Repository: ibestvina/datasloth Branch: main Commit: 6bb41fa7629a Files: 10 Total size: 55.3 KB Directory structure: gitextract_hgdrt7b0/ ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── README.rst ├── datasloth/ │ └── __init__.py ├── examples/ │ ├── datasloth_detailed_example.ipynb │ └── datasloth_quick_example.ipynb ├── setup.cfg └── setup.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2022 Ivan Bestvina Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: MANIFEST.in ================================================ include README.rst ================================================ FILE: README.md ================================================

# DataSloth _Natural language Pandas queries and data generation powered by GPT-3_

## Installation `pip install datasloth` ## Usage In order for DataSloth to work, you must have a working [OpenAI API key](https://beta.openai.com/account/api-keys) set in your environment variable, or provide it to the DataSloth object. For more info, refer to this [guide](https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety). DataSloth automatically discovers all Pandas dataframes in your namespace (filtering out names starting with an underscode). Before you load any data, import DataSloth and create the `sloth`: ```python from datasloth import DataSloth sloth = DataSloth() ``` Next, load any data you want to use. Try naming your dataframes and columns in a meaningful way, as DataSloth uses these names to understand what the data is about. Once your data is loaded, simply run `sloth.query('...')` to query the data. ### Improving results To improve the results, you can set custom descriptions of your tables: `df.sloth.description = 'Verbose description of the table'` By default, table descriptions consist of information about each column in the table. You can include this default description in your custom one by adding a `{COLUMNS_SUMMARY}` placeholder. See the detailed example notebook in the examples folder for more information. ### Solving issues A lot of times, if the returned data is not correct, or not fully formatted the way you want, it helps to rephrase the question or give specific pointers to how the final data should look like. To better understand where things might have gone wrong, use `show_query=True` in the `sloth.query()`, or run `sloth.show_last_query()` after the prompt has finished to print out the SQL query used (whithout rerunning the engine). ## Data generation DataSloth is also able to generate random data with the `generate` function. For example, running: ```python sloth.generate( description="people from Mars, with very space-sounding names, and strange taste in ice cream", columns=['First Name', 'Last Name', 'Date Of Birth', 'Country', 'City', 'Favourite Ice Cream'], n_rows=15 ) ``` Produces something like this: | First Name | Last Name | Date Of Birth | Country | City | Favourite Ice Cream | |-----------:|----------:|--------------:|--------:|-----------------:|--------------------:| | Glorza | Mangal | 06/12/2079 | Mars | Pryus Mater | Celestial Delight | | Yalza | Krang | 09/21/2084 | Mars | Valles Marineris | Moon Mist | | Tralza | Vomar | 04/17/2074 | Mars | Syrtis Major | Mars Mud Pie | | Dalza | Ralad | 01/02/2088 | Mars | Hellas Planitia | Alien Abduction | | Halza | Wular | 11/04/2092 | Mars | Olympus Mons | Martian Sunrise | Note that the results of the `generate` function are random, and different on each call. ================================================ FILE: README.rst ================================================ DataSloth ========= *Natural language Pandas queries and data generation powered by GPT-3* Installation ------------ ``pip install datasloth`` Usage ----- In order for DataSloth to work, you must have a working `OpenAI API key `__ set in your environment variable, or provide it to the DataSloth object. For more info, refer to this `guide `__. DataSloth automatically discovers all Pandas dataframes in your namespace (filtering out names starting with an underscode). Before you load any data, import DataSloth and create the ``sloth``: .. code:: python from datasloth import DataSloth sloth = DataSloth() Next, load any data you want to use. Try naming your dataframes and columns in a meaningful way, as DataSloth uses these names to understand what the data is about. Once your data is loaded, simply run ``sloth.query('...')`` to query the data. ================================================ FILE: datasloth/__init__.py ================================================ import os import inspect import re import pandas as pd from pandas.api.extensions import register_dataframe_accessor from pandas.api.types import is_string_dtype, is_numeric_dtype, is_datetime64_any_dtype from sqlalchemy import desc from pandasql import sqldf, PandaSQLException import openai @pd.api.extensions.register_dataframe_accessor("sloth") class SlothAccessor: """ Pandas Dataframe accessor to add '.sloth.description' field to dataframes, and manage column summaries used by DataSloth. """ def __init__(self, pandas_obj: pd.DataFrame) -> None: self._validate(pandas_obj) self._obj = pandas_obj self._description = '{COLUMNS_SUMMARY}' @staticmethod def _validate(obj): pass @property def description(self) -> str: return self._description.format(COLUMNS_SUMMARY=self.columns_summary()) @description.setter def description(self, value: str) -> None: """ Set additional description manually to inform the language engine about this table. Use '{COLUMNS_SUMMARY}' to include the default column summary in the description. By default, description is set only to this summary. To reset it, set description to None. """ if value is None: self._description = '{COLUMNS_SUMMARY}' else: self._description = value def columns_summary(self) -> str: """ Returns columns summary of the dataframe, in the "table" format containing column names, data types and additional info about columns. """ summary_lines = ['|column name|data type|info|'] for col_name in self._obj: col = self._obj[col_name] summary_lines.append(f'|{col_name}|{col.dtype}|{column_info(col)}|') return '\n'.join(summary_lines) class DataSloth(): prompt_format = """ Make sure to join in tables if information from multiple tables is needed for a task. Task: percentage of True values of column X in table Y ``` SQL query for SQLite: SELECT (SUM(CASE WHEN X = 'True' THEN 1.0 END) / COUNT(*)) * 100 AS percentage FROM Y ``` Task: count of rows in table T where date is equal to 11th of August 1993 ``` SQL query for SQLite: SELECT COUNT(*) AS row_count FROM T WHERE date(date) = date('1993-08-11') ``` Task: {QUERY} SQL query for SQLite: ``` """ def __init__(self, openai_api_key=None) -> None: if openai_api_key: openai.api_key = openai_api_key else: openai.api_key = os.getenv("OPENAI_API_KEY") if not openai.api_key: raise Exception( "OpenAI API key is not set. Either provide it to DataSloth(openai_api_key='...') "\ "run openai.api_key('...'), or set it as an env variable OPENAI_API_KEY." ) self.last_prompt = None self.last_gpt_response = None @staticmethod def dataframes_summary(env=None, ignore='^_') -> str: """ Summary of all DataFrames available in the namespace, ignoring those matching the 'ignore' regex. """ summary_lines = ['Tables available in the database, with their additional information, are:'] table_count = 0 for name, value in env.items(): if isinstance(value, pd.DataFrame) and (not ignore or not re.match(ignore, name)): summary_lines += [ f"\n\nTable name: {name}", value.sloth.description ] table_count += 1 if not table_count: return None return '\n'.join(summary_lines) def query(self, query, env=None, show_query=False): """ Query all Pandas DataFrames available in the namespace with a natural language query. To limit the tables used in the query, set the 'env' variable to a dict of tables (keys are table names, and values are table objects), or set it to globals() or locals(). To learn more, check pandasql docs. """ env = env or get_outer_frame_variables() query = query[0].lower() + query[1:] prompt = self.dataframes_summary(env) if not prompt: print('No dataframes found') return prompt += DataSloth.prompt_format.format(QUERY=query) response = openai.Completion.create( model="gpt-3.5-turbo-instruct", # as per OpenAI deprecations guide: https://platform.openai.com/docs/deprecations/instructgpt-models prompt=prompt, temperature=0, max_tokens=1000, top_p=1, frequency_penalty=0, presence_penalty=0, stop=["\n```\n"] ) sql_query = response['choices'][0]['text'] sql_query = sql_query.replace('```', '') self.last_prompt = (prompt, sql_query) if show_query: print(sql_query) try: result = sqldf(sql_query, env) except PandaSQLException: result = None print('Unsuccessful. Try rephrasing your query, or add additional table descriptions in df.sloth.description.') print('You can inspect the generated prompt and GPT response in sloth.show_last_prompt().') return result def generate(self, description, columns, n_rows=10): """ Generates a random dataset based on the description and a list of columns. """ rows = [] while len(rows) < n_rows: prompt = f'Fill the table below with {min(n_rows - len(rows) + 5, 30)} random rows about {description}\n\n' prompt += f"|{'|'.join(columns)}|\n" prompt += f"|{'|'.join(['-'*len(col) for col in columns])}|\n|" response = openai.Completion.create( model="gpt-3.5-turbo-instruct", # as per OpenAI deprecations guide: https://platform.openai.com/docs/deprecations/instructgpt-models prompt=prompt, temperature=0.8, max_tokens=1000, top_p=1, frequency_penalty=0, presence_penalty=0, ) response = '|' + response['choices'][0]['text'] new_rows = [row[1:-1].split('|') for row in response.split('\n') if not re.match('^[- |]*$', row)] new_rows = [row for row in new_rows if len(row) == len(columns)] rows += new_rows prompt = response + prompt df = pd.DataFrame(rows, columns=columns).head(n_rows) return df def _last_prompt(self): if self.last_prompt: print(self.last_prompt[0]) print(f'[->]\n{self.last_prompt[1]}') def show_last_query(self): """Print the SQL query generated in the last sloth.query() call.""" if self.last_prompt: print(self.last_prompt[1]) # Code copied from pandasql def get_outer_frame_variables(): """ Get a dict of local and global variables of the first outer frame from another file. """ cur_filename = inspect.getframeinfo(inspect.currentframe()).filename outer_frame = next(f for f in inspect.getouterframes(inspect.currentframe()) if f.filename != cur_filename) variables = {} variables.update(outer_frame.frame.f_globals) variables.update(outer_frame.frame.f_locals) return variables def column_info(col): """Info about a specific column, different depending on its type""" if is_string_dtype(col) or col.dtype == 'category': unique = col.unique().tolist() summary = 'unique values: ' + ', '.join(map(str, unique[:30])) if len(unique) > 30: summary += '...' elif col.dtype == 'bool': summary = f"values: 0, 1" elif is_numeric_dtype(col): summary = f"min={col.min()}, max={col.max()}" elif is_datetime64_any_dtype(col): summary = f"first={col.min()}, last={col.max()}" else: summary = '' return summary ================================================ FILE: examples/datasloth_detailed_example.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# DataSloth" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from datasloth import DataSloth\n", "import pandas as pd\n", "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Make sure your OpenAI API key is set in the OPENAI_API_KEY env variable, or provide it as an argument to DataSloth()\n", "sloth = DataSloth()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
survivedpclasssexagesibspparchfareembarkedclasswhoadult_maledeckembark_townalivealone
003male22.0107.2500SThirdmanTrueNaNSouthamptonnoFalse
111female38.01071.2833CFirstwomanFalseCCherbourgyesFalse
213female26.0007.9250SThirdwomanFalseNaNSouthamptonyesTrue
311female35.01053.1000SFirstwomanFalseCSouthamptonyesFalse
403male35.0008.0500SThirdmanTrueNaNSouthamptonnoTrue
\n", "
" ], "text/plain": [ " survived pclass sex age sibsp parch fare embarked class \\\n", "0 0 3 male 22.0 1 0 7.2500 S Third \n", "1 1 1 female 38.0 1 0 71.2833 C First \n", "2 1 3 female 26.0 0 0 7.9250 S Third \n", "3 1 1 female 35.0 1 0 53.1000 S First \n", "4 0 3 male 35.0 0 0 8.0500 S Third \n", "\n", " who adult_male deck embark_town alive alone \n", "0 man True NaN Southampton no False \n", "1 woman False C Cherbourg yes False \n", "2 woman False NaN Southampton yes True \n", "3 woman False C Southampton yes False \n", "4 man True NaN Southampton no True " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Main dataset to show datasloth capabilities\n", "titanic = sns.load_dataset('titanic')\n", "titanic.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT COUNT(*) AS survived_men\n", "FROM titanic\n", "WHERE sex = 'male' AND survived = 1\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
survived_men
0109
\n", "
" ], "text/plain": [ " survived_men\n", "0 109" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Example 1: we do not need to specify exact lables in our data. Here, 'men' is autonatically converted to 'male'.\n", "sloth.query(\"Number of men which survived the titanic\", show_query=True)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT AVG(fare) AS avg_fare\n", "FROM titanic\n", "WHERE alone = 1 AND sex = 'male'\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
avg_fare
016.713358
\n", "
" ], "text/plain": [ " avg_fare\n", "0 16.713358" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Exmaple 2: loosely specified statistics\n", "sloth.query(\"Average fare paid by men who traveled alone\", show_query=True)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT (SUM(CASE WHEN survived = 1 AND sex = 'male' THEN 1.0 END) / COUNT(*)) * 100 AS percentage\n", "FROM titanic\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
percentage
012.233446
\n", "
" ], "text/plain": [ " percentage\n", "0 12.233446" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Example 3: more complex stats\n", "sloth.query(\"Percentage of male survivors\", show_query=True)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT sex, (SUM(CASE WHEN survived = 1 THEN 1.0 END) / COUNT(*)) * 100 AS percentage\n", "FROM titanic\n", "GROUP BY sex\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sexpercentage
0female74.203822
1male18.890815
\n", "
" ], "text/plain": [ " sex percentage\n", "0 female 74.203822\n", "1 male 18.890815" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Example 4: group aggregations\n", "sloth.query(\"Calculate the percentage of survivors per sex\", show_query=True)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pclassmeal_typen_courses
01breakfast10
11lunch15
21dinner20
32breakfast5
42lunch6
52dinner7
63breakfast1
73lunch2
83dinner3
\n", "
" ], "text/plain": [ " pclass meal_type n_courses\n", "0 1 breakfast 10\n", "1 1 lunch 15\n", "2 1 dinner 20\n", "3 2 breakfast 5\n", "4 2 lunch 6\n", "5 2 dinner 7\n", "6 3 breakfast 1\n", "7 3 lunch 2\n", "8 3 dinner 3" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Introducing another dataframe into the namespace\n", "classes = pd.DataFrame({\n", " 'pclass': [1, 1, 1, 2, 2, 2, 3, 3, 3],\n", " 'meal_type': ['breakfast', 'lunch', 'dinner'] * 3, \n", " 'n_courses': [10, 15, 20, 5, 6, 7, 1, 2, 3]\n", "})\n", "classes" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT sex, (SUM(CASE WHEN survived = '1' THEN 1.0 END) / COUNT(*)) * 100 AS percentage\n", "FROM titanic\n", "JOIN classes ON titanic.pclass = classes.pclass\n", "WHERE meal_type = 'breakfast' AND n_courses > 5\n", "GROUP BY sex\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sexpercentage
0female96.808511
1male36.885246
\n", "
" ], "text/plain": [ " sex percentage\n", "0 female 96.808511\n", "1 male 36.885246" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Example 5: automatically joining with other tables in the namescpace\n", "sloth.query(\"Calculate the percentage of survivors of people who had more than 5 courses for breakfast. Do it per sex.\", show_query=True)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
codedate
0S1912-04-10
1C1912-04-10
2Q1912-04-11
\n", "
" ], "text/plain": [ " code date\n", "0 S 1912-04-10\n", "1 C 1912-04-10\n", "2 Q 1912-04-11" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Another table, with departure dates from each port\n", "# Note that the table and column names do not explain what the information is about\n", "table_por_dep = pd.DataFrame({'code': ['S', 'C', 'Q'], 'date': pd.to_datetime(['1912-04-10', '1912-04-10', '1912-04-11'])})\n", "table_por_dep" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT COUNT(*) AS female_passengers\n", "FROM titanic\n", "WHERE sex = 'female'\n", "AND date(embarked) = date('1912-04-11')\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
female_passengers
00
\n", "
" ], "text/plain": [ " female_passengers\n", "0 0" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Sloth is not able to make the connection correctly, as it does not know that departure dates are stored in that other table\n", "sloth.query(\"Count female passengers who departed on 11th of April\", show_query=True)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT COUNT(*) AS female_passengers\n", "FROM titanic\n", "INNER JOIN table_por_dep ON titanic.embarked = table_por_dep.code\n", "WHERE date(table_por_dep.date) = date('1912-04-11')\n", "AND titanic.sex = 'female'\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
female_passengers
036
\n", "
" ], "text/plain": [ " female_passengers\n", "0 36" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# To help, we add the table description\n", "# Note the use of a COLUMNS_SUMMARY placeholder to still keep the default description in.\n", "table_por_dep.sloth.description = \\\n", "\"Departure date table, to be joined to the main Titanic table on the 'embarked' code. \\n{COLUMNS_SUMMARY}\"\n", "\n", "sloth.query(\"Count female passengers who departed from their port on 11th of April\", show_query=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Data generation" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
First NameLast NameDate Of BirthCountryCityFavourite Ice Cream
0GlorzaMangal06/12/2079MarsPryus MaterCelestial Delight
1YalzaKrang09/21/2084MarsValles MarinerisMoon Mist
2TralzaVomar04/17/2074MarsSyrtis MajorMars Mud Pie
3DalzaRalad01/02/2088MarsHellas PlanitiaAlien Abduction
4HalzaWular11/04/2092MarsOlympus MonsMartian Sunrise
5KalzaLopal03/09/2073MarsAres VallisRed Planet
6MalzaBomar07/14/2081MarsTerra CimmeriaMars Bar
7NalzaKamar12/25/2085MarsUtopia PlanitiaEspresso crunch
8RalzaFomar02/11/2070MarsArsia MonsCotton candy
9SalzaSoldar05/16/2078MarsTharsis MontesButterscotch
10TalzaWomar10/28/2080MarsMangala VallesCookies and Cream
11UlzaDalad06/01/2072MarsElysium PlanitiaGreen Tea
12VulzaRopal04/14/2087MarsCydonia MensaeMint chocolate chip
13ZalzaBular07/11/2089MarsIsidis PlanitiaRocky Road
14BlorzaFomar09/08/2076MarsTempe TerraVanilla
\n", "
" ], "text/plain": [ " First Name Last Name Date Of Birth Country City \\\n", "0 Glorza Mangal 06/12/2079 Mars Pryus Mater \n", "1 Yalza Krang 09/21/2084 Mars Valles Marineris \n", "2 Tralza Vomar 04/17/2074 Mars Syrtis Major \n", "3 Dalza Ralad 01/02/2088 Mars Hellas Planitia \n", "4 Halza Wular 11/04/2092 Mars Olympus Mons \n", "5 Kalza Lopal 03/09/2073 Mars Ares Vallis \n", "6 Malza Bomar 07/14/2081 Mars Terra Cimmeria \n", "7 Nalza Kamar 12/25/2085 Mars Utopia Planitia \n", "8 Ralza Fomar 02/11/2070 Mars Arsia Mons \n", "9 Salza Soldar 05/16/2078 Mars Tharsis Montes \n", "10 Talza Womar 10/28/2080 Mars Mangala Valles \n", "11 Ulza Dalad 06/01/2072 Mars Elysium Planitia \n", "12 Vulza Ropal 04/14/2087 Mars Cydonia Mensae \n", "13 Zalza Bular 07/11/2089 Mars Isidis Planitia \n", "14 Blorza Fomar 09/08/2076 Mars Tempe Terra \n", "\n", " Favourite Ice Cream \n", "0 Celestial Delight \n", "1 Moon Mist \n", "2 Mars Mud Pie \n", "3 Alien Abduction \n", "4 Martian Sunrise \n", "5 Red Planet \n", "6 Mars Bar \n", "7 Espresso crunch \n", "8 Cotton candy \n", "9 Butterscotch \n", "10 Cookies and Cream \n", "11 Green Tea \n", "12 Mint chocolate chip \n", "13 Rocky Road \n", "14 Vanilla " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Given a table description and a list of columns, DataSloth can generate some random data\n", "sloth.generate(\n", " \"people from Mars, with very space-sounding names, and strange taste in ice cream\", \n", " ['First Name', 'Last Name', 'Date Of Birth', 'Country', 'City', 'Favourite Ice Cream'],\n", " n_rows=15\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.4" }, "vscode": { "interpreter": { "hash": "fa2753a9fc1c7a7f868f370d31058bd0275fd3cd078c4899cfafe3ad2d226086" } } }, "nbformat": 4, "nbformat_minor": 4 } ================================================ FILE: examples/datasloth_quick_example.ipynb ================================================ { "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from datasloth import DataSloth\n", "sloth = DataSloth()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
survivedpclasssexagesibspparchfareembarkedclasswhoadult_maledeckembark_townalivealone
003male22.0107.2500SThirdmanTrueNaNSouthamptonnoFalse
111female38.01071.2833CFirstwomanFalseCCherbourgyesFalse
\n", "
" ], "text/plain": [ " survived pclass sex age sibsp parch fare embarked class \\\n", "0 0 3 male 22.0 1 0 7.2500 S Third \n", "1 1 1 female 38.0 1 0 71.2833 C First \n", "\n", " who adult_male deck embark_town alive alone \n", "0 man True NaN Southampton no False \n", "1 woman False C Cherbourg yes False " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import seaborn as sns\n", "titanic = sns.load_dataset('titanic')\n", "titanic.head(2)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
avg_fare
016.713358
\n", "
" ], "text/plain": [ " avg_fare\n", "0 16.713358" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sloth.query(\"Average fare paid by men who traveled alone\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
survivedpclasssexagesibspparchfareembarkedclasswhoadult_maledeckembark_townalivealone
001female50.00028.7125CFirstwoman0CCherbourgno1
101female2.012151.5500SFirstchild0CSouthamptonno0
201female25.012151.5500SFirstwoman0CSouthamptonno0
\n", "
" ], "text/plain": [ " survived pclass sex age sibsp parch fare embarked class \\\n", "0 0 1 female 50.0 0 0 28.7125 C First \n", "1 0 1 female 2.0 1 2 151.5500 S First \n", "2 0 1 female 25.0 1 2 151.5500 S First \n", "\n", " who adult_male deck embark_town alive alone \n", "0 woman 0 C Cherbourg no 1 \n", "1 child 0 C Southampton no 0 \n", "2 woman 0 C Southampton no 0 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sloth.query(\"All first class women who did not survive\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.4" }, "vscode": { "interpreter": { "hash": "fa2753a9fc1c7a7f868f370d31058bd0275fd3cd078c4899cfafe3ad2d226086" } } }, "nbformat": 4, "nbformat_minor": 4 } ================================================ FILE: setup.cfg ================================================ [metadata] description-file = README.md ================================================ FILE: setup.py ================================================ from setuptools import setup def readme(): with open('README.rst') as f: return f.read() setup( name='datasloth', version='0.4', description='Natural language Pandas queries and data generation', url='http://github.com/ibestvina/datasloth', author='Ivan Bestvina', author_email='ivan.bestvina@gmail.com', license='MIT', packages=['datasloth'], zip_safe=False, install_requires=[ 'openai', 'pandas', 'pandasql' ], long_description=readme(), )