Repository: modin-project/modin Branch: main Commit: 7ca200b08597 Files: 681 Total size: 6.8 MB Directory structure: gitextract_eudtie4f/ ├── .gitattributes ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ ├── bug-report.yaml │ │ ├── feature_request.md │ │ └── question.md │ ├── PULL_REQUEST_TEMPLATE.md │ ├── actions/ │ │ ├── mamba-env/ │ │ │ └── action.yml │ │ ├── python-only/ │ │ │ └── action.yml │ │ ├── run-core-tests/ │ │ │ ├── action.yml │ │ │ ├── group_1/ │ │ │ │ └── action.yml │ │ │ ├── group_2/ │ │ │ │ └── action.yml │ │ │ ├── group_3/ │ │ │ │ └── action.yml │ │ │ └── group_4/ │ │ │ └── action.yml │ │ └── upload-coverage/ │ │ └── action.yml │ ├── dependabot.yaml │ ├── stale.yml │ └── workflows/ │ ├── ci-notebooks.yml │ ├── ci-required.yml │ ├── ci.yml │ ├── codeql/ │ │ └── codeql-config.yml │ ├── codeql.yml │ ├── fuzzydata-test.yml │ ├── publish-to-pypi.yml │ ├── push-to-main.yml │ └── sql_server/ │ └── set_up_sql_server.sh ├── .gitignore ├── .readthedocs.yaml ├── CODEOWNERS ├── CODE_OF_CONDUCT.md ├── LICENSE ├── LICENSE_HEADER ├── MANIFEST.in ├── NOTICE ├── README.md ├── asv_bench/ │ ├── README.md │ ├── asv.conf.dask.json │ ├── asv.conf.json │ ├── asv.conf.unidist.json │ ├── benchmarks/ │ │ ├── __init__.py │ │ ├── benchmarks.py │ │ ├── io/ │ │ │ ├── __init__.py │ │ │ ├── csv.py │ │ │ └── parquet.py │ │ ├── scalability/ │ │ │ ├── __init__.py │ │ │ └── scalability_benchmarks.py │ │ └── utils/ │ │ ├── __init__.py │ │ ├── common.py │ │ ├── compatibility.py │ │ └── data_shapes.py │ └── test/ │ ├── __init__.py │ └── test_utils.py ├── ci/ │ └── teamcity/ │ ├── Dockerfile.teamcity-ci │ ├── build-docker.py │ └── comment_on_pr.py ├── codecov.yml ├── contributing/ │ ├── contributing.md │ └── pre-commit ├── docker/ │ └── Dockerfile ├── docs/ │ ├── _static/ │ │ └── custom.js │ ├── _templates/ │ │ └── layout.html │ ├── conf.py │ ├── contact.rst │ ├── development/ │ │ ├── architecture.rst │ │ ├── contributing.rst │ │ ├── index.rst │ │ ├── partition_api.rst │ │ ├── using_pandas_on_dask.rst │ │ ├── using_pandas_on_mpi.rst │ │ ├── using_pandas_on_python.rst │ │ └── using_pandas_on_ray.rst │ ├── ecosystem.rst │ ├── flow/ │ │ └── modin/ │ │ ├── config.rst │ │ ├── core/ │ │ │ ├── dataframe/ │ │ │ │ ├── algebra.rst │ │ │ │ ├── base/ │ │ │ │ │ ├── dataframe.rst │ │ │ │ │ ├── index.rst │ │ │ │ │ └── partitioning/ │ │ │ │ │ └── axis_partition.rst │ │ │ │ ├── index.rst │ │ │ │ └── pandas/ │ │ │ │ ├── dataframe.rst │ │ │ │ ├── index.rst │ │ │ │ ├── metadata/ │ │ │ │ │ ├── dtypes.rst │ │ │ │ │ └── index.rst │ │ │ │ └── partitioning/ │ │ │ │ ├── axis_partition.rst │ │ │ │ ├── partition.rst │ │ │ │ └── partition_manager.rst │ │ │ ├── execution/ │ │ │ │ ├── dask/ │ │ │ │ │ └── implementations/ │ │ │ │ │ └── pandas_on_dask/ │ │ │ │ │ ├── dataframe.rst │ │ │ │ │ ├── index.rst │ │ │ │ │ └── partitioning/ │ │ │ │ │ ├── partition.rst │ │ │ │ │ ├── partition_manager.rst │ │ │ │ │ └── virtual_partition.rst │ │ │ │ ├── dispatching.rst │ │ │ │ ├── python/ │ │ │ │ │ └── implementations/ │ │ │ │ │ └── pandas_on_python/ │ │ │ │ │ ├── dataframe.rst │ │ │ │ │ ├── index.rst │ │ │ │ │ └── partitioning/ │ │ │ │ │ ├── axis_partition.rst │ │ │ │ │ ├── partition.rst │ │ │ │ │ └── partition_manager.rst │ │ │ │ ├── ray/ │ │ │ │ │ ├── generic.rst │ │ │ │ │ └── implementations/ │ │ │ │ │ └── pandas_on_ray/ │ │ │ │ │ ├── dataframe.rst │ │ │ │ │ ├── index.rst │ │ │ │ │ └── partitioning/ │ │ │ │ │ ├── axis_partition.rst │ │ │ │ │ ├── partition.rst │ │ │ │ │ └── partition_manager.rst │ │ │ │ └── unidist/ │ │ │ │ ├── generic.rst │ │ │ │ └── implementations/ │ │ │ │ └── pandas_on_unidist/ │ │ │ │ ├── dataframe.rst │ │ │ │ ├── index.rst │ │ │ │ └── partitioning/ │ │ │ │ ├── axis_partition.rst │ │ │ │ ├── partition.rst │ │ │ │ └── partition_manager.rst │ │ │ ├── io/ │ │ │ │ └── index.rst │ │ │ └── storage_formats/ │ │ │ ├── base/ │ │ │ │ └── query_compiler.rst │ │ │ ├── index.rst │ │ │ └── pandas/ │ │ │ ├── index.rst │ │ │ ├── parsers.rst │ │ │ └── query_compiler.rst │ │ ├── distributed/ │ │ │ └── dataframe/ │ │ │ └── pandas.rst │ │ ├── experimental/ │ │ │ ├── batch.rst │ │ │ ├── core/ │ │ │ │ └── io/ │ │ │ │ └── index.rst │ │ │ ├── index.rst │ │ │ ├── pandas.rst │ │ │ ├── range_partitioning_groupby.rst │ │ │ ├── reshuffling_groupby.rst │ │ │ ├── sklearn.rst │ │ │ └── xgboost.rst │ │ ├── pandas/ │ │ │ ├── base.rst │ │ │ ├── dataframe.rst │ │ │ └── series.rst │ │ └── utils.rst │ ├── getting_started/ │ │ ├── examples.rst │ │ ├── faq.rst │ │ ├── installation.rst │ │ ├── quickstart.rst │ │ ├── troubleshooting.rst │ │ ├── using_modin/ │ │ │ ├── using_modin.rst │ │ │ ├── using_modin_cluster.rst │ │ │ └── using_modin_locally.rst │ │ └── why_modin/ │ │ ├── modin_vs_dask_vs_koalas.rst │ │ ├── out_of_core.rst │ │ ├── pandas.rst │ │ └── why_modin.rst │ ├── index.rst │ ├── release-procedure.md │ ├── release_notes/ │ │ ├── release_notes-0.14.0.rst │ │ ├── release_notes-0.15.0.rst │ │ ├── release_notes-0.16.0.rst │ │ └── release_notes-template.rst │ ├── requirements-doc.txt │ ├── supported_apis/ │ │ ├── dataframe_supported.rst │ │ ├── defaulting_to_pandas.rst │ │ ├── index.rst │ │ ├── io_supported.rst │ │ ├── older_pandas_compat.rst │ │ ├── series_supported.rst │ │ └── utilities_supported.rst │ └── usage_guide/ │ ├── advanced_usage/ │ │ ├── batch.rst │ │ ├── index.rst │ │ ├── modin_engines.rst │ │ ├── modin_logging.rst │ │ ├── modin_metrics.rst │ │ ├── modin_xgboost.rst │ │ ├── progress_bar.rst │ │ └── spreadsheets_api.rst │ ├── benchmarking.rst │ ├── examples/ │ │ └── index.rst │ ├── index.rst │ ├── integrations.rst │ └── optimization_notes/ │ ├── index.rst │ └── range_partitioning_ops.rst ├── environment-dev.yml ├── examples/ │ ├── data/ │ │ ├── boston_housing.csv │ │ ├── census_1k.csv │ │ ├── nyc-taxi_1k.csv │ │ ├── plasticc_test_set_1k.csv │ │ ├── plasticc_test_set_metadata_1k.csv │ │ ├── plasticc_training_set_1k.csv │ │ └── plasticc_training_set_metadata_1k.csv │ ├── docker/ │ │ └── modin-ray/ │ │ ├── Dockerfile │ │ ├── build-docker-image.sh │ │ ├── census.py │ │ ├── nyc-taxi.py │ │ ├── plasticc.py │ │ └── taxi.pstat │ ├── jupyter/ │ │ ├── Modin_Taxi.ipynb │ │ ├── Pandas_Taxi.ipynb │ │ └── integrations/ │ │ ├── NLTK.ipynb │ │ ├── altair.ipynb │ │ ├── bokeh.ipynb │ │ ├── huggingface.ipynb │ │ ├── matplotlib.ipynb │ │ ├── plotly.ipynb │ │ ├── seaborn.ipynb │ │ ├── sklearn.ipynb │ │ ├── statsmodels.ipynb │ │ ├── tensorflow.ipynb │ │ └── xgboost.ipynb │ ├── modin-scikit-learn-example.ipynb │ ├── quickstart.ipynb │ ├── spreadsheet/ │ │ ├── requirements.txt │ │ └── tutorial.ipynb │ └── tutorial/ │ ├── README.md │ └── jupyter/ │ ├── README.md │ └── execution/ │ ├── pandas_on_dask/ │ │ ├── Dockerfile │ │ ├── cluster/ │ │ │ └── exercise_5.ipynb │ │ ├── local/ │ │ │ ├── exercise_1.ipynb │ │ │ ├── exercise_2.ipynb │ │ │ ├── exercise_3.ipynb │ │ │ └── exercise_4.ipynb │ │ ├── requirements.txt │ │ └── test/ │ │ └── test_notebooks.py │ ├── pandas_on_ray/ │ │ ├── Dockerfile │ │ ├── cluster/ │ │ │ ├── README.md │ │ │ ├── exercise_5.py │ │ │ └── modin-cluster.yaml │ │ ├── local/ │ │ │ ├── exercise_1.ipynb │ │ │ ├── exercise_2.ipynb │ │ │ ├── exercise_3.ipynb │ │ │ └── exercise_4.ipynb │ │ ├── requirements.txt │ │ └── test/ │ │ └── test_notebooks.py │ ├── pandas_on_unidist/ │ │ ├── Dockerfile │ │ ├── README.md │ │ ├── jupyter_unidist_env.yml │ │ ├── local/ │ │ │ ├── exercise_1.ipynb │ │ │ ├── exercise_2.ipynb │ │ │ ├── exercise_3.ipynb │ │ │ └── exercise_4.ipynb │ │ ├── setup_kernel.py │ │ └── test/ │ │ └── test_notebooks.py │ └── test/ │ └── utils.py ├── modin/ │ ├── __init__.py │ ├── __main__.py │ ├── _version.py │ ├── config/ │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── envvars.py │ │ └── pubsub.py │ ├── conftest.py │ ├── core/ │ │ ├── __init__.py │ │ ├── computation/ │ │ │ ├── __init__.py │ │ │ ├── align.py │ │ │ ├── check.py │ │ │ ├── common.py │ │ │ ├── engines.py │ │ │ ├── eval.py │ │ │ ├── expr.py │ │ │ ├── ops.py │ │ │ ├── parsing.py │ │ │ └── scope.py │ │ ├── dataframe/ │ │ │ ├── __init__.py │ │ │ ├── algebra/ │ │ │ │ ├── __init__.py │ │ │ │ ├── binary.py │ │ │ │ ├── default2pandas/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── binary.py │ │ │ │ │ ├── cat.py │ │ │ │ │ ├── dataframe.py │ │ │ │ │ ├── datetime.py │ │ │ │ │ ├── default.py │ │ │ │ │ ├── groupby.py │ │ │ │ │ ├── list.py │ │ │ │ │ ├── resample.py │ │ │ │ │ ├── rolling.py │ │ │ │ │ ├── series.py │ │ │ │ │ ├── str.py │ │ │ │ │ └── struct.py │ │ │ │ ├── fold.py │ │ │ │ ├── groupby.py │ │ │ │ ├── map.py │ │ │ │ ├── operator.py │ │ │ │ ├── reduce.py │ │ │ │ └── tree_reduce.py │ │ │ ├── base/ │ │ │ │ ├── __init__.py │ │ │ │ ├── dataframe/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── dataframe.py │ │ │ │ │ └── utils.py │ │ │ │ ├── interchange/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── dataframe_protocol/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── dataframe.py │ │ │ │ │ └── utils.py │ │ │ │ └── partitioning/ │ │ │ │ ├── __init__.py │ │ │ │ └── axis_partition.py │ │ │ └── pandas/ │ │ │ ├── __init__.py │ │ │ ├── dataframe/ │ │ │ │ ├── __init__.py │ │ │ │ ├── dataframe.py │ │ │ │ └── utils.py │ │ │ ├── interchange/ │ │ │ │ ├── __init__.py │ │ │ │ └── dataframe_protocol/ │ │ │ │ ├── __init__.py │ │ │ │ ├── buffer.py │ │ │ │ ├── column.py │ │ │ │ ├── dataframe.py │ │ │ │ ├── exception.py │ │ │ │ └── from_dataframe.py │ │ │ ├── metadata/ │ │ │ │ ├── __init__.py │ │ │ │ ├── dtypes.py │ │ │ │ └── index.py │ │ │ ├── partitioning/ │ │ │ │ ├── __init__.py │ │ │ │ ├── axis_partition.py │ │ │ │ ├── partition.py │ │ │ │ └── partition_manager.py │ │ │ └── utils.py │ │ ├── execution/ │ │ │ ├── __init__.py │ │ │ ├── dask/ │ │ │ │ ├── __init__.py │ │ │ │ ├── common/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── engine_wrapper.py │ │ │ │ │ └── utils.py │ │ │ │ └── implementations/ │ │ │ │ ├── __init__.py │ │ │ │ └── pandas_on_dask/ │ │ │ │ ├── __init__.py │ │ │ │ ├── dataframe/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── dataframe.py │ │ │ │ ├── io/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── io.py │ │ │ │ └── partitioning/ │ │ │ │ ├── __init__.py │ │ │ │ ├── partition.py │ │ │ │ ├── partition_manager.py │ │ │ │ └── virtual_partition.py │ │ │ ├── dispatching/ │ │ │ │ ├── __init__.py │ │ │ │ └── factories/ │ │ │ │ ├── __init__.py │ │ │ │ ├── dispatcher.py │ │ │ │ └── factories.py │ │ │ ├── modin_aqp.py │ │ │ ├── python/ │ │ │ │ ├── __init__.py │ │ │ │ ├── common/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── engine_wrapper.py │ │ │ │ └── implementations/ │ │ │ │ ├── __init__.py │ │ │ │ └── pandas_on_python/ │ │ │ │ ├── __init__.py │ │ │ │ ├── dataframe/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── dataframe.py │ │ │ │ ├── io/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── io.py │ │ │ │ └── partitioning/ │ │ │ │ ├── __init__.py │ │ │ │ ├── partition.py │ │ │ │ ├── partition_manager.py │ │ │ │ └── virtual_partition.py │ │ │ ├── ray/ │ │ │ │ ├── __init__.py │ │ │ │ ├── common/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── deferred_execution.py │ │ │ │ │ ├── engine_wrapper.py │ │ │ │ │ └── utils.py │ │ │ │ ├── generic/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── io/ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── io.py │ │ │ │ │ └── partitioning/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── partition_manager.py │ │ │ │ └── implementations/ │ │ │ │ ├── __init__.py │ │ │ │ └── pandas_on_ray/ │ │ │ │ ├── __init__.py │ │ │ │ ├── dataframe/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── dataframe.py │ │ │ │ ├── io/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── io.py │ │ │ │ └── partitioning/ │ │ │ │ ├── __init__.py │ │ │ │ ├── partition.py │ │ │ │ ├── partition_manager.py │ │ │ │ └── virtual_partition.py │ │ │ ├── unidist/ │ │ │ │ ├── __init__.py │ │ │ │ ├── common/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── engine_wrapper.py │ │ │ │ │ └── utils.py │ │ │ │ ├── generic/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── io/ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── io.py │ │ │ │ │ └── partitioning/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── partition_manager.py │ │ │ │ └── implementations/ │ │ │ │ ├── __init__.py │ │ │ │ └── pandas_on_unidist/ │ │ │ │ ├── __init__.py │ │ │ │ ├── dataframe/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── dataframe.py │ │ │ │ ├── io/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── io.py │ │ │ │ └── partitioning/ │ │ │ │ ├── __init__.py │ │ │ │ ├── partition.py │ │ │ │ ├── partition_manager.py │ │ │ │ └── virtual_partition.py │ │ │ └── utils.py │ │ ├── io/ │ │ │ ├── __init__.py │ │ │ ├── column_stores/ │ │ │ │ ├── __init__.py │ │ │ │ ├── column_store_dispatcher.py │ │ │ │ ├── feather_dispatcher.py │ │ │ │ ├── hdf_dispatcher.py │ │ │ │ └── parquet_dispatcher.py │ │ │ ├── file_dispatcher.py │ │ │ ├── io.py │ │ │ ├── sql/ │ │ │ │ ├── __init__.py │ │ │ │ └── sql_dispatcher.py │ │ │ └── text/ │ │ │ ├── __init__.py │ │ │ ├── csv_dispatcher.py │ │ │ ├── excel_dispatcher.py │ │ │ ├── fwf_dispatcher.py │ │ │ ├── json_dispatcher.py │ │ │ ├── text_file_dispatcher.py │ │ │ └── utils.py │ │ └── storage_formats/ │ │ ├── __init__.py │ │ ├── base/ │ │ │ ├── __init__.py │ │ │ ├── doc_utils.py │ │ │ ├── query_compiler.py │ │ │ └── query_compiler_calculator.py │ │ └── pandas/ │ │ ├── __init__.py │ │ ├── aggregations.py │ │ ├── groupby.py │ │ ├── merge.py │ │ ├── native_query_compiler.py │ │ ├── parsers.py │ │ ├── query_compiler.py │ │ ├── query_compiler_caster.py │ │ └── utils.py │ ├── db_conn.py │ ├── distributed/ │ │ ├── __init__.py │ │ └── dataframe/ │ │ ├── __init__.py │ │ └── pandas/ │ │ ├── __init__.py │ │ └── partitions.py │ ├── error_message.py │ ├── experimental/ │ │ ├── __init__.py │ │ ├── batch/ │ │ │ ├── __init__.py │ │ │ └── pipeline.py │ │ ├── core/ │ │ │ ├── __init__.py │ │ │ ├── execution/ │ │ │ │ ├── __init__.py │ │ │ │ ├── dask/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── implementations/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── pandas_on_dask/ │ │ │ │ │ └── __init__.py │ │ │ │ ├── ray/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── implementations/ │ │ │ │ │ └── __init__.py │ │ │ │ └── unidist/ │ │ │ │ ├── __init__.py │ │ │ │ └── implementations/ │ │ │ │ ├── __init__.py │ │ │ │ └── pandas_on_unidist/ │ │ │ │ └── __init__.py │ │ │ ├── io/ │ │ │ │ ├── __init__.py │ │ │ │ ├── glob/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── glob_dispatcher.py │ │ │ │ ├── sql/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── sql_dispatcher.py │ │ │ │ │ └── utils.py │ │ │ │ └── text/ │ │ │ │ ├── __init__.py │ │ │ │ ├── csv_glob_dispatcher.py │ │ │ │ └── custom_text_dispatcher.py │ │ │ └── storage_formats/ │ │ │ ├── __init__.py │ │ │ └── pandas/ │ │ │ ├── __init__.py │ │ │ └── parsers.py │ │ ├── fuzzydata/ │ │ │ └── __init__.py │ │ ├── pandas/ │ │ │ ├── __init__.py │ │ │ └── io.py │ │ ├── sklearn/ │ │ │ ├── __init__.py │ │ │ └── model_selection/ │ │ │ ├── __init__.py │ │ │ └── train_test_split.py │ │ ├── spreadsheet/ │ │ │ ├── __init__.py │ │ │ └── general.py │ │ ├── torch/ │ │ │ ├── __init__.py │ │ │ └── datasets.py │ │ └── xgboost/ │ │ ├── __init__.py │ │ ├── utils.py │ │ ├── xgboost.py │ │ └── xgboost_ray.py │ ├── logging/ │ │ ├── __init__.py │ │ ├── class_logger.py │ │ ├── config.py │ │ ├── logger_decorator.py │ │ └── metrics.py │ ├── numpy/ │ │ ├── __init__.py │ │ ├── arr.py │ │ ├── array_creation.py │ │ ├── array_shaping.py │ │ ├── constants.py │ │ ├── indexing.py │ │ ├── linalg.py │ │ ├── logic.py │ │ ├── math.py │ │ ├── trigonometry.py │ │ └── utils.py │ ├── pandas/ │ │ ├── __init__.py │ │ ├── accessor.py │ │ ├── api/ │ │ │ ├── __init__.py │ │ │ └── extensions/ │ │ │ ├── __init__.py │ │ │ └── extensions.py │ │ ├── arrays/ │ │ │ └── __init__.py │ │ ├── base.py │ │ ├── dataframe.py │ │ ├── errors/ │ │ │ └── __init__.py │ │ ├── general.py │ │ ├── groupby.py │ │ ├── indexing.py │ │ ├── io.py │ │ ├── iterator.py │ │ ├── plotting.py │ │ ├── resample.py │ │ ├── series.py │ │ ├── series_utils.py │ │ ├── testing/ │ │ │ └── __init__.py │ │ ├── utils.py │ │ └── window.py │ ├── polars/ │ │ ├── __init__.py │ │ ├── base.py │ │ ├── dataframe.py │ │ ├── groupby.py │ │ ├── lazyframe.py │ │ └── series.py │ ├── tests/ │ │ ├── __init__.py │ │ ├── config/ │ │ │ ├── __init__.py │ │ │ ├── docs_module/ │ │ │ │ ├── __init__.py │ │ │ │ ├── classes.py │ │ │ │ └── functions.py │ │ │ ├── docs_module_with_just_base/ │ │ │ │ ├── __init__.py │ │ │ │ └── classes.py │ │ │ ├── test_envvars.py │ │ │ └── test_parameter.py │ │ ├── core/ │ │ │ ├── __init__.py │ │ │ ├── storage_formats/ │ │ │ │ ├── base/ │ │ │ │ │ └── test_internals.py │ │ │ │ ├── cudf/ │ │ │ │ │ ├── test_gpu_managers.py │ │ │ │ │ └── test_internals.py │ │ │ │ └── pandas/ │ │ │ │ └── test_internals.py │ │ │ └── test_dispatcher.py │ │ ├── experimental/ │ │ │ ├── __init__.py │ │ │ ├── spreadsheet/ │ │ │ │ ├── __init__.py │ │ │ │ └── test_general.py │ │ │ ├── test_fuzzydata.py │ │ │ ├── test_io_exp.py │ │ │ ├── test_pipeline.py │ │ │ ├── torch/ │ │ │ │ ├── __init__.py │ │ │ │ └── test_dataloader.py │ │ │ └── xgboost/ │ │ │ ├── __init__.py │ │ │ ├── test_default.py │ │ │ ├── test_dmatrix.py │ │ │ └── test_xgboost.py │ │ ├── interchange/ │ │ │ ├── __init__.py │ │ │ └── dataframe_protocol/ │ │ │ ├── __init__.py │ │ │ ├── base/ │ │ │ │ ├── __init__.py │ │ │ │ ├── test_sanity.py │ │ │ │ └── test_utils.py │ │ │ ├── pandas/ │ │ │ │ ├── __init__.py │ │ │ │ └── test_protocol.py │ │ │ └── test_general.py │ │ ├── numpy/ │ │ │ ├── __init__.py │ │ │ ├── test_array.py │ │ │ ├── test_array_arithmetic.py │ │ │ ├── test_array_axis_functions.py │ │ │ ├── test_array_creation.py │ │ │ ├── test_array_indexing.py │ │ │ ├── test_array_linalg.py │ │ │ ├── test_array_logic.py │ │ │ ├── test_array_math.py │ │ │ ├── test_array_shaping.py │ │ │ └── utils.py │ │ ├── pandas/ │ │ │ ├── __init__.py │ │ │ ├── conftest.py │ │ │ ├── data/ │ │ │ │ ├── __init__.py │ │ │ │ ├── airline.sas7bdat │ │ │ │ ├── blah.csv │ │ │ │ ├── every_other_row_nan.xlsx │ │ │ │ ├── excel_sheetname_title.xlsx │ │ │ │ ├── hdfs.parquet/ │ │ │ │ │ ├── part-00000-a7bff54c-2ff4-4654-9783-626542bd3a90-c000.snappy.parquet │ │ │ │ │ ├── part-00001-a7bff54c-2ff4-4654-9783-626542bd3a90-c000.snappy.parquet │ │ │ │ │ └── part-00002-a7bff54c-2ff4-4654-9783-626542bd3a90-c000.snappy.parquet │ │ │ │ ├── issue5159.parquet/ │ │ │ │ │ └── part-0000.snappy.parquet/ │ │ │ │ │ ├── par=a/ │ │ │ │ │ │ └── 44c5b23d806c4dc8a97d70c4fb2219f5-0.parquet │ │ │ │ │ └── par=b/ │ │ │ │ │ └── 44c5b23d806c4dc8a97d70c4fb2219f5-0.parquet │ │ │ │ ├── issue_1930.csv │ │ │ │ ├── issue_2074.csv │ │ │ │ ├── issue_2239.csv │ │ │ │ ├── issue_3119.csv │ │ │ │ ├── issue_4543.csv │ │ │ │ ├── issue_976.csv │ │ │ │ ├── modin_error_book.xlsx │ │ │ │ ├── multiple_csv/ │ │ │ │ │ ├── test_data0.csv │ │ │ │ │ └── test_data1.csv │ │ │ │ ├── newlines.csv │ │ │ │ ├── test_border_rows.xlsx │ │ │ │ ├── test_categories.csv │ │ │ │ ├── test_categories.json │ │ │ │ ├── test_data.feather │ │ │ │ ├── test_data.fwf │ │ │ │ ├── test_data.json │ │ │ │ ├── test_data.parquet │ │ │ │ ├── test_data_dir.parquet/ │ │ │ │ │ ├── part_0.parquet │ │ │ │ │ ├── part_1.parquet │ │ │ │ │ ├── part_10.parquet │ │ │ │ │ ├── part_11.parquet │ │ │ │ │ ├── part_12.parquet │ │ │ │ │ ├── part_13.parquet │ │ │ │ │ ├── part_14.parquet │ │ │ │ │ ├── part_15.parquet │ │ │ │ │ ├── part_2.parquet │ │ │ │ │ ├── part_3.parquet │ │ │ │ │ ├── part_4.parquet │ │ │ │ │ ├── part_5.parquet │ │ │ │ │ ├── part_6.parquet │ │ │ │ │ ├── part_7.parquet │ │ │ │ │ ├── part_8.parquet │ │ │ │ │ └── part_9.parquet │ │ │ │ ├── test_delim.csv │ │ │ │ ├── test_different_columns_in_rows.json │ │ │ │ ├── test_empty_rows.xlsx │ │ │ │ ├── test_emptyline.xlsx │ │ │ │ ├── test_null_col.csv │ │ │ │ ├── test_time_parsing.csv │ │ │ │ └── test_usecols.csv │ │ │ ├── dataframe/ │ │ │ │ ├── __init__.py │ │ │ │ ├── test_binary.py │ │ │ │ ├── test_default.py │ │ │ │ ├── test_indexing.py │ │ │ │ ├── test_iter.py │ │ │ │ ├── test_join_sort.py │ │ │ │ ├── test_map_metadata.py │ │ │ │ ├── test_pickle.py │ │ │ │ ├── test_reduce.py │ │ │ │ ├── test_udf.py │ │ │ │ └── test_window.py │ │ │ ├── extensions/ │ │ │ │ ├── __init__.py │ │ │ │ ├── conftest.py │ │ │ │ ├── test_api_reexport.py │ │ │ │ ├── test_base_extensions.py │ │ │ │ ├── test_dataframe_extensions.py │ │ │ │ ├── test_groupby_extensions.py │ │ │ │ ├── test_pd_extensions.py │ │ │ │ └── test_series_extensions.py │ │ │ ├── integrations/ │ │ │ │ ├── __init__.py │ │ │ │ └── test_lazy_import.py │ │ │ ├── internals/ │ │ │ │ ├── __init__.py │ │ │ │ └── test_benchmark_mode.py │ │ │ ├── native_df_interoperability/ │ │ │ │ ├── __init__.py │ │ │ │ ├── conftest.py │ │ │ │ ├── test_binary.py │ │ │ │ ├── test_compiler_caster.py │ │ │ │ ├── test_copy_on_write.py │ │ │ │ ├── test_default.py │ │ │ │ ├── test_default_to_pandas_without_warnings.py │ │ │ │ ├── test_general.py │ │ │ │ ├── test_indexing.py │ │ │ │ ├── test_iter.py │ │ │ │ ├── test_join_sort.py │ │ │ │ ├── test_map_metadata.py │ │ │ │ ├── test_pickle.py │ │ │ │ ├── test_window.py │ │ │ │ └── utils.py │ │ │ ├── test_api.py │ │ │ ├── test_backend.py │ │ │ ├── test_concat.py │ │ │ ├── test_expanding.py │ │ │ ├── test_general.py │ │ │ ├── test_groupby.py │ │ │ ├── test_io.py │ │ │ ├── test_repartition.py │ │ │ ├── test_reshape.py │ │ │ ├── test_rolling.py │ │ │ ├── test_series.py │ │ │ └── utils.py │ │ ├── polars/ │ │ │ └── test_dataframe.py │ │ ├── test_dataframe_api_standard.py │ │ ├── test_docstring_urls.py │ │ ├── test_envvar_catcher.py │ │ ├── test_envvar_npartitions.py │ │ ├── test_executions_api.py │ │ ├── test_headers.py │ │ ├── test_logging.py │ │ ├── test_metrics.py │ │ ├── test_partition_api.py │ │ └── test_utils.py │ └── utils.py ├── modin-autoimport-pandas.pth ├── mypy.ini ├── requirements/ │ ├── env_unidist_linux.yml │ ├── env_unidist_win.yml │ └── requirements-no-engine.yml ├── requirements-dev.txt ├── scripts/ │ ├── __init__.py │ ├── doc_checker.py │ ├── release.py │ └── test/ │ ├── __init__.py │ ├── examples.py │ └── test_doc_checker.py ├── setup.cfg ├── setup.py ├── stress_tests/ │ ├── kaggle/ │ │ ├── kaggle10.py │ │ ├── kaggle12.py │ │ ├── kaggle13.py │ │ ├── kaggle14.py │ │ ├── kaggle17.py │ │ ├── kaggle18.py │ │ ├── kaggle19.py │ │ ├── kaggle20.py │ │ ├── kaggle22.py │ │ ├── kaggle3.py │ │ ├── kaggle4.py │ │ ├── kaggle5.py │ │ ├── kaggle6.py │ │ ├── kaggle7.py │ │ ├── kaggle8.py │ │ └── kaggle9.py │ ├── run_stress_tests.sh │ └── test_kaggle_ipynb.py └── versioneer.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitattributes ================================================ * text=auto modin/_version.py export-subst ================================================ FILE: .github/ISSUE_TEMPLATE/bug-report.yaml ================================================ name: Bug report description: Report incorrect behavior in the Modin library title: 'BUG: ' labels: ['bug 🦗', 'Triage 🩹'] body: - type: checkboxes id: checks attributes: label: Modin version checks options: - label: > I have checked that this issue has not already been reported. required: true - label: > I have confirmed this bug exists on the latest released version of Modin. required: true - label: > I have confirmed this bug exists on the main branch of Modin. (In order to do this you can follow [this guide](https://modin.readthedocs.io/en/stable/getting_started/installation.html#installing-from-the-github-main-branch).) - type: textarea id: example attributes: label: Reproducible Example description: > Please follow [this guide](https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) on how to provide a minimal, copy-pastable example. placeholder: > import modin.pandas as pd df = pd.DataFrame(range(5)) ... render: python validations: required: true - type: textarea id: problem attributes: label: Issue Description description: > Please provide a description of the issue shown in the reproducible example. validations: required: true - type: textarea id: expected-behavior attributes: label: Expected Behavior description: > Please describe or show a code example of the expected behavior. validations: required: true - type: textarea id: logs attributes: label: Error Logs description: > Please paste the output of any relevant error logs. value: >
```python-traceback Replace this line with the error backtrace (if applicable). ```
- type: textarea id: version attributes: label: Installed Versions description: > Please paste the output of ``pd.show_versions()`` value: >
Replace this line with the output of pd.show_versions()
validations: required: true ================================================ FILE: .github/ISSUE_TEMPLATE/feature_request.md ================================================ --- name: Feature request about: Request a new API or feature implementation title: '' labels: 'new feature/request 💬, Triage 🩹' assignees: '' --- **Is your feature request related to a problem? Please describe.** A clear and concise description of what the problem is. What kind of performance improvements would you like to see with this new API? ================================================ FILE: .github/ISSUE_TEMPLATE/question.md ================================================ --- name: Question about: You want to ask a question title: '' labels: 'question ❓, Triage 🩹' assignees: '' --- ================================================ FILE: .github/PULL_REQUEST_TEMPLATE.md ================================================ ## What do these changes do? - [x] first commit message and PR title follow format outlined [here](https://modin.readthedocs.io/en/latest/development/contributing.html#commit-message-formatting) > **_NOTE:_** If you edit the PR title to match this format, you need to add another commit (even if it's empty) or amend your last commit for the CI job that checks the PR title to pick up the new PR title. - [ ] passes `flake8 modin/ asv_bench/benchmarks scripts/doc_checker.py` - [ ] passes `black --check modin/ asv_bench/benchmarks scripts/doc_checker.py` - [ ] signed commit with `git commit -s` - [ ] Resolves #? - [ ] tests added and passing - [ ] module layout described at `docs/development/architecture.rst` is up-to-date ================================================ FILE: .github/actions/mamba-env/action.yml ================================================ name: "Install environment using Mamba" description: "Prepare the environment to run Modin" inputs: python-version: description: "Python version to install" default: "3.9" environment-file: description: "Conda environment yml" required: true activate-environment: description: "Conda environment to activate" default: "modin" runs: using: "composite" steps: - name: Get current week id: get-week # use current week as cache key to periodically refresh the cache, # as cache is based on requirements, but dependencies push # updated versions at some irregular pace run: echo "thisweek=$(/bin/date -u '+%Y.w%W')" >> $GITHUB_OUTPUT shell: bash - name: Cache conda id: cache-conda uses: actions/cache@v4 with: path: | ~/conda_pkgs_dir ~/.cache/pip key: ${{ runner.os }}-conda-${{ steps.get-week.outputs.thisweek }}-${{ hashFiles(inputs.environment-file) }} - uses: conda-incubator/setup-miniconda@v3 with: miniforge-variant: Miniforge3 miniforge-version: latest use-mamba: true activate-environment: ${{ inputs.activate-environment }} environment-file: ${{ inputs.environment-file }} python-version: ${{ inputs.python-version }} channel-priority: strict # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 use-only-tar-bz2: false - shell: bash -l {0} run: | conda run -n ${{ inputs.activate-environment }} pip install . conda list -n ${{ inputs.activate-environment }} ================================================ FILE: .github/actions/python-only/action.yml ================================================ name: "Install Python only" description: "Prepare the environment to run simple tasks" inputs: python-version: description: "Python version to install" default: "3.9" runs: using: "composite" steps: - uses: actions/setup-python@v5 with: python-version: ${{ inputs.python-version }} architecture: "x64" cache: 'pip' ================================================ FILE: .github/actions/run-core-tests/action.yml ================================================ name: "Run core Modin tests" description: "Run core Modin tests like dataframe or groupby" inputs: runner: description: "Runner for tests" default: "python -m pytest" parallel: description: "How to run tests in parallel" default: "-n 2" runs: using: "composite" steps: - uses: ./.github/actions/run-core-tests/group_1 with: runner: ${{ inputs.runner }} parallel: ${{ inputs.parallel }} - uses: ./.github/actions/run-core-tests/group_2 with: runner: ${{ inputs.runner }} parallel: ${{ inputs.parallel }} - uses: ./.github/actions/run-core-tests/group_3 with: runner: ${{ inputs.runner }} parallel: ${{ inputs.parallel }} - uses: ./.github/actions/run-core-tests/group_4 with: runner: ${{ inputs.runner }} parallel: ${{ inputs.parallel }} ================================================ FILE: .github/actions/run-core-tests/group_1/action.yml ================================================ name: "Run core Modin tests - group 1" description: "Run core Modin tests like dataframe or groupby" inputs: runner: description: "Runner for tests" default: "python -m pytest" parallel: description: "How to run tests in parallel" default: "-n 2" runs: using: "composite" steps: - run: | echo "::group::Running dataframe tests (group 1)..." ${{ inputs.runner }} ${{ inputs.parallel }} modin/tests/pandas/dataframe/test_binary.py \ modin/tests/pandas/dataframe/test_default.py \ modin/tests/pandas/dataframe/test_indexing.py \ modin/tests/pandas/dataframe/test_iter.py echo "::endgroup::" shell: bash -l {0} ================================================ FILE: .github/actions/run-core-tests/group_2/action.yml ================================================ name: "Run core Modin tests - group 2" description: "Run core Modin tests like dataframe or groupby" inputs: runner: description: "Runner for tests" default: "python -m pytest" parallel: description: "How to run tests in parallel" default: "-n 2" runs: using: "composite" steps: - run: | echo "::group::Running dataframe tests (group 2)..." ${{ inputs.runner }} ${{ inputs.parallel }} modin/tests/pandas/dataframe/test_join_sort.py \ modin/tests/pandas/dataframe/test_reduce.py \ modin/tests/pandas/dataframe/test_udf.py \ modin/tests/pandas/dataframe/test_window.py \ modin/tests/pandas/dataframe/test_pickle.py \ modin/tests/pandas/test_repartition.py \ modin/tests/pandas/test_backend.py echo "::endgroup::" shell: bash -l {0} ================================================ FILE: .github/actions/run-core-tests/group_3/action.yml ================================================ name: "Run core Modin tests - group 3" description: "Run core Modin tests like dataframe or groupby" inputs: runner: description: "Runner for tests" default: "python -m pytest" parallel: description: "How to run tests in parallel" default: "-n 2" runs: using: "composite" steps: - run: | echo "::group::Running tests (group 3)..." ${{ inputs.runner }} ${{ inputs.parallel }} modin/tests/pandas/test_series.py \ modin/tests/pandas/dataframe/test_map_metadata.py echo "::endgroup::" shell: bash -l {0} - run: | echo "::group::Running range-partitioning tests (group 3)..." MODIN_RANGE_PARTITIONING=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/tests/pandas/test_groupby.py MODIN_RANGE_PARTITIONING=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/tests/pandas/test_series.py -k "test_unique or test_nunique or drop_duplicates or test_resample" MODIN_RANGE_PARTITIONING=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/tests/pandas/test_general.py -k "test_unique" MODIN_RANGE_PARTITIONING=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/tests/pandas/dataframe/test_map_metadata.py -k "drop_duplicates" MODIN_RANGE_PARTITIONING=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/tests/pandas/dataframe/test_join_sort.py -k "merge" MODIN_RANGE_PARTITIONING=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/tests/pandas/dataframe/test_default.py -k "resample" echo "::endgroup::" shell: bash -l {0} ================================================ FILE: .github/actions/run-core-tests/group_4/action.yml ================================================ name: "Run core Modin tests - group 4" description: "Run core Modin tests like dataframe or groupby" inputs: runner: description: "Runner for tests" default: "python -m pytest" parallel: description: "How to run tests in parallel" default: "-n 2" runs: using: "composite" steps: - run: | echo "::group::Running tests (group 4)..." ${{ inputs.runner }} ${{ inputs.parallel }} modin/tests/pandas/test_rolling.py \ modin/tests/pandas/test_expanding.py \ modin/tests/pandas/test_groupby.py \ modin/tests/pandas/test_reshape.py \ modin/tests/pandas/test_general.py echo "::endgroup::" shell: bash -l {0} - run: | echo "::group::Running concat tests (group 4)..." ${{ inputs.runner }} modin/tests/pandas/test_concat.py # Ray and Dask versions fails with -n 2 echo "::endgroup::" shell: bash -l {0} ================================================ FILE: .github/actions/upload-coverage/action.yml ================================================ name: Upload Coverage description: Upload coverage files runs: using: "composite" steps: - run: | COVERAGE_UUID=$(python3 -c "import uuid; print(uuid.uuid4())") mv .coverage .coverage.${COVERAGE_UUID} echo "COVERAGE_UUID=${COVERAGE_UUID}" >> $GITHUB_ENV id: coverage-uuid shell: bash - uses: actions/upload-artifact@v4 with: name: coverage-data-${{ env.COVERAGE_UUID }} path: .coverage* include-hidden-files: true ================================================ FILE: .github/dependabot.yaml ================================================ version: 2 updates: - package-ecosystem: "github-actions" directory: "/" schedule: interval: "monthly" groups: github-actions: patterns: - "*" ================================================ FILE: .github/stale.yml ================================================ # Number of days of inactivity before an Issue or Pull Request becomes stale daysUntilStale: 365 # Number of days of inactivity before an Issue or Pull Request with the stale label is closed. # Set to false to disable. If disabled, issues still need to be closed manually, but will remain marked as stale. daysUntilClose: 7 # Only issues or pull requests with all of these labels are check if stale. Defaults to `[]` (disabled) onlyLabels: [] # Issues or Pull Requests with these labels will never be considered stale. Set to `[]` to disable exemptLabels: [] # Set to true to ignore issues in a project (defaults to false) exemptProjects: false # Set to true to ignore issues in a milestone (defaults to false) exemptMilestones: false # Set to true to ignore issues with an assignee (defaults to false) exemptAssignees: false # Label to use when marking as stale staleLabel: stale # Comment to post when marking as stale. Set to `false` to disable markComment: > This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs within the next 7 days. Thank you for your contributions. # Comment to post when removing the stale label. # unmarkComment: > # Your comment here. # Comment to post when closing a stale Issue or Pull Request. closeComment: > Closing as stale. ================================================ FILE: .github/workflows/ci-notebooks.yml ================================================ name: ci-notebooks on: pull_request: paths: - modin/** - examples/tutorial/** - .github/workflows/ci-notebooks.yml - setup.cfg - setup.py - requirements/env_unidist_linux.yml concurrency: # Cancel other jobs in the same branch. We don't care whether CI passes # on old commits. group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} env: MODIN_GITHUB_CI: true jobs: test-tutorial-notebooks: defaults: run: shell: bash -l {0} name: test tutorial notebooks runs-on: ubuntu-latest strategy: matrix: execution: [pandas_on_ray, pandas_on_dask, pandas_on_unidist] steps: - uses: actions/checkout@v4 - uses: ./.github/actions/python-only if: matrix.execution != 'pandas_on_unidist' - uses: ./.github/actions/mamba-env with: environment-file: requirements/env_unidist_linux.yml activate-environment: modin_on_unidist if: matrix.execution == 'pandas_on_unidist' - name: Cache datasets uses: actions/cache@v4 with: path: taxi.csv # update cache only if notebooks require it to be changed key: taxi-csv-dataset-${{ hashFiles('examples/tutorial/jupyter/**') }} # replace modin with . in the tutorial requirements file for `pandas_on_ray` and # `pandas_on_dask` since we need Modin built from sources - run: sed -i 's/modin/./g' examples/tutorial/jupyter/execution/${{ matrix.execution }}/requirements.txt if: matrix.execution != 'pandas_on_unidist' # install dependencies required for notebooks execution for `pandas_on_ray` and `pandas_on_dask` # Override modin-spreadsheet install for now - run: | pip install -r examples/tutorial/jupyter/execution/${{ matrix.execution }}/requirements.txt pip install git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5 if: matrix.execution != 'pandas_on_unidist' # Build Modin from sources for `pandas_on_unidist` - run: pip install -e . if: matrix.execution == 'pandas_on_unidist' # install test dependencies # NOTE: If you are changing the set of packages installed here, make sure that # the dev requirements match them. - run: pip install pytest pytest-cov black flake8 flake8-print flake8-no-implicit-concat if: matrix.execution != 'pandas_on_unidist' - run: pip install flake8-print jupyter nbformat nbconvert if: matrix.execution == 'pandas_on_unidist' - run: pip list if: matrix.execution != 'pandas_on_unidist' - run: | conda info conda list if: matrix.execution == 'pandas_on_unidist' # setup kernel configuration for `pandas_on_unidist` execution with mpi backend - run: python examples/tutorial/jupyter/execution/${{ matrix.execution }}/setup_kernel.py if: matrix.execution == 'pandas_on_unidist' - run: jupyter kernelspec list - run: | black --check --diff examples/tutorial/jupyter/execution/${{ matrix.execution }}/test/test_notebooks.py black --check --diff examples/tutorial/jupyter/execution/test/utils.py - run: | flake8 --enable=T examples/tutorial/jupyter/execution/${{ matrix.execution }}/test/test_notebooks.py flake8 --enable=T examples/tutorial/jupyter/execution/test/utils.py - run: python -m pytest examples/tutorial/jupyter/execution/${{ matrix.execution }}/test/test_notebooks.py ================================================ FILE: .github/workflows/ci-required.yml ================================================ name: ci-required on: pull_request concurrency: # Cancel other jobs in the same branch. We don't care whether CI passes # on old commits. group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} env: MODIN_GITHUB_CI: true jobs: check-pr-title: runs-on: ubuntu-latest steps: - uses: Slashgear/action-check-pr-title@v4.3.0 with: # NOTE: If you change the allowed prefixes here, update # the documentation about them in /docs/development/contributing.rst regexp: '^(?:FEAT|DOCS|FIX|REFACTOR|TEST|PERF)-#\d+:' build-docs: name: build docs runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 with: fetch-depth: 1 - uses: actions/setup-python@v5 with: python-version: "3.9" architecture: "x64" cache: "pip" cache-dependency-path: '**/requirements-doc.txt' - run: pip install -r docs/requirements-doc.txt - run: cd docs && sphinx-build -T -E -W -b html . build lint-pydocstyle: name: lint (pydocstyle) runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: ./.github/actions/python-only # The `numpydoc` version here MUST match the versions in the dev requirements files. - run: pip install pytest pytest-cov pydocstyle numpydoc==1.6.0 - run: python -m pytest scripts/test - run: pip install -e ".[all]" - run: | python scripts/doc_checker.py --add-ignore=D101,D102,D103,D105 --disable-numpydoc \ modin/pandas/dataframe.py modin/pandas/series.py \ modin/pandas/groupby.py \ modin/pandas/series_utils.py modin/pandas/general.py \ modin/pandas/plotting.py modin/pandas/utils.py \ modin/pandas/iterator.py modin/pandas/indexing.py \ - run: python scripts/doc_checker.py modin/core/dataframe - run: python scripts/doc_checker.py modin/core/execution/dask - run: | python scripts/doc_checker.py \ modin/pandas/accessor.py modin/pandas/general.py \ modin/pandas/groupby.py modin/pandas/indexing.py \ modin/pandas/iterator.py modin/pandas/plotting.py \ modin/pandas/series_utils.py modin/pandas/utils.py \ modin/pandas/base.py \ modin/pandas/io.py \ asv_bench/benchmarks/utils \ asv_bench/benchmarks/__init__.py asv_bench/benchmarks/io/__init__.py \ asv_bench/benchmarks/scalability/__init__.py \ modin/core/io \ modin/pandas/series.py \ modin/core/execution/python \ modin/pandas/dataframe.py \ modin/config/__init__.py \ modin/config/__main__.py \ modin/config/envvars.py \ modin/config/pubsub.py - run: python scripts/doc_checker.py modin/distributed - run: python scripts/doc_checker.py modin/utils.py - run: python scripts/doc_checker.py modin/experimental/sklearn - run: | python scripts/doc_checker.py modin/experimental/xgboost/__init__.py \ modin/experimental/xgboost/utils.py modin/experimental/xgboost/xgboost.py \ modin/experimental/xgboost/xgboost_ray.py - run: python scripts/doc_checker.py modin/core/execution/ray - run: | python scripts/doc_checker.py modin/core/execution/dispatching/factories/factories.py \ modin/core/execution/dispatching/factories/dispatcher.py \ - run: python scripts/doc_checker.py scripts/doc_checker.py - run: | python scripts/doc_checker.py modin/experimental/pandas/io.py \ modin/experimental/pandas/__init__.py - run: python scripts/doc_checker.py modin/core/storage_formats/base - run: python scripts/doc_checker.py modin/core/storage_formats/pandas - run: python scripts/doc_checker.py modin/experimental/batch/pipeline.py - run: python scripts/doc_checker.py modin/logging lint-black-isort: name: lint (black and isort) runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: ./.github/actions/python-only - run: pip install black>=24.1.0 isort>=5.12 # NOTE: keep the black command here in sync with the pre-commit hook in # /contributing/pre-commit - run: black --check --diff modin/ asv_bench/benchmarks scripts/doc_checker.py - run: isort . --check-only ================================================ FILE: .github/workflows/ci.yml ================================================ name: ci on: pull_request: paths: # NOTE: keep these paths in sync with the paths that trigger the # fuzzydata Github Actions in .github/workflows/fuzzydata-test.yml - .github/workflows/** - .github/actions/** - '!.github/workflows/push-to-main.yml' - asv_bench/** - modin/** - requirements/** - scripts/** - environment-dev.yml - requirements-dev.txt - setup.cfg - setup.py - versioneer.py push: schedule: - cron: "30 2 * * WED" - cron: "30 2 * * THU" concurrency: # Cancel other jobs in the same branch. We don't care whether CI passes # on old commits. group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} env: MODIN_GITHUB_CI: true jobs: python-filter: runs-on: ubuntu-latest outputs: python-version: ${{ steps.choose.outputs.python-version }} steps: - id: choose run: | if [[ "${{ github.event.schedule }}" = "30 2 * * WED" ]] then echo "python-version=3.10" >> "$GITHUB_OUTPUT" elif [[ "${{ github.event.schedule }}" = "30 2 * * THU" ]] then echo "python-version=3.11" >> "$GITHUB_OUTPUT" else echo "python-version=3.9" >> "$GITHUB_OUTPUT" fi lint-mypy: needs: [python-filter] name: lint (mypy) runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: ./.github/actions/python-only with: python-version: ${{ needs.python-filter.outputs.python-version }} - run: pip install -r requirements-dev.txt - run: mypy --config-file mypy.ini lint-flake8: needs: [python-filter] name: lint (flake8) runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: ./.github/actions/python-only with: python-version: ${{ needs.python-filter.outputs.python-version }} # NOTE: If you are changing the set of packages installed here, make sure that # the dev requirements match them. - run: pip install flake8 flake8-print flake8-no-implicit-concat # NOTE: keep the flake8 command here in sync with the pre-commit hook in # /contributing/pre-commit - run: flake8 modin/ asv_bench/benchmarks scripts/doc_checker.py test-api-and-no-engine: needs: [python-filter] name: Test API, headers and no-engine mode runs-on: ubuntu-latest defaults: run: shell: bash -l {0} steps: - uses: actions/checkout@v4 - uses: ./.github/actions/mamba-env with: environment-file: requirements/requirements-no-engine.yml python-version: ${{ needs.python-filter.outputs.python-version }} - run: python -m pytest modin/tests/pandas/test_api.py - run: python -m pytest modin/tests/test_executions_api.py - run: python -m pytest modin/tests/test_headers.py - run: python -m pytest modin/tests/core/test_dispatcher.py::test_add_option - uses: ./.github/actions/upload-coverage test-clean-install: needs: [lint-flake8, python-filter] strategy: matrix: os: - ubuntu - windows runs-on: ${{ matrix.os }}-latest defaults: run: shell: bash -l {0} name: test-clean-install-${{ matrix.os }} steps: - uses: actions/checkout@v4 - uses: ./.github/actions/python-only with: python-version: ${{ needs.python-filter.outputs.python-version }} - run: python -m pip install -e ".[all]" - name: Ensure Ray and Dask engines start up run: | MODIN_ENGINE=dask python -c "import modin.pandas as pd; print(pd.DataFrame([1,2,3]))" MODIN_ENGINE=ray python -c "import modin.pandas as pd; print(pd.DataFrame([1,2,3]))" - name: Ensure MPI engine start up # Install a working MPI implementation beforehand so mpi4py can link to it run: | sudo apt-get update sudo apt-get install software-properties-common sudo add-apt-repository "deb http://archive.ubuntu.com/ubuntu jammy main universe restricted multiverse" sudo add-apt-repository "deb http://archive.ubuntu.com/ubuntu jammy-updates main universe restricted multiverse" sudo add-apt-repository "deb http://security.ubuntu.com/ubuntu jammy-security main universe restricted multiverse" sudo apt-get update sudo apt-get install libmpich-dev=4.0-3 libmpich12=4.0-3 mpich=4.0-3 python -m pip install -e ".[mpi]" # mpi4py 4.1 does not work with the mpich versions above. # TODO(https://github.com/modin-project/modin/issues/7615): figure out # the correct libmpich versions for mpi4py >= 4.1 python -m pip install "mpi4py<4.1" MODIN_ENGINE=unidist UNIDIST_BACKEND=mpi mpiexec -n 1 python -c "import modin.pandas as pd; print(pd.DataFrame([1,2,3]))" if: matrix.os == 'ubuntu' test-internals: needs: [lint-flake8, python-filter] runs-on: ubuntu-latest defaults: run: shell: bash -l {0} name: test-internals steps: - uses: actions/checkout@v4 - uses: ./.github/actions/mamba-env with: environment-file: environment-dev.yml python-version: ${{ needs.python-filter.outputs.python-version }} - name: Internals tests run: python -m pytest modin/tests/core/test_dispatcher.py - run: python -m pytest modin/tests/config - run: python -m pytest modin/tests/test_envvar_catcher.py - run: python -m pytest modin/tests/core/storage_formats/base/test_internals.py - run: python -m pytest modin/tests/core/storage_formats/pandas/test_internals.py - run: python -m pytest modin/tests/test_envvar_npartitions.py - run: python -m pytest modin/tests/test_utils.py - run: python -m pytest asv_bench/test/test_utils.py - run: python -m pytest modin/tests/interchange/dataframe_protocol/base - run: python -m pytest modin/tests/test_dataframe_api_standard.py - run: python -m pytest modin/tests/test_logging.py - run: python -m pytest modin/tests/test_metrics.py - run: python -m pytest modin/tests/pandas/extensions - uses: ./.github/actions/upload-coverage test-defaults: needs: [lint-flake8, python-filter] runs-on: ubuntu-latest defaults: run: shell: bash -l {0} strategy: matrix: execution: [BaseOnPython] env: MODIN_TEST_DATASET_SIZE: "small" name: Test ${{ matrix.execution }} execution, Python ${{ needs.python-filter.outputs.python-version }}" steps: - uses: actions/checkout@v4 - uses: ./.github/actions/mamba-env with: environment-file: environment-dev.yml python-version: ${{ needs.python-filter.outputs.python-version }} - name: Install HDF5 run: sudo apt update && sudo apt install -y libhdf5-dev - name: xgboost tests run: | # TODO(https://github.com/modin-project/modin/issues/5194): Uncap xgboost # when we use collective instead of rabit. # Per the thread https://github.com/conda-forge/miniforge/issues/513, # remove unused conda packages and caches to avoid `Found incorrect # download: joblib` error from mamba. mamba clean --all mamba install "xgboost>=1.7.1,<2.0.0" scikit-learn -c conda-forge python -m pytest modin/tests/experimental/xgboost/test_default.py --execution=${{ matrix.execution }} - run: python -m pytest -n 2 modin/tests/core/storage_formats/base/test_internals.py --execution=${{ matrix.execution }} - uses: ./.github/actions/run-core-tests with: runner: python -m pytest --execution=${{ matrix.execution }} - uses: ./.github/actions/upload-coverage test-asv-benchmarks: if: github.event_name == 'pull_request' needs: [lint-flake8] runs-on: ubuntu-latest defaults: run: shell: bash -l {0} env: MODIN_ENGINE: ray MODIN_MEMORY: 1000000000 MODIN_TEST_DATASET_SIZE: small name: test-asv-benchmarks steps: - uses: actions/checkout@v4 with: fetch-depth: 1 - uses: conda-incubator/setup-miniconda@v3 with: auto-activate-base: true activate-environment: "" miniforge-variant: Miniforge3 miniforge-version: latest use-mamba: true - name: Running benchmarks run: | git remote add upstream https://github.com/modin-project/modin.git git fetch upstream if git diff upstream/main --name-only | grep -q "^asv_bench/"; then cd asv_bench mamba env create -f ../environment-dev.yml conda activate modin pip install .. asv machine --yes # check Modin on Ray asv run --quick --dry-run --python=same --strict --show-stderr --launch-method=spawn \ -b ^benchmarks -b ^io -b ^scalability | tee benchmarks.log # check pure pandas MODIN_ASV_USE_IMPL=pandas asv run --quick --dry-run --python=same --strict --show-stderr --launch-method=spawn \ -b ^benchmarks -b ^io | tee benchmarks.log else echo "Benchmarks did not run, no changes detected" fi if: always() - name: Publish benchmarks artifact uses: actions/upload-artifact@v4 with: name: Benchmarks log path: asv_bench/benchmarks.log include-hidden-files: true if: failure() execution-filter: # Choose which executions we want to run all tests for on a pull request. # We always test 'native' and 'python' executions completely because they # are fast, but we only test ray, dask, and unidist, if we think this pull # request is affecting how we execute with those engines specifically. runs-on: ubuntu-latest outputs: ray: ${{ steps.filter.outputs.ray }} dask: ${{ steps.filter.outputs.dask }} unidist: ${{ steps.filter.outputs.unidist }} engines: ${{ steps.engines.outputs.engines }} experimental: ${{ steps.experimental.outputs.experimental }} steps: - uses: actions/checkout@v4 - uses: dorny/paths-filter@v3 id: filter with: filters: | shared: &shared - 'modin/core/execution/dispatching/**' ray: - *shared - 'modin/core/execution/ray/**' dask: - *shared - 'modin/core/execution/dask/**' unidist: - *shared - 'modin/core/execution/unidist/**' experimental: - 'modin/experimental/**' - uses: actions/setup-python@v5 - id: engines run: | python -c "import sys, json; print('engines=' + json.dumps(['python', 'native'] + (sys.argv[1] == 'true' and ['ray'] or []) + (sys.argv[2] == 'true' and ['dask'] or []) ))" \ "${{ steps.filter.outputs.ray }}" "${{ steps.filter.outputs.dask }}" >> $GITHUB_OUTPUT test-all-unidist: needs: [lint-flake8, execution-filter, python-filter] if: github.event_name == 'push' || needs.execution-filter.outputs.unidist == 'true' runs-on: ubuntu-latest defaults: run: shell: bash -l {0} strategy: matrix: python-version: [ "${{ needs.python-filter.outputs.python-version }}" ] unidist-backend: ["mpi"] env: MODIN_ENGINE: "Unidist" UNIDIST_BACKEND: ${{matrix.unidist-backend}} # Only test reading from SQL server and postgres on ubuntu for now. # Eventually, we should test on Windows, too, but we will have to set up # the servers differently. MODIN_TEST_READ_FROM_SQL_SERVER: true MODIN_TEST_READ_FROM_POSTGRES: true name: test-ubuntu (engine unidist ${{matrix.unidist-backend}}, python ${{matrix.python-version}}) services: moto: image: motoserver/moto:5.0.13 ports: - 5000:5000 env: AWS_ACCESS_KEY_ID: foobar_key AWS_SECRET_ACCESS_KEY: foobar_secret steps: - uses: actions/checkout@v4 - uses: ./.github/actions/mamba-env with: environment-file: requirements/env_unidist_linux.yml activate-environment: modin_on_unidist python-version: ${{matrix.python-version}} - name: Install HDF5 run: sudo apt update && sudo apt install -y libhdf5-dev - name: Set up postgres # Locally, specifying port 2345:5432 works, but 2345:2345 and 5432:5432 do not. This solution is from # https://stackoverflow.com/questions/36415654/cant-connect-docker-postgresql-9-3 run: | sudo docker pull postgres sudo docker run --name some-postgres -e POSTGRES_USER=sa -e POSTGRES_PASSWORD=Strong.Pwd-123 -e POSTGRES_DB=postgres -d -p 2345:5432 postgres - run: mpiexec -n 1 python -m pytest modin/tests/pandas/internals/test_benchmark_mode.py - run: mpiexec -n 1 python -m pytest modin/tests/test_partition_api.py - uses: ./.github/actions/run-core-tests with: runner: mpiexec -n 1 python -m pytest parallel: "" - run: mpiexec -n 1 python -m pytest modin/tests/numpy - run: chmod +x ./.github/workflows/sql_server/set_up_sql_server.sh - run: ./.github/workflows/sql_server/set_up_sql_server.sh # need an extra argument "genv" to set environment variables for mpiexec. We need # these variables to test writing to the mock s3 filesystem. - uses: nick-fields/retry@v3 # to avoid issues with non-stable `to_csv` tests for unidist on MPI backend. # for details see: https://github.com/modin-project/modin/pull/6776 with: timeout_minutes: 15 max_attempts: 3 command: | conda run --no-capture-output -n modin_on_unidist mpiexec -n 1 -genv AWS_ACCESS_KEY_ID foobar_key \ -genv AWS_SECRET_ACCESS_KEY foobar_secret python -m pytest modin/tests/pandas/test_io.py --verbose - run: | mpiexec -n 1 -genv AWS_ACCESS_KEY_ID foobar_key -genv AWS_SECRET_ACCESS_KEY foobar_secret \ python -m pytest modin/tests/experimental/test_io_exp.py - run: mpiexec -n 1 python -m pytest modin/tests/interchange/dataframe_protocol/test_general.py - run: mpiexec -n 1 python -m pytest modin/tests/interchange/dataframe_protocol/pandas/test_protocol.py - run: | python -m pip install lazy_import mpiexec -n 1 python -m pytest modin/tests/pandas/integrations/ - uses: ./.github/actions/upload-coverage test-all: needs: [lint-flake8, execution-filter, python-filter] strategy: matrix: os: - ubuntu - windows python-version: [ "${{ needs.python-filter.outputs.python-version }}" ] # On push, run the tests for all engines. Otherwise, for pull requests, # only run tests for engines that depend on files changed in this PR. engine: ${{ fromJSON( (github.event_name == 'push' && '["python", "ray", "dask", "native"]') || needs.execution-filter.outputs.engines ) }} test_task: - group_1 - group_2 - group_3 - group_4 exclude: # python and native engines only have one task group that contains all the tests - engine: "python" test_task: "group_2" - engine: "native" test_task: "group_2" - engine: "python" test_task: "group_3" - engine: "native" test_task: "group_3" - engine: "python" test_task: "group_4" - engine: "native" test_task: "group_4" runs-on: ${{ matrix.os }}-latest defaults: run: shell: bash -l {0} env: MODIN_ENGINE: ${{matrix.engine}} # Only test reading from SQL server and postgres on ubuntu for now. # Eventually, we should test on Windows, too, but we will have to set up # the servers differently. MODIN_TEST_READ_FROM_SQL_SERVER: ${{ matrix.os == 'ubuntu' }} MODIN_TEST_READ_FROM_POSTGRES: ${{ matrix.os == 'ubuntu' }} name: test-${{ matrix.os }} (engine ${{matrix.engine}}, python ${{matrix.python-version}}, ${{matrix.test_task}}) services: # Using workaround https://github.com/actions/runner/issues/822#issuecomment-1524826092 moto: # we only need moto service on Ubuntu and for group_4 task, or for native or python engine. image: ${{ (matrix.os == 'ubuntu' && (matrix.engine == 'python' || matrix.engine == 'native' || matrix.test_task == 'group_4')) && 'motoserver/moto:5.0.13' || '' }} ports: - 5000:5000 env: AWS_ACCESS_KEY_ID: foobar_key AWS_SECRET_ACCESS_KEY: foobar_secret steps: - name: Set native storage format run: echo "MODIN_STORAGE_FORMAT=Native" >> $GITHUB_ENV if: matrix.engine == 'native' - name: Limit ray memory run: echo "MODIN_MEMORY=1000000000" >> $GITHUB_ENV if: matrix.os == 'ubuntu' && matrix.engine == 'ray' - name: Tell Modin to use existing ray cluster run: echo "MODIN_RAY_CLUSTER=True" >> $GITHUB_ENV if: matrix.os == 'windows' && matrix.engine == 'ray' - uses: actions/checkout@v4 - uses: ./.github/actions/mamba-env with: environment-file: environment-dev.yml python-version: ${{matrix.python-version}} - name: Start local ray cluster # Try a few times to start ray to work around # https://github.com/modin-project/modin/issues/4562 uses: nick-fields/retry@v3 with: timeout_minutes: 5 max_attempts: 5 command: ray start --head --port=6379 --object-store-memory=1000000000 if: matrix.os == 'windows' && matrix.engine == 'ray' - name: Install HDF5 run: sudo apt update && sudo apt install -y libhdf5-dev if: matrix.os == 'ubuntu' - name: Set up postgres # Locally, specifying port 2345:5432 works, but 2345:2345 and 5432:5432 do not. This solution is from # https://stackoverflow.com/questions/36415654/cant-connect-docker-postgresql-9-3 run: | sudo docker pull postgres sudo docker run --name some-postgres -e POSTGRES_USER=sa -e POSTGRES_PASSWORD=Strong.Pwd-123 -e POSTGRES_DB=postgres -d -p 2345:5432 postgres if: matrix.os == 'ubuntu' # BEGIN partitioned execution tests. We run these tests along with group 1, # or if we are on the "python" engine, which only has a single group. We # skip these tests on the "native" engine, which does not use partitions. - run: python -m pytest modin/tests/pandas/internals/test_benchmark_mode.py if: matrix.engine != 'native' && (matrix.engine == 'python' || matrix.test_task == 'group_1') - run: python -m pytest modin/tests/test_partition_api.py # Skip this test for python because we do not define unwrap_partitions() # for python execution. if: matrix.engine != 'native' && matrix.engine != 'python' && matrix.test_task == 'group_1' - name: xgboost tests run: | # TODO(https://github.com/modin-project/modin/issues/5194): Uncap xgboost # when we use collective instead of rabit. mamba install "xgboost>=1.7.1,<2.0.0" scikit-learn -c conda-forge python -m pytest -n 2 \ modin/tests/experimental/xgboost/test_default.py \ modin/tests/experimental/xgboost/test_xgboost.py \ modin/tests/experimental/xgboost/test_dmatrix.py if: matrix.engine != 'native' && matrix.os != 'windows' && (matrix.engine == 'python' || matrix.test_task == 'group_1') - run: python -m pytest -n 2 modin/tests/experimental/test_pipeline.py if: matrix.engine != 'native' && (matrix.engine == 'python' || matrix.test_task == 'group_1') # END partitioned execution tests. # BEGIN test groups. # Run all the tests in the corresponding group for this instance of the # test matrix. For example, if we are in the matrix's 'group_4', run the # tests for 'group_4'. For each of 'native' and 'python' engines, we run # all tests in a single job, so we ignore the grouping. - uses: ./.github/actions/run-core-tests/group_1 with: # When running with Ray engine on Windows using 2 pytest workers tests are failing in CI. # See https://github.com/modin-project/modin/issues/7387. parallel: ${{ matrix.engine == 'ray' && matrix.os == 'windows' && '-n 1' || '-n 2' }} if: matrix.engine == 'python' || matrix.engine == 'native' || matrix.test_task == 'group_1' - uses: ./.github/actions/run-core-tests/group_2 with: # When running with Ray engine on Windows using 2 pytest workers tests are failing in CI. # See https://github.com/modin-project/modin/issues/7387. parallel: ${{ matrix.engine == 'ray' && matrix.os == 'windows' && '-n 1' || '-n 2' }} if: matrix.engine == 'python' || matrix.engine == 'native' || matrix.test_task == 'group_2' - uses: ./.github/actions/run-core-tests/group_3 with: # When running with Ray engine on Windows using 2 pytest workers tests are failing in CI. # See https://github.com/modin-project/modin/issues/7387. parallel: ${{ matrix.engine == 'ray' && matrix.os == 'windows' && '-n 1' || '-n 2' }} if: matrix.engine == 'python' || matrix.engine == 'native' || matrix.test_task == 'group_3' - uses: ./.github/actions/run-core-tests/group_4 with: # When running with Ray engine on Windows using 2 pytest workers tests are failing in CI. # See https://github.com/modin-project/modin/issues/7387. parallel: ${{ matrix.engine == 'ray' && matrix.os == 'windows' && '-n 1' || '-n 2' }} if: matrix.engine == 'python' || matrix.engine == 'native' || matrix.test_task == 'group_4' - run: python -m pytest -n 2 modin/tests/numpy # Native execution does not support the modin Numpy API. if: matrix.engine == 'python' || matrix.test_task == 'group_4' # END test groups. # BEGIN some tests that we run along with group 4 for engines other than # 'native' and 'python'. 'native' and 'python' jobs will run these tests # along with all other tests in a single group. - run: chmod +x ./.github/workflows/sql_server/set_up_sql_server.sh if: matrix.os == 'ubuntu' && (matrix.engine == 'python' || matrix.engine == 'native' || matrix.test_task == 'group_4') - run: ./.github/workflows/sql_server/set_up_sql_server.sh if: matrix.os == 'ubuntu' && (matrix.engine == 'python' || matrix.engine == 'native' || matrix.test_task == 'group_4') # Do not add parallelism (`-n` argument) here - it will cause mock S3 service to fail. - run: python -m pytest modin/tests/pandas/test_io.py --verbose timeout-minutes: 60 if: matrix.engine == 'python' || matrix.engine == 'native' || matrix.test_task == 'group_4' - run: python -m pytest modin/tests/experimental/test_io_exp.py if: matrix.engine == 'python' || matrix.engine == 'native' || matrix.test_task == 'group_4' - run: python -m pytest modin/tests/interchange/dataframe_protocol/test_general.py if: matrix.engine == 'python' || matrix.engine == 'native' || matrix.test_task == 'group_4' - run: python -m pytest modin/tests/interchange/dataframe_protocol/pandas/test_protocol.py if: matrix.engine == 'python' || matrix.engine == 'native' || matrix.test_task == 'group_4' - run: python -m pytest modin/tests/polars/test_dataframe.py - run: | python -m pip install lazy_import python -m pytest modin/tests/pandas/integrations/ if: matrix.engine == 'python' || matrix.engine == 'native' || matrix.test_task == 'group_4' # END tests that run on group 4, or in the single group for 'native' and # python' engines. - uses: ./.github/actions/upload-coverage - name: Stop local ray cluster run: ray stop if: matrix.os == 'windows' && matrix.engine == 'ray' test-sanity: # The "sanity" tests run on each pull request to test that a subset of the # full tests work with the slower engines (ray, dask, and unidist-MPI). needs: [lint-flake8, execution-filter, python-filter] # If we don't need to run any sanity tests, the job matrix that we generate # here gives a single job with all the matrix fields empty (that is, os, # execution, etc. are not set, so we treat them as ""). # so, if the matrix is going to be empty, we need to skip this job # completely. This bizarre behavior is not in the official documentation, # of GitHub actions matrices, but someone does mention it here: # https://stackoverflow.com/a/77118991 if: | github.event_name == 'pull_request' && ( needs.execution-filter.outputs.ray != 'true' || needs.execution-filter.outputs.dask != 'true' || needs.execution-filter.outputs.unidist != 'true' ) strategy: matrix: os: - ubuntu - windows python-version: [ "${{ needs.python-filter.outputs.python-version }}" ] running-all-ray-tests: [ "${{ needs.execution-filter.outputs.ray }}" ] running-all-dask-tests: [ "${{needs.execution-filter.outputs.dask}}" ] running-all-unidist-tests: [ "${{needs.execution-filter.outputs.unidist}}" ] execution: [ray, dask, unidist] # If we're going to run all ray tests because we've detected a # change to the ray engine, we don't need to run these sanity tests # on ray. Likewise for dask and unidist. exclude: - running-all-ray-tests: 'true' execution: ray - running-all-dask-tests: 'true' execution: dask - running-all-unidist-tests: 'true' execution: unidist runs-on: ${{ matrix.os }}-latest defaults: run: shell: bash -l {0} env: MODIN_ENGINE: ${{ matrix.execution }} UNIDIST_BACKEND: "mpi" PARALLEL: ${{ matrix.execution != 'unidist' && matrix.os != 'windows' && '-n 2' || '' }} PYTEST_COMMAND: >- ${{ ( (matrix.execution == 'ray' || matrix.execution == 'dask') && 'python -m pytest' ) || ( matrix.execution == 'unidist' && 'mpiexec -n 1 -genv AWS_ACCESS_KEY_ID foobar_key -genv AWS_SECRET_ACCESS_KEY foobar_secret python -m pytest' ) || 'UNKNOWN_PYTEST_COMMAND' }} name: test-${{ matrix.os }}-sanity (engine ${{ matrix.execution }}, python ${{matrix.python-version}}) services: moto: image: ${{ matrix.os != 'windows' && 'motoserver/moto:5.0.13' || '' }} ports: - 5000:5000 env: AWS_ACCESS_KEY_ID: foobar_key AWS_SECRET_ACCESS_KEY: foobar_secret steps: - uses: actions/checkout@v4 - uses: ./.github/actions/mamba-env with: environment-file: ${{ matrix.os == 'ubuntu' && matrix.execution == 'unidist' && 'requirements/env_unidist_linux.yml' || matrix.os == 'windows' && matrix.execution == 'unidist' && 'requirements/env_unidist_win.yml' || 'environment-dev.yml' }} activate-environment: ${{ matrix.execution == 'unidist' && 'modin_on_unidist' || 'modin' }} python-version: ${{matrix.python-version}} - name: Install HDF5 run: sudo apt update && sudo apt install -y libhdf5-dev if: matrix.os != 'windows' - name: Limit ray memory run: echo "MODIN_MEMORY=1000000000" >> $GITHUB_ENV if: matrix.os != 'windows' && matrix.execution == 'ray' - name: Tell Modin to use existing ray cluster run: echo "MODIN_RAY_CLUSTER=True" >> $GITHUB_ENV if: matrix.os == 'windows' && matrix.execution == 'ray' - name: Start local ray cluster # Try a few times to start ray to work around # https://github.com/modin-project/modin/issues/4562 uses: nick-fields/retry@v3 with: timeout_minutes: 5 max_attempts: 5 command: ray start --head --port=6379 --object-store-memory=1000000000 if: matrix.os == 'windows' && matrix.execution == 'ray' - run: MODIN_BENCHMARK_MODE=True $PYTEST_COMMAND modin/tests/pandas/internals/test_benchmark_mode.py - run: $PYTEST_COMMAND $PARALLEL modin/tests/test_partition_api.py - run: $PYTEST_COMMAND modin/tests/pandas/extensions - name: xgboost tests run: | # TODO(https://github.com/modin-project/modin/issues/5194): Uncap xgboost # when we use collective instead of rabit. mamba install "xgboost>=1.7.1,<2.0.0" scikit-learn -c conda-forge $PYTEST_COMMAND $PARALLEL \ modin/tests/experimental/xgboost/test_default.py \ modin/tests/experimental/xgboost/test_xgboost.py \ modin/tests/experimental/xgboost/test_dmatrix.py if: matrix.os != 'windows' && needs.execution-filter.outputs.experimental == 'true' - run: $PYTEST_COMMAND $PARALLEL modin/tests/experimental/test_pipeline.py if: matrix.os != 'windows' && matrix.execution != 'unidist' && needs.execution-filter.outputs.experimental == 'true' - name: "test DF: binary, default, iter" run: | $PYTEST_COMMAND $PARALLEL \ modin/tests/pandas/dataframe/test_binary.py \ modin/tests/pandas/dataframe/test_default.py \ modin/tests/pandas/dataframe/test_iter.py if: matrix.os != 'windows' - name: "test DF: reduce, udf, window, pickle" run: | $PYTEST_COMMAND $PARALLEL \ modin/tests/pandas/dataframe/test_reduce.py \ modin/tests/pandas/dataframe/test_udf.py \ modin/tests/pandas/dataframe/test_window.py \ modin/tests/pandas/dataframe/test_pickle.py if: matrix.os != 'windows' - run: $PYTEST_COMMAND modin/tests/pandas/test_series.py if: matrix.execution == 'ray' - run: $PYTEST_COMMAND -m "not exclude_in_sanity" modin/tests/pandas/test_series.py if: matrix.execution != 'ray' - run: $PYTEST_COMMAND modin/tests/pandas/dataframe/test_map_metadata.py if: matrix.execution == 'ray' - run: $PYTEST_COMMAND -m "not exclude_in_sanity" modin/tests/pandas/dataframe/test_map_metadata.py if: matrix.execution != 'ray' - name: "test rolling, expanding, reshape, general, concat" run: | $PYTEST_COMMAND $PARALLEL \ modin/tests/pandas/test_rolling.py \ modin/tests/pandas/test_expanding.py \ modin/tests/pandas/test_reshape.py \ modin/tests/pandas/test_general.py \ modin/tests/pandas/test_concat.py if: matrix.os != 'windows' - run: $PYTEST_COMMAND $PARALLEL modin/tests/numpy - run: $PYTEST_COMMAND -m "not exclude_in_sanity" modin/tests/pandas/test_io.py --verbose if: matrix.execution != 'unidist' - uses: nick-fields/retry@v3 # to avoid issues with non-stable `to_csv` tests for unidist on MPI backend. # for details see: https://github.com/modin-project/modin/pull/6776 with: timeout_minutes: 15 max_attempts: 3 command: conda run --no-capture-output -n modin_on_unidist $PYTEST_COMMAND -m "not exclude_in_sanity" modin/tests/pandas/test_io.py --verbose if: matrix.execution == 'unidist' - run: $PYTEST_COMMAND modin/tests/experimental/test_io_exp.py - run: $PYTEST_COMMAND $PARALLEL modin/tests/interchange/dataframe_protocol/test_general.py - run: $PYTEST_COMMAND $PARALLEL modin/tests/interchange/dataframe_protocol/pandas/test_protocol.py - name: Stop local ray cluster run: ray stop if: matrix.os == 'windows' && matrix.execution == 'ray' - uses: ./.github/actions/upload-coverage test-experimental: needs: [lint-flake8, python-filter] runs-on: ubuntu-latest defaults: run: shell: bash -l {0} env: MODIN_ENGINE: "python" MODIN_EXPERIMENTAL: "True" name: test experimental services: moto: image: motoserver/moto:5.0.13 ports: - 5000:5000 env: AWS_ACCESS_KEY_ID: foobar_key AWS_SECRET_ACCESS_KEY: foobar_secret steps: - uses: actions/checkout@v4 - uses: ./.github/actions/mamba-env with: environment-file: environment-dev.yml python-version: ${{ needs.python-filter.outputs.python-version }} - name: Install HDF5 run: sudo apt update && sudo apt install -y libhdf5-dev - run: python -m pytest -n 2 modin/tests/pandas/dataframe/test_map_metadata.py - run: python -m pytest -n 2 modin/tests/pandas/test_series.py # Do not add parallelism (`-n` argument) here - it will cause mock S3 service to fail. - run: python -m pytest modin/tests/pandas/test_io.py --verbose - uses: ./.github/actions/upload-coverage test-spreadsheet: needs: [lint-flake8, python-filter] runs-on: ubuntu-latest defaults: run: shell: bash -l {0} strategy: matrix: python-version: [ "${{ needs.python-filter.outputs.python-version }}" ] engine: ["ray", "dask"] env: MODIN_EXPERIMENTAL: "True" MODIN_ENGINE: ${{matrix.engine}} name: test-spreadsheet (engine ${{matrix.engine}}, python ${{matrix.python-version}}) steps: - uses: actions/checkout@v4 - uses: ./.github/actions/mamba-env with: environment-file: environment-dev.yml python-version: ${{matrix.python-version}} - run: python -m pytest modin/tests/experimental/spreadsheet/test_general.py test-native-dataframe-interoperability: needs: [ lint-flake8] if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' }} runs-on: ubuntu-latest defaults: run: shell: bash -l {0} strategy: matrix: python-version: ["3.9"] env: # Test interoperability between PandasOnPython dataframes/series and # native dataframes/series. MODIN_ENGINE: "Python" name: test-native-dataframe-interoperability python ${{matrix.python-version}}) steps: - uses: actions/checkout@v4 - uses: ./.github/actions/mamba-env with: environment-file: environment-dev.yml python-version: ${{matrix.python-version}} - run: python -m pytest modin/tests/pandas/native_df_interoperability/ -n 2 - uses: ./.github/actions/upload-coverage merge-coverage-artifacts: needs: [test-internals, test-api-and-no-engine, test-defaults, test-all-unidist, test-all, test-experimental, test-sanity, test-native-dataframe-interoperability] if: always() # we need to run it regardless of some job being skipped, like in PR runs-on: ubuntu-latest defaults: run: shell: bash -l {0} steps: - name: Merge Artifacts uses: actions/upload-artifact/merge@v4 with: name: coverage-data pattern: coverage-data-* include-hidden-files: true delete-merged: true upload-coverage: needs: [merge-coverage-artifacts, python-filter] if: always() # we need to run it regardless of some job being skipped, like in PR runs-on: ubuntu-latest defaults: run: shell: bash -l {0} steps: - uses: actions/checkout@v4 - uses: ./.github/actions/python-only with: python-version: ${{ needs.python-filter.outputs.python-version }} - name: Download coverage data uses: actions/download-artifact@v4 with: name: coverage-data - run: pip install coverage - name: Combine coverage run: python -m coverage combine - name: Generate coverage report in xml format run: python -m coverage xml - uses: codecov/codecov-action@v4 with: fail_ci_if_error: ${{ github.event_name == 'push' }} # do not care about uploads in PR token: ${{ secrets.CODECOV_TOKEN }} # this token is available at https://app.codecov.io/account/github/modin-project/ ================================================ FILE: .github/workflows/codeql/codeql-config.yml ================================================ name: "Modin CodeQL config" paths: - modin/** ================================================ FILE: .github/workflows/codeql.yml ================================================ name: "CodeQL" on: push: branches: [ "main" ] pull_request: branches: [ "main" ] concurrency: # Cancel other jobs in the same branch. We don't care whether CI passes # on old commits. group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} env: MODIN_GITHUB_CI: true jobs: analyze: name: Analyze runs-on: ubuntu-latest permissions: actions: read contents: read security-events: write strategy: fail-fast: false matrix: language: [ python ] steps: - name: Checkout uses: actions/checkout@v4 - name: Initialize CodeQL uses: github/codeql-action/init@v3 with: languages: ${{ matrix.language }} queries: +security-and-quality config-file: ./.github/workflows/codeql/codeql-config.yml - name: Autobuild uses: github/codeql-action/autobuild@v3 - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@v3 with: category: "/language:${{ matrix.language }}" ================================================ FILE: .github/workflows/fuzzydata-test.yml ================================================ name: fuzzy on: pull_request: paths: # NOTE: keep these paths in sync with the paths that trigger the CI Github # Actions in .github/workflows/ci.yml - .github/workflows/** - '!.github/workflows/push-to-main.yml' - asv_bench/** - modin/** - requirements/** - scripts/** - environment-dev.yml - requirements-dev.txt - setup.cfg - setup.py - versioneer.py concurrency: # Cancel other jobs in the same branch. We don't care whether CI passes # on old commits. group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} env: MODIN_GITHUB_CI: true jobs: test-fuzzydata: runs-on: ubuntu-latest defaults: run: shell: bash -l {0} strategy: matrix: python-version: ["3.9"] engine: ["ray", "dask"] steps: - uses: actions/checkout@v4 - uses: ./.github/actions/mamba-env with: environment-file: environment-dev.yml python-version: ${{matrix.python-version}} - name: test-fuzzydata (engine ${{matrix.engine}}, python ${{matrix.python-version}}) run: python -m pytest modin/tests/experimental/test_fuzzydata.py -Wignore::UserWarning --log-file=/tmp/fuzzydata-test-wf-${{matrix.engine}}/run.log --log-file-level=INFO env: MODIN_ENGINE: ${{matrix.engine}} - uses: actions/upload-artifact@v4 if: success() || failure() with: name: fuzzydata-test-workflow-${{matrix.engine}} path: /tmp/fuzzydata-test-wf-${{matrix.engine}}/* # Must match output dir in test_fuzzydata.py if-no-files-found: error include-hidden-files: true ================================================ FILE: .github/workflows/publish-to-pypi.yml ================================================ name: Publish Modin wheel to PyPI on: schedule: - cron: "42 0 * * WED" push: tags: - '*' workflow_dispatch: jobs: build-n-publish: name: Build and publish Modin wheel to PyPI environment: release runs-on: ubuntu-latest permissions: id-token: write # IMPORTANT: this permission is mandatory for trusted publishing steps: - uses: actions/checkout@v4 with: fetch-depth: 0 fetch-tags: true - name: Checkout latest git tag run: git checkout $(git describe --tags "$(git rev-list --tags --max-count=1)") if: github.event_name == 'push' - name: Set up Python uses: actions/setup-python@v5 with: python-version: "3.9.x" - name: Install/update tools run: python3 -m pip install --upgrade build wheel - name: Build a pure Python wheel run: python3 setup.py sdist bdist_wheel - uses: actions/upload-artifact@v4 with: name: modin-wheel-and-source-tarball path: ./dist/ include-hidden-files: true - name: Publish Modin wheel to PyPI if: github.event_name == 'push' uses: pypa/gh-action-pypi-publish@release/v1 ================================================ FILE: .github/workflows/push-to-main.yml ================================================ name: push-to-main on: push: branches: - main concurrency: # Cancel other jobs in the same branch. We don't care whether CI passes # on old commits. group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} env: MODIN_GITHUB_CI: true jobs: test-ray-master: runs-on: ubuntu-latest defaults: run: # `shell: bash -l {0}` - special way to activate modin environment shell: bash -l {0} services: moto: image: motoserver/moto:5.0.13 ports: - 5000:5000 env: AWS_ACCESS_KEY_ID: foobar_key AWS_SECRET_ACCESS_KEY: foobar_secret steps: - uses: actions/checkout@v4 - uses: ./.github/actions/mamba-env with: environment-file: environment-dev.yml - name: install Ray nightly build # Use --force-reinstall to always reinstall ray and its dependencies. # botocore isn't compatible with urllib3>=2; see #6094 for details run: pip install --force-reinstall "urllib3<2" https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp39-cp39-manylinux2014_x86_64.whl - name: Conda environment run: | conda info conda list - run: sudo apt update && sudo apt install -y libhdf5-dev - name: Run parallelizable Modin Tests run: > python -m pytest -n 2 modin/tests/pandas/dataframe/test_binary.py modin/tests/pandas/dataframe/test_default.py modin/tests/pandas/dataframe/test_indexing.py modin/tests/pandas/dataframe/test_iter.py modin/tests/pandas/dataframe/test_join_sort.py modin/tests/pandas/dataframe/test_map_metadata.py modin/tests/pandas/dataframe/test_reduce.py modin/tests/pandas/dataframe/test_udf.py modin/tests/pandas/dataframe/test_window.py modin/tests/pandas/test_series.py modin/tests/numpy/test_array.py modin/tests/numpy/test_array_creation.py modin/tests/numpy/test_array_arithmetic.py modin/tests/numpy/test_array_axis_functions.py modin/tests/numpy/test_array_logic.py modin/tests/numpy/test_array_linalg.py modin/tests/numpy/test_array_indexing.py modin/tests/numpy/test_array_math.py modin/tests/numpy/test_array_shaping.py modin/tests/pandas/test_rolling.py modin/tests/pandas/test_expanding.py modin/tests/pandas/test_concat.py modin/tests/pandas/test_groupby.py modin/tests/pandas/test_reshape.py modin/tests/pandas/test_general.py - name: Run non-parallelizable Modin Tests run: > python -m pytest modin/tests/pandas/test_io.py modin/tests/experimental/test_io_exp.py test-docs: runs-on: ubuntu-latest defaults: run: shell: bash -l {0} name: test docs steps: - uses: actions/checkout@v4 - uses: ./.github/actions/mamba-env with: environment-file: environment-dev.yml - run: sudo apt update && sudo apt install -y libhdf5-dev - name: Docstring URL validity check run: python -m pytest modin/tests/test_docstring_urls.py ================================================ FILE: .github/workflows/sql_server/set_up_sql_server.sh ================================================ # This script sets up a SQL server listening at 0.0.0.0:1234. # If any step fails, we can't set up a valid SQL server for unit tests. set -e # Pull the 2019 SQL server docker container image by following: # https://docs.microsoft.com/en-us/sql/linux/quickstart-install-connect-docker?view=sql-server-ver15&pivots=cs1-powershell#pullandrun2019 sudo docker pull mcr.microsoft.com/mssql/server:2019-latest sudo docker run -d --name example_sql_server -e 'ACCEPT_EULA=Y' -e 'SA_PASSWORD=Strong.Pwd-123' -p 1433:1433 mcr.microsoft.com/mssql/server:2019-latest # Wait 10 seconds because if we don't the server typically will not be ready # to accept connections by the time we want to make them. sleep 10 ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST scripts/gh-users-cache.json # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ docs/flow/modin/configs_help.csv # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 # User-specific stuff .idea/ .idea/**/workspace.xml .idea/**/tasks.xml .idea/**/usage.statistics.xml .idea/**/dictionaries .idea/**/shelf *.DS_Store # Sensitive or high-churn files .idea/**/dataSources/ .idea/**/dataSources.ids .idea/**/dataSources.local.xml .idea/**/sqlDataSources.xml .idea/**/dynamic.xml .idea/**/uiDesigner.xml .idea/**/dbnavigator.xml # Gradle .idea/**/gradle.xml .idea/**/libraries # vscode settings .vscode/ # CMake cmake-build-*/ # Mongo Explorer plugin .idea/**/mongoSettings.xml # File-based project format *.iws # IntelliJ out/ # mpeltonen/sbt-idea plugin .idea_modules/ # JIRA plugin atlassian-ide-plugin.xml # Cursive Clojure plugin .idea/replstate.xml # Crashlytics plugin (for Android Studio and IntelliJ) com_crashlytics_export_strings.xml crashlytics.properties crashlytics-build.properties fabric.properties # Editor-based Rest Client .idea/httpRequests # Cscope and Tags tags cscope.files cscope.out # PYTest Benchmarks .benchmarks/ # Dask workspace dask-worker-space/ node_modules # Asv stuff asv_bench/.asv/ asv_bench/modin/ # Sublime stuff *.sublime-workspace *.sublime-project ================================================ FILE: .readthedocs.yaml ================================================ # .readthedocs.yaml # Read the Docs configuration file # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details # Required version: 2 # Set the version of Python and other tools you might need build: os: ubuntu-20.04 tools: python: "3.9" # Build documentation in the docs/ directory with Sphinx sphinx: configuration: docs/conf.py formats: all python: install: - requirements: docs/requirements-doc.txt ================================================ FILE: CODEOWNERS ================================================ # These owners will be the default owners for everything in # the repo unless a later match takes precedence, * @modin-project/modin-core @devin-petersohn @mvashishtha @RehanSD @YarShev @vnlitvinov @anmyachev @dchigarev ================================================ FILE: CODE_OF_CONDUCT.md ================================================ # Contributor Covenant Code of Conduct ## Our Pledge In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. ## Our Standards Examples of behavior that contributes to creating a positive environment include: * Using welcoming and inclusive language * Being respectful of differing viewpoints and experiences * Gracefully accepting constructive criticism * Focusing on what is best for the community * Showing empathy towards other community members Examples of unacceptable behavior by participants include: * The use of sexualized language or imagery and unwelcome sexual attention or advances * Trolling, insulting/derogatory comments, and personal or political attacks * Public or private harassment * Publishing others' private information, such as a physical or electronic address, without explicit permission * Other conduct which could reasonably be considered inappropriate in a professional setting ## Our Responsibilities Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. ## Scope This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at conduct@gr-oss.io. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. ## Attribution This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html [homepage]: https://www.contributor-covenant.org For answers to common questions about this code of conduct, see https://www.contributor-covenant.org/faq ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. # Certain code used and distributed in this package is forked from pandas # (https://github.com/pandas-dev/pandas). The pandas LICENSE # below applies to those certain forked components in this project: BSD 3-Clause License Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team All rights reserved. Copyright (c) 2011-2025, Open source contributors. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: LICENSE_HEADER ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. ================================================ FILE: MANIFEST.in ================================================ include versioneer.py include modin/_version.py include modin/tests/pandas/data/*.csv ================================================ FILE: NOTICE ================================================ Modin Copyright (c) 2018-2024 Modin Developers. ================================================ FILE: README.md ================================================

Scale your pandas workflows by changing one line of code

|

Dev Community & Support

|

Forums

|

Socials

|

Docs

| |:---: | :---: | :---: | :---: | | [![Slack](https://img.shields.io/badge/Slack-4A154B?style=for-the-badge&logo=slack&logoColor=white)](https://join.slack.com/t/modin-project/shared_invite/zt-yvk5hr3b-f08p_ulbuRWsAfg9rMY3uA) | [![Stack Overflow](https://img.shields.io/badge/-Stackoverflow-FE7A16?style=for-the-badge&logo=stack-overflow&logoColor=white)](https://stackoverflow.com/questions/tagged/modin) | Twitter Follow | |

PyPI version

### What is Modin? Modin is a drop-in replacement for [pandas](https://github.com/pandas-dev/pandas). While pandas is single-threaded, Modin lets you instantly speed up your workflows by scaling pandas so it uses all of your cores. Modin works especially well on larger datasets, where pandas becomes painfully slow or runs [out of memory](https://modin.readthedocs.io/en/latest/getting_started/why_modin/out_of_core.html). Also, Modin comes with the [additional APIs](https://modin.readthedocs.io/en/latest/usage_guide/advanced_usage/index.html#additional-apis) to improve user experience. By simply replacing the import statement, Modin offers users effortless speed and scale for their pandas workflows: In the GIFs below, Modin (left) and pandas (right) perform *the same pandas operations* on a 2GB dataset. The only difference between the two notebook examples is the import statement.
The charts below show the speedup you get by replacing pandas with Modin based on the examples above. The example notebooks can be found [here](examples/jupyter). To learn more about the speedups you could get with Modin and try out some examples on your own, check out our [10-minute quickstart guide](https://modin.readthedocs.io/en/latest/getting_started/quickstart.html) to try out some examples on your own! ### Installation #### From PyPI Modin can be installed with `pip` on Linux, Windows and MacOS: ```bash pip install "modin[all]" # (Recommended) Install Modin with Ray and Dask engines. ``` If you want to install Modin with a specific engine, we recommend: ```bash pip install "modin[ray]" # Install Modin dependencies and Ray. pip install "modin[dask]" # Install Modin dependencies and Dask. pip install "modin[mpi]" # Install Modin dependencies and MPI through unidist. ``` To get Modin on MPI through unidist (as of unidist 0.5.0) fully working it is required to have a working MPI implementation installed beforehand. Otherwise, installation of `modin[mpi]` may fail. Refer to [Installing with pip](https://unidist.readthedocs.io/en/latest/installation.html#installing-with-pip) section of the unidist documentation for more details about installation. **Note:** Since Modin 0.30.0 we use a reduced set of Ray dependencies: `ray` instead of `ray[default]`. This means that the dashboard and cluster launcher are no longer installed by default. If you need those, consider installing `ray[default]` along with `modin[ray]`. Modin automatically detects which engine(s) you have installed and uses that for scheduling computation. #### From conda-forge Installing from [conda forge](https://github.com/conda-forge/modin-feedstock) using `modin-all` will install Modin and three engines: [Ray](https://github.com/ray-project/ray), [Dask](https://github.com/dask/dask) and [MPI through unidist](https://github.com/modin-project/unidist). ```bash conda install -c conda-forge modin-all ``` Each engine can also be installed individually (and also as a combination of several engines): ```bash conda install -c conda-forge modin-ray # Install Modin dependencies and Ray. conda install -c conda-forge modin-dask # Install Modin dependencies and Dask. conda install -c conda-forge modin-mpi # Install Modin dependencies and MPI through unidist. ``` **Note:** Since Modin 0.30.0 we use a reduced set of Ray dependencies: `ray-core` instead of `ray-default`. This means that the dashboard and cluster launcher are no longer installed by default. If you need those, consider installing `ray-default` along with `modin-ray`. Refer to [Installing with conda](https://unidist.readthedocs.io/en/latest/installation.html#installing-with-conda) section of the unidist documentation for more details on how to install a specific MPI implementation to run on. To speed up conda installation we recommend using libmamba solver. To do this install it in a base environment: ```bash conda install -n base conda-libmamba-solver ``` and then use it during istallation either like: ```bash conda install -c conda-forge modin-ray --experimental-solver=libmamba ``` or starting from conda 22.11 and libmamba solver 22.12 versions: ```bash conda install -c conda-forge modin-ray --solver=libmamba ``` #### Choosing a Compute Engine If you want to choose a specific compute engine to run on, you can set the environment variable `MODIN_ENGINE` and Modin will do computation with that engine: ```bash export MODIN_ENGINE=ray # Modin will use Ray export MODIN_ENGINE=dask # Modin will use Dask export MODIN_ENGINE=unidist # Modin will use Unidist ``` If you want to choose the Unidist engine, you should set the additional environment variable ``UNIDIST_BACKEND``. Currently, Modin only supports MPI through unidist: ```bash export UNIDIST_BACKEND=mpi # Unidist will use MPI backend ``` This can also be done within a notebook/interpreter before you import Modin: ```python import modin.config as modin_cfg import unidist.config as unidist_cfg modin_cfg.Engine.put("ray") # Modin will use Ray modin_cfg.Engine.put("dask") # Modin will use Dask modin_cfg.Engine.put('unidist') # Modin will use Unidist unidist_cfg.Backend.put('mpi') # Unidist will use MPI backend ``` _Note: You should not change the engine after your first operation with Modin as it will result in undefined behavior._ #### Which engine should I use? On Linux, MacOS, and Windows you can install and use either Ray, Dask or MPI through unidist. There is no knowledge required to use either of these engines as Modin abstracts away all of the complexity, so feel free to pick either! ### Pandas API Coverage

| pandas Object | Modin's Ray Engine Coverage | Modin's Dask Engine Coverage | Modin's Unidist Engine Coverage | |-------------------|:------------------------------------------------------------------------------------:|:---------------:|:---------------:| | `pd.DataFrame` | | | | | `pd.Series` | | | | `pd.read_csv` | ✅ | ✅ | ✅ | | `pd.read_table` | ✅ | ✅ | ✅ | | `pd.read_parquet` | ✅ | ✅ | ✅ | | `pd.read_sql` | ✅ | ✅ | ✅ | | `pd.read_feather` | ✅ | ✅ | ✅ | | `pd.read_excel` | ✅ | ✅ | ✅ | | `pd.read_json` | [✳️](https://github.com/modin-project/modin/issues/554) | [✳️](https://github.com/modin-project/modin/issues/554) | [✳️](https://github.com/modin-project/modin/issues/554) | | `pd.read_` | [✴️](https://modin.readthedocs.io/en/latest/supported_apis/io_supported.html) | [✴️](https://modin.readthedocs.io/en/latest/supported_apis/io_supported.html) | [✴️](https://modin.readthedocs.io/en/latest/supported_apis/io_supported.html) |

Some pandas APIs are easier to implement than others, so if something is missing feel free to open an issue! ### More about Modin For the complete documentation on Modin, visit our [ReadTheDocs](https://modin.readthedocs.io/en/latest/index.html) page. #### Scale your pandas workflow by changing a single line of code. _Note: In local mode (without a cluster), Modin will create and manage a local (Dask or Ray) cluster for the execution._ To use Modin, you do not need to specify how to distribute the data, or even know how many cores your system has. In fact, you can continue using your previous pandas notebooks while experiencing a considerable speedup from Modin, even on a single machine. Once you've changed your import statement, you're ready to use Modin just like you would with pandas! #### Faster pandas, even on your laptop The `modin.pandas` DataFrame is an extremely light-weight parallel DataFrame. Modin transparently distributes the data and computation so that you can continue using the same pandas API while working with more data faster. Because it is so light-weight, Modin provides speed-ups of up to 4x on a laptop with 4 physical cores. In pandas, you are only able to use one core at a time when you are doing computation of any kind. With Modin, you are able to use all of the CPU cores on your machine. Even with a traditionally synchronous task like `read_csv`, we see large speedups by efficiently distributing the work across your entire machine. ```python import modin.pandas as pd df = pd.read_csv("my_dataset.csv") ``` #### Modin can handle the datasets that pandas can't Often data scientists have to switch between different tools for operating on datasets of different sizes. Processing large dataframes with pandas is slow, and pandas does not support working with dataframes that are too large to fit into the available memory. As a result, pandas workflows that work well for prototyping on a few MBs of data do not scale to tens or hundreds of GBs (depending on the size of your machine). Modin supports operating on data that does not fit in memory, so that you can comfortably work with hundreds of GBs without worrying about substantial slowdown or memory errors. With [cluster](https://modin.readthedocs.io/en/latest/getting_started/using_modin/using_modin_cluster.html) and [out of core](https://modin.readthedocs.io/en/latest/getting_started/why_modin/out_of_core.html) support, Modin is a DataFrame library with both great single-node performance and high scalability in a cluster. #### Modin Architecture We designed [Modin's architecture](https://modin.readthedocs.io/en/latest/development/architecture.html) to be modular so we can plug in different components as they develop and improve: Modin's architecture ### Other Resources #### Getting Started with Modin - [Documentation](https://modin.readthedocs.io/en/latest/) - [10-min Quickstart Guide](https://modin.readthedocs.io/en/latest/getting_started/quickstart.html) - [Examples and Tutorials](https://modin.readthedocs.io/en/latest/getting_started/examples.html) - [Videos and Blogposts](https://modin.readthedocs.io/en/latest/getting_started/examples.html#talks-podcasts) - [Benchmarking Modin](https://modin.readthedocs.io/en/latest/usage_guide/benchmarking.html) #### Modin Community - [Slack](https://join.slack.com/t/modin-project/shared_invite/zt-yvk5hr3b-f08p_ulbuRWsAfg9rMY3uA) - [Twitter](https://twitter.com/modin_project) - [Mailing List](https://groups.google.com/g/modin-dev) - [GitHub Issues](https://github.com/modin-project/modin/issues) - [StackOverflow](https://stackoverflow.com/questions/tagged/modin) #### Learn More about Modin - [Frequently Asked Questions (FAQs)](https://modin.readthedocs.io/en/latest/getting_started/faq.html) - [Troubleshooting Guide](https://modin.readthedocs.io/en/latest/getting_started/troubleshooting.html) - [Development Guide](https://modin.readthedocs.io/en/latest/development/index.html) - Modin is built on many years of research and development at UC Berkeley. Check out these selected papers to learn more about how Modin works: - [Flexible Rule-Based Decomposition and Metadata Independence in Modin](https://people.eecs.berkeley.edu/~totemtang/paper/Modin.pdf) (VLDB 2021) - [Dataframe Systems: Theory, Architecture, and Implementation](https://www2.eecs.berkeley.edu/Pubs/TechRpts/2021/EECS-2021-193.pdf) (PhD Dissertation 2021) - [Towards Scalable Dataframe Systems](https://arxiv.org/pdf/2001.00888.pdf) (VLDB 2020) #### Getting Involved ***`modin.pandas` is currently under active development. Requests and contributions are welcome!*** For more information on how to contribute to Modin, check out the [Modin Contribution Guide](https://modin.readthedocs.io/en/latest/development/contributing.html). ### License [Apache License 2.0](LICENSE) ================================================ FILE: asv_bench/README.md ================================================ # Modin ASV benchmarks ## Here are some scenarios in which [ASV](https://asv.readthedocs.io/en/stable/index.html) can be used: * Check the impact of the new patch on the performance of a certain set of operations: `asv continuous -f 1.05 src/main HEAD -b TimeGroupBy --launch-method=spawn` * Check for presence of errors inside of benchmarks after changing them or writing new ones: `asv run --quick --show-stderr --python=same --launch-method=spawn` * Run entire benchmark suite to get the current times: `asv run --launch-method=spawn` * Check the range of commits for performance degradation: ``` asv run [start_hash]..[end_hash] --launch-method=spawn asv publish asv preview ``` For more consistent results, you may need to use the following parameters which description is in [ASV docs](https://asv.readthedocs.io/en/stable/benchmarks.html?highlight=sample_time#timing-benchmarks): * `-a sample_time=1` * `-a warmup_time=1` ### Notes about using Modin on Ray with Asv: * `--launch-method=forkserver` is not working; * Each set of parameters for each test is launched in its own process, which brings a large overhead, since for each process redis server and other necessary processes from ray initialization are started and destroyed. ## Adding new benchmark Basic information on writing benchmarks is present [in ASV documentation](https://asv.readthedocs.io/en/stable/writing_benchmarks.html) Benchmarks from `benchmarks/benchmarks.py`, `benchmarks/scalability/scalability_benchmarks.py` or `benchmarks/io/csv.py` could be used as a starting point. Requirements: * the benchmark should be able to run both on Modin and on Pandas when the appropriate value of the environment variable `MODIN_ASV_USE_IMPL` is selected. * the size of the benchmark dataset should depend on the environment variable `MODIN_TEST_DATASET_SIZE`. ## Changing existing benchmark It should be remembered that the hash calculated from the benchmark source code is used to display the results. When changing the benchmark, the old results will no longer be displayed in the dashboard. In general, this is the correct behavior so as not to get a situation when incomparable numbers are displayed in the dashboard. But it should be noted that there could be changes in the source code when it is still correct to compare the "before" and "after" versions, for example, name of a variable changed, comment added, etc. In this case you must either run a new version of the benchmark for all the commits ever accounted for or manually change the hash in the corresponding result files. ## Pipeline for displaying results in a dashboard Step 1: checking benchmarks for validity, runs in PRs CI. During the test, the benchmarks are run once on small data. The implementation can be found in `test-asv-benchmarks` job of [ci.yml](https://github.com/modin-project/modin/blob/main/.github/workflows/ci.yml) Step 2: running benchmarks with saving the results in [modin-bench@master](https://github.com/modin-project/modin-bench). The launch takes place on internal server using specific TeamCity configuration. The description of the server can be found in the ["Benchmark list"](https://modin.org/modin-bench/#summarylist?sort=0&dir=asc) tab, on the left when you hover the mouse over the machine name. This step starts as scheduled (now every half hour), subject to the presence of new commits in the Modin `main` branch. Command to run benchmarks: `asv run HASHFILE:hashfile.txt --show-stderr --machine xeon-e5 --launch-method=spawn`. In the file `hashfile.txt` is the last modin commit hash. Writing to a `modin-bench@master` triggers 3 step of the pipeline. Step 3: converting the results to html representation, which is saved in [modin-bench@gh-pages](https://github.com/modin-project/modin-bench) The implementation can be found in `deploy-gh-pages` job of [push.yml](https://github.com/modin-project/modin-bench/blob/master/.github/workflows/push.yml) Basic actions for step 2: * setup environment variable: * export MODIN_TEST_DATASET=Big * export MODIN_CPUS=44 * setup git client * prepare json file with machine description * This file should be placed in the user's home directory. * ASV does not always automatically create the file with the description of the machine correctly (e.g. due to being run in a container). It is recommended to create a file using [asv machine](https://asv.readthedocs.io/en/stable/commands.html?highlight=machine%20description#asv-machine) command, and manually check the result. [Example](https://github.com/modin-project/modin-bench/blob/master/results/xeon-e5/machine.json) * copy old result to folder where new result will appear (conflict resolution will be performed by ASV itself instead of git) * push performance result to modin-bench repository ================================================ FILE: asv_bench/asv.conf.dask.json ================================================ { // The version of the config file format. Do not change, unless // you know what you are doing. "version": 1, // The name of the project being benchmarked "project": "modin", // The project's homepage "project_url": "https://modin.readthedocs.io/", // The URL or local path of the source code repository for the // project being benchmarked "repo": "..", // List of branches to benchmark. If not provided, defaults to "master" // (for git) or "default" (for mercurial). "branches": ["main"], // Customizable commands for building, installing, and // uninstalling the project. See asv.conf.json documentation. "install_command": ["in-dir={env_dir} python -mpip install {wheel_file}[dask]"], // The tool to use to create environments. May be "conda", // "virtualenv" or other value depending on the plugins in use. // If missing or the empty string, the tool will be automatically // determined by looking for tools on the PATH environment // variable. "environment_type": "conda", // timeout in seconds for installing any dependencies in environment // defaults to 10 min "install_timeout": 6000, // the base URL to show a commit for the project. "show_commit_url": "https://github.com/modin-project/modin/commit/", // The Pythons you'd like to test against. If not provided, defaults // to the current version of Python used to run `asv`. "pythons": ["3.9"], // The list of conda channel names to be searched for benchmark // dependency packages in the specified order "conda_channels": ["conda-forge", "defaults"], // The directory (relative to the current directory) to cache the Python // environments in. If not provided, defaults to "env" "env_dir": ".asv/env", // The directory (relative to the current directory) that raw benchmark // results are stored in. If not provided, defaults to "results". "results_dir": ".asv/results", // The directory (relative to the current directory) that the html tree // should be written to. If not provided, defaults to "html". "html_dir": ".asv/html", } ================================================ FILE: asv_bench/asv.conf.json ================================================ { // The version of the config file format. Do not change, unless // you know what you are doing. "version": 1, // The name of the project being benchmarked "project": "modin", // The project's homepage "project_url": "https://modin.readthedocs.io/", // The URL or local path of the source code repository for the // project being benchmarked "repo": "..", // List of branches to benchmark. If not provided, defaults to "master" // (for git) or "default" (for mercurial). "branches": ["main"], // Customizable commands for building, installing, and // uninstalling the project. See asv.conf.json documentation. "install_command": ["in-dir={env_dir} python -mpip install {wheel_file}[ray]"], // The tool to use to create environments. May be "conda", // "virtualenv" or other value depending on the plugins in use. // If missing or the empty string, the tool will be automatically // determined by looking for tools on the PATH environment // variable. "environment_type": "conda", // timeout in seconds for installing any dependencies in environment // defaults to 10 min "install_timeout": 6000, // the base URL to show a commit for the project. "show_commit_url": "https://github.com/modin-project/modin/commit/", // The Pythons you'd like to test against. If not provided, defaults // to the current version of Python used to run `asv`. "pythons": ["3.9"], // The list of conda channel names to be searched for benchmark // dependency packages in the specified order "conda_channels": ["conda-forge", "defaults"], // The directory (relative to the current directory) to cache the Python // environments in. If not provided, defaults to "env" "env_dir": ".asv/env", // The directory (relative to the current directory) that raw benchmark // results are stored in. If not provided, defaults to "results". "results_dir": ".asv/results", // The directory (relative to the current directory) that the html tree // should be written to. If not provided, defaults to "html". "html_dir": ".asv/html", } ================================================ FILE: asv_bench/asv.conf.unidist.json ================================================ { // The version of the config file format. Do not change, unless // you know what you are doing. "version": 1, // The name of the project being benchmarked "project": "modin", // The project's homepage "project_url": "https://modin.readthedocs.io/", // The URL or local path of the source code repository for the // project being benchmarked "repo": "..", // List of branches to benchmark. If not provided, defaults to "master" // (for git) or "default" (for mercurial). "branches": ["main"], // Customizable commands for building, installing, and // uninstalling the project. See asv.conf.json documentation. "install_command": ["in-dir={env_dir} python -mpip install {wheel_file}[unidist]"], // The tool to use to create environments. May be "conda", // "virtualenv" or other value depending on the plugins in use. // If missing or the empty string, the tool will be automatically // determined by looking for tools on the PATH environment // variable. "environment_type": "conda", // timeout in seconds for installing any dependencies in environment // defaults to 10 min "install_timeout": 6000, // the base URL to show a commit for the project. "show_commit_url": "https://github.com/modin-project/modin/commit/", // The Pythons you'd like to test against. If not provided, defaults // to the current version of Python used to run `asv`. "pythons": ["3.9"], // The list of conda channel names to be searched for benchmark // dependency packages in the specified order "conda_channels": ["conda-forge", "defaults"], // The directory (relative to the current directory) to cache the Python // environments in. If not provided, defaults to "env" "env_dir": ".asv/env", // The directory (relative to the current directory) that raw benchmark // results are stored in. If not provided, defaults to "results". "results_dir": ".asv/results", // The directory (relative to the current directory) that the html tree // should be written to. If not provided, defaults to "html". "html_dir": ".asv/html", } ================================================ FILE: asv_bench/benchmarks/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Modin benchmarks.""" ================================================ FILE: asv_bench/benchmarks/benchmarks.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """General Modin benchmarks.""" # define `MODIN_CPUS` env var to control the number of partitions # it should be defined before modin.pandas import (in case of using os.environ) # define `MODIN_ASV_USE_IMPL` env var to choose library for using in performance # measurements import math import numpy as np from .utils import ( GROUPBY_NGROUPS, IMPL, RAND_HIGH, RAND_LOW, execute, gen_nan_data, generate_dataframe, get_benchmark_shapes, random_booleans, random_columns, random_string, translator_groupby_ngroups, ) class BaseTimeGroupBy: def setup(self, shape, ngroups=5, groupby_ncols=1): ngroups = translator_groupby_ngroups(ngroups, shape) self.df, self.groupby_columns = generate_dataframe( "int", *shape, RAND_LOW, RAND_HIGH, groupby_ncols, count_groups=ngroups, ) class TimeGroupByMultiColumn(BaseTimeGroupBy): param_names = ["shape", "ngroups", "groupby_ncols"] params = [ get_benchmark_shapes("TimeGroupByMultiColumn"), GROUPBY_NGROUPS, [6], ] def time_groupby_agg_quan(self, *args, **kwargs): execute(self.df.groupby(by=self.groupby_columns).agg("quantile")) def time_groupby_agg_mean(self, *args, **kwargs): execute(self.df.groupby(by=self.groupby_columns).apply(lambda df: df.mean())) class TimeGroupByDefaultAggregations(BaseTimeGroupBy): param_names = ["shape", "ngroups"] params = [ get_benchmark_shapes("TimeGroupByDefaultAggregations"), GROUPBY_NGROUPS, ] def time_groupby_count(self, *args, **kwargs): execute(self.df.groupby(by=self.groupby_columns).count()) def time_groupby_size(self, *args, **kwargs): execute(self.df.groupby(by=self.groupby_columns).size()) def time_groupby_sum(self, *args, **kwargs): execute(self.df.groupby(by=self.groupby_columns).sum()) def time_groupby_mean(self, *args, **kwargs): execute(self.df.groupby(by=self.groupby_columns).mean()) class TimeGroupByDictionaryAggregation(BaseTimeGroupBy): param_names = ["shape", "ngroups", "operation_type"] params = [ get_benchmark_shapes("TimeGroupByDictionaryAggregation"), GROUPBY_NGROUPS, ["reduce", "aggregation"], ] operations = { "reduce": ["sum", "count", "prod"], "aggregation": ["quantile", "std", "median"], } def setup(self, shape, ngroups, operation_type): super().setup(shape, ngroups) self.cols_to_agg = self.df.columns[1:4] operations = self.operations[operation_type] self.agg_dict = { c: operations[i % len(operations)] for i, c in enumerate(self.cols_to_agg) } def time_groupby_dict_agg(self, *args, **kwargs): execute(self.df.groupby(by=self.groupby_columns).agg(self.agg_dict)) class TimeJoin: param_names = ["shapes", "how", "sort"] params = [ get_benchmark_shapes("TimeJoin"), ["left", "inner"], [False], ] def setup(self, shapes, how, sort): self.df1 = generate_dataframe("int", *shapes[0], RAND_LOW, RAND_HIGH) self.df2 = generate_dataframe("int", *shapes[1], RAND_LOW, RAND_HIGH) def time_join(self, shapes, how, sort): # join dataframes on index to get the predictable shape execute(self.df1.join(self.df2, how=how, lsuffix="left_", sort=sort)) class TimeJoinStringIndex: param_names = ["shapes", "sort"] params = [ get_benchmark_shapes("TimeJoinStringIndex"), [True, False], ] def setup(self, shapes, sort): assert shapes[0] % 100 == 0, "implementation restriction" level1 = IMPL.Index([f"i-{i}" for i in range(10)], dtype=object).values level2 = IMPL.Index( [f"i-{i}" for i in range(shapes[0] // 100)], dtype=object ).values codes1 = np.arange(10).repeat(shapes[0] // 100) codes2 = np.tile(np.arange(shapes[0] // 100), 10) index2 = IMPL.MultiIndex(levels=[level1, level2], codes=[codes1, codes2]) self.df_multi = IMPL.DataFrame( np.random.randn(len(index2), 4), index=index2, columns=["A", "B", "C", "D"] ) self.key1 = np.tile(level1.take(codes1), 10) self.key2 = np.tile(level2.take(codes2), 10) self.df = generate_dataframe("int", *shapes, RAND_LOW, RAND_HIGH) # just to keep source shape self.df = self.df.drop(columns=self.df.columns[-2:]) self.df["key1"] = self.key1 self.df["key2"] = self.key2 execute(self.df) self.df_key1 = IMPL.DataFrame( np.random.randn(len(level1), 4), index=level1, columns=["A", "B", "C", "D"] ) self.df_key2 = IMPL.DataFrame( np.random.randn(len(level2), 4), index=level2, columns=["A", "B", "C", "D"] ) def time_join_dataframe_index_multi(self, shapes, sort): execute(self.df.join(self.df_multi, on=["key1", "key2"], sort=sort)) def time_join_dataframe_index_single_key_bigger(self, shapes, sort): execute(self.df.join(self.df_key2, on="key2", sort=sort)) def time_join_dataframe_index_single_key_small(self, shapes, sort): execute(self.df.join(self.df_key1, on="key1", sort=sort)) class TimeMergeDefault: param_names = ["shapes", "how", "sort"] params = [ get_benchmark_shapes("TimeMergeDefault"), ["left", "inner"], [True, False], ] def setup(self, shapes, how, sort): self.df1 = generate_dataframe("int", *shapes[0], RAND_LOW, RAND_HIGH) self.df2 = generate_dataframe("int", *shapes[1], RAND_LOW, RAND_HIGH) def time_merge(self, shapes, how, sort): execute(IMPL.merge(self.df1, self.df2, how=how, sort=sort)) class TimeMerge: param_names = ["shapes", "how", "sort"] params = [ get_benchmark_shapes("TimeMerge"), ["left", "inner"], [True, False], ] def setup(self, shapes, how, sort): self.df1 = generate_dataframe("int", *shapes[0], RAND_LOW, RAND_HIGH) self.df2 = generate_dataframe("int", *shapes[1], RAND_LOW, RAND_HIGH) def time_merge(self, shapes, how, sort): # merge dataframes by index to get the predictable shape execute( self.df1.merge( self.df2, left_index=True, right_index=True, how=how, sort=sort ) ) def time_merge_dataframe_empty_right(self, shapes, how, sort): # Getting an empty dataframe using `iloc` should be very fast, # so the impact on the time of the merge operation should be negligible. execute(IMPL.merge(self.df1, self.df2.iloc[:0], how=how, sort=sort)) def time_merge_dataframe_empty_left(self, shapes, how, sort): # Getting an empty dataframe using `iloc` should be very fast, # so the impact on the time of the merge operation should be negligible. execute(IMPL.merge(self.df1.iloc[:0], self.df2, how=how, sort=sort)) class TimeMergeCategoricals: param_names = ["shapes", "data_type"] params = [ get_benchmark_shapes("MergeCategoricals"), ["object", "category"], ] def setup(self, shapes, data_type): assert len(shapes) == 2 assert shapes[1] == 2 size = (shapes[0],) self.left = IMPL.DataFrame( { "X": np.random.choice(range(0, 10), size=size), "Y": np.random.choice(["one", "two", "three"], size=size), } ) self.right = IMPL.DataFrame( { "X": np.random.choice(range(0, 10), size=size), "Z": np.random.choice(["jjj", "kkk", "sss"], size=size), } ) if data_type == "category": self.left = self.left.assign(Y=self.left["Y"].astype("category")) execute(self.left) self.right = self.right.assign(Z=self.right["Z"].astype("category")) execute(self.right) def time_merge_categoricals(self, shapes, data_type): execute(IMPL.merge(self.left, self.right, on="X")) class TimeConcat: param_names = ["shapes", "how", "axis", "ignore_index"] params = [ get_benchmark_shapes("TimeConcat"), ["inner", "outer"], [0, 1], [True, False], ] def setup(self, shapes, how, axis, ignore_index): self.df1 = generate_dataframe("int", *shapes[0], RAND_LOW, RAND_HIGH) self.df2 = generate_dataframe("int", *shapes[1], RAND_LOW, RAND_HIGH) def time_concat(self, shapes, how, axis, ignore_index): execute( IMPL.concat( [self.df1, self.df2], axis=axis, join=how, ignore_index=ignore_index ) ) class TimeBinaryOp: param_names = ["shapes", "binary_op", "axis"] params = [ get_benchmark_shapes("TimeBinaryOp"), ["mul"], [0, 1], ] def setup(self, shapes, binary_op, axis): self.df1 = generate_dataframe("int", *shapes[0], RAND_LOW, RAND_HIGH) self.df2 = generate_dataframe("int", *shapes[1], RAND_LOW, RAND_HIGH) self.op = getattr(self.df1, binary_op) def time_binary_op(self, shapes, binary_op, axis): execute(self.op(self.df2, axis=axis)) class TimeBinaryOpSeries: param_names = ["shapes", "binary_op"] params = [ get_benchmark_shapes("TimeBinaryOpSeries"), ["mul"], ] def setup(self, shapes, binary_op): df1 = generate_dataframe("int", *shapes[0], RAND_LOW, RAND_HIGH) df2 = generate_dataframe("int", *shapes[1], RAND_LOW, RAND_HIGH) self.series1 = df1[df1.columns[0]] self.series2 = df2[df2.columns[0]] self.op = getattr(self.series1, binary_op) execute(self.series1) execute(self.series2) def time_binary_op_series(self, shapes, binary_op): execute(self.op(self.series2)) class BaseTimeSetItem: param_names = ["shape", "item_length", "loc", "is_equal_indices"] @staticmethod def get_loc(df, loc, axis, item_length): locs_dict = { "zero": 0, "middle": len(df.axes[axis]) // 2, "last": len(df.axes[axis]) - 1, } base_loc = locs_dict[loc] range_based_loc = np.arange( base_loc, min(len(df.axes[axis]), base_loc + item_length) ) return ( (df.axes[axis][base_loc], base_loc) if len(range_based_loc) == 1 else (df.axes[axis][range_based_loc], range_based_loc) ) def setup(self, shape, item_length, loc, is_equal_indices): self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH).copy() self.loc, self.iloc = self.get_loc( self.df, loc, item_length=item_length, axis=1 ) self.item = self.df[self.loc] + 1 self.item_raw = self.item.to_numpy() if not is_equal_indices: self.item.index = reversed(self.item.index) class TimeSetItem(BaseTimeSetItem): params = [ get_benchmark_shapes("TimeSetItem"), [1], ["zero", "middle", "last"], [True, False], ] def time_setitem_qc(self, *args, **kwargs): self.df[self.loc] = self.item execute(self.df) def time_setitem_raw(self, *args, **kwargs): self.df[self.loc] = self.item_raw execute(self.df) class TimeInsert(BaseTimeSetItem): params = [ get_benchmark_shapes("TimeInsert"), [1], ["zero", "middle", "last"], [True, False], ] def time_insert_qc(self, *args, **kwargs): self.df.insert(loc=self.iloc, column=random_string(), value=self.item) execute(self.df) def time_insert_raw(self, *args, **kwargs): self.df.insert(loc=self.iloc, column=random_string(), value=self.item_raw) execute(self.df) class TimeArithmetic: param_names = ["shape", "axis"] params = [ get_benchmark_shapes("TimeArithmetic"), [0, 1], ] def setup(self, shape, axis): self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH) def time_sum(self, shape, axis): execute(self.df.sum(axis=axis)) def time_count(self, shape, axis): execute(self.df.count(axis=axis)) def time_median(self, shape, axis): execute(self.df.median(axis=axis)) def time_nunique(self, shape, axis): execute(self.df.nunique(axis=axis)) def time_apply(self, shape, axis): execute(self.df.apply(lambda df: df.sum(), axis=axis)) def time_mean(self, shape, axis): execute(self.df.mean(axis=axis)) def time_mode(self, shape, axis): execute(self.df.mode(axis=axis)) def time_add(self, shape, axis): execute(self.df.add(2, axis=axis)) def time_mul(self, shape, axis): execute(self.df.mul(2, axis=axis)) def time_mod(self, shape, axis): execute(self.df.mod(2, axis=axis)) def time_abs(self, shape, axis): execute(self.df.abs()) def time_aggregate(self, shape, axis): execute(self.df.aggregate(lambda df: df.sum(), axis=axis)) def time_is_in(self, shape, axis): execute(self.df.isin([0, 2])) def time_transpose(self, shape, axis): execute(self.df.transpose()) class TimeSortValues: param_names = ["shape", "columns_number", "ascending_list"] params = [ get_benchmark_shapes("TimeSortValues"), [1, 2, 10, 100], [False, True], ] def setup(self, shape, columns_number, ascending_list): self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH) self.columns = random_columns(self.df.columns, columns_number) self.ascending = ( random_booleans(columns_number) if ascending_list else bool(random_booleans(1)[0]) ) def time_sort_values(self, shape, columns_number, ascending_list): execute(self.df.sort_values(self.columns, ascending=self.ascending)) class TimeDrop: param_names = ["shape", "axis", "drop_ncols"] params = [ get_benchmark_shapes("TimeDrop"), [0, 1], [1, 0.8], ] def setup(self, shape, axis, drop_ncols): self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH) drop_count = ( int(len(self.df.axes[axis]) * drop_ncols) if isinstance(drop_ncols, float) else drop_ncols ) self.labels = self.df.axes[axis][:drop_count] def time_drop(self, shape, axis, drop_ncols): execute(self.df.drop(self.labels, axis=axis)) class TimeHead: param_names = ["shape", "head_count"] params = [ get_benchmark_shapes("TimeHead"), [5, 0.8], ] def setup(self, shape, head_count): self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH) self.head_count = ( int(head_count * len(self.df.index)) if isinstance(head_count, float) else head_count ) def time_head(self, shape, head_count): execute(self.df.head(self.head_count)) class TimeTail: param_names = ["shape", "tail_count"] params = [ get_benchmark_shapes("TimeTail"), [5, 0.8], ] def setup(self, shape, tail_count): self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH) self.tail_count = ( int(tail_count * len(self.df.index)) if isinstance(tail_count, float) else tail_count ) def time_tail(self, shape, tail_count): execute(self.df.tail(self.tail_count)) class TimeExplode: param_names = ["shape"] params = [ get_benchmark_shapes("TimeExplode"), ] def setup(self, shape): self.df = generate_dataframe( "int", *shape, RAND_LOW, RAND_HIGH, gen_unique_key=True ) def time_explode(self, shape): execute(self.df.explode("col1")) class TimeFillnaSeries: param_names = ["value_type", "shape", "limit"] params = [ ["scalar", "dict", "Series"], get_benchmark_shapes("TimeFillnaSeries"), [None, 0.8], ] def setup(self, value_type, shape, limit): self.series = gen_nan_data(*shape) if value_type == "scalar": self.value = 18.19 elif value_type == "dict": self.value = {k: k * 1.23 for k in range(shape[0])} elif value_type == "Series": self.value = IMPL.Series( [k * 1.23 for k in range(shape[0])], index=IMPL.RangeIndex(shape[0]) ) else: assert False limit = int(limit * shape[0]) if limit else None self.kw = {"value": self.value, "limit": limit} def time_fillna(self, value_type, shape, limit): execute(self.series.fillna(**self.kw)) def time_fillna_inplace(self, value_type, shape, limit): self.series.fillna(inplace=True, **self.kw) execute(self.series) class TimeFillnaDataFrame: param_names = ["value_type", "shape", "limit"] params = [ ["scalar", "dict", "DataFrame", "Series"], get_benchmark_shapes("TimeFillnaDataFrame"), [None, 0.8], ] def setup(self, value_type, shape, limit): self.df = gen_nan_data(*shape) columns = self.df.columns if value_type == "scalar": self.value = 18.19 elif value_type == "dict": self.value = {k: i * 1.23 for i, k in enumerate(columns)} elif value_type == "Series": self.value = IMPL.Series( [i * 1.23 for i in range(len(columns))], index=columns ) elif value_type == "DataFrame": self.value = IMPL.DataFrame( { k: [i + j * 1.23 for j in range(shape[0])] for i, k in enumerate(columns) }, index=IMPL.RangeIndex(shape[0]), columns=columns, ) else: assert False limit = int(limit * shape[0]) if limit else None self.kw = {"value": self.value, "limit": limit} def time_fillna(self, value_type, shape, limit): execute(self.df.fillna(**self.kw)) def time_fillna_inplace(self, value_type, shape, limit): self.df.fillna(inplace=True, **self.kw) execute(self.df) class BaseTimeValueCounts: def setup(self, shape, ngroups=5, subset=1): ngroups = translator_groupby_ngroups(ngroups, shape) self.df, self.subset = generate_dataframe( "int", *shape, RAND_LOW, RAND_HIGH, groupby_ncols=subset, count_groups=ngroups, ) class TimeValueCountsFrame(BaseTimeValueCounts): param_names = ["shape", "ngroups", "subset"] params = [ get_benchmark_shapes("TimeValueCountsFrame"), GROUPBY_NGROUPS, [2, 10], ] def time_value_counts(self, *args, **kwargs): execute(self.df.value_counts(subset=self.subset)) class TimeValueCountsSeries(BaseTimeValueCounts): param_names = ["shape", "ngroups", "bins"] params = [ get_benchmark_shapes("TimeValueCountsSeries"), GROUPBY_NGROUPS, [None, 3], ] def setup(self, shape, ngroups, bins): super().setup(ngroups=ngroups, shape=shape) self.df = self.df[self.subset[0]] def time_value_counts(self, shape, ngroups, bins): execute(self.df.value_counts(bins=bins)) class TimeIndexing: param_names = ["shape", "indexer_type"] params = [ get_benchmark_shapes("TimeIndexing"), [ "bool_array", "bool_series", "scalar", "slice", "continuous_slice", "numpy_array_take_all_values", "python_list_take_10_values", "function", ], ] indexer_getters = { "bool_array": lambda df: np.array([False, True] * (len(df) // 2)), # This boolean-Series is a projection of the source frame, it shouldn't # be reimported or triggered to execute: "bool_series": lambda df: df.iloc[:, 0] > 50, "scalar": lambda df: len(df) // 2, "slice": lambda df: slice(0, len(df), 2), "continuous_slice": lambda df: slice(len(df) // 2), "numpy_array_take_all_values": lambda df: np.arange(len(df)), "python_list_take_10_values": lambda df: list(range(min(10, len(df)))), "function": lambda df: (lambda df: df.index[::-2]), } def setup(self, shape, indexer_type): self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH) self.indexer = self.indexer_getters[indexer_type](self.df) if isinstance(self.indexer, (IMPL.Series, IMPL.DataFrame)): # HACK: Triggering `dtypes` meta-data computation in advance, # so it won't affect the `loc/iloc` time: self.indexer.dtypes def time_iloc(self, shape, indexer_type): # Pandas doesn't implement `df.iloc[series boolean_mask]` and raises an exception on it. # Replacing this with the semantically equivalent construction: if indexer_type != "bool_series": execute(self.df.iloc[self.indexer]) else: execute(self.df[self.indexer]) def time_loc(self, shape, indexer_type): execute(self.df.loc[self.indexer]) class TimeIndexingColumns: param_names = ["shape"] params = [get_benchmark_shapes("TimeIndexing")] def setup(self, shape): self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH) self.numeric_indexer = [0, 1] self.labels_indexer = self.df.columns[self.numeric_indexer].tolist() def time_iloc(self, shape): execute(self.df.iloc[:, self.numeric_indexer]) def time_loc(self, shape): execute(self.df.loc[:, self.labels_indexer]) def time___getitem__(self, shape): execute(self.df[self.labels_indexer]) class TimeMultiIndexing: param_names = ["shape"] params = [get_benchmark_shapes("TimeMultiIndexing")] def setup(self, shape): df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH) index = IMPL.MultiIndex.from_product( [df.index[: shape[0] // 2], ["bar", "foo"]] ) columns = IMPL.MultiIndex.from_product( [df.columns[: shape[1] // 2], ["buz", "fuz"]] ) df.index = index df.columns = columns self.df = df.sort_index(axis=1) def time_multiindex_loc(self, shape): execute( self.df.loc[ self.df.index[2] : self.df.index[-2], self.df.columns[2] : self.df.columns[-2], ] ) class TimeResetIndex: param_names = ["shape", "drop", "level"] params = [ get_benchmark_shapes("TimeResetIndex"), [False, True], [None, "level_1"], ] def setup(self, shape, drop, level): self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH) if level: index = IMPL.MultiIndex.from_product( [self.df.index[: shape[0] // 2], ["bar", "foo"]], names=["level_1", "level_2"], ) self.df.index = index def time_reset_index(self, shape, drop, level): execute(self.df.reset_index(drop=drop, level=level)) class TimeAstype: param_names = ["shape", "dtype", "astype_ncolumns"] params = [ get_benchmark_shapes("TimeAstype"), ["float64", "category"], ["one", "all"], ] def setup(self, shape, dtype, astype_ncolumns): self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH) if astype_ncolumns == "all": self.astype_arg = dtype elif astype_ncolumns == "one": self.astype_arg = {"col1": dtype} else: raise ValueError(f"astype_ncolumns: {astype_ncolumns} isn't supported") def time_astype(self, shape, dtype, astype_ncolumns): execute(self.df.astype(self.astype_arg)) class TimeDescribe: param_names = ["shape"] params = [ get_benchmark_shapes("TimeDescribe"), ] def setup(self, shape): self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH) def time_describe(self, shape): execute(self.df.describe()) class TimeProperties: param_names = ["shape"] params = [ get_benchmark_shapes("TimeProperties"), ] def setup(self, shape): self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH) def time_shape(self, shape): return self.df.shape def time_columns(self, shape): return self.df.columns def time_index(self, shape): return self.df.index class TimeIndexingNumericSeries: param_names = ["shape", "dtype", "index_structure"] params = [ get_benchmark_shapes("TimeIndexingNumericSeries"), (np.int64, np.uint64, np.float64), ("unique_monotonic_inc", "nonunique_monotonic_inc"), ] def setup(self, shape, dtype, index_structure): N = shape[0] indices = { "unique_monotonic_inc": IMPL.Index(range(N), dtype=dtype), "nonunique_monotonic_inc": IMPL.Index( list(range(N // 100)) + [(N // 100) - 1] + list(range(N // 100, N - 1)), dtype=dtype, ), } self.data = IMPL.Series(np.random.rand(N), index=indices[index_structure]) self.array = np.arange(N // 2) self.index_to_query = N // 2 self.array_list = self.array.tolist() execute(self.data) def time_getitem_scalar(self, shape, index, index_structure): # not calling execute as execute function fails for scalar self.data[self.index_to_query] def time_getitem_slice(self, shape, index, index_structure): execute(self.data[: self.index_to_query]) def time_getitem_list_like(self, shape, index, index_structure): execute(self.data[[self.index_to_query]]) def time_getitem_array(self, shape, index, index_structure): execute(self.data[self.array]) def time_getitem_lists(self, shape, index, index_structure): execute(self.data[self.array_list]) def time_iloc_array(self, shape, index, index_structure): execute(self.data.iloc[self.array]) def time_iloc_list_like(self, shape, index, index_structure): execute(self.data.iloc[[self.index_to_query]]) def time_iloc_scalar(self, shape, index, index_structure): # not calling execute as execute function fails for scalar self.data.iloc[self.index_to_query] def time_iloc_slice(self, shape, index, index_structure): execute(self.data.iloc[: self.index_to_query]) def time_loc_array(self, shape, index, index_structure): execute(self.data.loc[self.array]) def time_loc_list_like(self, shape, index, index_structure): execute(self.data.loc[[self.index_to_query]]) def time_loc_scalar(self, shape, index, index_structure): self.data.loc[self.index_to_query] def time_loc_slice(self, shape, index, index_structure): execute(self.data.loc[: self.index_to_query]) class TimeReindex: param_names = ["shape"] params = [get_benchmark_shapes("TimeReindex")] def setup(self, shape): rows, cols = shape rng = IMPL.date_range(start="1/1/1970", periods=rows, freq="1min") self.df = IMPL.DataFrame( np.random.rand(rows, cols), index=rng, columns=range(cols) ) self.df["foo"] = "bar" self.rng_subset = IMPL.Index(rng[::2]) self.df2 = IMPL.DataFrame( index=range(rows), data=np.random.rand(rows, cols), columns=range(cols) ) level1 = IMPL.Index( [f"i-{i}" for i in range(rows // 10)], dtype=object ).values.repeat(10) level2 = np.tile( IMPL.Index([f"i-{i}" for i in range(10)], dtype=object).values, rows // 10 ) index = IMPL.MultiIndex.from_arrays([level1, level2]) self.s = IMPL.Series(np.random.randn(rows), index=index) self.s_subset = self.s[::2] self.s_subset_no_cache = self.s[::2].copy() mi = IMPL.MultiIndex.from_product([rng[: len(rng) // 10], range(10)]) self.s2 = IMPL.Series(np.random.randn(len(mi)), index=mi) self.s2_subset = self.s2[::2].copy() execute(self.df), execute(self.df2) execute(self.s), execute(self.s_subset) execute(self.s2), execute(self.s2_subset) execute(self.s_subset_no_cache) def time_reindex_dates(self, shape): execute(self.df.reindex(self.rng_subset)) def time_reindex_columns(self, shape): execute(self.df2.reindex(columns=self.df.columns[1:5])) def time_reindex_multiindex_with_cache(self, shape): # MultiIndex._values gets cached (pandas specific) execute(self.s.reindex(self.s_subset.index)) def time_reindex_multiindex_no_cache(self, shape): # Copy to avoid MultiIndex._values getting cached (pandas specific) execute(self.s.reindex(self.s_subset_no_cache.index.copy())) def time_reindex_multiindex_no_cache_dates(self, shape): # Copy to avoid MultiIndex._values getting cached (pandas specific) execute(self.s2_subset.reindex(self.s2.index.copy())) class TimeReindexMethod: params = [ get_benchmark_shapes("TimeReindexMethod"), ["pad", "backfill"], [IMPL.date_range, IMPL.period_range], ] param_names = ["shape", "method", "constructor"] def setup(self, shape, method, constructor): N = shape[0] self.idx = constructor("1/1/2000", periods=N, freq="1min") self.ts = IMPL.Series(np.random.randn(N), index=self.idx)[::2] execute(self.ts) def time_reindex_method(self, shape, method, constructor): execute(self.ts.reindex(self.idx, method=method)) class TimeFillnaMethodSeries: params = [get_benchmark_shapes("TimeFillnaMethodSeries"), ["pad", "backfill"]] param_names = ["shape", "method"] def setup(self, shape, method): N = shape[0] self.idx = IMPL.date_range("1/1/2000", periods=N, freq="1min") ts = IMPL.Series(np.random.randn(N), index=self.idx)[::2] self.ts_reindexed = ts.reindex(self.idx) self.ts_float32 = self.ts_reindexed.astype("float32") execute(self.ts_reindexed), execute(self.ts_float32) def time_reindexed(self, shape, method): execute(self.ts_reindexed.fillna(method=method)) def time_float_32(self, shape, method): execute(self.ts_float32.fillna(method=method)) class TimeFillnaMethodDataframe: params = [get_benchmark_shapes("TimeFillnaMethodDataframe"), ["pad", "backfill"]] param_names = ["shape", "method"] def setup(self, shape, method): self.idx = IMPL.date_range("1/1/2000", periods=shape[0], freq="1min") df_ts = IMPL.DataFrame(np.random.randn(*shape), index=self.idx)[::2] self.df_ts_reindexed = df_ts.reindex(self.idx) self.df_ts_float32 = self.df_ts_reindexed.astype("float32") execute(self.df_ts_reindexed), execute(self.df_ts_float32) def time_reindexed(self, shape, method): execute(self.df_ts_reindexed.fillna(method=method)) def time_float_32(self, shape, method): execute(self.df_ts_float32.fillna(method=method)) class TimeLevelAlign: params = [get_benchmark_shapes("TimeLevelAlign")] param_names = ["shapes"] def setup(self, shapes): rows, cols = shapes[0] rows_sqrt = round(math.sqrt(rows)) # the new number of rows may differ from the requested (slightly, so ok) rows = rows_sqrt * rows_sqrt self.index = IMPL.MultiIndex( levels=[np.arange(10), np.arange(rows_sqrt), np.arange(rows_sqrt)], codes=[ np.arange(10).repeat(rows), np.tile(np.arange(rows_sqrt).repeat(rows_sqrt), 10), np.tile(np.tile(np.arange(rows_sqrt), rows_sqrt), 10), ], ) self.df1 = IMPL.DataFrame( np.random.randn(len(self.index), cols), index=self.index ) self.df2 = IMPL.DataFrame(np.random.randn(*shapes[1])) execute(self.df1), execute(self.df2) def time_align_level(self, shapes): left, right = self.df1.align(self.df2, level=1, copy=False) execute(left), execute(right) def time_reindex_level(self, shapes): # `reindex` returns the same result here as `align`. # Approximately the same performance is expected. execute(self.df2.reindex(self.index, level=1)) class TimeDropDuplicatesDataframe: params = [get_benchmark_shapes("TimeDropDuplicatesDataframe")] param_names = ["shape"] def setup(self, shape): rows, cols = shape N = rows // 10 K = 10 data = {} # dataframe would have cols-1 keys(strings) and one value(int) column for col in range(cols - 1): data["key" + str(col + 1)] = IMPL.Index( [f"i-{i}" for i in range(N)], dtype=object ).values.repeat(K) data["value"] = np.random.randn(N * K) self.df = IMPL.DataFrame(data) execute(self.df) def time_drop_dups(self, shape): execute(self.df.drop_duplicates(self.df.columns[:-1])) def time_drop_dups_inplace(self, shape): self.df.drop_duplicates(self.df.columns[:-1], inplace=True) execute(self.df) class TimeDropDuplicatesSeries: params = [get_benchmark_shapes("TimeDropDuplicatesSeries")] param_names = ["shape"] def setup(self, shape): rows = shape[0] self.series = IMPL.Series( np.tile( IMPL.Index([f"i-{i}" for i in range(rows // 10)], dtype=object).values, 10, ) ) execute(self.series) def time_drop_dups(self, shape): execute(self.series.drop_duplicates()) def time_drop_dups_string(self, shape): self.series.drop_duplicates(inplace=True) execute(self.series) class TimeDatetimeAccessor: params = [get_benchmark_shapes("TimeDatetimeAccessor")] param_names = ["shape"] def setup(self, shape): self.series = IMPL.Series( IMPL.timedelta_range("1 days", periods=shape[0], freq="h") ) execute(self.series) def time_dt_accessor(self, shape): execute(self.series.dt) def time_timedelta_days(self, shape): execute(self.series.dt.days) def time_timedelta_seconds(self, shape): execute(self.series.dt.seconds) class BaseCategories: def setup(self, shape): rows = shape[0] arr = [f"s{i:04d}" for i in np.random.randint(0, rows // 10, size=rows)] self.ts = IMPL.Series(arr).astype("category") execute(self.ts) class TimeSetCategories(BaseCategories): params = [get_benchmark_shapes("TimeSetCategories")] param_names = ["shape"] def time_set_categories(self, shape): execute(self.ts.cat.set_categories(self.ts.cat.categories[::2])) class TimeRemoveCategories(BaseCategories): params = [get_benchmark_shapes("TimeRemoveCategories")] param_names = ["shape"] def time_remove_categories(self, shape): execute(self.ts.cat.remove_categories(self.ts.cat.categories[::2])) class BaseReshape: def setup(self, shape): rows, cols = shape k = 10 arrays = [ np.arange(rows // k).repeat(k), np.roll(np.tile(np.arange(rows // k), k), 25), ] index = IMPL.MultiIndex.from_arrays(arrays) self.df = IMPL.DataFrame(np.random.randn(rows, cols), index=index) execute(self.df) class TimeStack(BaseReshape): params = [get_benchmark_shapes("TimeStack")] param_names = ["shape"] def setup(self, shape): super().setup(shape) self.udf = self.df.unstack(1) execute(self.udf) def time_stack(self, shape): execute(self.udf.stack()) class TimeUnstack(BaseReshape): params = [get_benchmark_shapes("TimeUnstack")] param_names = ["shape"] def time_unstack(self, shape): execute(self.df.unstack(1)) class TimeReplace: params = [get_benchmark_shapes("TimeReplace")] param_names = ["shape"] def setup(self, shape): rows, cols = shape self.to_replace = {i: getattr(IMPL, "Timestamp")(i) for i in range(rows)} self.df = IMPL.DataFrame(np.random.randint(rows, size=(rows, cols))) execute(self.df) def time_replace(self, shape): execute(self.df.replace(self.to_replace)) class TimeGroups: params = [get_benchmark_shapes("TimeGroups")] param_names = ["shape"] def setup(self, shape): self.series = IMPL.Series(np.random.randint(0, 100, size=shape[0])) execute(self.series) # returns a pretty dict thus not calling execute def time_series_groups(self, shape): self.series.groupby(self.series).groups # returns a dict thus not calling execute def time_series_indices(self, shape): self.series.groupby(self.series).indices class TimeRepr: params = [get_benchmark_shapes("TimeRepr")] param_names = ["shape"] def setup(self, shape): self.df = IMPL.DataFrame(np.random.randn(*shape)) execute(self.df) # returns a string thus not calling execute def time_repr(self, shape): repr(self.df) class TimeMaskBool: params = [get_benchmark_shapes("TimeMaskBool")] param_names = ["shape"] def setup(self, shape): self.df = IMPL.DataFrame(np.random.randn(*shape)) self.mask = self.df < 0 execute(self.df), execute(self.mask) def time_frame_mask(self, shape): execute(self.df.mask(self.mask)) class TimeIsnull: params = [get_benchmark_shapes("TimeIsnull")] param_names = ["shape"] def setup(self, shape): sample = np.array([np.nan, 1.0]) data = np.random.choice(sample, (shape[0], shape[1])) self.df = IMPL.DataFrame(data) execute(self.df) def time_isnull(self, shape): execute(IMPL.isnull(self.df)) class TimeDropna: params = (["all", "any"], [0, 1], get_benchmark_shapes("TimeDropna")) param_names = ["how", "axis", "shape"] def setup(self, how, axis, shape): row, col = shape self.df = IMPL.DataFrame(np.random.randn(row, col)) self.df.iloc[row // 20 : row // 10, col // 3 : col // 2] = np.nan self.df["foo"] = "bar" execute(self.df) def time_dropna(self, how, axis, shape): execute(self.df.dropna(how=how, axis=axis)) class TimeEquals: params = [get_benchmark_shapes("TimeEquals")] param_names = ["shape"] def setup(self, shape): self.df = IMPL.DataFrame(np.random.randn(*shape)) self.df.iloc[-1, -1] = np.nan execute(self.df) # returns a boolean thus not calling execute def time_frame_float_equal(self, shape): self.df.equals(self.df) from .utils import setup # noqa: E402, F401 ================================================ FILE: asv_bench/benchmarks/io/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """IO Modin benchmarks.""" ================================================ FILE: asv_bench/benchmarks/io/csv.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import numpy as np from ..utils import ( ASV_USE_IMPL, IMPL, RAND_HIGH, RAND_LOW, execute, generate_dataframe, get_benchmark_shapes, get_shape_id, prepare_io_data, ) class BaseReadCsv: # test data file should be created only once def setup_cache(self, test_filename="io_test_file"): test_filenames = prepare_io_data( test_filename, self.data_type, get_benchmark_shapes(self.__class__.__name__) ) return test_filenames def setup(self, test_filenames, shape, *args, **kwargs): # ray init if ASV_USE_IMPL == "modin": IMPL.DataFrame([]) self.shape_id = get_shape_id(shape) class TimeReadCsvSkiprows(BaseReadCsv): shapes = get_benchmark_shapes("TimeReadCsvSkiprows") skiprows_mapping = { "lambda_even_rows": lambda x: x % 2, "range_uniform": np.arange(1, shapes[0][0] // 10), "range_step2": np.arange(1, shapes[0][0], 2), } data_type = "str_int" param_names = ["shape", "skiprows"] params = [ shapes, [None, "lambda_even_rows", "range_uniform", "range_step2"], ] def setup(self, test_filenames, shape, skiprows): super().setup(test_filenames, shape, skiprows) self.skiprows = self.skiprows_mapping[skiprows] if skiprows else None def time_skiprows(self, test_filenames, shape, skiprows): execute(IMPL.read_csv(test_filenames[self.shape_id], skiprows=self.skiprows)) class TimeReadCsvTrueFalseValues(BaseReadCsv): data_type = "true_false_int" param_names = ["shape"] params = [get_benchmark_shapes("TimeReadCsvTrueFalseValues")] def time_true_false_values(self, test_filenames, shape): execute( IMPL.read_csv( test_filenames[self.shape_id], true_values=["Yes", "true"], false_values=["No", "false"], ), ) class TimeReadCsvNamesDtype: shapes = get_benchmark_shapes("TimeReadCsvNamesDtype") _dtypes_params = ["Int64", "Int64_Timestamp"] _timestamp_columns = ["col1", "col2"] param_names = ["shape", "names", "dtype"] params = [ shapes, ["array-like"], _dtypes_params, ] def _get_file_id(self, shape, dtype): return get_shape_id(shape) + dtype def _add_timestamp_columns(self, df): df = df.copy() date_column = IMPL.date_range("2000", periods=df.shape[0], freq="ms") for col in self._timestamp_columns: df[col] = date_column return df def setup_cache(self, test_filename="io_test_file_csv_names_dtype"): # filenames with a metadata of saved dataframes cache = {} for shape in self.shapes: for dtype in self._dtypes_params: df = generate_dataframe( "int", *shape, RAND_LOW, RAND_HIGH, impl="pandas" ) if dtype == "Int64_Timestamp": df = self._add_timestamp_columns(df) file_id = self._get_file_id(shape, dtype) cache[file_id] = ( f"{test_filename}_{file_id}.csv", df.columns.to_list(), df.dtypes.to_dict(), ) df.to_csv(cache[file_id][0], index=False) return cache def setup(self, cache, shape, names, dtype): # ray init if ASV_USE_IMPL == "modin": IMPL.DataFrame([]) file_id = self._get_file_id(shape, dtype) self.filename, self.names, self.dtype = cache[file_id] self.parse_dates = None if dtype == "Int64_Timestamp": # cached version of dtype should not change self.dtype = self.dtype.copy() for col in self._timestamp_columns: del self.dtype[col] self.parse_dates = self._timestamp_columns def time_read_csv_names_dtype(self, cache, shape, names, dtype): execute( IMPL.read_csv( self.filename, names=self.names, header=0, dtype=self.dtype, parse_dates=self.parse_dates, ) ) from ..utils import setup # noqa: E402, F401 ================================================ FILE: asv_bench/benchmarks/io/parquet.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. from ..utils import ( ASV_USE_IMPL, IMPL, execute, get_benchmark_shapes, get_shape_id, prepare_io_data_parquet, ) class TimeReadParquet: shapes = get_benchmark_shapes("TimeReadParquet") data_type = "str_int" param_names = ["shape"] params = [ shapes, ] # test data file should be created only once def setup_cache(self, test_filename="io_test_file"): test_filenames = prepare_io_data_parquet( test_filename, self.data_type, get_benchmark_shapes(self.__class__.__name__) ) return test_filenames def setup(self, test_filenames, shape): # ray init if ASV_USE_IMPL == "modin": IMPL.DataFrame([]) self.shape_id = get_shape_id(shape) def time_read_parquet(self, test_filenames, shape): execute( IMPL.read_parquet( test_filenames[self.shape_id], ) ) from ..utils import setup # noqa: E402, F401 ================================================ FILE: asv_bench/benchmarks/scalability/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Benchmarks measuring how Modin performance scales when MODIN_CPUS are changed.""" ================================================ FILE: asv_bench/benchmarks/scalability/scalability_benchmarks.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """These benchmarks are supposed to be run only for modin, since they do not make sense for pandas.""" import modin.pandas as pd try: from modin.pandas.io import from_pandas except ImportError: from modin.pandas.utils import from_pandas try: from modin.pandas.io import to_numpy, to_pandas except ImportError: try: from modin.utils import to_numpy, to_pandas except ImportError: # This provides compatibility with older versions of the Modin, allowing us to test old commits. from modin.pandas.utils import to_pandas import pandas from ..utils import ( RAND_HIGH, RAND_LOW, execute, gen_data, generate_dataframe, get_benchmark_shapes, ) class TimeFromPandas: param_names = ["shape", "cpus"] params = [ get_benchmark_shapes("TimeFromPandas"), [4, 16, 32], ] def setup(self, shape, cpus): self.data = pandas.DataFrame(gen_data("int", *shape, RAND_LOW, RAND_HIGH)) from modin.config import NPartitions NPartitions.get = lambda: cpus # trigger ray init pd.DataFrame([]) def time_from_pandas(self, shape, cpus): execute(from_pandas(self.data)) class TimeToPandas: param_names = ["shape", "cpus"] params = [ get_benchmark_shapes("TimeToPandas"), [4, 16, 32], ] def setup(self, shape, cpus): from modin.config import NPartitions NPartitions.get = lambda: cpus self.data = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH, impl="modin") def time_to_pandas(self, shape, cpus): # to_pandas is already synchronous to_pandas(self.data) class TimeToNumPy: param_names = ["shape", "cpus"] params = [ get_benchmark_shapes("TimeToNumPy"), [4, 16, 32], ] def setup(self, shape, cpus): from modin.config import NPartitions NPartitions.get = lambda: cpus self.data = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH, impl="modin") def time_to_numpy(self, shape, cpus): # to_numpy is already synchronous to_numpy(self.data) from ..utils import setup # noqa: E402, F401 ================================================ FILE: asv_bench/benchmarks/utils/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Modin benchmarks utils.""" from .common import ( IMPL, execute, gen_data, gen_nan_data, generate_dataframe, get_shape_id, prepare_io_data, prepare_io_data_parquet, random_booleans, random_columns, random_string, setup, translator_groupby_ngroups, ) from .compatibility import ASV_USE_IMPL, ASV_USE_STORAGE_FORMAT from .data_shapes import GROUPBY_NGROUPS, RAND_HIGH, RAND_LOW, get_benchmark_shapes __all__ = [ "ASV_USE_IMPL", "ASV_USE_STORAGE_FORMAT", "RAND_LOW", "RAND_HIGH", "GROUPBY_NGROUPS", "get_benchmark_shapes", "IMPL", "execute", "get_shape_id", "gen_data", "gen_nan_data", "generate_dataframe", "prepare_io_data", "prepare_io_data_parquet", "random_string", "random_columns", "random_booleans", "translator_groupby_ngroups", "setup", ] ================================================ FILE: asv_bench/benchmarks/utils/common.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ The module contains the functionality that is used when benchmarking Modin commits. In the case of using utilities from the main Modin code, there is a chance that when benchmarking old commits, the utilities changed, which in turn can unexpectedly affect the performance results, hence some utility functions are duplicated here. """ import logging import uuid from typing import Optional, Union import numpy as np import pandas import modin.pandas from .compatibility import ASV_DATASET_SIZE, ASV_USE_ENGINE, ASV_USE_IMPL from .data_shapes import RAND_HIGH, RAND_LOW POSSIBLE_IMPL = { "modin": modin.pandas, "pandas": pandas, } IMPL = POSSIBLE_IMPL[ASV_USE_IMPL] def translator_groupby_ngroups(groupby_ngroups: Union[str, int], shape: tuple) -> int: """ Translate a string representation of the number of groups, into a number. Parameters ---------- groupby_ngroups : str or int Number of groups that will be used in `groupby` operation. shape : tuple Same as pandas.Dataframe.shape. Returns ------- int """ if ASV_DATASET_SIZE == "big": if groupby_ngroups == "huge_amount_groups": return min(shape[0] // 2, 5000) return groupby_ngroups else: return groupby_ngroups class weakdict(dict): # noqa: GL08 __slots__ = ("__weakref__",) data_cache = dict() dataframes_cache = dict() def gen_nan_data(nrows: int, ncols: int) -> dict: """ Generate nan data with caching. The generated data are saved in the dictionary and on a subsequent call, if the keys match, saved data will be returned. Therefore, we need to carefully monitor the changing of saved data and make its copy if needed. Parameters ---------- nrows : int Number of rows. ncols : int Number of columns. Returns ------- modin.pandas.DataFrame or pandas.DataFrame or modin.pandas.Series or pandas.Series DataFrame or Series with shape (nrows, ncols) or (nrows,), respectively. """ cache_key = (ASV_USE_IMPL, nrows, ncols) if cache_key in data_cache: return data_cache[cache_key] logging.info("Generating nan data {} rows and {} columns".format(nrows, ncols)) if ncols > 1: columns = [f"col{x}" for x in range(ncols)] data = IMPL.DataFrame(np.nan, index=IMPL.RangeIndex(nrows), columns=columns) elif ncols == 1: data = IMPL.Series(np.nan, index=IMPL.RangeIndex(nrows)) else: assert False, "Number of columns (ncols) should be >= 1" data_cache[cache_key] = data return data def gen_int_data(nrows: int, ncols: int, rand_low: int, rand_high: int) -> dict: """ Generate int data. Parameters ---------- nrows : int Number of rows. ncols : int Number of columns. rand_low : int Low bound for random generator. rand_high : int High bound for random generator. Returns ------- dict Number of keys - `ncols`, each of them store np.ndarray of `nrows` length. """ data = { "col{}".format(i): np.random.randint(rand_low, rand_high, size=(nrows)) for i in range(ncols) } return data def gen_str_int_data(nrows: int, ncols: int, rand_low: int, rand_high: int) -> dict: """ Generate int data and string data. Parameters ---------- nrows : int Number of rows. ncols : int Number of columns. rand_low : int Low bound for random generator. rand_high : int High bound for random generator. Returns ------- dict Number of keys - `ncols`, each of them store np.ndarray of `nrows` length. One of the columns with string values. """ data = gen_int_data(nrows, ncols, rand_low, rand_high).copy() # convert values in arbitary column to string type key = list(data.keys())[0] data[key] = [f"str_{x}" for x in data[key]] return data def gen_true_false_int_data(nrows, ncols, rand_low, rand_high): """ Generate int data and string data "true" and "false" values. Parameters ---------- nrows : int Number of rows. ncols : int Number of columns. rand_low : int Low bound for random generator. rand_high : int High bound for random generator. Returns ------- dict Number of keys - `ncols`, each of them store np.ndarray of `nrows` length. One half of the columns with integer values, another half - with "true" and "false" string values. """ data = gen_int_data(nrows // 2, ncols // 2, rand_low, rand_high) data_true_false = { "tf_col{}".format(i): np.random.choice( ["Yes", "true", "No", "false"], size=(nrows - nrows // 2) ) for i in range(ncols - ncols // 2) } data.update(data_true_false) return data def gen_data( data_type: str, nrows: int, ncols: int, rand_low: int, rand_high: int, ) -> dict: """ Generate data with caching. The generated data are saved in the dictionary and on a subsequent call, if the keys match, saved data will be returned. Therefore, we need to carefully monitor the changing of saved data and make its copy if needed. Parameters ---------- data_type : {"int", "str_int", "true_false_int"} Type of data generation. nrows : int Number of rows. ncols : int Number of columns. rand_low : int Low bound for random generator. rand_high : int High bound for random generator. Returns ------- dict Number of keys - `ncols`, each of them store np.ndarray of `nrows` length. Notes ----- Returned data type depends on the `data_type` parameter in the next way: - `data_type`=="int" - all columns will be contain only integer values; - `data_type`=="str_int" some of the columns will be of string type; - `data_type`=="true_false_int" half of the columns will be filled with string values representing "true" and "false" values and another half - with integers. """ type_to_generator = { "int": gen_int_data, "str_int": gen_str_int_data, "true_false_int": gen_true_false_int_data, } cache_key = (data_type, nrows, ncols, rand_low, rand_high) if cache_key in data_cache: return data_cache[cache_key] logging.info( "Generating {} data {} rows and {} columns [{}-{}]".format( data_type, nrows, ncols, rand_low, rand_high ) ) assert data_type in type_to_generator data_generator = type_to_generator[data_type] data = data_generator(nrows, ncols, rand_low, rand_high) data_cache[cache_key] = weakdict(data) return data def generate_dataframe( data_type: str, nrows: int, ncols: int, rand_low: int, rand_high: int, groupby_ncols: Optional[int] = None, count_groups: Optional[int] = None, gen_unique_key: bool = False, cache_prefix: str = None, impl: str = None, ) -> Union[modin.pandas.DataFrame, pandas.DataFrame]: """ Generate DataFrame with caching. The generated dataframes are saved in the dictionary and on a subsequent call, if the keys match, one of the saved dataframes will be returned. Therefore, we need to carefully monitor that operations that change the dataframe work with its copy. Parameters ---------- data_type : str Type of data generation; supported types: {"int", "str_int"}. nrows : int Number of rows. ncols : int Number of columns. rand_low : int Low bound for random generator. rand_high : int High bound for random generator. groupby_ncols : int, default: None Number of columns for which `groupby` will be called in the future; to get more stable performance results, we need to have the same number of values in each group every benchmarking time. count_groups : int, default: None Count of groups in groupby columns. gen_unique_key : bool, default: False Generate `col1` column where all elements are unique. cache_prefix : str, optional Prefix to add to the cache key of the requested frame. impl : str, optional Implementation used to create the dataframe; supported implemetations: {"modin", "pandas"}. Returns ------- modin.pandas.DataFrame or pandas.DataFrame [and list] Notes ----- The list of groupby columns names returns when groupby columns are generated. """ assert not ( (groupby_ncols is None) ^ (count_groups is None) ), "You must either specify both parameters 'groupby_ncols' and 'count_groups' or none of them." if groupby_ncols and count_groups: ncols -= groupby_ncols if impl is None: impl = ASV_USE_IMPL cache_key = ( impl, data_type, nrows, ncols, rand_low, rand_high, groupby_ncols, count_groups, gen_unique_key, ) if cache_prefix is not None: cache_key = (cache_prefix, *cache_key) if cache_key in dataframes_cache: return dataframes_cache[cache_key] logging.info( "Allocating {} DataFrame {}: {} rows and {} columns [{}-{}]".format( impl, data_type, nrows, ncols, rand_low, rand_high ) ) data = gen_data(data_type, nrows, ncols, rand_low, rand_high) if groupby_ncols and count_groups: groupby_columns = [f"groupby_col{x}" for x in range(groupby_ncols)] for groupby_col in groupby_columns: data[groupby_col] = np.tile(np.arange(count_groups), nrows // count_groups) if gen_unique_key: data["col1"] = np.arange(nrows) df = POSSIBLE_IMPL[impl].DataFrame(data) if groupby_ncols and count_groups: dataframes_cache[cache_key] = df, groupby_columns return df, groupby_columns dataframes_cache[cache_key] = df return df def random_string() -> str: """ Create a 36-character random string. Returns ------- str """ return str(uuid.uuid4()) def random_columns(df_columns: list, columns_number: int) -> list: """ Pick sublist of random columns from a given sequence. Parameters ---------- df_columns : list Columns to choose from. columns_number : int How many columns to pick. Returns ------- list """ return list(np.random.choice(df_columns, size=columns_number)) def random_booleans(number: int) -> list: """ Create random list of booleans with `number` elements. Parameters ---------- number : int Count of booleans in result list. Returns ------- list """ return list(np.random.choice([True, False], size=number)) def execute(df: Union[modin.pandas.DataFrame, pandas.DataFrame]): """ Make sure the calculations are finished. Parameters ---------- df : modin.pandas.DataFrame or pandas.Datarame DataFrame to be executed. """ if ASV_USE_IMPL == "modin": partitions = df._query_compiler._modin_frame._partitions.flatten() mgr_cls = df._query_compiler._modin_frame._partition_mgr_cls if len(partitions) and hasattr(mgr_cls, "wait_partitions"): mgr_cls.wait_partitions(partitions) return # compatibility with old Modin versions all( map( lambda partition: partition.drain_call_queue() or True, partitions, ) ) if ASV_USE_ENGINE == "ray": from ray import wait all(map(lambda partition: wait([partition._data]), partitions)) elif ASV_USE_ENGINE == "dask": from dask.distributed import wait all(map(lambda partition: wait(partition._data), partitions)) elif ASV_USE_ENGINE == "python": pass elif ASV_USE_IMPL == "pandas": pass def get_shape_id(shape: tuple) -> str: """ Join shape numbers into a string with `_` delimiters. Parameters ---------- shape : tuple Same as pandas.Dataframe.shape. Returns ------- str """ return "_".join([str(element) for element in shape]) def prepare_io_data(test_filename: str, data_type: str, shapes: list): """ Prepare data for IO tests with caching. Parameters ---------- test_filename : str Unique file identifier that is used to distinguish data for different tests. data_type : {"int", "str_int", "true_false_int"} Type of data generation. shapes : list Data shapes to prepare. Returns ------- test_filenames : dict Dictionary that maps dataset shape to the file on disk. """ test_filenames = {} for shape in shapes: shape_id = get_shape_id(shape) test_filenames[shape_id] = f"{test_filename}_{shape_id}_{data_type}.csv" df = generate_dataframe(data_type, *shape, RAND_LOW, RAND_HIGH, impl="pandas") df.to_csv(test_filenames[shape_id], index=False) return test_filenames def prepare_io_data_parquet(test_filename: str, data_type: str, shapes: list): """ Prepare data for IO tests with caching. Parameters ---------- test_filename : str Unique file identifier that is used to distinguish data for different tests. data_type : "str_int" Type of data generation. shapes : list Data shapes to prepare. Returns ------- test_filenames : dict Dictionary that maps dataset shape to the file on disk. """ test_filenames = {} for shape in shapes: shape_id = get_shape_id(shape) test_filenames[shape_id] = f"{test_filename}_{shape_id}_{data_type}.parquet" df = generate_dataframe(data_type, *shape, RAND_LOW, RAND_HIGH, impl="pandas") df.to_parquet(test_filenames[shape_id], index=False) return test_filenames def setup(*args, **kwargs): # noqa: GL08 # This function just needs to be imported into each benchmark file to # set up the random seed before each function. ASV run it automatically. # https://asv.readthedocs.io/en/latest/writing_benchmarks.html np.random.seed(42) ================================================ FILE: asv_bench/benchmarks/utils/compatibility.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Compatibility layer for parameters used by ASV.""" import os import modin.pandas as pd try: from modin.config import NPartitions NPARTITIONS = NPartitions.get() except ImportError: NPARTITIONS = pd.DEFAULT_NPARTITIONS try: from modin.config import AsvImplementation, Engine, StorageFormat, TestDatasetSize ASV_USE_IMPL = AsvImplementation.get() ASV_DATASET_SIZE = TestDatasetSize.get() or "Small" ASV_USE_ENGINE = Engine.get() ASV_USE_STORAGE_FORMAT = StorageFormat.get() except ImportError: # The same benchmarking code can be run for different versions of Modin, so in # case of an error importing important variables, we'll just use predefined values ASV_USE_IMPL = os.environ.get("MODIN_ASV_USE_IMPL", "modin") ASV_DATASET_SIZE = os.environ.get("MODIN_TEST_DATASET_SIZE", "Small") ASV_USE_ENGINE = os.environ.get("MODIN_ENGINE", "Ray") ASV_USE_STORAGE_FORMAT = os.environ.get("MODIN_STORAGE_FORMAT", "Pandas") ASV_USE_IMPL = ASV_USE_IMPL.lower() ASV_DATASET_SIZE = ASV_DATASET_SIZE.lower() ASV_USE_ENGINE = ASV_USE_ENGINE.lower() ASV_USE_STORAGE_FORMAT = ASV_USE_STORAGE_FORMAT.lower() assert ASV_USE_IMPL in ("modin", "pandas") assert ASV_DATASET_SIZE in ("big", "small") assert ASV_USE_ENGINE in ("ray", "dask", "python", "unidist") assert ASV_USE_STORAGE_FORMAT in ("pandas") ================================================ FILE: asv_bench/benchmarks/utils/data_shapes.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Define data shapes.""" import json import os from .compatibility import ASV_DATASET_SIZE RAND_LOW = 0 RAND_HIGH = 100 BINARY_OP_DATA_SIZE = { "big": [ [[5000, 5000], [5000, 5000]], # the case extremely inefficient # [[20, 500_000], [10, 1_000_000]], [[500_000, 20], [1_000_000, 10]], ], "small": [[[250, 250], [250, 250]], [[10_000, 20], [25_000, 10]]], } UNARY_OP_DATA_SIZE = { "big": [ [5000, 5000], # the case extremely inefficient # [10, 1_000_000], [1_000_000, 10], ], "small": [[250, 250], [10_000, 10]], } SERIES_DATA_SIZE = { "big": [[100_000, 1]], "small": [[10_000, 1]], } BINARY_OP_SERIES_DATA_SIZE = { "big": [ [[500_000, 1], [1_000_000, 1]], [[500_000, 1], [500_000, 1]], ], "small": [[[5_000, 1], [10_000, 1]]], } DEFAULT_GROUPBY_NGROUPS = { "big": [100, "huge_amount_groups"], "small": [5], } GROUPBY_NGROUPS = DEFAULT_GROUPBY_NGROUPS[ASV_DATASET_SIZE] _DEFAULT_CONFIG_T = [ ( UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE], [ # Pandas storage format benchmarks "TimeGroupByMultiColumn", "TimeGroupByDefaultAggregations", "TimeGroupByDictionaryAggregation", "TimeSetItem", "TimeInsert", "TimeArithmetic", "TimeSortValues", "TimeDrop", "TimeHead", "TimeTail", "TimeExplode", "TimeFillna", "TimeFillnaDataFrame", "TimeValueCountsFrame", "TimeValueCountsSeries", "TimeIndexing", "TimeMultiIndexing", "TimeResetIndex", "TimeAstype", "TimeDescribe", "TimeProperties", "TimeReindex", "TimeReindexMethod", "TimeFillnaMethodDataframe", "TimeDropDuplicatesDataframe", "TimeStack", "TimeUnstack", "TimeRepr", "TimeMaskBool", "TimeIsnull", "TimeDropna", "TimeEquals", # IO benchmarks "TimeReadCsvSkiprows", "TimeReadCsvTrueFalseValues", "TimeReadCsvNamesDtype", "TimeReadParquet", # Scalability benchmarks "TimeFromPandas", "TimeToPandas", "TimeToNumPy", ], ), ( BINARY_OP_DATA_SIZE[ASV_DATASET_SIZE], [ # Pandas storage format benchmarks "TimeJoin", "TimeMerge", "TimeMergeDefault", "TimeConcat", "TimeAppend", "TimeBinaryOp", "TimeLevelAlign", ], ), ( SERIES_DATA_SIZE[ASV_DATASET_SIZE], [ # Pandas storage format benchmarks "TimeFillnaSeries", "TimeGroups", "TimeIndexingNumericSeries", "TimeFillnaMethodSeries", "TimeDatetimeAccessor", "TimeSetCategories", "TimeRemoveCategories", "TimeDropDuplicatesSeries", ], ), ( BINARY_OP_SERIES_DATA_SIZE[ASV_DATASET_SIZE], [ # Pandas storage format benchmarks "TimeBinaryOpSeries", ], ), ] DEFAULT_CONFIG = {} DEFAULT_CONFIG["MergeCategoricals"] = ( [[10_000, 2]] if ASV_DATASET_SIZE == "big" else [[1_000, 2]] ) DEFAULT_CONFIG["TimeJoinStringIndex"] = ( [[100_000, 64]] if ASV_DATASET_SIZE == "big" else [[1_000, 4]] ) DEFAULT_CONFIG["TimeReplace"] = ( [[10_000, 2]] if ASV_DATASET_SIZE == "big" else [[1_000, 2]] ) for config in (_DEFAULT_CONFIG_T,): for _shape, _names in config: DEFAULT_CONFIG.update({_name: _shape for _name in _names}) # Correct shapes in the case when the operation ended with a timeout error if ASV_DATASET_SIZE == "big": DEFAULT_CONFIG["TimeMergeDefault"] = [ [[1000, 1000], [1000, 1000]], [[500_000, 20], [1_000_000, 10]], ] DEFAULT_CONFIG["TimeLevelAlign"] = [ [[2500, 2500], [2500, 2500]], [[250_000, 20], [500_000, 10]], ] DEFAULT_CONFIG["TimeStack"] = [ [1500, 1500], [100_000, 10], ] DEFAULT_CONFIG["TimeUnstack"] = DEFAULT_CONFIG["TimeStack"] CONFIG_FROM_FILE = None def get_benchmark_shapes(bench_id: str): """ Get custom benchmark shapes from a json file stored in MODIN_ASV_DATASIZE_CONFIG. If `bench_id` benchmark is not found in the file, then the default value will be used. Parameters ---------- bench_id : str Unique benchmark identifier that is used to get shapes. Returns ------- list Benchmark shapes. """ global CONFIG_FROM_FILE if not CONFIG_FROM_FILE: try: from modin.config import AsvDataSizeConfig filename = AsvDataSizeConfig.get() except ImportError: filename = os.environ.get("MODIN_ASV_DATASIZE_CONFIG", None) if filename: # should be json with open(filename) as _f: CONFIG_FROM_FILE = json.load(_f) if CONFIG_FROM_FILE and bench_id in CONFIG_FROM_FILE: return CONFIG_FROM_FILE[bench_id] return DEFAULT_CONFIG[bench_id] ================================================ FILE: asv_bench/test/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. ================================================ FILE: asv_bench/test/test_utils.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. from unittest.mock import Mock, mock_open, patch import numpy as np import pytest from benchmarks.utils import data_shapes, execute, get_benchmark_shapes import modin.pandas as pd from modin.config import AsvDataSizeConfig @pytest.mark.parametrize( "asv_config_content, result", [ ( '{"TimeJoin": [[[10, 10], [15, 15]], [[11, 11], [13, 13]]], \ "TimeGroupBy": [[11, 11], [13, 13]]}', [ [ # binary shapes [[10, 10], [15, 15]], [[11, 11], [13, 13]], ], [ # unary shapes [11, 11], [13, 13], ], ], ), ], ) @patch.object(data_shapes, "CONFIG_FROM_FILE", new=None) def test_get_benchmark_shapes(asv_config_content, result): AsvDataSizeConfig.put("mock_filename") with patch("builtins.open", mock_open(read_data=asv_config_content)): assert result[0] == get_benchmark_shapes("TimeJoin") assert result[1] == get_benchmark_shapes("TimeGroupBy") @pytest.mark.parametrize( "asv_config_content, result", [ ( '{"TimeJoin": [[[10, 10], [15, 15]]]', [[100, 100]], ), ], ) @patch.object(data_shapes, "CONFIG_FROM_FILE", new=None) def test_get_benchmark_shapes_default(asv_config_content, result): AsvDataSizeConfig.put(None) with patch.object(data_shapes, "DEFAULT_CONFIG", new={"TimeJoin": result}): assert result == get_benchmark_shapes("TimeJoin") def test_execute(): df = pd.DataFrame(np.random.rand(100, 64)) partitions = df._query_compiler._modin_frame._partitions.flatten() mgr_cls = df._query_compiler._modin_frame._partition_mgr_cls with patch.object(mgr_cls, "wait_partitions", new=Mock()): execute(df) mgr_cls.wait_partitions.assert_called_once() assert (mgr_cls.wait_partitions.call_args[0] == partitions).all() ================================================ FILE: ci/teamcity/Dockerfile.teamcity-ci ================================================ # Create images from this container like this (in modin repo root): # # git rev-parse HEAD > ci/teamcity/git-rev # # tar cf ci/teamcity/modin.tar . # # docker build --build-arg ENVIRONMENT=environment-dev.yml -t modin-project/teamcity-ci:${BUILD_NUMBER} -f ci/teamcity/Dockerfile.teamcity-ci ci/teamcity FROM rayproject/ray:latest ARG ENVIRONMENT=environment-dev.yml ADD modin.tar /modin ADD git-rev /modin/git-rev WORKDIR /modin RUN sudo chown -R ray /modin # Make RUN commands use `bash --login`: SHELL ["/bin/bash", "--login", "-c"] # Initialize conda in bash config files: RUN conda init bash ENV PATH /home/ray/anaconda3/envs/modin/bin:$PATH RUN conda config --set channel_priority strict RUN conda update python -y RUN conda env create -f ${ENVIRONMENT} RUN conda install curl PyGithub # Activate the environment, and make sure it's activated: # The following line also removed conda initialization from # ~/.bashrc so conda starts complaining that it should be # initialized for bash. But it is necessary to do it because # activation is not always executed when "docker exec" is used # and then conda initialization overwrites PATH with its base # environment where python doesn't have any packages installed. RUN echo "conda activate modin" > ~/.bashrc RUN echo "Make sure environment is activated" RUN conda list -n modin ================================================ FILE: ci/teamcity/build-docker.py ================================================ import os import sys def execute_command(cmd): status = os.system(cmd) ec = os.WEXITSTATUS(status) if ec != 0: raise SystemExit('Command "{}" failed'.format(cmd)) if sys.platform.startswith("linux"): execute_command("git rev-parse HEAD > git-rev") execute_command( "(cd ../.. && git archive -o ci/teamcity/modin.tar $(cat ci/teamcity/git-rev))" ) base_image = "ray-project/deploy" requirements = "requirements-dev.txt" execute_command( "docker build -f Dockerfile.modin-base --build-arg BASE_IMAGE={} -t modin-project/modin-base .".format( base_image ) ) else: raise SystemExit( "TeamCity CI in Docker containers is supported only on Linux at the moment." ) execute_command( "docker build -f Dockerfile.teamcity-ci --build-arg REQUIREMENTS={} -t modin-project/teamcity-ci .".format( requirements ) ) if sys.platform.startswith("linux"): execute_command("rm ./modin.tar ./git-rev") ================================================ FILE: ci/teamcity/comment_on_pr.py ================================================ """ Post the comment like the following to the PR: ``` :robot: TeamCity test results bot :robot: ``` """ import os import sys from github import Github # Check if this is a pull request or not based on the environment variable try: pr_id = int(os.environ["GITHUB_PR_NUMBER"].split("/")[-1]) except Exception: sys.exit(0) engine = os.environ["MODIN_ENGINE"] header = """

TeamCity {} test results bot

\n\n""".format( engine.title() ) if engine == "ray": pytest_outputs = ["ray_tests.log"] elif engine == "dask": pytest_outputs = ["dask_tests.log"] elif engine == "python": pytest_outputs = ["python_tests.log"] else: raise Exception("Unknown Engine, set `MODIN_ENGINE` environment variable") full_comment = "" # Do not include coverage info in PR comment split_by_first = ( "----------- coverage: platform linux, python 3.7.5-final-0 -----------" ) split_by_second = "--------------------------------------------------------------------------------------" tests_failed = False for out in pytest_outputs: content = open(out, "r").read() full_comment += "".join( "".join( [ i.split(split_by_first)[0], i.split(split_by_first)[-1].split(split_by_second)[-1], ] ) for i in content.split("+ python3 -m pytest ") ) tests_failed = tests_failed or ("FAILURES" in full_comment) if len(full_comment) > 65_000: full_comment = ( full_comment[-65_000:] + "\n\nRemaining output truncated\n\n" ) full_comment = "
Tests Logs\n\n\n```\n" + full_comment full_comment += "\n```\n\n
\n" if not tests_failed: header += '

Tests PASSed

\n\n' else: header += '

Tests FAILed

\n\n' full_comment = header + full_comment token = os.environ["GITHUB_TOKEN"] g = Github(token) repo = g.get_repo("modin-project/modin") pr = repo.get_pull(pr_id) if any( i.user.login == "modin-bot" and "TeamCity {} test results bot".format(engine).lower() in i.body.lower() for i in pr.get_issue_comments() ): pr_comment_list = [ i for i in list(pr.get_issue_comments()) if i.user.login == "modin-bot" and "TeamCity {} test results bot".format(engine).lower() in i.body.lower() ] assert len(pr_comment_list) == 1, "Too many comments from modin-bot already" pr_comment_list[0].edit(full_comment) else: pr.create_issue_comment(full_comment) ================================================ FILE: codecov.yml ================================================ comment: false coverage: status: project: default: branches: - main target: 85% patch: default: target: 30% ================================================ FILE: contributing/contributing.md ================================================ # Modin dev onboarding 1. [Set up git](https://docs.github.com/en/get-started/quickstart/set-up-git) 1. [install anaconda](https://www.anaconda.com/products/individual#macos). Once installed, you should reopen your terminal to find "(base)" next to your prompt: ![](conda_prompt.png) 1. [Generate an SSH key](https://docs.github.com/en/authentication/connecting-to-github-with-ssh/generating-a-new-ssh-key-and-adding-it-to-the-ssh-agent) for GitHub 1. Fork the [modin repo](https://github.com/modin-project/modin) on GitHub 1. Clone the forked repo in a local directory of your choice: ``` git clone ${PATH_TO_REPO} ``` where the path can be found here: ![](clone_my_modin.png) 4. Inside the cloned "modin" directory, add a remote branch called "upstream": ``` git remote add upstream git@github.com:modin-project/modin.git ``` where the upstream link comes from here: ![](clone_upstream_modin.png) 1. Fetch the upstream branch: ``` git fetch upstream ``` 1. Set the default remote branch for your local main branch. ``` git branch --set-upstream-to=upstream/main main ``` 1. Install modin from local source code, and install all its dependencies: ``` pip install -e ".[all]" ``` 1. Install ipython: ``` pip install ipython ``` 1. If you ever want to install modin at a release version (not the editable version from your machine): ``` pip install modin ``` 1. If you want a specific version: ``` pip install modin==0.11 ``` 1. To upgrade modin to the newest available version: ``` pip install -U modin ``` 1. Now go back to local modin. ``` pip install -e . ``` 1. Try out modin in ipython: ``` ipython import modin modin.__version__ ``` You should see the Modin version, which consists of the version, the last commit number, and the last commit hash. 1. List Modin versions: ``` git tag ``` 1. Get a summary of a particular release: ``` git tag -l --format='%(contents)' 0.11.0 ``` 1. Check out the developer requirements in `requirements-dev.txt`. Install them with: ``` pip install -r requirements-dev.txt ``` 1. Try a unit test: ``` pytest modin/tests/pandas/test_concat.py ``` 1. [Add a GPG key](https://docs.github.com/en/authentication/managing-commit-signature-verification/adding-a-new-gpg-key-to-your-github-account ) to your Modin account. Your commits need to be signed with a GPG key. For mac, you can use [Mac GPG](https://gpgtools.org/). 1. (Optional) We recommend a few workflow settings: 1. If you use Visual Studio Code, auto-format with [black](https://black.readthedocs.io/en/stable/) every time you save changes: 1. Install [Microsoft's Python extension](https://marketplace.visualstudio.com/items?itemName=ms-python.python) 1. Open your VSCode settings, in `Code -> Preferences -> Settings`. 1. Search for "python formatting provider" and select "black" from the dropdown menu. 1. Again in settings, search for "format on save" and enable the "Editor: Format on Save" option. 2. Add a pre-commit hook: 1. In your modin repository, copy [this pre-commit file](pre-commit) to `.git/hooks/pre-commit` 1. Every time you try to commit, git will try to run flake8 and mypy, and abort the commit if either one fails. This lets you make sure your commits pass these tests before you push to GitHub. 1. To bypass the pre-commit hook (e.g. if you don't want to create a pull request, or you already know your code will pass the tests), commit with the flag `--no-verify`. ================================================ FILE: contributing/pre-commit ================================================ #!/bin/sh # # Called by "git commit" with no arguments. The hook should # exit with non-zero status after issuing an appropriate message if # it wants to stop the commit. # set -e printf "running black. This script will preempt the commit if black fails.\n" black --check --diff modin/ asv_bench/benchmarks scripts/doc_checker.py printf 'black passed!\n' printf "running isort. This script will preempt the commit if isort fails.\n" isort . --check-only printf 'isort passed!\n' printf "running flake8. This script will preempt the commit if flake8 fails.\n" flake8 modin/ asv_bench/benchmarks scripts/doc_checker.py printf "flake8 passed!\n" printf "running mypy. This script will preempt the commit if mypy fails.\n" mypy --config-file mypy.ini printf "mypy passed!\n" printf "pre-commit hook finished!\n" ================================================ FILE: docker/Dockerfile ================================================ FROM continuumio/miniconda3 RUN conda install -c conda-forge psutil setproctitle RUN pip install modin ================================================ FILE: docs/_static/custom.js ================================================ document.addEventListener("DOMContentLoaded", function () { var script = document.createElement("script"); script.type = "module"; script.id = "runllm-widget-script" script.src = "https://widget.runllm.com"; script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget. script.setAttribute("runllm-name", "Modin"); script.setAttribute("runllm-position", "BOTTOM_RIGHT"); script.setAttribute("runllm-assistant-id", "164"); script.async = true; document.head.appendChild(script); }); ================================================ FILE: docs/_templates/layout.html ================================================ {% extends "!layout.html" %} {% block footer %} {{ super() }} {% endblock %} ================================================ FILE: docs/conf.py ================================================ # -*- coding: utf-8 -*- # # Configuration file for the Sphinx documentation builder. # # This file does only contain a selection of the most common options. For a # full list see the documentation: # http://www.sphinx-doc.org/en/stable/config import os # -- Project information ----------------------------------------------------- import sys import types import ray # stub ray.remote to be a no-op so it doesn't shadow docstrings def noop_decorator(*args, **kwargs): if len(args) == 1 and len(kwargs) == 0 and callable(args[0]): # This is the case where the decorator is just @ray.remote without parameters. return args[0] return lambda cls_or_func: cls_or_func ray.remote = noop_decorator # fake modules if they're missing for mod_name in ( "xgboost", "unidist", "unidist.config", ): try: __import__(mod_name) except ImportError: sys.modules[mod_name] = types.ModuleType( mod_name, f"fake {mod_name} for building docs" ) if not hasattr(sys.modules["xgboost"], "Booster"): sys.modules["xgboost"].Booster = type("Booster", (object,), {}) if not hasattr(sys.modules["unidist"], "remote"): sys.modules["unidist"].remote = noop_decorator if not hasattr(sys.modules["unidist"], "core"): sys.modules["unidist"].core = type("core", (object,), {}) if not hasattr(sys.modules["unidist"].core, "base"): sys.modules["unidist"].core.base = type("base", (object,), {}) if not hasattr(sys.modules["unidist"].core.base, "object_ref"): sys.modules["unidist"].core.base.object_ref = type("object_ref", (object,), {}) if not hasattr(sys.modules["unidist"].core.base.object_ref, "ObjectRef"): sys.modules["unidist"].core.base.object_ref.ObjectRef = type("ObjectRef", (object,), {}) sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) import modin from modin.config.__main__ import export_config_help configs_file_path = os.path.abspath( os.path.join(os.path.dirname(__file__), "flow/modin/configs_help.csv") ) # Export configs help to create configs table in the docs/flow/modin/config.rst export_config_help(configs_file_path) project = "Modin" copyright = "2018-2024, Modin Developers." author = "Modin contributors" # The short X.Y version version = "{}".format(modin.__version__) # The full version, including alpha/beta/rc tags release = version # -- General configuration --------------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. # # needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ "sphinx.ext.autodoc", "sphinx.ext.napoleon", "sphinx.ext.intersphinx", "sphinx.ext.todo", "sphinx.ext.mathjax", "sphinx.ext.githubpages", "sphinx.ext.graphviz", "sphinxcontrib.plantuml", "sphinx_issues", ] # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] source_suffix = ".rst" # The master toctree document. master_doc = "index" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. language = "en" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path . exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] html_static_path = ["_static"] html_js_files = ["custom.js"] # The name of the Pygments (syntax highlighting) style to use. pygments_style = "sphinx" # -- Options for HTML output ------------------------------------------------- # Maps git branches to Sphinx themes default_html_theme = "pydata_sphinx_theme" current_branch = "nature" # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # html_theme = "pydata_sphinx_theme" html_favicon = "img/MODIN_ver2.ico" html_logo = "img/MODIN_ver2.png" html_context = {"default_mode": "light"} # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. # html_theme_options = { "navbar_end": ["navbar-icon-links"], "sidebarwidth": 270, "collapse_navigation": False, "navigation_depth": 4, "show_toc_level": 2, "github_url": "https://github.com/modin-project/modin", "icon_links": [ { "name": "PyPI", "url": "https://pypi.org/project/modin", "icon": "fab fa-python", }, { "name": "conda-forge", "url": "https://anaconda.org/conda-forge/modin", "icon": "fas fa-circle-notch", }, { "name": "Join the Slack", "url": "https://modin.org/slack.html", "icon": "fab fa-slack", }, { "name": "Mailing List", "url": "https://groups.google.com/forum/#!forum/modin-dev", "icon": "fas fa-envelope-square", }, ], "navigation_with_keys": True, } # Custom sidebar templates, must be a dictionary that maps document names # to template names. # # The default sidebars (for documents that don't match any pattern) are # defined by theme itself. Builtin themes are using these templates by # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', # 'searchbox.html']``. # # The default pydata_sphinx_theme sidebar templates are # sidebar-nav-bs.html and search-field.html. html_sidebars = {} issues_github_path = "modin-project/modin" ================================================ FILE: docs/contact.rst ================================================ Contact ======= Slack ----- Join our `Slack`_ community to connect with Modin users and contributors, discuss, and ask questions about all things Modin-related. Mailing List ------------ General questions, potential contributors, and ideas can be directed to the `developer mailing list`_. It is an open Google Group, so feel free to join anytime! If you are unsure about where to ask or post something, the mailing list is a good place to ask as well. Issues ------ Bug reports and feature requests can be directed to the issues_ page of the Modin GitHub repo. .. _Slack: https://modin.org/slack.html .. _developer mailing list: https://groups.google.com/forum/#!forum/modin-dev .. _issues: https://github.com/modin-project/modin/issues ================================================ FILE: docs/development/architecture.rst ================================================ System Architecture =================== In this section, we will lay out the overall system architecture for Modin, as well as go into detail about the component design, implementation and other important details. This document also contains important reference information for those interested in contributing new functionality, bugfixes and enhancements. High-Level Architectural View ----------------------------- The diagram below outlines the general layered view to the components of Modin with a short description of each major section of the documentation following. .. image:: /img/modin_architecture.png :align: center Modin is logically separated into different layers that represent the hierarchy of a typical Database Management System. Abstracting out each component allows us to individually optimize and swap out components without affecting the rest of the system. We can implement, for example, new compute kernels that are optimized for a certain type of data and can simply plug it in to the existing infrastructure by implementing a small interface. It can still be distributed by our choice of compute engine with the logic internally. System View ----------- A top-down view of Modin’s architecture is detailed below: .. image:: /img/10000_meter.png :align: center The user - Data Scientist interacts with the Modin system by sending interactive or batch commands through API and Modin executes them using various execution engines: Ray, Dask and MPI are currently supported. Subsystem/Container View ------------------------ If we click down to the next level of details we will see that inside Modin the layered architecture is implemented using several interacting components: .. image:: /img/component_view.png :align: center For the simplicity the other execution systems - Dask and MPI are omitted and only Ray execution is shown. * Dataframe subsystem is the backbone of the dataframe holding and query compilation. It is responsible for dispatching the ingress/egress to the appropriate module, getting the pandas API and calling the query compiler to convert calls to the internal intermediate Dataframe Algebra. * Data Ingress/Egress Module is working in conjunction with Dataframe and Partitions subsystem to read data split into partitions and send data into the appropriate node for storing. * Query Planner is subsystem that translates the pandas API to intermediate Dataframe Algebra representation DAG and performs an initial set of optimizations. * Query Executor is responsible for getting the Dataframe Algebra DAG, performing further optimizations based on a selected storage format and mapping or compiling the Dataframe Algebra DAG to and actual execution sequence. * Storage formats module is responsible for mapping the abstract operation to an actual executor call, e.g. pandas, custom format. * Orchestration subsystem is responsible for spawning and controlling the actual execution environment for the selected execution. It spawns the actual nodes, fires up the execution environment, e.g. Ray, monitors the state of executors and provides telemetry Component View -------------- User queries which perform data transformation, data ingress or data egress pass through the Modin components detailed below. The path the query takes is mostly similar across execution systems. Data Transformation ''''''''''''''''''' .. image:: /img/generic_data_transform.svg :align: center Query Compiler """""""""""""" The :ref:`Query Compiler ` receives queries from the pandas API layer. The API layer is responsible for ensuring a clean input to the Query Compiler. The Query Compiler must have knowledge of the compute kernels and in-memory format of the data in order to efficiently compile the query. The Query Compiler is responsible for sending the compiled query to the Core Modin Dataframe. In this design, the Query Compiler does not have information about where or when the query will be executed, and gives the control of the partition layout to the Modin Dataframe. In the interest of reducing the pandas API, the Query Compiler layer closely follows the pandas API, but cuts out a large majority of the repetition. .. _auto-switch architecture: Automatic Engine Switching and Casting """""""""""""""""""""""""""""""""""""" QueryCompilers which are derived from QueryCompilerCaster can participate in automatic casting when different query compilers, representing different underlying engines, are used together in a function. A relative "cost" of casting is used to determine which query compiler everything should be moved to. Each query compiler must implement the functions, `move_to_cost`, `move_to_me_cost`, `max_cost` and `stay_cost` to provide information and query costs associated with different decision points in cost opimization. With the exception of `max_cost` these methods need to return a QCCoercionCost in the range of 0-1000. These functions have precise meanings: * `move_to_cost` is the transmission cost of moving the data, including known serialization costs from the perspective of that particular compiler. Colloquially, the question being asked of the query compiler is, "What is the normalized cost of moving my data to the other engine?" * `move_to_me_cost` is the execution cost for the data and operation on the proposed *destination* query compiler. Since this method is called before the data has been migrated this is a class method and the destination query_compiler may have very limited information on the possible cost after migration. Factors that may be considered here include available memory, cpu, and the unique characteristics of the engine. The question being asked is, "If this data were moved to me, what would be the normalized execution cost to perform that operation?" * `stay_cost` is the execution cost on the current query compilier ( where the data is ). The question asked of the query compiler is, "If I were to keep this data on my engine, what would be the normalized execution cost?" * `max_cost` is the maximum cost allowed by this query compiler across all data movements. This method sets a normalized upper bound for situations where multiple data frames from different engines all need to move to the same engine. The value returned by this method can exceed QCCoercionCost.COST_IMPOSSIBLE There are generally two places where automatic casting is considered: When two or more DataFrames on different engines are participating in an operation ( such as pd.concat ) or at registered functions for particular engines through the `register_function_for_pre_op_switch` and `register_function_for_post_op_switch` methods. Core Modin Dataframe """""""""""""""""""" At this layer, operations can be performed lazily. Currently, Modin executes most operations eagerly in an attempt to behave as pandas does. Some operations, e.g. ``transpose`` are expensive and create full copies of the data in-memory. In these cases, we can wait until another operation triggers computation. In the future, we plan to add additional query planning and laziness to Modin to ensure that queries are performed efficiently. The structure of the Core Modin Dataframe is extensible, such that any operation that could be better optimized for a given execution can be overridden and optimized in that way. This layer has a significantly reduced API from the QueryCompiler and the user-facing API. Each of these APIs represents a single way of performing a given operation or behavior. Core Modin Dataframe API """""""""""""""""""""""" More documentation can be found internally in the code_. This API is not complete, but represents an overwhelming majority of operations and behaviors. This API can be implemented by other distributed/parallel DataFrame libraries and plugged in to Modin as well. Create an issue_ or discuss on our `Slack `_ for more information! The :doc:`Core Modin Dataframe ` is responsible for the data layout and shuffling, partitioning, and serializing the tasks that get sent to each partition. Other implementations of the Modin Dataframe interface will have to handle these as well. Partition Manager """"""""""""""""" The Partition Manager can change the size and shape of the partitions based on the type of operation. For example, certain operations are complex and require access to an entire column or row. The Partition Manager can convert the block partitions to row partitions or column partitions. This gives Modin the flexibility to perform operations that are difficult in row-only or column-only partitioning schemas. Another important component of the Partition Manager is the serialization and shipment of compiled queries to the Partitions. It maintains metadata for the length and width of each partition, so when operations only need to operate on or extract a subset of the data, it can ship those queries directly to the correct partition. This is particularly important for some operations in pandas which can accept different arguments and operations for different columns, e.g. ``fillna`` with a dictionary. This abstraction separates the actual data movement and function application from the Dataframe layer to keep the Core Dataframe API small and separately optimize the data movement and metadata management. Partitions """""""""" Partitions are responsible for managing a subset of the Dataframe. As mentioned below, the Dataframe is partitioned both row and column-wise. This gives Modin scalability in both directions and flexibility in data layout. There are a number of optimizations in Modin that are implemented in the partitions. Partitions are specific to the execution framework and in-memory format of the data, allowing Modin to exploit potential optimizations across both. These optimizations are explained further on the pages specific to the execution framework. Execution Engine '''''''''''''''' This layer performs computation on partitions of the data. The Modin Dataframe is designed to work with `task parallel`_ frameworks, but integration with data parallel frameworks should be possible with some effort. Storage Format '''''''''''''' The :doc:`storage format ` describes the in-memory partition type. The base storage format in Modin is pandas. In the default case, the Modin Dataframe operates on partitions that contain ``pandas.DataFrame`` objects. Data Ingress '''''''''''' .. note:: Data ingress operations (e.g. ``read_csv``) in Modin load data from the source into partitions and vice versa for data egress (e.g. ``to_csv``) operation. Improved performance is achieved by reading/writing in partitions in parallel. Data ingress starts with a function in the pandas API layer (e.g. ``read_csv``). Then the user's query is passed to the :doc:`Factory Dispatcher `, which defines a factory specific for the execution. The factory for execution contains an IO class (e.g. ``PandasOnRayIO``) whose responsibility is to perform a parallel read/write from/to a file. This IO class contains class methods with interfaces and names that are similar to pandas IO functions (e.g. ``PandasOnRayIO.read_csv``). The IO class declares the Modin Dataframe and Query Compiler classes specific for the execution engine and storage format to ensure the correct object is constructed. It also declares IO methods that are mix-ins containing a combination of the engine-specific class for deploying remote tasks, the class for parsing the given file format and the class handling the chunking of the format-specific file on the head node (see dispatcher classes implementation :doc:`details `). The output from the IO class data ingress function is a :doc:`Modin Dataframe `. .. image:: /img/generic_data_ingress.svg :align: center Data Egress ''''''''''' Data egress operations (e.g. ``to_csv``) are similar to data ingress operations up to execution-specific IO class functions construction. Data egress functions of the IO class are defined slightly different from data ingress functions and created only specifically for the engine since partitions already have information about its storage format. Using the IO class, data is exported from partitions to the target file. .. image:: /img/generic_data_egress.svg :align: center Supported Execution Engines and Storage Formats ''''''''''''''''''''''''''''''''''''''''''''''' This is a list of execution engines and in-memory formats supported in Modin. If you would like to contribute a new execution engine or in-memory format, please see the documentation page on :doc:`contributing `. - :doc:`pandas on Ray ` - Uses the Ray_ execution framework. - The storage format is `pandas` and the in-memory partition type is a pandas DataFrame. - For more information on the execution path, see the :doc:`pandas on Ray ` page. - :doc:`pandas on Dask ` - Uses the `Dask Futures`_ execution framework. - The storage format is `pandas` and the in-memory partition type is a pandas DataFrame. - For more information on the execution path, see the :doc:`pandas on Dask ` page. - :doc:`pandas on MPI ` - Uses MPI_ through the Unidist_ execution framework. - The storage format is `pandas` and the in-memory partition type is a pandas DataFrame. - For more information on the execution path, see the :doc:`pandas on Unidist ` page. - :doc:`pandas on Python ` - Uses native python execution - mainly used for debugging. - The storage format is `pandas` and the in-memory partition type is a pandas DataFrame. - For more information on the execution path, see the :doc:`pandas on Python ` page. - pandas on Snowflake - Uses the Snowpark Python library to transpile pandas API calls to SQL queries. - The storage format is the custom-defined `Snowflake` format; data remains within Snowflake warehouses until retrieved by pandas API calls. - For more information on pandas on Snowflake, refer to Snowflake's `documentation `_ (external link). .. _directory-tree: DataFrame Partitioning ---------------------- The Modin DataFrame architecture follows in the footsteps of modern architectures for database and high performance matrix systems. We chose a partitioning schema that partitions along both columns and rows because it gives Modin flexibility and scalability in both the number of columns and the number of rows. The following figure illustrates this concept. .. image:: /img/block_partitions_diagram.png :align: center Currently, the main in-memory format of each partition is a `pandas DataFrame`_ (:doc:`pandas storage format `). Index ----- We currently use the ``pandas.Index`` object for indexing both columns and rows. In the future, we will implement a distributed, pandas-compatible Index object in order to remove this scaling limitation from the system. Most workloads will not be affected by this scalability limit since it only appears when operating on more than 10's of billions of columns or rows. **Important note**: If you are using the default index (``pandas.RangeIndex``) there is a fixed memory overhead (~200 bytes) and there will be no scalability issues with the index. API --- The API is the outer-most layer that faces users. The following classes contain Modin's implementation of the pandas API: .. toctree:: /flow/modin/pandas/base /flow/modin/pandas/dataframe /flow/modin/pandas/series Module/Class View ----------------- Modin's modules layout is shown below. Click on the links to deep dive into Modin's internal implementation details. The documentation covers most modules, with more docs being added everyday! .. parsed-literal:: ├───.github ├───asv_bench ├───ci ├───docker ├───docs ├───examples ├───modin │ ├─── :doc:`config ` | ├─── :doc:`utils ` │ ├───core │ │ ├─── :doc:`dataframe ` │ │ │ ├─── :doc:`algebra ` │ │ │ ├─── :doc:`base ` │ │ │ └─── :doc:`pandas ` │ │ ├───execution │ │ │ ├───dask │ │ │ │ ├───common │ │ │ │ └───implementations │ │ │ │ └─── :doc:`pandas_on_dask ` │ │ │ ├─── :doc:`dispatching ` │ │ │ ├───python │ │ │ │ └───implementations │ │ │ │ └─── :doc:`pandas_on_python ` │ │ │ ├───ray │ │ │ │ ├───common │ │ │ │ ├─── :doc:`generic ` │ │ │ │ └───implementations │ │ │ │ └─── :doc:`pandas_on_ray ` │ │ │ └───unidist │ │ │ ├───common │ │ │ ├─── :doc:`generic ` │ │ │ └───implementations │ │ │ └─── :doc:`pandas_on_unidist ` │ │ ├─── :doc:`io ` │ │ └─── :doc:`storage_formats ` │ │ ├─── :doc:`base ` │ │ └─── :doc:`pandas ` │ ├───distributed │ │ ├───dataframe │ │ │ └─── :doc:`pandas ` │ ├─── :doc:`experimental ` │ │ ├───core | | | └─── :doc:`io ` │ │ ├─── :doc:`pandas ` │ │ ├─── :doc:`sklearn ` │ │ ├───spreadsheet │ │ ├─── :doc:`xgboost ` │ │ └─── :doc:`batch ` │ └───pandas │ ├─── :doc:`dataframe ` │ └─── :doc:`series ` ├───requirements ├───scripts └───stress_tests .. _pandas Dataframe: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html .. _Ray: https://github.com/ray-project/ray .. _Unidist: https://github.com/modin-project/unidist .. _MPI: https://www.mpi-forum.org/ .. _code: https://github.com/modin-project/modin/blob/main/modin/core/dataframe .. _Dask: https://github.com/dask/dask .. _Dask Futures: https://docs.dask.org/en/latest/futures.html .. _issue: https://github.com/modin-project/modin/issues .. _task parallel: https://en.wikipedia.org/wiki/Task_parallelism .. _experimental features: /usage_guide/advanced_usage/index.html ================================================ FILE: docs/development/contributing.rst ================================================ Contributing ============ Getting Started --------------- If you're interested in getting involved in the development of Modin, but aren't sure where start, take a look at the issues tagged `Good first issue`_ or Documentation_. These are issues that would be good for getting familiar with the codebase and better understanding some of the more complex components of the architecture. There is documentation here about the :doc:`architecture ` that you will want to review in order to get started. Also, feel free to join the discussions on the `developer mailing list`_. If you want a quick guide to getting your development environment setup, please use `the contributing instructions on GitHub`_. Certificate of Origin --------------------- To keep a clear track of who did what, we use a `sign-off` procedure (same requirements for using the signed-off-by process as the Linux kernel has https://www.kernel.org/doc/html/v4.17/process/submitting-patches.html) on patches or pull requests that are being sent. The sign-off is a simple line at the end of the explanation for the patch, which certifies that you wrote it or otherwise have the right to pass it on as an open-source patch. The rules are pretty simple: if you can certify the below: CERTIFICATE OF ORIGIN V 1.1 ^^^^^^^^^^^^^^^^^^^^^^^^^^^ "By making a contribution to this project, I certify that: 1.) The contribution was created in whole or in part by me and I have the right to submit it under the open source license indicated in the file; or 2.) The contribution is based upon previous work that, to the best of my knowledge, is covered under an appropriate open source license and I have the right under that license to submit that work with modifications, whether created in whole or in part by me, under the same open source license (unless I am permitted to submit under a different license), as indicated in the file; or 3.) The contribution was provided directly to me by some other person who certified (a), (b) or (c) and I have not modified it. 4.) I understand and agree that this project and the contribution are public and that a record of the contribution (including all personal information I submit with it, including my sign-off) is maintained indefinitely and may be redistributed consistent with this project or the open source license(s) involved." .. code-block:: bash This is my commit message Signed-off-by: Awesome Developer Code without a proper signoff cannot be merged into the main branch. Note: You must use your real name (sorry, no pseudonyms or anonymous contributions.) The text can either be manually added to your commit body, or you can add either ``-s`` or ``--signoff`` to your usual ``git commit`` commands: .. code-block:: bash git commit --signoff -m "This is my commit message" git commit -s -m "This is my commit message" This will use your default git configuration which is found in .git/config. To change this, you can use the following commands: .. code-block:: bash git config --global user.name "Awesome Developer" git config --global user.email "awesome.developer.@example.org" If you have authored a commit that is missing the signed-off-by line, you can amend your commits and push them to GitHub. .. code-block:: bash git commit --amend --signoff If you've pushed your changes to GitHub already you'll need to force push your branch after this with ``git push -f``. Commit Message formatting ------------------------- We request that your first commit follow a particular format, and we **require** that your PR title follow the format. The format is: .. code-block:: bash FEAT-#9999: Add `DataFrame.rolling` functionality, to enable rolling window operations The ``FEAT`` component represents the type of commit. This component of the commit message can be one of the following: * FEAT: A new feature that is added * DOCS: Documentation improvements or updates * FIX: A bugfix contribution * REFACTOR: Moving or removing code without change in functionality * TEST: Test updates or improvements * PERF: Performance enhancements The ``#9999`` component of the commit message should be the issue number in the Modin GitHub issue tracker: https://github.com/modin-project/modin/issues. This is important because it links commits to their issues. The commit message should follow a colon (:) and be descriptive and succinct. A Modin CI job on GitHub will enforce that your pull request title follows the format we suggest. Note that if you update the PR title, you have to push another commit (even if it's empty) or amend your last commit for the job to pick up the new PR title. Re-running the job in Github Actions won't work. General Rules for committers ---------------------------- - Try to write a PR name as descriptive as possible. - Try to keep PRs as small as possible. One PR should be making one semantically atomic change. - Don't merge your own PRs even if you are technically able to do it. Development Dependencies ------------------------ We recommend doing development in a virtualenv or conda environment, though this decision is ultimately yours. You will want to run the following in order to install all of the required dependencies for running the tests and formatting the code: .. code-block:: bash conda env create --file environment-dev.yml # or pip install -r requirements-dev.txt Code Formatting and Lint ------------------------ We use black_ for code formatting. Before you submit a pull request, please make sure that you run the following from the project root: .. code-block:: bash black modin/ asv_bench/benchmarks scripts/doc_checker.py We also use flake8_ to check linting errors. Running the following from the project root will ensure that it passes the lint checks on Github Actions: .. code-block:: bash flake8 modin/ asv_bench/benchmarks scripts/doc_checker.py We test that this has been run on our `Github Actions`_ test suite. If you do this and find that the tests are still failing, try updating your version of black and flake8. Adding a test ------------- If you find yourself fixing a bug or adding a new feature, don't forget to add a test to the test suite to verify its correctness! More on testing and the layout of the tests can be found in our testing documentation. We ask that you follow the existing structure of the tests for ease of maintenance. Running the tests ----------------- To run the entire test suite, run the following from the project root: .. code-block:: bash pytest modin/pandas/test The test suite is very large, and may take a long time if you run every test. If you've only modified a small amount of code, it may be sufficient to run a single test or some subset of the test suite. In order to run a specific test run: .. code-block:: bash pytest modin/pandas/test::test_new_functionality The entire test suite is automatically run for each pull request. Performance measurement ----------------------- We use Asv_ tool for performance tracking of various Modin functionality. The results can be viewed here: `Asv dashboard`_. More information can be found in the `Asv readme`_. Building documentation ---------------------- To build the documentation, please follow the steps below from the project root: .. code-block:: bash pip install -r docs/requirements-doc.txt sphinx-build -b html docs docs/build To visualize the documentation locally, run the following from `build` folder: .. code-block:: bash python -m http.server # python -m http.server 1234 then open the browser at `0.0.0.0:` (e.g. `0.0.0.0:1234`). Contributing a new execution framework or in-memory format ---------------------------------------------------------- If you are interested in contributing support for a new execution framework or in-memory format, please make sure you understand the :doc:`architecture ` of Modin. The best place to start the discussion for adding a new execution framework or in-memory format is the `developer mailing list`_. More docs on this coming soon... .. _Good first issue: https://github.com/modin-project/modin/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue+%3Abeginner%3A%22 .. _Documentation: https://github.com/modin-project/modin/issues?q=is%3Aissue+is%3Aopen+label%3A%22documentation+%3Abookmark_tabs%3A%22 .. _black: https://github.com/ambv/black .. _flake8: http://flake8.pycqa.org/en/latest/ .. _Github Actions: https://github.com/features/actions .. _Asv: https://github.com/airspeed-velocity/asv#airspeed-velocity .. _developer mailing list: https://groups.google.com/forum/#!forum/modin-dev .. _Asv dashboard: https://modin.org/modin-bench/#/ .. _Asv readme: https://github.com/modin-project/modin/blob/main/asv_bench/README.md .. _the contributing instructions on GitHub: https://github.com/modin-project/modin/blob/main/contributing/contributing.md ================================================ FILE: docs/development/index.rst ================================================ Development =========== .. toctree:: :maxdepth: 4 contributing architecture partition_api using_pandas_on_ray using_pandas_on_dask using_pandas_on_python using_pandas_on_mpi .. meta:: :description lang=en: Development-specific documentation. ================================================ FILE: docs/development/partition_api.rst ================================================ Partition API in Modin ====================== When you are working with a :py:class:`~modin.pandas.dataframe.DataFrame`, you can unwrap its remote partitions to get the raw futures objects compatible with the execution engine (e.g. ``ray.ObjectRef`` for Ray). In addition to unwrapping of the remote partitions we also provide an API to construct a ``modin.pandas.DataFrame`` from raw futures objects. Partition IPs ------------- For finer grained placement control, Modin also provides an API to get the IP addresses of the nodes that hold each partition. You can pass the partitions having needed IPs to your function. It can help with minimizing of data movement between nodes. Partition API implementations ----------------------------- By default, a :py:class:`~modin.pandas.dataframe.DataFrame` stores underlying partitions as ``pandas.DataFrame`` objects. You can find the specific implementation of Modin's Partition Interface in :doc:`pandas Partition API `. .. toctree:: :hidden: /flow/modin/distributed/dataframe/pandas Ray engine ---------- However, it is worth noting that for Modin on ``Ray`` engine with ``pandas`` in-memory format IPs of the remote partitions may not match actual locations if the partitions are lower than 100 kB. Ray saves such objects (<= 100 kB, by default) in in-process store of the calling process (please, refer to `Ray documentation`_ for more information). We can't get IPs for such objects while maintaining good performance. So, you should keep in mind this for unwrapping of the remote partitions with their IPs. Several options are provided to handle the case in ``How to handle Ray objects that are lower 100 kB`` section. Dask engine ----------- There is no mentioned above issue for Modin on ``Dask`` engine with ``pandas`` in-memory format because ``Dask`` saves any objects in the worker process that processes a function (please, refer to `Dask documentation`_ for more information). Unidist engine -------------- Currently, Modin only supports MPI through unidist. There is no mentioned above issue for Modin on ``Unidist`` engine using ``MPI`` backend with ``pandas`` in-memory format because ``Unidist`` saves any objects in the MPI worker process that processes a function (please, refer to `Unidist documentation`_ for more information). How to handle Ray objects that are lower than 100 kB ---------------------------------------------------- * If you are sure that each of the remote partitions being unwrapped is higher than 100 kB, you can just import Modin or perform ``ray.init()`` manually. * If you don't know partition sizes you can pass the option ``_system_config={"max_direct_call_object_size": ,}``, where ``nbytes`` is threshold for objects that will be stored in in-process store, to ``ray.init()``. * You can also start Ray as follows: ``ray start --head --system-config='{"max_direct_call_object_size":}'``. Note that when specifying the threshold the performance of some Modin operations may change. .. _`Ray documentation`: https://docs.ray.io/en/master/index.html# .. _`Dask documentation`: https://distributed.dask.org/en/latest/index.html .. _`Unidist documentation`: https://unidist.readthedocs.io/en/latest/index.html ================================================ FILE: docs/development/using_pandas_on_dask.rst ================================================ pandas on Dask ============== This section describes usage related documents for the pandas on Dask component of Modin. Modin uses pandas as a primary memory format of the underlying partitions and optimizes queries ingested from the API layer in a specific way to this format. Thus, there is no need to care of choosing it but you can explicitly specify it anyway as shown below. One of the execution engines that Modin uses is Dask. To enable the pandas on Dask execution you should set the following environment variables: .. code-block:: bash export MODIN_ENGINE=dask export MODIN_STORAGE_FORMAT=pandas or turn them on in source code: .. code-block:: python import modin.config as cfg cfg.Engine.put('dask') cfg.StorageFormat.put('pandas') Using Modin on Dask locally --------------------------- If you want to run Modin on Dask locally using a single node, just set Modin engine to ``Dask`` and continue working with a Modin DataFrame as if it was a pandas DataFrame. You can either initialize a Dask client on your own and Modin connects to the existing Dask cluster or allow Modin itself to initialize a Dask client. .. code-block:: python import modin.pandas as pd import modin.config as modin_cfg modin_cfg.Engine.put("dask") df = pd.DataFrame(...) Using Modin on Dask in a Cluster -------------------------------- If you want to run Modin on Dask in a cluster, you should set up a Dask cluster and initialize a Dask client. Once the Dask client is initialized, Modin will be able to connect to it and use the Dask cluster. .. code-block:: python from distributed import Client import modin.pandas as pd import modin.config as modin_cfg # Define your cluster here cluster = ... client = Client(cluster) modin_cfg.Engine.put("dask") df = pd.DataFrame(...) To get more information on how to deploy and run a Dask cluster, visit the `Deploy Dask Clusters`_ page. Conversion between Modin DataFrame and Dask DataFrame ----------------------------------------------------- Modin DataFrame can be converted to/from Dask DataFrame with no-copy partition conversion. This allows you to take advantage of both Modin and Dask libraries for maximum performance. .. code-block:: python import modin.pandas as pd import modin.config as modin_cfg from modin.pandas.io import to_dask, from_dask modin_cfg.Engine.put("dask") df = pd.DataFrame(...) # Convert Modin to Dask DataFrame dask_df = to_dask(df) # Convert Dask to Modin DataFrame modin_df = from_dask(dask_df) .. _Deploy Dask Clusters: https://docs.dask.org/en/stable/deploying.html ================================================ FILE: docs/development/using_pandas_on_mpi.rst ================================================ pandas on MPI through unidist ============================= This section describes usage related documents for the pandas on MPI through unidist component of Modin. Modin uses pandas as a primary memory format of the underlying partitions and optimizes queries ingested from the API layer in a specific way to this format. Thus, there is no need to care of choosing it but you can explicitly specify it anyway as shown below. One of the execution engines that Modin uses is MPI through unidist. To enable the pandas on MPI through unidist execution you should set the following environment variables: .. code-block:: bash export MODIN_ENGINE=unidist export MODIN_STORAGE_FORMAT=pandas export UNIDIST_BACKEND=mpi or turn it on in source code: .. code-block:: python import modin.config as modin_cfg import unidist.config as unidist_cfg modin_cfg.Engine.put('unidist') modin_cfg.StorageFormat.put('pandas') unidist_cfg.Backend.put('mpi') To run a python application you should use ``mpiexec -n 1 python `` command. .. code-block:: bash mpiexec -n 1 python script.py For more information on how to run a python application with unidist on MPI backend please refer to `Unidist on MPI`_ section of the unidist documentation. As of unidist 0.5.0 there is support for a shared object store for MPI backend. The feature allows to improve performance in the workloads, where workers use same data multiple times by reducing data copies. You can enable the feature by setting the following environment variable: .. code-block:: bash export UNIDIST_MPI_SHARED_OBJECT_STORE=True or turn it on in source code: .. code-block:: python import unidist.config as unidist_cfg unidist_cfg.MpiSharedObjectStore.put(True) .. _`Unidist on MPI`: https://unidist.readthedocs.io/en/latest/using_unidist/unidist_on_mpi.html ================================================ FILE: docs/development/using_pandas_on_python.rst ================================================ pandas on Python ================ This section describes usage related documents for the pandas on Python component of Modin. Modin uses pandas as the primary memory format of the underlying partitions and optimizes queries from the API layer in a specific way to this format. Since it is a default, you do not need to specify the pandas memory format, but we show how to explicitly set it below. One of the execution engines that Modin uses is Python. This engine is sequential and used for debugging. To enable the pandas on Python execution you should set the following environment variables: .. code-block:: bash export MODIN_ENGINE=python export MODIN_STORAGE_FORMAT=pandas or turn a debug mode on: .. code-block:: bash export MODIN_DEBUG=True export MODIN_STORAGE_FORMAT=pandas or do the same in source code: .. code-block:: python import modin.config as cfg cfg.Engine.put('python') cfg.StorageFormat.put('pandas') .. code-block:: python import modin.config as cfg cfg.IsDebug.put(True) cfg.StorageFormat.put('pandas') ================================================ FILE: docs/development/using_pandas_on_ray.rst ================================================ pandas on Ray ============= This section describes usage related documents for the pandas on Ray component of Modin. Modin uses pandas as a primary memory format of the underlying partitions and optimizes queries ingested from the API layer in a specific way to this format. Thus, there is no need to care of choosing it but you can explicitly specify it anyway as shown below. One of the execution engines that Modin uses is Ray. If you have Ray installed in your system, Modin also uses it by default to distribute computations. If you want to be explicit, you could set the following environment variables: .. code-block:: bash export MODIN_ENGINE=ray export MODIN_STORAGE_FORMAT=pandas or turn it on in source code: .. code-block:: python import modin.config as cfg cfg.Engine.put('ray') cfg.StorageFormat.put('pandas') ================================================ FILE: docs/ecosystem.rst ================================================ Ecosystem ========= There is a constantly growing number of users and packages using pandas to address their specific needs in data preparation, analysis and visualization. pandas is being used ubiquitously and is a good choise to handle small-sized data. However, pandas scales poorly and is non-interactive on moderate to large datasets. Modin provides a drop-in replacement API for pandas and scales computation across nodes and CPUs available. What you need to do to switch to Modin is just replace a single line of code. .. code-block:: python # import pandas as pd import modin.pandas as pd While most packages can consume a pandas DataFrame and operate it efficiently, this is not the case with a Modin DataFrame due to its distributed nature. Thus, some packages may lack support for handling Modin DataFrame(s) correctly and, moreover, efficiently. Modin implements such methods as ``__array__``, ``__dataframe__``, etc. to facilitate other libraries to consume a Modin DataFrame. If you feel that a certain library can operate efficiently with a specific format of data, it is possible to convert a Modin DataFrame to the format preferred. to_pandas --------- You can refer to `pandas ecosystem`_ page to get more details on where pandas can be used and what libraries it powers. .. code-block:: python from modin.pandas.io import to_pandas pandas_df = to_pandas(modin_df) to_numpy -------- You can refer to `NumPy ecosystem`_ section of NumPy documentation to get more details on where NumPy can be used and what libraries it powers. .. code-block:: python from modin.pandas.io import to_numpy numpy_arr = to_numpy(modin_df) to_ray ------ You can refer to `Ray Data`_ page to get more details on where Ray Dataset can be used and what libraries it powers. .. code-block:: python from modin.pandas.io import to_ray ray_dataset = to_ray(modin_df) to_dask ------- You can refer to `Dask DataFrame`_ page to get more details on where Dask DataFrame can be used and what libraries it powers. .. code-block:: python from modin.pandas.io import to_dask dask_df = to_dask(modin_df) .. _pandas ecosystem: https://pandas.pydata.org/community/ecosystem.html .. _NumPy ecosystem: https://numpy.org .. _Ray Data: https://docs.ray.io/en/latest/data/data.html .. _Dask DataFrame: https://docs.dask.org/en/stable/dataframe.html ================================================ FILE: docs/flow/modin/config.rst ================================================ :orphan: Modin Configuration Settings """""""""""""""""""""""""""" To adjust Modin's default behavior, you can set the value of Modin configs by setting an environment variable or by using the ``modin.config`` API. To list all available configs in Modin, please run ``python -m modin.config`` to print all Modin configs with descriptions. Public API '''''''''' Potentially, the source of configs can be any, but for now only environment variables are implemented. Any environment variable originate from :class:`~modin.config.envvars.EnvironmentVariable`, which contains most of the config API implementation. .. autoclass:: modin.config.envvars.EnvironmentVariable :members: get, put, get_help, get_value_source, once, subscribe Modin Configs List '''''''''''''''''' .. csv-table:: :file: configs_help.csv :header-rows: 1 Usage Guide ''''''''''' See example of interaction with Modin configs below, as it can be seen config value can be set either by setting the environment variable or by using config API. .. code-block:: python import os # Setting `MODIN_ENGINE` environment variable. # Also can be set outside the script. os.environ["MODIN_ENGINE"] = "Dask" import modin.config import modin.pandas as pd # Checking initially set `Engine` config, # which corresponds to `MODIN_ENGINE` environment # variable print(modin.config.Engine.get()) # prints 'Dask' # Checking default value of `NPartitions` print(modin.config.NPartitions.get()) # prints '8' # Changing value of `NPartitions` modin.config.NPartitions.put(16) print(modin.config.NPartitions.get()) # prints '16' One can also use config variables with a context manager in order to use some config only for a certain part of the code: .. code-block:: python import modin.config as cfg # Default value for this config is 'False' print(cfg.RangePartitioning.get()) # False # Set the config to 'True' inside of the context-manager with cfg.context(RangePartitioning=True): print(cfg.RangePartitioning.get()) # True df.merge(...) # will use range-partitioning impl # Once the context is over, the config gets back to its previous value print(cfg.RangePartitioning.get()) # False # You can also set multiple config at once when you pass a dictionary to 'cfg.context' print(cfg.AsyncReadMode.get()) # False with cfg.context(RangePartitioning=True, AsyncReadMode=True): print(cfg.RangePartitioning.get()) # True print(cfg.AsyncReadMode.get()) # True print(cfg.RangePartitioning.get()) # False print(cfg.AsyncReadMode.get()) # False ================================================ FILE: docs/flow/modin/core/dataframe/algebra.rst ================================================ :orphan: Operators Module Description """""""""""""""""""""""""""" Brief description ''''''''''''''''' Most of the functions that are evaluated by `QueryCompiler` can be categorized into one of the patterns: Map, TreeReduce, Binary, Reduce, etc., called core operators. The ``modin.core.dataframe.algebra`` module provides templates to easily build such types of functions. These templates are supposed to be used at the `QueryCompiler` level since each built function accepts and returns `QueryCompiler`. High-Level Module Overview '''''''''''''''''''''''''' Each template class implements a ``register`` method, which takes functions to apply and instantiate the related template. Functions that are passed to ``register`` will be executed against converted to pandas and preprocessed in a template-specific way partition, so the function would take one of the pandas object: ``pandas.DataFrame``, ``pandas.Series`` or ``pandas.DataFrameGroupbyObject``. .. note:: Currently, functions that are built in that way are supported only in a pandas storage format (i.e. can be used only in `PandasQueryCompiler`). Algebra module provides templates for this type of function: Map operator ------------- Uniformly apply a function argument to each partition in parallel. **Note**: map function should not change the shape of the partitions. .. figure:: /img/map_evaluation.svg :align: center This operator performs best when the number of partitions equals to the number of CPUs so that each single partition gets processed in parallel. When the number of partitions is 1.5x greater than the number of CPUs, Modin applies a heuristic to join some partitions to get "ideal" partitioning so that each new partition gets processed in parallel. Reduce operator --------------- Applies an argument function that reduces each column or row on the specified axis into a scalar, but requires knowledge about the whole axis. Be aware that providing this knowledge may be expensive because the execution engine has to concatenate partitions along the specified axis. Also, note that the execution engine expects that the reduce function returns a one dimensional frame. .. figure:: /img/reduce_evaluation.svg :align: center This operator performs best when the number of partitions (row or column partitions in depend on the specified axis) equals to the number of CPUs so that each single axis partition gets processed in parallel. TreeReduce operator ------------------- Applies an argument function that reduces specified axis into a scalar. First applies map function to each partition in parallel, then concatenates resulted partitions along the specified axis and applies reduce function. In contrast with `Map function` template, here you're allowed to change partition shape in the map phase. Note that the execution engine expects that the reduce function returns a one dimensional frame. This operator performs best when the number of partitions (including the initial and intermediate stages) equals to the number of CPUs so that each single axis partition gets processed in parallel. Binary operator --------------- Applies an argument function, that takes exactly two operands (first is always `QueryCompiler`). If both operands are query compilers then the execution engine broadcasts partitions of the right operand to the left. .. figure:: /img/binary_evaluation.svg :align: center .. warning:: To be able to do frame broadcasting, partitioning along the index axis of both frames has to be equal, otherwise they need to be aligned first. The execution engine will do it automatically but note that this requires repartitioning, which is a much more expensive operation than the binary function itself. This operator performs best when both operands have identical partitioning and the number of partitions of an operand equals to the number of CPUs so that each single partition gets processed in parallel. Fold operator ------------- Applies an argument function that requires knowledge of the whole axis. Be aware that providing this knowledge may be expensive because the execution engine has to concatenate partitions along the specified axis. This operator performs best when the number of partitions (row or column partitions in depend on the specified axis) equals to the number of CPUs so that each single axis partition gets processed in parallel. GroupBy operator ---------------- Evaluates GroupBy aggregation for that type of functions that can be executed via TreeReduce approach. To be able to form groups engine broadcasts ``by`` partitions to each partition of the source frame. This operator performs best when the cardinality of ``by`` columns is low (small number of output groups). At the ``Map`` stage, the operator computes the aggregation for each row partition individually, meaning, that the ``Reduce`` stage takes a dataframe with the following number of rows: ``num_groups * n_row_parts``. If the number of groups is too high, there's a risk of getting a dataframe with even bigger than the initial shape at the ``Reduce`` stage. Default-to-pandas operator -------------------------- Do :doc:`fallback to pandas ` for passed function. This operator has a performance penalty for going from a partitioned Modin DataFrame to pandas because of the communication cost and single-threaded nature of pandas. How to register your own function ''''''''''''''''''''''''''''''''' Let's examine an example of how to use the algebra module to create your own new functions. Imagine you have a complex aggregation that can be implemented into a single query but doesn't have any implementation in pandas API. If you know how to implement this aggregation efficiently in a distributed frame, you may want to use one of the above described patterns (e.g. ``TreeReduce``). Let's implement a function that counts non-NA values for each column or row (``pandas.DataFrame.count``). First, we need to determine the function type. TreeReduce approach would be great: in a map phase, we'll count non-NA cells in each partition in parallel and then just sum its results in the reduce phase. To define the TreeReduce function that does `count` + `sum` we just need to register the appropriate functions and then assign the result to the picked `QueryCompiler` (`PandasQueryCompiler` in our case): .. code-block:: python from modin.core.storage_formats import PandasQueryCompiler from modin.core.dataframe.algebra import TreeReduce PandasQueryCompiler.custom_count = TreeReduce.register(pandas.DataFrame.count, pandas.DataFrame.sum) Then, we want to handle it from the :py:class:`~modin.pandas.dataframe.DataFrame`, so we need to create a way to do that: .. code-block:: python import modin.pandas as pd def count_func(self, **kwargs): # The constructor allows you to pass in a query compiler as a keyword argument return self.__constructor__(query_compiler=self._query_compiler.custom_count(**kwargs)) pd.DataFrame.count_custom = count_func And then you can use it like you usually would: .. code-block:: python df.count_custom(axis=1) Many of the `pandas` API functions can be easily implemented this way, so if you find out that one of your favorite function is still defaulted to pandas and decide to contribute to Modin to add its implementation, you may use this example as a reference. ================================================ FILE: docs/flow/modin/core/dataframe/base/dataframe.rst ================================================ ModinDataframe """""""""""""" The :py:class:`~modin.core.dataframe.base.dataframe.dataframe.ModinDataframe` is the parent class for all dataframes - regardless of what storage format they are backed by. Its purpose is to define the algebra operators that must be exposed by a dataframe. This class exposes the dataframe algebra and is meant to be subclassed by all dataframe implementations. Descendants of this class implement the algebra, and act as the intermediate level between the query compiler and the underlying execution details (e.g. the conforming partition manager). The class provides a significantly reduced set of operations that can be composed to form any pandas query. The :py:class:`~modin.core.dataframe.pandas.dataframe.dataframe.PandasDataframe` is an example of a descendant of this class. It currently has implementations for some of the operators exposed in this class, and is currently being refactored to include implementations for all of the algebra operators. Please refer to the :doc:`PandasDataframe documentation ` for more information. The :py:class:`~modin.core.dataframe.base.dataframe.dataframe.ModinDataframe` is independent of implementation specific details such as partitioning, storage format, or execution engine. Public API ---------- .. autoclass:: modin.core.dataframe.base.dataframe.dataframe.ModinDataframe :members: ================================================ FILE: docs/flow/modin/core/dataframe/base/index.rst ================================================ Purpose ======= The :py:class:`~modin.core.dataframe.base.dataframe.dataframe.ModinDataframe` serves the purpose of describing and defining the :doc:`Core Dataframe Algebra `. It is the core construction element and serves as the client for the :doc:`Modin Query Compiler`. Descendants that offer implementations execute the queries from the compiler by invoking functions over partitions via a partition manager. The partitions and partition manager interfaces are currently implementation-specific, but may be standardized in the future. The :py:class:`~modin.core.dataframe.base.dataframe.dataframe.ModinDataframe` and axis partitions are the interfaces that must be implemented by any :doc:`execution backend` in order for it to be plugged in to Modin. These classes are mostly abstract, however very simple and generic enough methods like :py:meth:`~modin.core.dataframe.base.partitioning.BaseDataframeAxisPartition.force_materialization` can be implemented at the base level because for now we do not expect them to differ in any implementation. ModinDataframe Interface ======================== * :doc:`ModinDataframe ` is an abstract class which represents the algebra operators a dataframe must expose. * :doc:`BaseDataframeAxisPartition ` is an abstract class, representing a joined group of partitions along some axis (either rows or labels). .. toctree:: :hidden: dataframe partitioning/axis_partition ================================================ FILE: docs/flow/modin/core/dataframe/base/partitioning/axis_partition.rst ================================================ BaseDataframeAxisPartition """""""""""""""""""""""""" The class is base for any axis partition class and serves as the last level on which operations that were conveyed from the partition manager are being performed on an entire column or row. **Note**: ``modin.core.dataframe.base`` intentionally does not describe any particular partition interface, as it is the partition manager responsibility (if said partition manager is implemented), i.e. it is too low-level to be present on the base, abstract level. The class provides an API that has to be overridden by the child classes in order to manipulate on a list of block partitions (making up column or row partition) they store. The procedures that use this class and its methods assume that they have some global knowledge about the entire axis. This may require the implementation to use concatenation or append on the list of block partitions. Public API ---------- .. autoclass:: modin.core.dataframe.base.partitioning.axis_partition.BaseDataframeAxisPartition :members: ================================================ FILE: docs/flow/modin/core/dataframe/index.rst ================================================ :orphan: Core Modin Dataframe Objects ============================ Modin partitions data to scale efficiently. To keep track of everything a few key classes are introduced: ``Dataframe``, ``Partition``, ``AxisPartiton`` and ``PartitionManager``. * ``Dataframe`` is the class conforming to Dataframe Algebra. * ``Partition`` is an element of a NxM grid which, when combined, represents the ``Dataframe`` * ``AxisPartition`` is a joined group of ``Partition``-s along some axis (either rows or columns) * ``PartitionManager`` is the manager that implements the primitives used for Dataframe Algebra operations over ``Partition``-s Each :doc:`storage format `, execution engine, and each execution system (storage format + execution engine) may have its own implementations of these Core Dataframe's entities. Current stable implementations are the following: * :doc:`Base ModinDataframe ` defines a common interface and algebra operators for `Dataframe` implementations. Storage format specific: * :doc:`Modin PandasDataframe ` is an implementation for any frame class of :doc:`pandas storage format `. Engine specific: * :doc:`Modin GenericRayDataframe ` is an implementation for any frame class that works on Ray execution engine. * :doc:`Modin GenericUnidistDataframe ` is an implementation for any frame class that works on Unidist execution engine. Execution system specific: * :doc:`Modin PandasOnRayDataframe ` is a specialization of the Core Modin Dataframe for ``PandasOnRay`` execution. * :doc:`Modin PandasOnDaskDataframe ` is specialization of the Core Modin Dataframe for ``PandasOnDask`` execution. * :doc:`Modin PandasOnPythonDataframe ` is a specialization of the Core Modin Dataframe for ``PandasOnPython`` execution. * :doc:`Modin PandasOnUnidistDataframe ` is a specialization of the Core Modin Dataframe for ``PandasOnUnidist`` execution. .. note:: At the current stage of Modin development, the base interfaces of the Dataframe objects are not defined yet. So for now the origin of all changes in the Dataframe interfaces is the :doc:`Dataframe for pandas storage format`. .. toctree:: :hidden: base/index pandas/index ================================================ FILE: docs/flow/modin/core/dataframe/pandas/dataframe.rst ================================================ PandasDataframe """"""""""""""" :py:class:`~modin.core.dataframe.pandas.dataframe.dataframe.PandasDataframe` is a direct descendant of :py:class:`~modin.core.dataframe.base.dataframe.dataframe.ModinDataframe`. Its purpose is to implement the abstract interfaces for usage with all ``pandas``-based :doc:`storage formats`. :py:class:`~modin.core.dataframe.pandas.dataframe.dataframe.PandasDataframe` could be inherited and augmented further by any specific implementation which needs it to take special care of some behavior or to improve performance for certain execution engine. The class serves as the intermediate level between ``pandas`` query compiler and conforming partition manager. All queries formed at the query compiler layer are ingested by this class and then conveyed jointly with the stored partitions into the partition manager for processing. Direct partitions manipulation by this class is prohibited except cases if an operation is strictly private or protected and called inside of the class only. The class provides significantly reduced set of operations that fit plenty of pandas operations. Main tasks of :py:class:`~modin.core.dataframe.pandas.dataframe.dataframe.PandasDataframe` are storage of partitions, manipulation with labels of axes and providing set of methods to perform operations on the internal data. As mentioned above, ``PandasDataframe`` shouldn't work with stored partitions directly and the responsibility for modifying partitions array has to lay on :doc:`partitioning/partition_manager`. For example, method :meth:`~modin.core.dataframe.pandas.dataframe.dataframe.PandasDataframe.broadcast_apply_full_axis` redirects applying function to :meth:`~PandasDataframePartitionManager.broadcast_axis_partitions` method. ``Modin PandasDataframe`` can be created from ``pandas.DataFrame``, ``pyarrow.Table`` (methods :meth:`~modin.core.dataframe.pandas.dataframe.dataframe.PandasDataframe.from_pandas`, :meth:`~modin.core.dataframe.pandas.dataframe.dataframe.PandasDataframe.from_arrow` are used respectively). Also, ``PandasDataframe`` can be converted to ``np.array``, ``pandas.DataFrame`` (methods :meth:`~modin.core.dataframe.pandas.dataframe.dataframe.PandasDataframe.to_numpy`, :meth:`~modin.core.dataframe.pandas.dataframe.dataframe.PandasDataframe.to_pandas` are used respectively). Manipulation with labels of axes happens using internal methods for changing labels on the new, adding prefixes/suffixes etc. Public API ---------- .. autoclass:: modin.core.dataframe.pandas.dataframe.dataframe.PandasDataframe :members: ================================================ FILE: docs/flow/modin/core/dataframe/pandas/index.rst ================================================ Modin PandasDataframe Objects ============================= ``modin.core.dataframe.pandas`` is the package which houses common implementations of different Modin internal classes used by most `pandas`-based :doc:`storage formats`. It also double-serves as the full example of how to implement Modin execution backend pieces (sans the :doc:`execution part` which is absent here), as it implements everything an execution backend needs to be fully conformant to Modin expectations. * :doc:`PandasDataframe ` is the class conforming to Dataframe Algebra. * :doc:`PandasDataframePartition ` implements ``Partition`` interface holding ``pandas.DataFrame``. * :doc:`PandasDataframeAxisPartition ` is a joined group of ``PandasDataframePartition``-s along some axis (either rows or labels) * :doc:`PandasDataframePartitionManager ` is the manager that implements the primitives used for Dataframe Algebra operations over ``PandasDataframePartition``-s * :doc:`ModinDtypes ` * :doc:`ModinIndex ` .. toctree:: :hidden: dataframe partitioning/partition partitioning/axis_partition partitioning/partition_manager metadata/dtypes metadata/index ================================================ FILE: docs/flow/modin/core/dataframe/pandas/metadata/dtypes.rst ================================================ ModinDtypes """"""""""" Public API ---------- .. autoclass:: modin.core.dataframe.pandas.metadata.dtypes.ModinDtypes :members: ================================================ FILE: docs/flow/modin/core/dataframe/pandas/metadata/index.rst ================================================ ModinIndex """""""""" Public API ---------- .. autoclass:: modin.core.dataframe.pandas.metadata.index.ModinIndex :members: ================================================ FILE: docs/flow/modin/core/dataframe/pandas/partitioning/axis_partition.rst ================================================ PandasDataframeAxisPartition """""""""""""""""""""""""""" The class implements abstract interface methods from :py:class:`~modin.core.dataframe.base.partitioning.axis_partition.BaseDataframeAxisPartition` giving the means for a sibling :doc:`partition manager` to actually work with the axis-wide partitions. The class is base for any axis partition class of ``pandas`` storage format. Subclasses must implement ``list_of_blocks`` which represents data wrapped by the :py:class:`~modin.core.dataframe.pandas.partitioning.partition.PandasDataframePartition` objects and creates something interpretable as a ``pandas.DataFrame``. See :py:class:`~modin.core.execution.ray.implementations.pandas_on_ray.partitioning.axis_partition.PandasOnRayDataframeAxisPartition` for an example on how to override/use this class when the implementation needs to be augmented. The :py:class:`~modin.core.dataframe.pandas.partitioning.axis_partition.PandasDataframeAxisPartition` object has an invariant that requires that this object is never returned from a function. It assumes that there will always be ``PandasDataframeAxisPartition`` object stored and structures itself accordingly. Public API ---------- .. autoclass:: modin.core.dataframe.pandas.partitioning.axis_partition.PandasDataframeAxisPartition :members: ================================================ FILE: docs/flow/modin/core/dataframe/pandas/partitioning/partition.rst ================================================ PandasDataframePartition """""""""""""""""""""""" The class is base for any partition class of ``pandas`` storage format and serves as the last level on which operations that were conveyed from the partition manager are being performed on an individual block partition. The class provides an API that has to be overridden by child classes in order to manipulate on data and metadata they store. The public API exposed by the children of this class is used in :py:class:`~modin.core.dataframe.pandas.partitioning.partition_manager.PandasDataframePartitionManager`. The objects wrapped by the child classes are treated as immutable by ``PandasDataframePartitionManager`` subclasses and no logic for updating inplace. Public API ---------- .. autoclass:: modin.core.dataframe.pandas.partitioning.partition.PandasDataframePartition :members: ================================================ FILE: docs/flow/modin/core/dataframe/pandas/partitioning/partition_manager.rst ================================================ PandasDataframePartitionManager """"""""""""""""""""""""""""""" The class is base for any partition manager class of ``pandas`` storage format and serves as intermediate level between :doc:`Modin PandasDataframe <../dataframe>` and conforming :doc:`partition ` class. The class is responsible for partitions manipulation and applying a function to individual partitions: block partitions, row partitions or column partitions, i.e. the class can form axis partitions from block partitions to apply a function if an operation requires access to an entire column or row. The class translates frame API into partition API and also can have some preprocessing operations depending on the partition type for improving performance (for example, :meth:`~modin.core.dataframe.pandas.partitioning.partition_manager.PandasDataframePartitionManager.preprocess_func`). Main task of partition manager is to keep knowledge of how partitions are stored and managed internal to itself, so surrounding code could use it via lean enough API without worrying about implementation details. Partition manager can apply user-passed (arbitrary) function in different modes: * block-wise (apply a function to individual block partitions): * optionally accepting partition indices along each axis * optionally accepting an item to be split so parts of it would be sent to each partition * along a full axis (apply a function to an entire column or row made up of block partitions when user function needs information about the whole axis) It can also broadcast partitions from `right` to `left` when executing certain operations making `right` partitions available for functions executed where `left` live. .. TODO: insert more text explaining "broadcast" term Partition manager also is used to create "logical" partitions, or :doc:`axis partitions ` by joining existing partitions along specified axis (either rows or labels), and to concatenate different partition sets along given axis. It also maintains mapping from "external" (end user-visible) indices along all axes to internal indices which are actually pairs of indices of partitions and indices inside the partitions, as well as manages conversion to numpy and pandas representations. Public API ---------- .. autoclass:: modin.core.dataframe.pandas.partitioning.partition_manager.PandasDataframePartitionManager :members: ================================================ FILE: docs/flow/modin/core/execution/dask/implementations/pandas_on_dask/dataframe.rst ================================================ PandasOnDaskDataframe """"""""""""""""""""" The class is the specific implementation of the dataframe algebra for the `Dask` execution engine. It serves as an intermediate level between ``pandas`` query compiler and :py:class:`~modin.core.execution.dask.implementations.pandas_on_dask.partitioning.PandasOnDaskDataframePartitionManager`. Public API ---------- .. autoclass:: modin.core.execution.dask.implementations.pandas_on_dask.dataframe.PandasOnDaskDataframe :members: ================================================ FILE: docs/flow/modin/core/execution/dask/implementations/pandas_on_dask/index.rst ================================================ :orphan: PandasOnDask Execution ====================== Queries that perform data transformation, data ingress or data egress using the `pandas on Dask` execution pass through the Modin components detailed below. To enable `pandas on Dask` execution, please refer to the usage section in :doc:`pandas on Dask `. Data Transformation ''''''''''''''''''' .. image:: /img/pandas_on_dask_data_transform.svg :align: center When a user calls any :py:class:`~modin.pandas.dataframe.DataFrame` API, a query starts forming at the `API` layer to be executed at the `Execution` layer. The `API` layer is responsible for processing the query appropriately, for example, determining whether the final result should be a ``DataFrame`` or ``Series`` object. This layer is also responsible for sanitizing the input to the :py:class:`~modin.core.storage_formats.pandas.query_compiler.PandasQueryCompiler`, e.g. validating a parameter from the query and defining specific intermediate values to provide more context to the query compiler. The :py:class:`~modin.core.storage_formats.pandas.query_compiler.PandasQueryCompiler` is responsible for processing the query, received from the :py:class:`~modin.pandas.dataframe.DataFrame` `API` layer, to determine how to apply it to a subset of the data - either cell-wise or along an axis-wise partition backed by the `pandas` storage format. The :py:class:`~modin.core.storage_formats.pandas.query_compiler.PandasQueryCompiler` maps the query to one of the :doc:`Core Algebra Operators ` of the :py:class:`~modin.core.execution.dask.implementations.pandas_on_dask.dataframe.PandasOnDaskDataframe` which inherits generic functionality from the :py:class:`~modin.core.dataframe.pandas.dataframe.dataframe.PandasDataframe`. PandasOnDask Dataframe implementation ------------------------------------- Modin implements ``Dataframe``, ``PartitionManager``, ``AxisPartition`` and ``Partition`` classes specifically for the `PandasOnDask` execution. * :doc:`PandasOnDaskDataframe ` * :doc:`PandasOnDaskDataframePartition ` * :doc:`PandasOnDaskDataframeVirtualPartition ` * :doc:`PandasOnDaskDataframePartitionManager ` .. toctree:: :hidden: dataframe partitioning/partition partitioning/virtual_partition partitioning/partition_manager Data Ingress '''''''''''' .. image:: /img/pandas_on_dask_data_ingress.svg :align: center Data Egress ''''''''''' .. image:: /img/pandas_on_dask_data_egress.svg :align: center When a user calls any IO function from the ``modin.pandas.io`` module, the `API` layer queries the :py:class:`~modin.core.execution.dispatching.factories.dispatcher.FactoryDispatcher` which defines a factory specific for the execution, namely, the :py:class:`~modin.core.execution.dispatching.factories.factories.PandasOnDaskFactory`. The factory, in turn, exposes the :py:class:`~modin.core.execution.dask.implementations.pandas_on_dask.io.PandasOnDaskIO` class whose responsibility is to perform a parallel read/write from/to a file. When reading data from a CSV file, for example, the :py:class:`~modin.core.execution.dask.implementations.pandas_on_dask.io.PandasOnDaskIO` class forwards the user query to the :meth:`~modin.core.io.text.CSVDispatcher._read` method of :py:class:`~modin.core.io.text.CSVDispatcher`, where the query's parameters are preprocessed to check if they are supported by the execution (defaulting to pandas if they are not) and computes some metadata common for all partitions to be read. Then, the file is split into row chunks, and this data is used to launch remote tasks on the Dask workers via the :meth:`~modin.core.execution.dask.common.engine_wrapper.DaskWrapper.deploy` method of :py:class:`~modin.core.execution.dask.common.engine_wrapper.DaskWrapper`. On each Dask worker, the :py:class:`~modin.core.storage_formats.pandas.parsers.PandasCSVParser` parses data. After the remote tasks are finished, additional result postprocessing is performed, and a new query compiler with the data read is returned. When writing data to a CSV file, for example, the :py:class:`~modin.core.execution.dask.implementations.pandas_on_dask.io.PandasOnDaskIO` processes the user query to execute it on Dask workers. Then, the :py:class:`~modin.core.execution.dask.implementations.pandas_on_dask.io.PandasOnDaskIO` asks the :py:class:`~modin.core.execution.dask.implementations.pandas_on_dask.io.PandasOnDaskDataframe` to decompose the data into row-wise partitions that will be written into the file in parallel in Dask workers. .. note:: Currently, data egress uses default `pandas` implementation for `pandas on Dask` execution. ================================================ FILE: docs/flow/modin/core/execution/dask/implementations/pandas_on_dask/partitioning/partition.rst ================================================ PandasOnDaskDataframePartition """""""""""""""""""""""""""""" The class is the specific implementation of :py:class:`~modin.core.dataframe.pandas.partitioning.partition.PandasDataframePartition`, providing the API to perform operations on a block partition, namely, ``pandas.DataFrame``, using Dask as the execution engine. In addition to wrapping a ``pandas.DataFrame``, the class also holds the following metadata: * ``length`` - length of ``pandas.DataFrame`` wrapped * ``width`` - width of ``pandas.DataFrame`` wrapped * ``ip`` - node IP address that holds ``pandas.DataFrame`` wrapped An operation on a block partition can be performed in two modes: * asynchronously_ - via :meth:`~modin.core.execution.dask.implementations.pandas_on_dask.partitioning.PandasOnDaskDataframePartition.apply` * lazily_ - via :meth:`~modin.core.execution.dask.implementations.pandas_on_dask.partitioning.PandasOnDaskDataframePartition.add_to_apply_calls` Public API ---------- .. autoclass:: modin.core.execution.dask.implementations.pandas_on_dask.partitioning.PandasOnDaskDataframePartition :members: .. _asynchronously: https://en.wikipedia.org/wiki/Asynchrony_(computer_programming) .. _lazily: https://en.wikipedia.org/wiki/Lazy_evaluation ================================================ FILE: docs/flow/modin/core/execution/dask/implementations/pandas_on_dask/partitioning/partition_manager.rst ================================================ PandasOnDaskDataframePartitionManager """"""""""""""""""""""""""""""""""""" This class is the specific implementation of :py:class:`~modin.core.dataframe.pandas.partitioning.partition_manager.PandasDataframePartitionManager` using Dask as the execution engine. This class is responsible for partition manipulation and applying a function to block/row/column partitions. Public API ---------- .. autoclass:: modin.core.execution.dask.implementations.pandas_on_dask.partitioning.PandasOnDaskDataframePartitionManager :members: ================================================ FILE: docs/flow/modin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.rst ================================================ PandasOnDaskDataframeVirtualPartition """"""""""""""""""""""""""""""""""""" The class is the specific implementation of :py:class:`~modin.core.dataframe.pandas.partitioning.virtual_partition.PandasOnDaskDataframeVirtualPartition`, providing the API to perform operations on an axis (column or row) partition using Dask as the execution engine. The axis partition is a wrapper over a list of block partitions that are stored in this class. Public API ---------- .. autoclass:: modin.core.execution.dask.implementations.pandas_on_dask.partitioning.PandasOnDaskDataframeVirtualPartition :members: PandasOnDaskDataframeColumnPartition """""""""""""""""""""""""""""""""""" Public API ---------- .. autoclass:: modin.core.execution.dask.implementations.pandas_on_dask.partitioning.PandasOnDaskDataframeColumnPartition :members: PandasOnDaskDataframeRowPartition """"""""""""""""""""""""""""""""" Public API ---------- .. autoclass:: modin.core.execution.dask.implementations.pandas_on_dask.partitioning.PandasOnDaskDataframeRowPartition :members: ================================================ FILE: docs/flow/modin/core/execution/dispatching.rst ================================================ :orphan: .. TODO: add links to documentation for mentioned modules. Factories Module Description """""""""""""""""""""""""""" Brief description ''''''''''''''''' Modin has several execution engines and storage formats, combining them together forms certain executions.  Calling any :py:class:`~modin.pandas.dataframe.DataFrame` API function will end up in some execution-specific method. The responsibility of dispatching high-level API calls to execution-specific function belongs to the :ref:`QueryCompiler `, which is determined at the time of the dataframe's creation by the factory of the corresponding execution. The mission of this module is to route IO function calls from the API level to its actual execution-specific implementations, which builds the `QueryCompiler` of the appropriate execution. Execution representation via Factories '''''''''''''''''''''''''''''''''''''' Execution is a combination of the :doc:`storage format ` and an actual execution engine. For example, ``PandasOnRay`` execution means the combination of the `pandas storage format` and `Ray` engine. Each storage format has its own :ref:`Query Compiler ` which compiles the most efficient queries for the corresponding :doc:`Core Modin Dataframe ` implementation. Speaking about ``PandasOnRay`` execution, its Query Compiler is :doc:`PandasQueryCompiler ` and the Dataframe implementation is :doc:`PandasDataframe `, which is general implementation for every execution of the pandas storage format. The actual implementation of ``PandasOnRay`` dataframe is defined by the :doc:`PandasOnRayDataframe ` class that extends ``PandasDataframe``. In the scope of this module, each execution is represented with a factory class located in ``modin/core/execution/dispatching/factories/factories.py``. Each factory contains a field that identifies the IO module of the corresponding execution. This IO module is responsible for dispatching calls of IO functions to their actual implementations in the underlying IO module. For more information about IO module visit :doc:`IO ` page. Factory Dispatcher '''''''''''''''''' The :py:class:`~modin.core.execution.dispatching.factories.dispatcher.FactoryDispatcher` class provides public methods whose interface corresponds to pandas IO functions, the only difference is that they return `QueryCompiler` of the selected storage format instead of high-level :py:class:`~modin.pandas.dataframe.DataFrame`. ``FactoryDispatcher`` is responsible for routing these IO calls to the factory which represents the selected execution. So when you call ``read_csv()`` function and your execution is ``PandasOnRay`` then the trace would be the following: .. figure:: /img/factory_dispatching.svg :align: center ``modin.pandas.read_csv`` calls ``FactoryDispatcher.read_csv``, which calls ``._read_csv`` function of the factory of the selected execution, in our case it's ``PandasOnRayFactory._read_csv``, which in turn forwards this call to the actual implementation of ``read_csv`` — to the ``PandasOnRayIO.read_csv``. The result of ``modin.pandas.read_csv`` will return a high-level Modin DataFrame with the appropriate `QueryCompiler` bound to it, which is responsible for dispatching all of the further function calls. Public API '''''''''' .. automodule:: modin.core.execution.dispatching.factories.factories :members: ================================================ FILE: docs/flow/modin/core/execution/python/implementations/pandas_on_python/dataframe.rst ================================================ PandasOnPythonDataframe """"""""""""""""""""""" The class is specific implementation of :py:class:`~modin.core.dataframe.pandas.dataframe.dataframe.PandasDataframe` for `Python` execution engine. It serves as an intermediate level between :py:class:`~modin.core.storage_formats.pandas.query_compiler.PandasQueryCompiler` and :py:class:`~modin.core.execution.python.implementations.pandas_on_python.partitioning.partition_manager.PandasOnPythonDataframePartitionManager`. Public API ---------- .. autoclass:: modin.core.execution.python.implementations.pandas_on_python.dataframe.dataframe.PandasOnPythonDataframe :members: ================================================ FILE: docs/flow/modin/core/execution/python/implementations/pandas_on_python/index.rst ================================================ :orphan: PandasOnPython Execution ======================== Queries that perform data transformation, data ingress or data egress using the `pandas on Python` execution pass through the Modin components detailed below. `pandas on Python` execution is sequential and it's used for the debug purposes. To enable `pandas on Python` execution, please refer to the usage section in :doc:`pandas on Python `. Data Transformation ''''''''''''''''''' .. image:: /img/pandas_on_python_data_transform.svg :align: center When a user calls any :py:class:`~modin.pandas.dataframe.DataFrame` API, a query starts forming at the `API` layer to be executed at the `Execution` layer. The `API` layer is responsible for processing the query appropriately, for example, determining whether the final result should be a ``DataFrame`` or ``Series`` object. This layer is also responsible for sanitizing the input to the :py:class:`~modin.core.storage_formats.pandas.query_compiler.PandasQueryCompiler`, e.g. validating a parameter from the query and defining specific intermediate values to provide more context to the query compiler. The :py:class:`~modin.core.storage_formats.pandas.query_compiler.PandasQueryCompiler` is responsible for processing the query, received from the :py:class:`~modin.pandas.dataframe.DataFrame` `API` layer, to determine how to apply it to a subset of the data - either cell-wise or along an axis-wise partition backed by the `pandas` storage format. The :py:class:`~modin.core.storage_formats.pandas.query_compiler.PandasQueryCompiler` maps the query to one of the :doc:`Core Algebra Operators ` of the :py:class:`~modin.core.execution.python.implementations.pandas_on_python.dataframe.dataframe.PandasOnPythonDataframe` which inherits generic functionality from the :py:class:`~modin.core.dataframe.pandas.dataframe.dataframe.PandasDataframe`. PandasOnPython Dataframe implementation --------------------------------------- This page describes implementation of :doc:`Modin PandasDataframe Objects ` specific for `PandasOnPython` execution. Since Python engine doesn't allow computation parallelization, operations on partitions are performed sequentially. The absence of parallelization doesn't give any performance speed-up, so ``PandasOnPython`` is used for testing purposes only. * :doc:`PandasOnPythonDataframe ` * :doc:`PandasOnPythonDataframePartition ` * :doc:`PandasOnPythonDataframeAxisPartition ` * :doc:`PandasOnPythonDataframePartitionManager ` .. toctree:: :hidden: dataframe partitioning/partition partitioning/axis_partition partitioning/partition_manager Data Ingress '''''''''''' .. image:: /img/pandas_on_python_data_ingress.svg :align: center Data Egress ''''''''''' .. image:: /img/pandas_on_python_data_egress.svg :align: center When a user calls any IO function from the ``modin.pandas.io`` module, the `API` layer queries the :py:class:`~modin.core.execution.dispatching.factories.dispatcher.FactoryDispatcher` which defines a factory specific for the execution, namely, the :py:class:`~modin.core.execution.dispatching.factories.factories.PandasOnPythonFactory`. The factory, in turn, exposes the :py:class:`~modin.core.execution.python.implementations.pandas_on_python.io.PandasOnPythonIO` class whose responsibility is a read/write from/to a file. When reading data from a CSV file, for example, the :py:class:`~modin.core.execution.python.implementations.pandas_on_python.io.io.PandasOnPythonIO` class reads the data using corresponding `pandas` function (``pandas.read_csv()`` in this case). After the reading is complete, a new query compiler is created from `pandas` object using :py:meth:`~modin.core.execution.python.implementations.pandas_on_python.io.io.PandasOnPythonIO.from_pandas` and returned. When writing data to a CSV file, for example, the :py:class:`~modin.core.execution.python.implementations.pandas_on_python.io.PandasOnPythonIO` converts a query compiler to `pandas` object using :py:meth:`~modin.core.storage_formats.base.query_compiler.BaseQueryCompiler.to_pandas`. After that, `pandas` writes the data to the file using corresponding function (``pandas.to_csv()`` in this case). ================================================ FILE: docs/flow/modin/core/execution/python/implementations/pandas_on_python/partitioning/axis_partition.rst ================================================ PandasOnPythonDataframeAxisPartition """""""""""""""""""""""""""""""""""" The class is specific implementation of :py:class:`~modin.core.dataframe.pandas.partitioning.axis_partition.PandasDataframeAxisPartition`, providing the API to perform operations on an axis partition, using Python as the execution engine. The axis partition is made up of list of block partitions that are stored in this class. Public API ---------- .. autoclass:: modin.core.execution.python.implementations.pandas_on_python.partitioning.virtual_partition.PandasOnPythonDataframeAxisPartition PandasOnPythonFrameColumnPartition """""""""""""""""""""""""""""""""" Public API ---------- .. autoclass:: modin.core.execution.python.implementations.pandas_on_python.partitioning.virtual_partition.PandasOnPythonDataframeColumnPartition :members: PandasOnPythonFrameRowPartition """"""""""""""""""""""""""""""" Public API ---------- .. autoclass:: modin.core.execution.python.implementations.pandas_on_python.partitioning.virtual_partition.PandasOnPythonDataframeRowPartition :members: ================================================ FILE: docs/flow/modin/core/execution/python/implementations/pandas_on_python/partitioning/partition.rst ================================================ PandasOnPythonDataframePartition """""""""""""""""""""""""""""""" The class is specific implementation of :py:class:`~modin.core.dataframe.pandas.partitioning.partition.PandasDataframePartition`, providing the API to perform operations on a block partition using Python as the execution engine. In addition to wrapping a ``pandas.DataFrame``, the class also holds the following metadata: * ``length`` - length of ``pandas.DataFrame`` wrapped * ``width`` - width of ``pandas.DataFrame`` wrapped An operation on a block partition can be performed in two modes: * immediately via :meth:`~modin.core.execution.python.implementations.pandas_on_python.partitioning.partition.PandasOnPythonDataframePartition.apply` - in this case accumulated call queue and new function will be executed immediately. * lazily_ via :meth:`~modin.core.execution.python.implementations.pandas_on_python.partitioning.partition.PandasOnPythonDataframePartition.add_to_apply_calls` - in this case function will be added to the call queue and no computations will be done at the moment. Public API ---------- .. autoclass:: modin.core.execution.python.implementations.pandas_on_python.partitioning.partition.PandasOnPythonDataframePartition :members: .. _lazily: https://en.wikipedia.org/wiki/Lazy_evaluation ================================================ FILE: docs/flow/modin/core/execution/python/implementations/pandas_on_python/partitioning/partition_manager.rst ================================================ PandasOnPythonDataframePartition """""""""""""""""""""""""""""""" The class is specific implementation of :py:class:`~modin.core.dataframe.pandas.partitioning.partition_manager.PandasDataframePartitionManager` using Python as the execution engine. This class is responsible for partitions manipulation and applying a function to block/row/column partitions. Public API ---------- .. autoclass:: modin.core.execution.python.implementations.pandas_on_python.partitioning.partition_manager.PandasOnPythonDataframePartitionManager :members: ================================================ FILE: docs/flow/modin/core/execution/ray/generic.rst ================================================ :orphan: Generic Ray-based members ========================= Objects which are storage format agnostic but require specific Ray implementation are placed in ``modin.core.execution.ray.generic``. Their purpose is to implement certain parallel I/O operations and to serve as a foundation for building storage format specific objects: .. autoclass:: modin.core.execution.ray.generic.io.RayIO :members: .. autoclass:: modin.core.execution.ray.generic.partitioning.GenericRayDataframePartitionManager :members: ================================================ FILE: docs/flow/modin/core/execution/ray/implementations/pandas_on_ray/dataframe.rst ================================================ PandasOnRayDataframe """""""""""""""""""" The class is specific implementation of :py:class:`~modin.core.dataframe.pandas.dataframe.dataframe.PandasDataframe` class using Ray distributed engine. It serves as an intermediate level between :py:class:`~modin.core.storage_formats.pandas.query_compiler.PandasQueryCompiler` and :py:class:`~modin.core.execution.ray.implementations.pandas_on_ray.partitioning.PandasOnRayDataframePartitionManager`. Public API ---------- .. autoclass:: modin.core.execution.ray.implementations.pandas_on_ray.dataframe.PandasOnRayDataframe :members: ================================================ FILE: docs/flow/modin/core/execution/ray/implementations/pandas_on_ray/index.rst ================================================ :orphan: PandasOnRay Execution ===================== Queries that perform data transformation, data ingress or data egress using the `pandas on Ray` execution pass through the Modin components detailed below. To enable `pandas on Ray` execution, please refer to the usage section in :doc:`pandas on Ray `. Data Transformation ''''''''''''''''''' .. image:: /img/pandas_on_ray_data_transform.svg :align: center When a user calls any :py:class:`~modin.pandas.dataframe.DataFrame` API, a query starts forming at the `API` layer to be executed at the `Execution` layer. The `API` layer is responsible for processing the query appropriately, for example, determining whether the final result should be a ``DataFrame`` or ``Series`` object. This layer is also responsible for sanitizing the input to the :py:class:`~modin.core.storage_formats.pandas.query_compiler.PandasQueryCompiler`, e.g. validating a parameter from the query and defining specific intermediate values to provide more context to the query compiler. The :py:class:`~modin.core.storage_formats.pandas.query_compiler.PandasQueryCompiler` is responsible for processing the query, received from the :py:class:`~modin.pandas.dataframe.DataFrame` `API` layer, to determine how to apply it to a subset of the data - either cell-wise or along an axis-wise partition backed by the `pandas` storage format. The :py:class:`~modin.core.storage_formats.pandas.query_compiler.PandasQueryCompiler` maps the query to one of the :doc:`Core Algebra Operators ` of the :py:class:`~modin.core.execution.ray.implementations.pandas_on_ray.dataframe.PandasOnRayDataframe` which inherits generic functionality from the ``GenericRayDataframe`` and the :py:class:`~modin.core.dataframe.pandas.dataframe.dataframe.PandasDataframe`. .. TODO: insert a link to ``GenericRayDataframe`` once we add an implementatiton of the class PandasOnRay Dataframe implementation ------------------------------------ Modin implements ``Dataframe``, ``PartitionManager``, ``VirtualPartition`` (a specific kind of ``AxisPartition`` with the capability to combine smaller partitions into the one "virtual") and ``Partition`` classes specifically for the ``PandasOnRay`` execution: * :doc:`PandasOnRayDataframe ` * :doc:`PandasOnRayDataframePartition ` * :doc:`PandasOnRayDataframeVirtualPartition ` * :doc:`PandasOnRayDataframePartitionManager ` .. toctree:: :hidden: dataframe partitioning/partition partitioning/axis_partition partitioning/partition_manager Data Ingress '''''''''''' .. image:: /img/pandas_on_ray_data_ingress.svg :align: center Data Egress ''''''''''' .. image:: /img/pandas_on_ray_data_egress.svg :align: center When a user calls any IO function from the ``modin.pandas.io`` module, the `API` layer queries the :py:class:`~modin.core.execution.dispatching.factories.dispatcher.FactoryDispatcher` which defines a factory specific for the execution, namely, the :py:class:`~modin.core.execution.dispatching.factories.factories.PandasOnRayFactory`. The factory, in turn, exposes the :py:class:`~modin.core.execution.ray.implementations.pandas_on_ray.io.PandasOnRayIO` class whose responsibility is to perform a parallel read/write from/to a file. When reading data from a CSV file, for example, the :py:class:`~modin.core.execution.ray.implementations.pandas_on_ray.io.PandasOnRayIO` class forwards the user query to the :meth:`~modin.core.io.text.CSVDispatcher._read` method of :py:class:`~modin.core.io.text.CSVDispatcher`, where the query's parameters are preprocessed to check if they are supported by the execution (defaulting to pandas if they are not) and computes some metadata common for all partitions to be read. Then, the file is split into row chunks, and this data is used to launch remote tasks on the Ray workers via the :meth:`~modin.core.execution.ray.common.RayWrapper.deploy` method of :py:class:`~modin.core.execution.ray.common.RayWrapper`. On each Ray worker, the :py:class:`~modin.core.storage_formats.pandas.parsers.PandasCSVParser` parses data. After the remote tasks are finished, additional result postprocessing is performed, and a new query compiler with the data read is returned. When writing data to a CSV file, for example, the :py:class:`~modin.core.execution.ray.implementations.pandas_on_ray.io.PandasOnRayIO` processes the user query to execute it on Ray workers. Then, the :py:class:`~modin.core.execution.ray.implementations.pandas_on_ray.io.PandasOnRayIO` asks the :py:class:`~modin.core.execution.ray.implementations.pandas_on_ray.dataframe.PandasOnRayDataframe` to decompose the data into row-wise partitions that will be written into the file in parallel in Ray workers. ================================================ FILE: docs/flow/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/axis_partition.rst ================================================ PandasOnRayDataframeVirtualPartition """""""""""""""""""""""""""""""""""" This class is the specific implementation of :py:class:`~modin.core.dataframe.pandas.partitioning.axis_partition.PandasDataframeAxisPartition`, providing the API to perform operations on an axis partition, using Ray as an execution engine. The virtual partition is a wrapper over a list of block partitions, which are stored in this class, with the capability to combine the smaller partitions into the one "virtual". Public API ---------- .. autoclass:: modin.core.execution.ray.implementations.pandas_on_ray.partitioning.PandasOnRayDataframeVirtualPartition :members: PandasOnRayDataframeColumnPartition """"""""""""""""""""""""""""""""""" Public API ---------- .. autoclass:: modin.core.execution.ray.implementations.pandas_on_ray.partitioning.PandasOnRayDataframeColumnPartition :members: PandasOnRayDataframeRowPartition """""""""""""""""""""""""""""""" Public API ---------- .. autoclass:: modin.core.execution.ray.implementations.pandas_on_ray.partitioning.PandasOnRayDataframeRowPartition :members: ================================================ FILE: docs/flow/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition.rst ================================================ PandasOnRayDataframePartition """"""""""""""""""""""""""""" The class is the specific implementation of :py:class:`~modin.core.dataframe.pandas.partitioning.partition.PandasDataframePartition`, providing the API to perform operations on a block partition, namely, ``pandas.DataFrame``, using Ray as an execution engine. In addition to wrapping a ``pandas.DataFrame``, the class also holds the following metadata: * ``length`` - length of ``pandas.DataFrame`` wrapped * ``width`` - width of ``pandas.DataFrame`` wrapped * ``ip`` - node IP address that holds ``pandas.DataFrame`` wrapped An operation on a block partition can be performed in two modes: * asynchronously_ - via :meth:`~modin.core.execution.ray.implementations.pandas_on_ray.partitioning.PandasOnRayDataframePartition.apply` * lazily_ - via :meth:`~modin.core.execution.ray.implementations.pandas_on_ray.partitioning.PandasOnRayDataframePartition.add_to_apply_calls` Public API ---------- .. autoclass:: modin.core.execution.ray.implementations.pandas_on_ray.partitioning.PandasOnRayDataframePartition :members: .. _asynchronously: https://en.wikipedia.org/wiki/Asynchrony_(computer_programming) .. _lazily: https://en.wikipedia.org/wiki/Lazy_evaluation ================================================ FILE: docs/flow/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition_manager.rst ================================================ PandasOnRayDataframePartitionManager """""""""""""""""""""""""""""""""""" This class is the specific implementation of :py:class:`~modin.core.execution.ray.generic.partitioning.GenericRayDataframePartitionManager` using Ray distributed engine. This class is responsible for partition manipulation and applying a function to block/row/column partitions. Public API ---------- .. autoclass:: modin.core.execution.ray.implementations.pandas_on_ray.partitioning.PandasOnRayDataframePartitionManager :members: ================================================ FILE: docs/flow/modin/core/execution/unidist/generic.rst ================================================ :orphan: Generic Unidist-based members ============================= Objects which are storage format agnostic but require specific Unidist implementation are placed in ``modin.core.execution.unidist.generic``. Their purpose is to implement certain parallel I/O operations and to serve as a foundation for building storage format specific objects: .. autoclass:: modin.core.execution.unidist.generic.io.UnidistIO :members: .. autoclass:: modin.core.execution.unidist.generic.partitioning.GenericUnidistDataframePartitionManager :members: ================================================ FILE: docs/flow/modin/core/execution/unidist/implementations/pandas_on_unidist/dataframe.rst ================================================ PandasOnUnidistDataframe """""""""""""""""""""""" The class is specific implementation of :py:class:`~modin.core.dataframe.pandas.dataframe.dataframe.PandasDataframe` class using Unidist distributed engine. It serves as an intermediate level between :py:class:`~modin.core.storage_formats.pandas.query_compiler.PandasQueryCompiler` and :py:class:`~modin.core.execution.unidist.implementations.pandas_on_unidist.partitioning.PandasOnUnidistDataframePartitionManager`. Public API ---------- .. autoclass:: modin.core.execution.unidist.implementations.pandas_on_unidist.dataframe.PandasOnUnidistDataframe :members: ================================================ FILE: docs/flow/modin/core/execution/unidist/implementations/pandas_on_unidist/index.rst ================================================ :orphan: PandasOnUnidist Execution ========================= Queries that perform data transformation, data ingress or data egress using the `pandas on Unidist` execution pass through the Modin components detailed below. To enable `pandas on MPI through unidist` execution, please refer to the usage section in :doc:`pandas on MPI through unidist `. Data Transformation ''''''''''''''''''' .. image:: /img/pandas_on_unidist_data_transform.svg :align: center When a user calls any :py:class:`~modin.pandas.dataframe.DataFrame` API, a query starts forming at the `API` layer to be executed at the `Execution` layer. The `API` layer is responsible for processing the query appropriately, for example, determining whether the final result should be a ``DataFrame`` or ``Series`` object. This layer is also responsible for sanitizing the input to the :py:class:`~modin.core.storage_formats.pandas.query_compiler.PandasQueryCompiler`, e.g. validating a parameter from the query and defining specific intermediate values to provide more context to the query compiler. The :py:class:`~modin.core.storage_formats.pandas.query_compiler.PandasQueryCompiler` is responsible for processing the query, received from the :py:class:`~modin.pandas.dataframe.DataFrame` `API` layer, to determine how to apply it to a subset of the data - either cell-wise or along an axis-wise partition backed by the `pandas` storage format. The :py:class:`~modin.core.storage_formats.pandas.query_compiler.PandasQueryCompiler` maps the query to one of the :doc:`Core Algebra Operators ` of the :py:class:`~modin.core.execution.unidist.implementations.pandas_on_unidist.dataframe.PandasOnUnidistDataframe` which inherits generic functionality from the ``GenericUnidistDataframe`` and the :py:class:`~modin.core.dataframe.pandas.dataframe.dataframe.PandasDataframe`. .. TODO: insert a link to ``GenericUnidistDataframe`` once we add an implementatiton of the class PandasOnUnidist Dataframe implementation ---------------------------------------- Modin implements ``Dataframe``, ``PartitionManager``, ``VirtualPartition`` (a specific kind of ``AxisPartition`` with the capability to combine smaller partitions into the one "virtual") and ``Partition`` classes specifically for the ``PandasOnUnidist`` execution: * :doc:`PandasOnUnidistDataframe ` * :doc:`PandasOnUnidistDataframePartition ` * :doc:`PandasOnUnidistDataframeVirtualPartition ` * :doc:`PandasOnUnidistDataframePartitionManager ` .. toctree:: :hidden: dataframe partitioning/partition partitioning/axis_partition partitioning/partition_manager Data Ingress '''''''''''' .. image:: /img/pandas_on_unidist_data_ingress.svg :align: center Data Egress ''''''''''' .. image:: /img/pandas_on_unidist_data_egress.svg :align: center When a user calls any IO function from the ``modin.pandas.io`` module, the `API` layer queries the :py:class:`~modin.core.execution.dispatching.factories.dispatcher.FactoryDispatcher` which defines a factory specific for the execution, namely, the :py:class:`~modin.core.execution.dispatching.factories.factories.PandasOnUnidistFactory`. The factory, in turn, exposes the :py:class:`~modin.core.execution.unidist.implementations.pandas_on_unidist.io.PandasOnUnidistIO` class whose responsibility is to perform a parallel read/write from/to a file. When reading data from a CSV file, for example, the :py:class:`~modin.core.execution.unidist.implementations.pandas_on_unidist.io.PandasOnUnidistIO` class forwards the user query to the :meth:`~modin.core.io.text.CSVDispatcher._read` method of :py:class:`~modin.core.io.text.CSVDispatcher`, where the query's parameters are preprocessed to check if they are supported by the execution (defaulting to pandas if they are not) and computes some metadata common for all partitions to be read. Then, the file is split into row chunks, and this data is used to launch remote tasks on the Unidist workers via the :meth:`~modin.core.execution.unidist.common.UnidistWrapper.deploy` method of :py:class:`~modin.core.execution.unidist.common.UnidistWrapper`. On each Unidist worker, the :py:class:`~modin.core.storage_formats.pandas.parsers.PandasCSVParser` parses data. After the remote tasks are finished, additional result postprocessing is performed, and a new query compiler with the data read is returned. When writing data to a CSV file, for example, the :py:class:`~modin.core.execution.unidist.implementations.pandas_on_unidist.io.PandasOnUnidistIO` processes the user query to execute it on Unidist workers. Then, the :py:class:`~modin.core.execution.unidist.implementations.pandas_on_unidist.io.PandasOnUnidistIO` asks the :py:class:`~modin.core.execution.unidist.implementations.pandas_on_unidist.dataframe.PandasOnUnidistDataframe` to decompose the data into row-wise partitions that will be written into the file in parallel in Unidist workers. ================================================ FILE: docs/flow/modin/core/execution/unidist/implementations/pandas_on_unidist/partitioning/axis_partition.rst ================================================ PandasOnUnidistDataframeVirtualPartition """""""""""""""""""""""""""""""""""""""" This class is the specific implementation of :py:class:`~modin.core.dataframe.pandas.partitioning.axis_partition.PandasDataframeAxisPartition`, providing the API to perform operations on an axis partition, using Unidist as an execution engine. The virtual partition is a wrapper over a list of block partitions, which are stored in this class, with the capability to combine the smaller partitions into the one "virtual". Public API ---------- .. autoclass:: modin.core.execution.unidist.implementations.pandas_on_unidist.partitioning.PandasOnUnidistDataframeVirtualPartition :members: PandasOnUnidistDataframeColumnPartition """"""""""""""""""""""""""""""""""""""" Public API ---------- .. autoclass:: modin.core.execution.unidist.implementations.pandas_on_unidist.partitioning.PandasOnUnidistDataframeColumnPartition :members: PandasOnUnidistDataframeRowPartition """""""""""""""""""""""""""""""""""" Public API ---------- .. autoclass:: modin.core.execution.unidist.implementations.pandas_on_unidist.partitioning.PandasOnUnidistDataframeRowPartition :members: ================================================ FILE: docs/flow/modin/core/execution/unidist/implementations/pandas_on_unidist/partitioning/partition.rst ================================================ PandasOnUnidistDataframePartition """"""""""""""""""""""""""""""""" The class is the specific implementation of :py:class:`~modin.core.dataframe.pandas.partitioning.partition.PandasDataframePartition`, providing the API to perform operations on a block partition, namely, ``pandas.DataFrame``, using Unidist as an execution engine. In addition to wrapping a ``pandas.DataFrame``, the class also holds the following metadata: * ``length`` - length of ``pandas.DataFrame`` wrapped * ``width`` - width of ``pandas.DataFrame`` wrapped * ``ip`` - node IP address that holds ``pandas.DataFrame`` wrapped An operation on a block partition can be performed in two modes: * asynchronously_ - via :meth:`~modin.core.execution.unidist.implementations.pandas_on_unidist.partitioning.PandasOnUnidistDataframePartition.apply` * lazily_ - via :meth:`~modin.core.execution.unidist.implementations.pandas_on_unidist.partitioning.PandasOnUnidistDataframePartition.add_to_apply_calls` Public API ---------- .. autoclass:: modin.core.execution.unidist.implementations.pandas_on_unidist.partitioning.PandasOnUnidistDataframePartition :members: .. _asynchronously: https://en.wikipedia.org/wiki/Asynchrony_(computer_programming) .. _lazily: https://en.wikipedia.org/wiki/Lazy_evaluation ================================================ FILE: docs/flow/modin/core/execution/unidist/implementations/pandas_on_unidist/partitioning/partition_manager.rst ================================================ PandasOnUnidistDataframePartitionManager """""""""""""""""""""""""""""""""""""""" This class is the specific implementation of :py:class:`~modin.core.execution.unidist.generic.partitioning.GenericUnidistDataframePartitionManager` using Unidist distributed engine. This class is responsible for partition manipulation and applying a function to block/row/column partitions. Public API ---------- .. autoclass:: modin.core.execution.unidist.implementations.pandas_on_unidist.partitioning.PandasOnUnidistDataframePartitionManager :members: ================================================ FILE: docs/flow/modin/core/io/index.rst ================================================ :orphan: IO Module Description """"""""""""""""""""" Dispatcher Classes Workflow Overview '''''''''''''''''''''''''''''''''''' Calls from ``read_*`` functions of execution-specific IO classes (for example, ``PandasOnRayIO`` for Ray engine and pandas storage format) are forwarded to the ``_read`` function of the file format-specific class (for example ``CSVDispatcher`` for CSV files), where function parameters are preprocessed to check if they are supported (defaulting to pandas if not) and common metadata is computed for all partitions. The file is then split into chunks (splitting mechanism described below) and the data is used to launch tasks on the remote workers. After the remote tasks finish, additional postprocessing is performed on the results, and a new query compiler with the imported data will be returned. Data File Splitting Mechanism ''''''''''''''''''''''''''''' Modin's file splitting mechanism differs depending on the data format type: * text format type - the file is split into bytes according to user specified arguments. In the simplest case, when no row related parameters (such as ``nrows`` or ``skiprows``) are passed, data chunk limits (start and end bytes) are derived by dividing the file size by the number of partitions (chunks can slightly differ between each other because usually end byte may occurs inside a line and in that case the last byte of the line should be used instead of initial value). In other cases the same splitting mechanism is used, but chunks sizes are defined according to the number of lines that each partition should contain. * columnar store type - the file is split so that each chunk contains approximately the same number of columns. * SQL type - chunking is obtained by wrapping initial SQL query with a query that specifies initial row offset and number of rows in the chunk. After file splitting is complete, chunks data is passed to the parser functions (``PandasCSVParser.parse`` for ``read_csv`` function with pandas storage format) for further processing on each worker. Submodules Description '''''''''''''''''''''' ``modin.core.io`` module is used mostly for storing utils and dispatcher classes for reading files of different formats. * ``io.py`` - class containing basic utils and default implementation of IO functions. * ``file_dispatcher.py`` - class reading data from different kinds of files and handling some util functions common for all formats. Also this class contains ``read`` function which is entry point function for all dispatchers ``_read`` functions. * text - directory for storing all text file format dispatcher classes * ``text_file_dispatcher.py`` - class for reading text formats files. This class holds ``partitioned_file`` function for splitting text format files into chunks, ``offset`` function for moving file offset at the specified amount of bytes, ``_read_rows`` function for moving file offset at the specified amount of rows and many other functions. * format/feature specific dispatchers: ``csv_dispatcher.py``, ``excel_dispatcher.py``, ``fwf_dispatcher.py`` and ``json_dispatcher.py``. * column_stores - directory for storing all columnar store file format dispatcher classes * ``column_store_dispatcher.py`` - class for reading columnar type files. This class holds ``build_query_compiler`` function that performs file splitting, deploying remote tasks and results postprocessing and many other functions. * format/feature specific dispatchers: ``feather_dispatcher.py``, ``hdf_dispatcher.py`` and ``parquet_dispatcher.py``. * sql - directory for storing SQL dispatcher class * ``sql_dispatcher.py`` - class for reading SQL queries or database tables. Public API '''''''''' .. automodule:: modin.core.io :members: Handling ``skiprows`` Parameter ''''''''''''''''''''''''''''''' Handling ``skiprows`` parameter by pandas import functions can be very tricky, especially for ``read_csv`` function because of interconnection with ``header`` parameter. In this section the techniques of ``skiprows`` processing by both pandas and Modin are covered. Processing ``skiprows`` by pandas ================================= Let's consider a simple snippet with ``pandas.read_csv`` in order to understand interconnection of ``header`` and ``skiprows`` parameters: .. code-block:: python import pandas from io import StringIO data = """0 1 2 3 4 5 6 7 8 """ # `header` parameter absence is equivalent to `header="infer"` or `header=0` # rows 1, 5, 6, 7, 8 are read with header "0" df = pandas.read_csv(StringIO(data), skiprows=[2, 3, 4]) # rows 5, 6, 7, 8 are read with header "1", row 0 is skipped additionally df = pandas.read_csv(StringIO(data), skiprows=[2, 3, 4], header=1) # rows 6, 7, 8 are read with header "5", rows 0, 1 are skipped additionally df = pandas.read_csv(StringIO(data), skiprows=[2, 3, 4], header=2) In the examples above list-like ``skiprows`` values are fixed and ``header`` is varied. In the first example with no ``header`` provided, rows 2, 3, 4 are skipped and row 0 is considered as the header. In the second example ``header == 1``, so the zeroth row is skipped and the next available row is considered the header. The third example illustrates when the ``header`` and ``skiprows`` parameters values are both present - in this case ``skiprows`` rows are dropped first and then the ``header`` is derived from the remaining rows (rows before header are skipped too). In the examples above only list-like ``skiprows`` and integer ``header`` parameters are considered, but the same logic is applicable for other types of the parameters. Processing ``skiprows`` by Modin ================================ As it can be seen, skipping rows in the pandas import functions is complicated and distributing this logic across multiple workers can complicate it even more. Thus in some rare corner cases default pandas implementation is used in Modin to avoid excessive Modin code complication. Modin uses two techniques for skipping rows: 1) During file partitioning (setting file limits that should be read by each partition) exact rows can be excluded from partitioning scope, thus they won't be read at all and can be considered as skipped. This is the most effective way of skipping rows since it doesn't require any actual data reading and postprocessing, but in this case ``skiprows`` parameter can be an integer only. When it is possible Modin always uses this approach. 2) Rows for skipping can be dropped after full dataset import. This is more expensive way since it requires extra IO work and postprocessing afterwards, but ``skiprows`` parameter can be of any non-integer type supported by ``pandas.read_csv``. In some cases, if ``skiprows`` is uniformly distributed array (e.g. [1, 2, 3]), ``skiprows`` can be "squashed" and represented as an integer to make a fastpath by skipping these rows during file partitioning (using the first option). But if there is a gap between the first row for skipping and the last line of the header (that will be skipped too since header is read by each partition to ensure metadata is defined properly), then this gap should be assigned for reading first by assigning the first partition to read these rows by setting ``pre_reading`` parameter. Let's consider an example of skipping rows during partitioning when ``header="infer"`` and ``skiprows=[3, 4, 5]``. In this specific case fastpath can be done since ``skiprows`` is uniformly distributed array, so we can "squash" it to an integer and set "partitioning" skiprows to 3. But if no additional action is done, these three rows will be skipped right after header line, that corresponds to ``skiprows=[1, 2, 3]``. To avoid this discrepancy, we need to assign the first partition to read data between header line and the first row for skipping by setting special ``pre_reading`` parameter to 2. Then, after the skipping of rows considered to be skipped during partitioning, the rest data will be divided between the rest of partitions, see rows assignment below: .. code-block:: 0 - header line (skip during partitioning) 1 - pre reading (assign to read by the first partition) 2 - pre reading (assign to read by the first partition) 3 - "partitioning" skiprows (skip during partitioning) 4 - "partitioning" skiprows (skip during partitioning) 5 - "partitioning" skiprows (skip during partitioning) 6 - data to partition (divide between the rest of partitions) 7 - data to partition (divide between the rest of partitions) ================================================ FILE: docs/flow/modin/core/storage_formats/base/query_compiler.rst ================================================ BaseQueryCompiler """"""""""""""""" Brief description ''''''''''''''''' :py:class:`~modin.core.storage_formats.base.query_compiler.BaseQueryCompiler` is an abstract class of query compiler, and sets a common interface that every other query compiler implementation in Modin must follow. The Base class contains a basic implementations for most of the interface methods, all of which :doc:`fallback to pandas `. Subclassing :py:class:`~modin.core.storage_formats.base.query_compiler.BaseQueryCompiler` ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''' If you want to add new type of query compiler to Modin the new class needs to inherit from :py:class:`~modin.core.storage_formats.base.query_compiler.BaseQueryCompiler` and implement the abstract methods: - :py:meth:`~modin.core.storage_formats.base.query_compiler.BaseQueryCompiler.from_pandas` build query compiler from pandas DataFrame. - :py:meth:`~modin.core.storage_formats.base.query_compiler.BaseQueryCompiler.from_arrow` build query compiler from Arrow Table. - :py:meth:`~modin.core.storage_formats.base.query_compiler.BaseQueryCompiler.to_pandas` get query compiler representation as pandas DataFrame. - :py:meth:`~modin.core.storage_formats.base.query_compiler.BaseQueryCompiler.default_to_pandas` do :doc:`fallback to pandas ` for the passed function. - :py:meth:`~modin.core.storage_formats.base.query_compiler.BaseQueryCompiler.finalize` finalize object constructing. - :py:meth:`~modin.core.storage_formats.base.query_compiler.BaseQueryCompiler.free` trigger memory cleaning. (Please refer to the code documentation to see the full documentation for these functions). This is a minimum set of operations to ensure a new query compiler will function in the Modin architecture, and the rest of the API can safely default to the pandas implementation via the base class implementation. To add a storage format specific implementation for some of the query compiler operations, just override the corresponding method in your query compiler class. Example ''''''' As an exercise let's define a new query compiler in `Modin`, just to see how easy it is. Usually, the query compiler routes formed queries to the underlying :doc:`frame ` class, which submits operators to an execution engine. For the sake of simplicity and independence of this example, our execution engine will be the pandas itself. We need to inherit a new class from :py:class:`~modin.core.storage_formats.base.query_compiler.BaseQueryCompiler` and implement all of the abstract methods. In this case, with `pandas` as an execution engine, it's trivial: .. code-block:: python from modin.core.storage_formats import BaseQueryCompiler class DefaultToPandasQueryCompiler(BaseQueryCompiler): def __init__(self, pandas_df): self._pandas_df = pandas_df @classmethod def from_pandas(cls, df, *args, **kwargs): return cls(df) @classmethod def from_arrow(cls, at, *args, **kwargs): return cls(at.to_pandas()) def to_pandas(self): return self._pandas_df.copy() def default_to_pandas(self, pandas_op, *args, **kwargs): return type(self)(pandas_op(self.to_pandas(), *args, **kwargs)) def finalize(self): pass def free(self): pass All done! Now you've got a fully functional query compiler, which is ready for extensions and already can be used in Modin DataFrame: .. code-block:: python import pandas pandas_df = pandas.DataFrame({"col1": [1, 2, 2, 1], "col2": [10, 2, 3, 40]}) # Building our query compiler from pandas object qc = DefaultToPandasQueryCompiler.from_pandas(pandas_df) import modin.pandas as pd # Building Modin DataFrame from newly created query compiler modin_df = pd.DataFrame(query_compiler=qc) # Got fully functional Modin DataFrame >>> print(modin_df.groupby("col1").sum().reset_index()) col1 col2 0 1 50 1 2 5 To be able to select this query compiler as default via ``modin.config`` you also need to define the combination of your query compiler and pandas engine as an execution by adding the corresponding factory. To find more information about factories, visit :doc:`dispatching ` page. Query Compiler API '''''''''''''''''' .. autoclass:: modin.core.storage_formats.base.query_compiler.BaseQueryCompiler :members: ================================================ FILE: docs/flow/modin/core/storage_formats/index.rst ================================================ :orphan: Storage Formats =============== Storage format is one of the components that form Modin's execution, it describes the type(s) of objects that are stored in the partitions of the selected Core Modin Dataframe implementation. The base storage format in Modin is pandas. In that format, Modin Dataframe operates with partitions that hold ``pandas.DataFrame`` objects. Pandas is the most natural storage format since high-level DataFrame objects mirror its API. The storage format + execution engine (Ray, Dask, etc.) form the execution backend. The Query Compiler (QC) converts high-level pandas API calls to queries that are understood by the execution backend. .. _query_compiler_def: Query Compiler ============== .. toctree:: :hidden: base/query_compiler pandas/index Modin supports several execution backends (storage format + execution engine). Calling any DataFrame API function will end up in some execution-specific method. The query compiler is a bridge between pandas DataFrame API and the actual Core Modin Dataframe implementation for the corresponding execution. .. image:: /img/simplified_query_flow.svg :align: right :width: 300px Each storage format has its own Query Compiler class that implements the most optimal query routing for the selected format. Query compilers of all storage formats implement a common API, which is used by the high-level Modin DataFrame to support dataframe queries. The role of the query compiler is to translate its API into a pairing of known user-defined functions and dataframe algebra operators. Each query compiler instance contains a :doc:`Core Modin Dataframe ` of the selected execution implementation and queries it with the compiled queries to get the result. The query compiler object is immutable, so the result of every method is a new query compiler. The query compilers API is defined by the :py:class:`~modin.core.storage_formats.base.query_compiler.BaseQueryCompiler` class and may resemble the pandas API, however, they're not equal. The query compilers API is significantly reduced in comparison with pandas, since many corner cases or even the whole methods can be handled at the API layer with the existing API. The query compiler is the level where Modin stops distinguishing DataFrame and Series (or column) objects. A Series is represented by a `1xN` query compiler, where the Series name is the column label. If Series is unnamed, then the label is ``MODIN_UNNAMED_SERIES_LABEL``, which is equal to ``"__reduced__"``. The high-level DataFrame API layer interprets a one-column query compiler as Series or DataFrame depending on the operation context. .. note:: Although we're declaring that there is no difference between DataFrame and Series at the query compiler, you still may find methods like ``method_ser`` and ``method_df`` which are implemented differently because they're emulating either Series or DataFrame logic, or you may find parameters, which indicates whether this one-column query compiler is representing Series or not. All of these are hacks, and we're working on getting rid of them. High-level module overview '''''''''''''''''''''''''' This module houses submodules of all of the stable storage formats: - :doc:`Base module ` contains an abstract query compiler class which defines common API. - :doc:`Pandas module ` contains query compiler and text parsers for pandas storage format. ================================================ FILE: docs/flow/modin/core/storage_formats/pandas/index.rst ================================================ :orphan: Pandas storage format """"""""""""""""""""" .. toctree:: :hidden: query_compiler parsers High-Level Module Overview '''''''''''''''''''''''''' This module houses submodules which are responsible for communication between the query compiler level and execution implementation level for pandas storage format: - :doc:`Query compiler ` is responsible for compiling efficient queries for :doc:`PandasDataframe `. - :doc:`Parsers ` are responsible for parsing data on workers during IO operations. ================================================ FILE: docs/flow/modin/core/storage_formats/pandas/parsers.rst ================================================ Pandas Parsers Module Description """"""""""""""""""""""""""""""""" High-Level Module Overview '''''''''''''''''''''''''' This module houses parser classes (classes that are used for data parsing on the workers) and util functions for handling parsing results. ``PandasParser`` is base class for parser classes with pandas storage format, that contains methods common for all child classes. Other module classes implement ``parse`` function that performs parsing of specific format data basing on the chunk information computed in the ``modin.core.io`` module. After the chunk is parsed, the resulting ``DataFrame``-s will be split into smaller ``DataFrame``-s according to the ``num_splits`` parameter, data type, or number of rows/columns in the parsed chunk. These frames, along with some additional metadata, are then returned. .. note:: If you are interested in the data parsing mechanism implementation details, please refer to the source code documentation. Public API '''''''''' .. automodule:: modin.core.storage_formats.pandas.parsers :members: ================================================ FILE: docs/flow/modin/core/storage_formats/pandas/query_compiler.rst ================================================ PandasQueryCompiler """"""""""""""""""" :py:class:`~modin.core.storage_formats.pandas.query_compiler.PandasQueryCompiler` is responsible for compiling a set of known predefined functions and pairing those with dataframe algebra operators in the :doc:`PandasDataframe `, specifically for dataframes backed by ``pandas.DataFrame`` objects. Each :py:class:`~modin.core.storage_formats.pandas.query_compiler.PandasQueryCompiler` contains an instance of :py:class:`~modin.core.dataframe.pandas.dataframe.dataframe.PandasDataframe` which it queries to get the result. :py:class:`~modin.core.storage_formats.pandas.query_compiler.PandasQueryCompiler` supports methods built by the :doc:`algebra module `. If you want to add an implementation for a query compiler method, visit the algebra module documentation to see whether the new operation fits one of the existing function templates and can be easily implemented with them. Public API '''''''''' :py:class:`~modin.core.storage_formats.pandas.query_compiler.PandasQueryCompiler` implements common query compilers API defined by the :py:class:`~modin.core.storage_formats.base.query_compiler.BaseQueryCompiler`. Some functionalities are inherited from the base class, in the following section only overridden methods are presented. .. autoclass:: modin.core.storage_formats.pandas.query_compiler.PandasQueryCompiler :members: ================================================ FILE: docs/flow/modin/distributed/dataframe/pandas.rst ================================================ Pandas partitioning API ======================= This page contains a description of the API to extract partitions from and build Modin Dataframes. unwrap_partitions ----------------- .. autofunction:: modin.distributed.dataframe.pandas.unwrap_partitions from_partitions --------------- .. autofunction:: modin.distributed.dataframe.pandas.from_partitions Example ------- .. code-block:: python import modin.pandas as pd from modin.distributed.dataframe.pandas import unwrap_partitions, from_partitions import numpy as np data = np.random.randint(0, 100, size=(2 ** 10, 2 ** 8)) df = pd.DataFrame(data) partitions = unwrap_partitions(df, axis=0, get_ip=True) print(partitions) new_df = from_partitions(partitions, axis=0) print(new_df) ================================================ FILE: docs/flow/modin/experimental/batch.rst ================================================ Batch Pipeline API """""""""""""""""" This API exposes the ability to pipeline row-parallel batch queries on a Modin DataFrame. Currently, this feature is only supported for the ``PandasOnRay`` execution. API ''' .. automodule:: modin.experimental.batch.pipeline :members: ================================================ FILE: docs/flow/modin/experimental/core/io/index.rst ================================================ :orphan: Experimental IO Module Description """""""""""""""""""""""""""""""""" The module is used mostly for storing experimental utils and dispatcher classes for reading/writing files of different formats. Submodules Description '''''''''''''''''''''' * text - directory for storing all text file format dispatcher classes * format/feature specific dispatchers: ``csv_glob_dispatcher.py``, ``custom_text_dispatcher.py``. * sql - directory for storing SQL dispatcher class * format/feature specific dispatchers: ``sql_dispatcher.py`` * pickle - directory for storing Pickle dispatcher class * format/feature specific dispatchers: ``pickle_dispatcher.py`` Public API '''''''''' .. automodule:: modin.experimental.core.io :members: ================================================ FILE: docs/flow/modin/experimental/index.rst ================================================ :orphan: Experimental Modules Overview """"""""""""""""""""""""""""" In some cases Modin can give the user the opportunity to extend (not modify) typical pandas API or to try new functionality in order to get more flexibility. Depending on the exact experimental feature user may need to install additional packages, change configurations or replace the standard Modin import statement ``import modin.pandas as pd`` with modified version ``import modin.experimental.pandas as pd``. ``modin.experimental`` holds experimental functionality that is under development right now and provides a limited set of functionality: * :doc:`xgboost ` * :doc:`sklearn ` * :doc:`batch ` .. toctree:: :hidden: sklearn xgboost batch ================================================ FILE: docs/flow/modin/experimental/pandas.rst ================================================ :orphan: Experimental Pandas API """"""""""""""""""""""" .. automodule:: modin.experimental.pandas :noindex: Experimental API Reference '''''''''''''''''''''''''' .. autofunction:: read_sql .. autofunction:: read_csv_glob .. autofunction:: read_custom_text .. autofunction:: read_pickle_glob .. autofunction:: read_parquet_glob .. autofunction:: read_json_glob .. autofunction:: read_xml_glob .. automethod:: modin.pandas.DataFrame.modin::to_pandas .. automethod:: modin.pandas.DataFrame.modin::to_ray .. automethod:: modin.pandas.DataFrame.modin::to_pickle_glob .. automethod:: modin.pandas.DataFrame.modin::to_parquet_glob .. automethod:: modin.pandas.DataFrame.modin::to_json_glob .. automethod:: modin.pandas.DataFrame.modin::to_xml_glob ================================================ FILE: docs/flow/modin/experimental/range_partitioning_groupby.rst ================================================ :orphan: .. redirect to the new page .. raw:: html ================================================ FILE: docs/flow/modin/experimental/reshuffling_groupby.rst ================================================ :orphan: .. redirect to the new page .. raw:: html ================================================ FILE: docs/flow/modin/experimental/sklearn.rst ================================================ Scikit-learn module description """"""""""""""""""""""""""""""" This module holds experimental scikit-learn-specific functionality for Modin. API ''' .. automodule:: modin.experimental.sklearn.model_selection :members: ================================================ FILE: docs/flow/modin/experimental/xgboost.rst ================================================ Modin XGBoost module description """""""""""""""""""""""""""""""" High-level Module Overview '''''''''''''''''''''''''' This module holds classes, public interface and internal functions for distributed XGBoost in Modin. Public classes :py:class:`~modin.experimental.xgboost.Booster`, :py:class:`~modin.experimental.xgboost.DMatrix` and function :py:func:`~modin.experimental.xgboost.train` provide the user with familiar XGBoost interfaces. They are located in the ``modin.experimental.xgboost.xgboost`` module. The internal module ``modin.experimental.xgboost.xgboost.xgboost_ray`` contains the implementation of Modin XGBoost for the Ray execution engine. This module mainly consists of the Ray actor-class :py:class:`~modin.experimental.xgboost.xgboost_ray.ModinXGBoostActor`, a function to distribute Modin's partitions between actors :py:func:`~modin.experimental.xgboost.xgboost_ray._assign_row_partitions_to_actors`, an internal :py:func:`~modin.experimental.xgboost.xgboost_ray._train`/:py:func:`~modin.experimental.xgboost.xgboost_ray._predict` function used from the public interfaces and additional util functions for computing cluster resources, actor creations etc. Public interfaces ''''''''''''''''' :py:class:`~modin.experimental.xgboost.DMatrix` inherits original class ``xgboost.DMatrix`` and overrides its constructor, which currently supports only `data` and `label` parameters. Both of the parameters must be ``modin.pandas.DataFrame``, which will be internally unwrapped to lists of delayed objects of Modin's row partitions using the function :py:func:`~modin.distributed.dataframe.pandas.unwrap_partitions`. .. autoclass:: modin.experimental.xgboost.DMatrix :members: :py:class:`~modin.experimental.xgboost.Booster` inherits original class ``xgboost.Booster`` and overrides method ``predict``. The difference from original class interface for ``predict`` method is changing the type of the `data` parameter to :py:class:`~modin.experimental.xgboost.DMatrix`. .. autoclass:: modin.experimental.xgboost.Booster :members: :py:func:`~modin.experimental.xgboost.train` function has 2 differences from the original ``train`` function - (1) the data type of `dtrain` parameter is :py:class:`~modin.experimental.xgboost.DMatrix` and (2) a new parameter `num_actors`. .. autofunction:: modin.experimental.xgboost.train Internal execution flow on Ray engine ''''''''''''''''''''''''''''''''''''' Internal functions :py:func:`~modin.experimental.xgboost.xgboost_ray._train` and :py:func:`~modin.experimental.xgboost.xgboost_ray._predict` work similar to xgboost. Training ******** 1. The data is passed to the :py:func:`~modin.experimental.xgboost.xgboost_ray._train` function as a :py:class:`~modin.experimental.xgboost.DMatrix` object. Lists of ``ray.ObjectRef`` corresponding to row partitions of Modin DataFrames are extracted by iterating over the :py:class:`~modin.experimental.xgboost.DMatrix`. Example: .. code-block:: python # Extract lists of row partitions from dtrain (DMatrix object) X_row_parts, y_row_parts = dtrain .. 2. On this step, the parameter `num_actors` is processed. The internal function :py:func:`~modin.experimental.xgboost.xgboost_ray._get_num_actors` examines the value provided by the user. In case the value isn't provided, the `num_actors` will be computed using condition that 1 actor should use maximum 2 CPUs. This condition was chosen for using maximum parallel workers with multithreaded XGBoost training (2 threads per worker will be used in this case). .. note:: `num_actors` parameter is made available for public function :py:func:`~modin.experimental.xgboost.train` to allow fine-tuning for obtaining the best performance in specific use cases. 3. :py:class:`~modin.experimental.xgboost.xgboost_ray.ModinXGBoostActor` objects are created. 4. Data `dtrain` is split between actors evenly. The internal function :py:func:`~modin.experimental.xgboost.xgboost_ray._split_data_across_actors` runs assigning row partitions to actors using internal function :py:func:`~modin.experimental.xgboost.xgboost_ray._assign_row_partitions_to_actors`. This function creates a dictionary in the form: `{actor_rank: ([part_i0, part_i3, ..], [0, 3, ..]), ..}`. .. note:: :py:func:`~modin.experimental.xgboost.xgboost_ray._assign_row_partitions_to_actors` takes into account IP addresses of row partitions of `dtrain` data to minimize excess data transfer. 5. For each :py:class:`~modin.experimental.xgboost.xgboost_ray.ModinXGBoostActor` object ``set_train_data`` method is called remotely. This method runs loading row partitions in actor according to the dictionary with partitions distribution from previous step. When data is passed to the actor, the row partitions are automatically materialized (``ray.ObjectRef`` -> ``pandas.DataFrame``). 6. ``train`` method of :py:class:`~modin.experimental.xgboost.xgboost_ray.ModinXGBoostActor` class object is called remotely. This method runs XGBoost training on local data of actor, connects to ``Rabit Tracker`` for sharing training state between actors and returns dictionary with `booster` and `evaluation results`. 7. At the final stage results from actors are returned. `booster` and `evals_result` are returned using ``ray.get`` function from remote actor. Prediction ********** 1. The data is passed to :py:func:`~modin.experimental.xgboost.xgboost_ray._predict` function as a :py:class:`~modin.experimental.xgboost.DMatrix` object. 2. :py:func:`~modin.experimental.xgboost.xgboost_ray._map_predict` function is applied remotely for each partition of the data to make a partial prediction. 3. Result ``modin.pandas.DataFrame`` is created from ``ray.ObjectRef`` objects, obtained in the previous step. Internal API '''''''''''' .. autoclass:: modin.experimental.xgboost.xgboost_ray.ModinXGBoostActor :members: :private-members: .. autofunction:: modin.experimental.xgboost.xgboost_ray._assign_row_partitions_to_actors .. autofunction:: modin.experimental.xgboost.xgboost_ray._train .. autofunction:: modin.experimental.xgboost.xgboost_ray._predict .. autofunction:: modin.experimental.xgboost.xgboost_ray._get_num_actors .. autofunction:: modin.experimental.xgboost.xgboost_ray._split_data_across_actors .. autofunction:: modin.experimental.xgboost.xgboost_ray._map_predict ================================================ FILE: docs/flow/modin/pandas/base.rst ================================================ Base pandas Dataset API """"""""""""""""""""""" The class implements functionality that is common to Modin's pandas API for both ``DataFrame`` and ``Series`` classes. Public API ---------- .. autoclass:: modin.pandas.base.BasePandasDataset :noindex: :members: ================================================ FILE: docs/flow/modin/pandas/dataframe.rst ================================================ :orphan: DataFrame Module Overview """"""""""""""""""""""""" Modin's ``pandas.DataFrame`` API '''''''''''''''''''''''''''''''' Modin's ``pandas.DataFrame`` API is backed by a distributed object providing an identical API to pandas. After the user calls some ``DataFrame`` function, this call is internally rewritten into a representation that can be processed in parallel by the partitions. These results can be e.g., reduced to single output, identical to the single threaded pandas ``DataFrame`` method output. .. TODO: add link to the docs with detailed description of queries compilation and execution ater DOCS-#2996 is merged. Public API ---------- .. autoclass:: modin.pandas.dataframe.DataFrame Usage Guide ''''''''''' The most efficient way to create Modin ``DataFrame`` is to import data from external storage using the highly efficient Modin IO methods (for example using ``pd.read_csv``, see details for Modin IO methods in the :doc:`IO ` page), but even if the data does not originate from a file, any pandas supported data type or ``pandas.DataFrame`` can be used. Internally, the ``DataFrame`` data is divided into partitions, which number along an axis usually corresponds to the number of the user's hardware CPUs. If needed, the number of partitions can be changed by setting ``modin.config.NPartitions``. Let's consider simple example of creation and interacting with Modin ``DataFrame``: .. code-block:: python import modin.config # This explicitly sets the number of partitions modin.config.NPartitions.put(4) import modin.pandas as pd import pandas # Create Modin DataFrame from the external file pd_dataframe = pd.read_csv("test_data.csv") # Create Modin DataFrame from the python object # data = {f'col{x}': [f'col{x}_{y}' for y in range(100, 356)] for x in range(4)} # pd_dataframe = pd.DataFrame(data) # Create Modin DataFrame from the pandas object # pd_dataframe = pd.DataFrame(pandas.DataFrame(data)) # Show created DataFrame print(pd_dataframe) # List DataFrame partitions. Note, that internal API is intended for # developers needs and was used here for presentation purposes # only. partitions = pd_dataframe._query_compiler._modin_frame._partitions print(partitions) # Show the first DataFrame partition print(partitions[0][0].get()) Output: # created DataFrame col0 col1 col2 col3 0 col0_100 col1_100 col2_100 col3_100 1 col0_101 col1_101 col2_101 col3_101 2 col0_102 col1_102 col2_102 col3_102 3 col0_103 col1_103 col2_103 col3_103 4 col0_104 col1_104 col2_104 col3_104 .. ... ... ... ... 251 col0_351 col1_351 col2_351 col3_351 252 col0_352 col1_352 col2_352 col3_352 253 col0_353 col1_353 col2_353 col3_353 254 col0_354 col1_354 col2_354 col3_354 255 col0_355 col1_355 col2_355 col3_355 [256 rows x 4 columns] # List of DataFrame partitions [[] [] [] []] # The first DataFrame partition col0 col1 col2 col3 0 col0_100 col1_100 col2_100 col3_100 1 col0_101 col1_101 col2_101 col3_101 2 col0_102 col1_102 col2_102 col3_102 3 col0_103 col1_103 col2_103 col3_103 4 col0_104 col1_104 col2_104 col3_104 .. ... ... ... ... 60 col0_160 col1_160 col2_160 col3_160 61 col0_161 col1_161 col2_161 col3_161 62 col0_162 col1_162 col2_162 col3_162 63 col0_163 col1_163 col2_163 col3_163 64 col0_164 col1_164 col2_164 col3_164 [65 rows x 4 columns] As we show in the example above, Modin ``DataFrame`` can be easily created, and supports any input that pandas ``DataFrame`` supports. Also note that tuning of the ``DataFrame`` partitioning can be done by just setting a single config. ================================================ FILE: docs/flow/modin/pandas/series.rst ================================================ :orphan: Series Module Overview """""""""""""""""""""" Modin's ``pandas.Series`` API ''''''''''''''''''''''''''''' Modin's ``pandas.Series`` API is backed by a distributed object providing an identical API to pandas. After the user calls some ``Series`` function, this call is internally rewritten into a representation that can be processed in parallel by the partitions. These results can be e.g., reduced to single output, identical to the single threaded pandas ``Series`` method output. .. TODO: add link to the docs with detailed description of queries compilation and execution ater DOCS-#2996 is merged. Public API ---------- .. autoclass:: modin.pandas.series.Series Usage Guide ''''''''''' The most efficient way to create Modin ``Series`` is to import data from external storage using the highly efficient Modin IO methods (for example using ``pd.read_csv``, see details for Modin IO methods in the :doc:`IO ` page), but even if the data does not originate from a file, any pandas supported data type or ``pandas.Series`` can be used. Internally, the ``Series`` data is divided into partitions, which number along an axis usually corresponds to the number of the user's hardware CPUs. If needed, the number of partitions can be changed by setting ``modin.config.NPartitions``. Let's consider simple example of creation and interacting with Modin ``Series``: .. code-block:: python import modin.config # This explicitly sets the number of partitions modin.config.NPartitions.put(4) import modin.pandas as pd import pandas # Create Modin Series from the external file pd_series = pd.read_csv("test_data.csv", header=None).squeeze() # Create Modin Series from the python object # pd_series = pd.Series([x for x in range(256)]) # Create Modin Series from the pandas object # pd_series = pd.Series(pandas.Series([x for x in range(256)])) # Show created `Series` print(pd_series) # List `Series` partitions. Note, that internal API is intended for # developers needs and was used here for presentation purposes # only. partitions = pd_series._query_compiler._modin_frame._partitions print(partitions) # Show the first `Series` partition print(partitions[0][0].get()) Output: # created `Series` 0 100 1 101 2 102 3 103 4 104 ... 251 351 252 352 253 353 254 354 255 355 Name: 0, Length: 256, dtype: int64 # List of `Series` partitions [[] [] [] []] # The first `Series` partition 0 0 100 1 101 2 102 3 103 4 104 .. ... 60 160 61 161 62 162 63 163 64 164 [65 rows x 1 columns] As we show in the example above, Modin ``Series`` can be easily created, and supports any input that pandas ``Series`` supports. Also note that tuning of the ``Series`` partitioning can be done by just setting a single config. ================================================ FILE: docs/flow/modin/utils.rst ================================================ :orphan: Modin Utils """"""""""" Here are utilities that can be useful when working with Modin. Public API '''''''''' .. autofunction:: modin.utils.try_cast_to_pandas .. autofunction:: modin.utils.execute ================================================ FILE: docs/getting_started/examples.rst ================================================ Examples and Resources ====================== Here you can find additional resources to learn about Modin. To learn more about advanced usage for Modin, please refer to :doc:`Usage Guide ` section.. Usage Examples '''''''''''''' The following notebooks demonstrate how Modin can be used for scalable data science: - Quickstart Guide to Modin [`Source `__] - Using Modin with the NYC Taxi Dataset [`Source `__] - Modin for Machine Learning with scikit-learn [`Source `__] Tutorials ''''''''' The following tutorials cover the basic usage of Modin. `Here `__ is a one hour video tutorial that walks through these basic exercises. - Exercise 1: Introduction to Modin [`Source PandasOnRay `__, `Source PandasOnDask `__] - Exercise 2: Speed Improvements with Modin [`Source PandasOnRay `__, `Source PandasOnDask `__] - Exercise 3: Defaulting to pandas with Modin [`Source PandasOnRay `__, `Source PandasOnDask `__] The following tutorials covers more advanced features in Modin: - Exercise 4: Experimental Features in Modin (Spreadsheet, Progress Bar) [`Source PandasOnRay `__, `Source PandasOnDask `__] - Exercise 5: Setting up Modin in a Cluster Environment [`Source PandasOnRay `__] - Exercise 6: Running Modin in a Cluster Environment [`Source PandasOnRay `__] How to get required dependencies for the tutorial notebooks and to run them please refer to the respective `README.md `__ file. Talks & Podcasts '''''''''''''''' - `Scaling Interactive Data Science with Modin and Ray `_ (20 minute, Ray Summit 2021) - `Unleash The Power Of Dataframes At Any Scale With Modin `_ (40 minute, Python Podcast 2021) - `[Russian] Distributed Data Processing and XGBoost Training and Prediction with Modin `_ (30 minute, PyCon Russia 2021) - `[Russian] Efficient Data Science with Modin `_ (30 minute, ISP RAS Open 2021) - `Modin: Scaling the Capabilities of the Data Scientist, not the Machine `_ (1 hour, RISE Camp 2020) - `Modin: Pandas Scalability with Devin Petersohn `_ (1 hour, Software Engineering Daily Podcast 2020) - `Introduction to the DataFrame and Modin `_ (20 minute, RISECamp 2019) - `Scaling Interactive Pandas Workflows with Modin `_ (40 minute, PyData NYC 2018) Community contributions ''''''''''''''''''''''' Here are some blogposts and articles about Modin: - `Anaconda Blog: Scale your pandas workflow with Modin by Vasilij Litvinov `_ - `The Modin view of Scaling Pandas by Devin Petersohn `_ - `Data Science at Scale with Modin by Areg Melik-Adamyan `_ - `Speed up Pandas using Modin by Eric D. Brown, D.Sc. `_ - `Explore Python Libraries: Make Your DataFrames Parallel With Modin by Zachary Bennett `_ - `Get faster pandas with Modin, even on your laptops by Parul Pandey `_ - `How to speedup pandas by changing one line of code by Shrivarsheni `_ - `How To Accelerate Pandas With Just One Line Of Code by Analytics India `_ - `An Easy Introduction to Modin: A Step-by-Step Guide to Accelerating Pandas by Intel `_ Here are some articles contributed by the international community: - `[Chinese] 用 Modin 来提速 pandas 工作流程 by Python Chinese Community `_ - `[German] Was ist Modin? by Dipl.-Ing. (FH) Stefan Luber `_ - `[Russian] Ускоряем Pandas при помощи модуля modin by Разработка `_ - `[Korean] modin 으로 pandas 더 빠르게 사용하기 by 분석뉴비 `_ If you would like your articles to be featured here, please `submit a pull request `_ to let us know! ================================================ FILE: docs/getting_started/faq.rst ================================================ Frequently Asked Questions (FAQs) ================================= Below, you will find answers to the most commonly asked questions about Modin. If you still cannot find the answer you are looking for, please post your question on the #support channel on our Slack_ community or open a Github issue_. FAQs: Why choose Modin? ----------------------- What’s wrong with pandas and why should I use Modin? """""""""""""""""""""""""""""""""""""""""""""""""""" While pandas works extremely well on small datasets, as soon as you start working with medium to large datasets that are more than a few GBs, pandas can become painfully slow or run out of memory. This is because pandas is single-threaded. In other words, you can only process your data with one core at a time. This approach does not scale to larger data sets and adding more hardware does not lead to more performance gain. The :py:class:`~modin.pandas.dataframe.DataFrame` is a highly scalable, parallel DataFrame. Modin transparently distributes the data and computation so that you can continue using the same pandas API while being able to work with more data faster. Modin lets you use all the CPU cores on your machine, and because it is lightweight, it often has less memory overhead than pandas. See :doc:` Why Modin? ` page to learn more about how Modin is different from pandas. Why not just improve pandas? """""""""""""""""""""""""""" pandas is a massive community and well established codebase. Many of the issues we have identified and resolved with pandas are fundamental to its current implementation. While we would be happy to donate parts of Modin that make sense in pandas, many of these components would require significant (or total) redesign of the pandas architecture. Modin's architecture goes beyond pandas, which is why the pandas API is just a thin layer at the user level. To learn more about Modin's architecture, see the :doc:`architecture ` documentation. How much faster can I go with Modin compared to pandas? """"""""""""""""""""""""""""""""""""""""""""""""""""""" Modin is designed to scale with the amount of hardware available. Even in a traditionally serial task like ``read_csv``, we see large gains by efficiently distributing the work across your entire machine. Because it is so light-weight, Modin provides speed-ups of up to 4x on a laptop with 4 physical cores. This speedup scales efficiently to larger machines with more cores. We have several published papers_ that include performance results and comparisons against pandas. How much more data would I be able to process with Modin? """"""""""""""""""""""""""""""""""""""""""""""""""""""""" Often data scientists have to use different tools for operating on datasets of different sizes. This is not only because processing large dataframes is slow, but also pandas does not support working with dataframes that don't fit into the available memory. As a result, pandas workflows that work well for prototyping on a few MBs of data do not scale to tens or hundreds of GBs (depending on the size of your machine). Modin supports operating on data that does not fit in memory, so that you can comfortably work with hundreds of GBs without worrying about substantial slowdown or memory errors. For more information, see :doc:`out-of-memory support ` for Modin. How does Modin compare to Dask DataFrame and Koalas? """""""""""""""""""""""""""""""""""""""""""""""""""" TLDR: Modin has better coverage of the pandas API, has a flexible backend, better ordering semantics, and supports both row and column-parallel operations. Check out :doc:`Modin vs Dask vs Koalas ` page detailing the differences! How does Modin work under the hood? """"""""""""""""""""""""""""""""""" Modin is logically separated into different layers that represent the hierarchy of a typical Database Management System. User queries which perform data transformation, data ingress or data egress pass through the Modin Query Compiler which translates queries from the top-level pandas API Layer that users interact with to the Modin Core Dataframe layer. The Modin Core DataFrame is our efficient DataFrame implementation that utilizes a partitioning schema which allows for distributing tasks and queries. From here, the Modin DataFrame works with engines like Ray, Dask or Unidist to execute computation, and then return the results to the user. For more details, take a look at our system :doc:`architecture `. FAQs: How to use Modin? ----------------------- If I’m only using my laptop, can I still get the benefits of Modin? """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" Absolutely! Unlike other parallel DataFrame systems, Modin is an extremely light-weight, robust DataFrame. Because it is so light-weight, Modin provides speed-ups of up to 4x on a laptop with 4 physical cores and allows you to work on data that doesn't fit in your laptop's RAM. How do I use Jupyter or Colab notebooks with Modin? """"""""""""""""""""""""""""""""""""""""""""""""""" You can take a look at this Google Colab installation guide_ and this notebook tutorial_. Once Modin is installed, simply replace your pandas import with Modin import: .. code-block:: python # import pandas as pd import modin.pandas as pd Which execution engine (Ray, Dask or Unidist) should I use for Modin? """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" Modin lets you effortlessly speed up your pandas workflows with either Ray_'s, Dask_'s or Unidist_'s execution engine. You don't need to know anything about either engine in order to use it with Modin. If you only have one engine installed, Modin will automatically detect which engine you have installed and use that for scheduling computation. If you don't have a preference, we recommend starting with Modin's default Ray engine. If you want to use a specific compute engine, you can set the environment variable ``MODIN_ENGINE`` and Modin will do computation with that engine: .. code-block:: bash pip install "modin[ray]" # Install Modin dependencies and Ray to run on Ray export MODIN_ENGINE=ray # Modin will use Ray pip install "modin[dask]" # Install Modin dependencies and Dask to run on Dask export MODIN_ENGINE=dask # Modin will use Dask pip install "modin[mpi]" # Install Modin dependencies and MPI to run on MPI through unidist. export MODIN_ENGINE=unidist # Modin will use Unidist export UNIDIST_BACKEND=mpi # Unidist will use MPI backend. This can also be done with: .. code-block:: python import modin.config as modin_cfg import unidist.config as unidist_cfg modin_cfg.Engine.put("ray") # Modin will use Ray modin_cfg.Engine.put("dask") # Modin will use Dask modin_cfg.Engine.put('unidist') # Modin will use Unidist unidist_cfg.Backend.put('mpi') # Unidist will use MPI backend We plan to support more execution engines in future. If you have a specific request, please post on the #feature-requests channel on our Slack_ community. How do I connect Modin to a database via `read_sql`? """""""""""""""""""""""""""""""""""""""""""""""""""" To read from a SQL database, you have two options: 1) Pass a connection string, e.g. ``postgresql://reader:NWDMCE5xdipIjRrp@hh-pgsql-public.ebi.ac.uk:5432/pfmegrnargs`` 2) Pass an open database connection, e.g. for psycopg2, ``psycopg2.connect("dbname=pfmegrnargs user=reader password=NWDMCE5xdipIjRrp host=hh-pgsql-public.ebi.ac.uk")`` The first option works with both Modin and pandas. If you try the second option in Modin, Modin will default to pandas because open database connections cannot be pickled. Pickling is required to send connection details to remote workers. To handle the unique requirements of distributed database access, Modin has a distributed database connection called ``ModinDatabaseConnection``: .. code-block:: python import modin.pandas as pd from modin.db_conn import ModinDatabaseConnection con = ModinDatabaseConnection( 'psycopg2', host='hh-pgsql-public.ebi.ac.uk', dbname='pfmegrnargs', user='reader', password='NWDMCE5xdipIjRrp') df = pd.read_sql("SELECT * FROM rnc_database", con, index_col=None, coerce_float=True, params=None, parse_dates=None, chunksize=None) The ``ModinDatabaseConnection`` will save any arguments you supply it and forward them to the workers to make their own connections. How can I contribute to Modin? """""""""""""""""""""""""""""" **Modin is currently under active development. Requests and contributions are welcome!** If you are interested in contributing please check out the :doc:`Contributing Guide` and then refer to the :doc:`Development Documentation`, where you can find system architecture, internal implementation details, and other useful information. Also check out the `Github`_ to view open issues and make contributions. .. _issue: https://github.com/modin-project/modin/issues .. _Slack: https://join.slack.com/t/modin-project/shared_invite/zt-yvk5hr3b-f08p_ulbuRWsAfg9rMY3uA .. _Github: https://github.com/modin-project/modin .. _Ray: https://github.com/ray-project/ray/ .. _Dask: https://github.com/dask/dask .. _Unidist: https://github.com/modin-project/unidist .. _papers: https://people.eecs.berkeley.edu/~totemtang/paper/Modin.pdf .. _guide: https://modin.readthedocs.io/en/latest/getting_started/installation.html#installing-on-google-colab .. _tutorial: https://github.com/modin-project/modin/tree/main/examples/tutorial ================================================ FILE: docs/getting_started/installation.rst ================================================ ============= Installation ============= .. note:: | *Estimated Reading Time: 15 minutes* | If you already installed Modin on your machine, you can skip this section. There are several ways to install Modin. Most users will want to install with ``pip`` or using ``conda`` tool, but some users may want to build from the main branch on the `GitHub repo`_. The main branch has the most recent patches, but may be less stable than a release installed from ``pip`` or ``conda``. Installing with pip ------------------- Stable version """""""""""""" Modin can be installed with ``pip`` on Linux, Windows and MacOS. To install the most recent stable release run the following: .. code-block:: bash pip install -U modin # -U for upgrade in case you have an older version Modin can be used with :doc:`Ray`, :doc:`Dask`, :doc:`Unidist` engines. If you don't have Ray_, Dask_ or Unidist_ installed, you will need to install Modin with one of the targets: .. code-block:: bash pip install "modin[ray]" # Install Modin dependencies and Ray to run on Ray pip install "modin[dask]" # Install Modin dependencies and Dask to run on Dask pip install "modin[mpi]" # Install Modin dependencies and MPI to run on MPI through unidist pip install "modin[all]" # Install Ray and Dask To get Modin on MPI through unidist (as of unidist 0.5.0) fully working it is required to have a working MPI implementation installed beforehand. Otherwise, installation of ``modin[mpi]`` may fail. Refer to `Installing with pip`_ section of the unidist documentation for more details about installation. **Note:** Since Modin 0.30.0 we use a reduced set of Ray dependencies: ``ray`` instead of ``ray[default]``. This means that the dashboard and cluster launcher are no longer installed by default. If you need those, consider installing ``ray[default]`` along with ``modin[ray]``. Modin will automatically detect which engine you have installed and use that for scheduling computation! Release candidates """""""""""""""""" Before most major releases, we will upload a release candidate to test and check if there are any problems. If you would like to install a pre-release of Modin, run the following: .. code-block:: bash pip install --pre modin These pre-releases are uploaded for dependencies and users to test their existing code to ensure that it still works. If you find something wrong, please raise an issue_ or email the bug reporter: bug_reports@modin.org. Installing specific dependency sets """"""""""""""""""""""""""""""""""" Modin has a number of specific dependency sets for running Modin on different execution engines and storage formats or for different functionalities of Modin. Here is a list of dependency sets for Modin: .. code-block:: bash pip install "modin[ray]" # If you want to use the Ray execution engine .. code-block:: bash pip install "modin[dask]" # If you want to use the Dask execution engine .. code-block:: bash pip install "modin[mpi]" # If you want to use MPI through unidist execution engine Consortium Standard-compatible implementation based on Modin """""""""""""""""""""""""""""""""""""""""""""""""""""""""""" .. code-block:: bash pip install "modin[consortium-standard]" Installing on Google Colab """"""""""""""""""""""""""" Modin can be used with Google Colab_ via the ``pip`` command, by running the following code in a new cell: .. code-block:: bash !pip install "modin[all]" Since Colab preloads several of Modin's dependencies by default, we need to restart the Colab environment once Modin is installed by either clicking on the :code:`"RESTART RUNTIME"` button in the installation output or by run the following code: .. code-block:: python # Post-install automatically kill and restart Colab environment import os os.kill(os.getpid(), 9) Once you have restarted the Colab environment, you can use Modin in Colab in subsequent sessions. Note that on the free version of Colab, there is a `limit on the compute resource `_. To leverage the full power of Modin, you may have to upgrade to Colab Pro to get access to more compute resources. Installing with conda --------------------- Using conda-forge channel """"""""""""""""""""""""" Modin releases can be installed using ``conda`` from conda-forge channel. Starting from 0.10.1 it is possible to install modin with chosen engine(s) alongside. Current options are: +---------------------------------+---------------------------+-----------------------------+ | **Package name in conda-forge** | **Engine(s)** | **Supported OSs** | +---------------------------------+---------------------------+-----------------------------+ | modin | Dask_ | Linux, Windows, MacOS | +---------------------------------+---------------------------+-----------------------------+ | modin-dask | Dask | Linux, Windows, MacOS | +---------------------------------+---------------------------+-----------------------------+ | modin-ray | Ray_ | Linux, Windows | +---------------------------------+---------------------------+-----------------------------+ | modin-mpi | MPI_ through unidist_ | Linux, Windows, MacOS | +---------------------------------+---------------------------+-----------------------------+ | modin-all | Dask, Ray, Unidist | Linux | +---------------------------------+---------------------------+-----------------------------+ **Note:** Since Modin 0.30.0 we use a reduced set of Ray dependencies: ``ray-core`` instead of ``ray-default``. This means that the dashboard and cluster launcher are no longer installed by default. If you need those, consider installing ``ray-default`` along with ``modin-ray``. For installing Dask, Ray and MPI through unidist engines into conda environment following command should be used: .. code-block:: bash conda install -c conda-forge modin-ray modin-dask modin-mpi All set of engines could be available in conda environment by specifying: .. code-block:: bash conda install -c conda-forge modin-all or explicitly: .. code-block:: bash conda install -c conda-forge modin-ray modin-dask modin-mpi Refer to `Installing with conda`_ section of the unidist documentation for more details on how to install a specific MPI implementation to run on. ``conda`` may be slow installing ``modin-all`` or combitations of execution engines so we currently recommend using libmamba solver for the installation process. To do this install it in a base environment: .. code-block:: bash conda install -n base conda-libmamba-solver Then it can be used during installation either like .. code-block:: bash conda install -c conda-forge modin-ray modin- --experimental-solver=libmamba or starting from conda 22.11 and libmamba solver 22.12 versions .. code-block:: bash conda install -c conda-forge modin-ray --solver=libmamba Installing from the GitHub main branch -------------------------------------- If you'd like to try Modin using the most recent updates from the main branch, you can also use ``pip``. .. code-block:: bash pip install "modin[all] @ git+https://github.com/modin-project/modin" This will install directly from the repo without you having to manually clone it! Please be aware that these changes have not made it into a release and may not be completely stable. If you would like to install Modin with a specific engine, you can use ``modin[ray]`` or ``modin[dask]`` or ``modin[mpi]`` instead of ``modin[all]`` in the command above. Windows ------- All Modin engines are available both on Windows and Linux as mentioned above. Default engine on Windows is :doc:`Ray`. It is also possible to use Windows Subsystem For Linux (WSL_), but this is generally not recommended due to the limitations and poor performance of Ray on WSL, a roughly 2-3x worse than native Windows. Building Modin from Source -------------------------- If you're planning on :doc:`contributing ` to Modin, you will need to ensure that you are building Modin from the local repository that you are working off of. Occasionally, there are issues in overlapping Modin installs from pypi and from source. To avoid these issues, we recommend uninstalling Modin before you install from source: .. code-block:: bash pip uninstall modin To build from source, you first must clone the repo. We recommend forking the repository first through the GitHub interface, then cloning as follows: .. code-block:: bash git clone https://github.com//modin.git Once cloned, ``cd`` into the ``modin`` directory and use ``pip`` to install: .. code-block:: bash cd modin pip install -e . pip install -e ".[all]" # will install dependencies for all engines .. _`GitHub repo`: https://github.com/modin-project/modin/tree/main .. _issue: https://github.com/modin-project/modin/issues .. _WSL: https://docs.microsoft.com/en-us/windows/wsl/install-win10 .. _Ray: http://ray.readthedocs.io .. _Dask: https://github.com/dask/dask .. _MPI: https://www.mpi-forum.org/ .. _Unidist: https://github.com/modin-project/unidist .. _`Installing with pip`: https://unidist.readthedocs.io/en/latest/installation.html#installing-with-pip .. _`Installing with conda`: https://unidist.readthedocs.io/en/latest/installation.html#installing-with-conda .. _`Intel Distribution of Modin`: https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/distribution-of-modin.html#gs.86stqv .. _`Intel Distribution of Modin Getting Started`: https://www.intel.com/content/www/us/en/developer/articles/technical/intel-distribution-of-modin-getting-started-guide.html .. |reg| unicode:: U+000AE .. REGISTERED SIGN .. _Colab: https://colab.research.google.com/ ================================================ FILE: docs/getting_started/quickstart.rst ================================================ Getting Started =============== .. note:: | *Estimated Reading Time: 10 minutes* | You can follow along this tutorial in a Jupyter notebook `here `_. .. toctree:: :hidden: :maxdepth: 4 10-min Quickstart Guide installation using_modin/using_modin why_modin/why_modin examples faq troubleshooting .. meta:: :description lang=en: Introduction to Modin. Quick Start Guide ----------------- To install the most recent stable release for Modin run the following: .. code-block:: bash pip install "modin[all]" For further instructions on how to install Modin with conda or for specific platforms or engines, see our detailed `installation guide <../getting_started/installation.html>`_. Modin acts as a drop-in replacement for pandas so you simply have to replace the import of pandas with the import of Modin as follows to speed up your pandas workflows: .. code-block:: bash # import pandas as pd import modin.pandas as pd Example: Instant Scalability with No Extra Effort ------------------------------------------------- When working on large datasets, pandas becomes painfully slow or :doc:`runs out of memory`. Modin automatically scales up your pandas workflows by parallelizing the dataframe operations, so that you can more effectively leverage the compute resources available. For the purpose of demonstration, we will load in modin as ``pd`` and pandas as ``pandas``. .. code-block:: python import modin.pandas as pd import pandas ############################################# ### For the purpose of timing comparisons ### ############################################# import time import ray # Look at the Ray documentation with respect to the Ray configuration suited to you most. ray.init() ############################################# In this toy example, we look at the NYC taxi dataset, which is around 200MB in size. You can download `this dataset `_ to run the example locally. .. code-block:: python # This may take a few minutes to download import urllib.request dataset_url = "https://modin-datasets.intel.com/testing/yellow_tripdata_2015-01.csv" urllib.request.urlretrieve(dataset_url, "taxi.csv") Faster Data Loading with ``read_csv`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: python start = time.time() pandas_df = pandas.read_csv(dataset_url, parse_dates=["tpep_pickup_datetime", "tpep_dropoff_datetime"], quoting=3) end = time.time() pandas_duration = end - start print("Time to read with pandas: {} seconds".format(round(pandas_duration, 3))) By running the same command ``read_csv`` with Modin, we generally get around 4X speedup for loading in the data in parallel. .. code-block:: python start = time.time() modin_df = pd.read_csv(dataset_url, parse_dates=["tpep_pickup_datetime", "tpep_dropoff_datetime"], quoting=3) end = time.time() modin_duration = end - start print("Time to read with Modin: {} seconds".format(round(modin_duration, 3))) print("Modin is {}x faster than pandas at `read_csv`!".format(round(pandas_duration / modin_duration, 2))) Faster ``concat`` across multiple dataframes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Our previous ``read_csv`` example operated on a relatively small dataframe. In the following example, we duplicate the same taxi dataset 100 times and then concatenate them together, resulting in a dataset around 19GB in size. .. code-block:: python start = time.time() big_pandas_df = pandas.concat([pandas_df for _ in range(25)]) end = time.time() pandas_duration = end - start print("Time to concat with pandas: {} seconds".format(round(pandas_duration, 3))) .. code-block:: python start = time.time() big_modin_df = pd.concat([modin_df for _ in range(25)]) end = time.time() modin_duration = end - start print("Time to concat with Modin: {} seconds".format(round(modin_duration, 3))) print("Modin is {}x faster than pandas at `concat`!".format(round(pandas_duration / modin_duration, 2))) Modin speeds up the ``concat`` operation by more than 60X, taking less than a second to create the large dataframe, while pandas took close to a minute. Faster ``apply`` over a single column ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The performance benefits of Modin become apparent when we operate on large gigabyte-scale datasets. Let's say we want to round up values across a single column via the ``apply`` operation. .. code-block:: python start = time.time() rounded_trip_distance_pandas = big_pandas_df["trip_distance"].apply(round) end = time.time() pandas_duration = end - start print("Time to apply with pandas: {} seconds".format(round(pandas_duration, 3))) .. code-block:: python start = time.time() rounded_trip_distance_modin = big_modin_df["trip_distance"].apply(round) end = time.time() modin_duration = end - start print("Time to apply with Modin: {} seconds".format(round(modin_duration, 3))) print("Modin is {}x faster than pandas at `apply` on one column!".format(round(pandas_duration / modin_duration, 2))) Modin is more than 30X faster at applying a single column of data, operating on 130+ million rows in a second. In short, Modin provides orders of magnitude speed up over pandas for a variety of operations out of the box. .. figure:: ../img/quickstart_speedup.svg :align: center Summary ------- Hopefully, this tutorial demonstrated how Modin delivers significant speedup on pandas operations without the need for any extra effort. Throughout example, we moved from working with 100MBs of data to 20GBs of data all without having to change anything or manually optimize our code to achieve the level of scalable performance that Modin provides. Note that in this quickstart example, we've only shown ``read_csv``, ``concat``, ``apply``, but these are not the only pandas operations that Modin optimizes for. In fact, Modin covers `more than 90\% of the pandas API `_, yielding considerable speedups for many common operations. ================================================ FILE: docs/getting_started/troubleshooting.rst ================================================ Troubleshooting =============== We hope your experience with Modin is bug-free, but there are some quirks about Modin that may require troubleshooting. If you are still having issues, please post on the #support channel on our Slack_ community or open a Github issue_. Frequently encountered issues ----------------------------- This is a list of the most frequently encountered issues when using Modin. Some of these are working as intended, while others are known bugs that are being actively worked on. Warning during execution: ``defaulting to pandas`` """""""""""""""""""""""""""""""""""""""""""""""""" Please note, that while Modin covers a large portion of the pandas API, not all functionality is implemented. For methods that are not yet implemented, such as ``asfreq``, you may see the following: .. code-block:: text UserWarning: `DataFrame.asfreq` defaulting to pandas implementation. To understand which functions will lead to this warning, we have compiled a list of :doc:`currently supported methods `. When you see this warning, Modin defaults to pandas by converting the Modin dataframe to pandas to perform the operation. Once the operation is complete in pandas, it is converted back to a Modin dataframe. These operations will have a high overhead due to the communication involved and will take longer than pandas. When this is happening, a warning will be given to the user to inform them that this operation will take longer than usual. You can learn more about this :doc:`here `. If you would like to request a particular method be implemented, feel free to open an `issue`_. Before you open an issue please make sure that someone else has not already requested that functionality. Hanging on ``import modin.pandas as pd`` """""""""""""""""""""""""""""""""""""""" This can happen when Ray fails to start. It will keep retrying, but often it is faster to just restart the notebook or interpreter. Generally, this should not happen. Most commonly this is encountered when starting multiple notebooks or interpreters in quick succession. **Solution** Restart your interpreter or notebook kernel. **Avoiding this Error** Avoid starting many Modin notebooks or interpreters in quick succession. Wait 2-3 seconds before starting the next one. Importing heterogeneous data using ``read_csv`` """"""""""""""""""""""""""""""""""""""""""""""" Since Modin's ``read_csv`` imports data in parallel, it is possible for data across partitions to be heterogeneously typed (this can happen when columns contain heterogeneous data, i.e. values in the same column are of different types). An example of how this is handled is shown below. .. code-block:: python import os import pandas import modin.pandas as pd from modin.config import NPartitions NPartitions.put(2) test_filename = "test.csv" # data with heterogeneous values in the first column data = """one,2 3,4 5,6 7,8 9.0,10 """ kwargs = { # names of the columns to set, if `names` parameter is set, # header inffering from the first data row/rows will be disabled "names": ["col1", "col2"], # explicit setting of data type of column/columns with heterogeneous # data will force partitions to read data with correct dtype # "dtype": {"col1": str}, } try : with open(test_filename, "w") as f: f.write(data) pandas_df = pandas.read_csv(test_filename, **kwargs) pd_df = pd.read_csv(test_filename, **kwargs) print(pandas_df) print(pd_df) finally: os.remove(test_filename) Output: pandas_df: col1 col2 0 one 2 1 3 4 2 5 6 3 7 8 4 9.0 10 pd_df: col1 col2 0 one 2 1 3 4 2 5 6 3 7.0 8 4 9.0 10 In this case, ``col1`` of the `DataFrame` read by pandas contains only ``str`` data because the first value ("one") is inferred to have type ``str``, which forces pandas to handle the rest of the values in the column as strings. The first Modin partition (the first three rows) handles the data as pandas does, but the second partition (the last two rows) reads the data as floats. This is because the second column contains an int and a float, and thus the column type is inferred to be float. As a result, `7` is interpreted as `7.0`, which differs from the pandas output. The above example demonstrates heterogenous data import with str, int, and float types, but heterogeneous data consisting of other data/parameter combinations can also result in data type mismatches with pandas. **Solution** When heterogeneous data is detected, a warning will be raised. Currently, these discrepancies aren't properly handled by Modin, so to avoid this issue, you need to set the ``dtype`` parameter of ``read_csv`` manually to force the correct data type coercion during data import. Note that to avoid excessive performance degradation, the ``dtype`` value should only be set for columns that may contain heterogenous data. as possible (specify ``dtype`` parameter only for columns with heterogeneous data). Specifying the ``dtype`` parameter will work well in most cases. If the file contains a column that should be interpreted as the index (the ``index_col`` parameter is specified) there may still be type discrepancies in the index, since the ``dtype`` parameter is only responsible for data fields. If in the above example, ``kwargs`` was set like so: .. code-block:: python kwargs = { "names": ["col1", "col2"], "dtype": {"col1": str}, "index_col": "col1", } The resulting Modin DataFrame will contain incorrect values - just as if ``dtype`` had not been specified: .. code-block:: python col1 one 2 3 4 5 6 7.0 8 9.0 10 One workaround is to import the data without setting the ``index_col`` parameter, and then set the index column using the ``DataFrame.set_index`` function as shown in the example below: .. code-block:: python pd_df = pd.read_csv(filename, dtype=data_dtype, index_col=None) pd_df = pd_df.set_index(index_col_name) pd_df.index.name = None Using Modin with python multiprocessing """"""""""""""""""""""""""""""""""""""" We strongly recommend against using a distributed execution engine (e.g. Ray or Dask) in conjunction with Python multiprocessing because that can lead to undefined behavior. One such example is shown below: .. code-block:: python import modin.pandas as pd # Ray engine is used by default df = pandas.DataFrame([1, 2, 3]) def f(arg): return df + arg if __name__ == '__main__': from multiprocessing import Pool with Pool(5) as p: print(p.map(f, [1])) Although this example may work on your machine, we do not recommend it, because the Python multiprocessing library will duplicate Ray clusters, causing both excessive resource usage and conflict over the available resources. Poor performance of the first operation with Modin on Ray engine """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" There might be cases when the first operation with Modin on Ray engine is much slower than the subsequent calls of the operation. That happens because Ray workers may not be fully set up yet to perform computation after initialization of the engine with ``ray.init(runtime_env={'env_vars': {'__MODIN_AUTOIMPORT_PANDAS__': '1'}})``, which is the default behavior of Modin on Ray engine if Ray has not been initialised yet. Modin intentionaly initializes Ray this way to import ``pandas`` in workers once Python interpreter is started in them so that to avoid a race condition in Ray between the import thread and the thread executing the code. .. See more details on why we started using ``ray.init(runtime_env={'env_vars': {'__MODIN_AUTOIMPORT_PANDAS__': '1'}})` in https://github.com/modin-project/modin/pull/4603. .. code-block:: python import time import pandas import numpy as np import ray import modin.pandas as pd import modin.config as cfg # Look at the Ray documentation with respect to the Ray configuration suited to you most. ray.init(runtime_env={'env_vars': {'__MODIN_AUTOIMPORT_PANDAS__': '1'}}) pandas_df = pandas.DataFrame( np.random.randint(0, 100, size=(1000000, 13)) ) pandas_df.to_csv("foo.csv", index=False) def read_csv_with_pandas(): start_time = time.time() pandas_df = pandas.read_csv("foo.csv", index_col=0) end_time = time.time() pandas_duration = end_time - start_time print("Time to read_csv with pandas: {} seconds".format(round(pandas_duration, 3))) return pandas_df def read_csv_with_modin(): start_time = time.time() modin_df = pd.read_csv("foo.csv", index_col=0) end_time = time.time() modin_duration = end_time - start_time print("Time to read_csv with Modin: {} seconds".format(round(modin_duration, 3))) return modin_df for i in range(5): pandas_df = read_csv_with_pandas() modin_df = read_csv_with_modin() Time to read_csv with pandas: 0.708 seconds Time to read_csv with Modin: 4.132 seconds Time to read_csv with pandas: 0.735 seconds Time to read_csv with Modin: 0.37 seconds Time to read_csv with pandas: 0.646 seconds Time to read_csv with Modin: 0.377 seconds Time to read_csv with pandas: 0.673 seconds Time to read_csv with Modin: 0.371 seconds Time to read_csv with pandas: 0.672 seconds Time to read_csv with Modin: 0.379 seconds **Solution** So far there is no a solution to fix or work around the problem rather than not to pass a non-empty runtime_env to ``ray.init()``. However, this may lead to other problem regarding a race condition in Ray between the import thread and the thread executing the code. So for now we just highlight the problem in hope of a future fix in Ray itself. Also, it is worth noting that every distributed engine by its nature has a little overhead for the first operation being called, which may be important for microbenchmarks. What you likely want to do is warm up worker processes either by excluding the time of the first iteration from your measurements or execute a simple function in workers to fully set up them. Common errors ------------- Error when using Dask engine: ``RuntimeError: if __name__ == '__main__':`` """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" The following `script.py` uses Modin with Dask as an execution engine and produces errors: .. code-block:: python # script.py import modin.pandas as pd import modin.config as cfg cfg.Engine.put("dask") df = pd.DataFrame([0,1,2,3]) print(df) A part of the produced errors by the script above would be the following: .. code-block:: File "/path/python3.9/multiprocessing/spawn.py", line 134, in _check_not_importing_main raise RuntimeError(''' RuntimeError: An attempt has been made to start a new process before the current process has finished its bootstrapping phase. This probably means that you are not using fork to start your child processes and you have forgotten to use the proper idiom in the main module: if __name__ == '__main__': freeze_support() ... The "freeze_support()" line can be omitted if the program is not going to be frozen to produce an executable. This happens because Dask Client uses `fork `_ to start processes. **Solution** To avoid the problem the Dask Client creation code needs to be moved into the ``__main__`` scope of the module. The corrected `script.py` would look like: .. code-block:: python # script.py import modin.pandas as pd import modin.config as cfg cfg.Engine.put("dask") if __name__ == "__main__": df = pd.DataFrame([0, 1, 2, 3]) # Dask Client creation is hidden in the first call of Modin functionality. print(df) or .. code-block:: python # script.py from distributed import Client import modin.pandas as pd import modin.config as cfg cfg.Engine.put("dask") if __name__ == "__main__": # Explicit Dask Client creation. # Look at the Dask Distributed documentation with respect to the Client configuration suited to you most. client = Client() df = pd.DataFrame([0, 1, 2, 3]) print(df) Spurious error "cannot import partially initialised pandas module" on custom Ray cluster """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" If you're using some pre-configured Ray cluster to run Modin, it's possible you would be seeing spurious errors like .. code-block:: ray.exceptions.RaySystemError: System error: partially initialized module 'pandas' has no attribute 'core' (most likely due to a circular import) traceback: Traceback (most recent call last): File "/usr/share/miniconda/envs/modin/lib/python3.8/site-packages/ray/serialization.py", line 340, in deserialize_objects obj = self._deserialize_object(data, metadata, object_ref) File "/usr/share/miniconda/envs/modin/lib/python3.8/site-packages/ray/serialization.py", line 237, in _deserialize_object return self._deserialize_msgpack_data(data, metadata_fields) File "/usr/share/miniconda/envs/modin/lib/python3.8/site-packages/ray/serialization.py", line 192, in _deserialize_msgpack_data python_objects = self._deserialize_pickle5_data(pickle5_data) File "/usr/share/miniconda/envs/modin/lib/python3.8/site-packages/ray/serialization.py", line 180, in _deserialize_pickle5_data obj = pickle.loads(in_band, buffers=buffers) File "/usr/share/miniconda/envs/modin/lib/python3.8/site-packages/pandas/__init__.py", line 135, in from pandas import api, arrays, errors, io, plotting, testing, tseries File "/usr/share/miniconda/envs/modin/lib/python3.8/site-packages/pandas/testing.py", line 6, in from pandas._testing import ( File "/usr/share/miniconda/envs/modin/lib/python3.8/site-packages/pandas/_testing/__init__.py", line 979, in cython_table = pd.core.common._cython_table.items() AttributeError: partially initialized module 'pandas' has no attribute 'core' (most likely due to a circular import) **Solution** Modin contains a workaround that should automatically do ``import pandas`` upon worker process starts. It is triggered by the presence of non-empty ``__MODIN_AUTOIMPORT_PANDAS__`` environment variable which Modin sets up automatically on the Ray clusters it spawns, but it might be missing on pre-configured clusters. So if you're seeing the issue like shown above, please make sure you set this environment variable on all worker nodes of your cluster before actually spawning the workers. .. _issue: https://github.com/modin-project/modin/issues .. _Slack: https://modin.org/slack.html ================================================ FILE: docs/getting_started/using_modin/using_modin.rst ================================================ Using Modin =========== In this section, we show how Modin can be used to accelerate your pandas workflows on a single machine up to multiple machines in a cluster setting. .. toctree:: :maxdepth: 4 using_modin_locally using_modin_cluster ================================================ FILE: docs/getting_started/using_modin/using_modin_cluster.rst ================================================ Using Modin in a Cluster ======================== .. note:: | *Estimated Reading Time: 15 minutes* Often in practice we have a need to exceed the capabilities of a single machine. Modin works and performs well in both local mode and in a cluster environment. The key advantage of Modin is that your python code does not change between local development and cluster execution. Users are not required to think about how many workers exist or how to distribute and partition their data; Modin handles all of this seamlessly and transparently. .. note:: It is possible to use a Jupyter notebook, but you will have to deploy a Jupyter server on the remote cluster head node and connect to it. .. image:: ../../img/modin_cluster.png :alt: Modin cluster :align: center Extra requirements for AWS authentication ----------------------------------------- First of all, install the necessary dependencies in your environment: .. code-block:: bash pip install boto3 The next step is to setup your AWS credentials. One can set ``AWS_ACCESS_KEY_ID``, ``AWS_SECRET_ACCESS_KEY`` and ``AWS_SESSION_TOKEN`` (Optional) (refer to `AWS CLI environment variables`_ to get more insight on this) or just run the following command: .. code-block:: bash aws configure Starting and connecting to the cluster -------------------------------------- This example starts 1 head node (m5.24xlarge) and 5 worker nodes (m5.24xlarge), 576 total CPUs. You can check the `Amazon EC2 pricing`_ page. It is possble to manually create AWS EC2 instances and configure them or just use the `Ray CLI`_ to create and initialize a Ray cluster on AWS using `Modin's Ray cluster setup config`_, which we are going to utilize in this example. Refer to `Ray's autoscaler options`_ page on how to modify the file. More details on how to launch a Ray cluster can be found on `Ray's cluster docs`_. To start up the Ray cluster, run the following command in your terminal: .. code-block:: bash ray up modin-cluster.yaml Once the head node has completed initialization, you can optionally connect to it by running the following command. .. code-block:: bash ray attach modin-cluster.yaml To exit the ssh session and return back into your local shell session, type: .. code-block:: bash exit Executing in a cluster environment ---------------------------------- .. note:: Be careful when using the `Ray client`_ to connect to a remote cluster. We don't recommend this connection mode, beacuse it may not work. Known bugs: - https://github.com/ray-project/ray/issues/38713, - https://github.com/modin-project/modin/issues/6641. Modin lets you instantly speed up your workflows with a large data by scaling pandas on a cluster. In this tutorial, we will use a 12.5 GB ``big_yellow.csv`` file that was created by concatenating a 200MB `NYC Taxi dataset`_ file 64 times. Preparing this file was provided as part of our `Modin's Ray cluster setup config`_. If you want to use the other dataset, you should provide it to each of the cluster nodes with the same path. We recomnend doing this by customizing the ``setup_commands`` section of the `Modin's Ray cluster setup config`_. To run any script in a remote cluster, you need to submit it to the Ray. In this way, the script file is sent to the the remote cluster head node and executed there. In this tutorial, we provide the `exercise_5.py`_ script, which reads the data from the CSV file and executes such pandas operations as count, groupby and map. As the result, you will see the size of the file being read and the execution time of the entire script. You can submit this script to the existing remote cluster by running the following command. .. code-block:: bash ray submit modin-cluster.yaml exercise_5.py To download or upload files to the cluster head node, use ``ray rsync_down`` or ``ray rsync_up``. It may help if you want to use some other Python modules that should be available to execute your own script or download a result file after executing the script. .. code-block:: bash # download a file from the cluster to the local machine: ray rsync_down modin-cluster.yaml '/path/on/cluster' '/local/path' # upload a file from the local machine to the cluster: ray rsync_up modin-cluster.yaml '/local/path' '/path/on/cluster' Shutting down the cluster -------------------------- Now that we have finished the computation, we need to shut down the cluster with `ray down` command. .. code-block:: bash ray down modin-cluster.yaml .. _`Ray's autoscaler options`: https://docs.ray.io/en/latest/cluster/vms/references/ray-cluster-configuration.html#cluster-config .. _`Ray's cluster docs`: https://docs.ray.io/en/latest/cluster/getting-started.html .. _`NYC Taxi dataset`: https://modin-datasets.intel.com/testing/yellow_tripdata_2015-01.csv .. _`Modin's Ray cluster setup config`: https://github.com/modin-project/modin/blob/main/examples/tutorial/jupyter/execution/pandas_on_ray/cluster/modin-cluster.yaml .. _`Amazon EC2 pricing`: https://aws.amazon.com/ec2/pricing/on-demand/ .. _`exercise_5.py`: https://github.com/modin-project/modin/blob/main/examples/tutorial/jupyter/execution/pandas_on_ray/cluster/exercise_5.py .. _`Ray client`: https://docs.ray.io/en/latest/cluster/running-applications/job-submission/ray-client.html .. _`Ray CLI`: https://docs.ray.io/en/latest/cluster/vms/getting-started.html#running-applications-on-a-ray-cluster .. _`AWS CLI environment variables`: https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-envvars.html ================================================ FILE: docs/getting_started/using_modin/using_modin_locally.rst ================================================ =================== Using Modin Locally =================== .. note:: | *Estimated Reading Time: 5 minutes* | You can follow along this tutorial in the `Jupyter notebook`_. In our quickstart example, we have already seen how you can achieve considerable speedup from Modin, even on a single machine. Users do not need to know how many cores their system has, nor do they need to specify how to distribute the data. In fact, users can **continue using their existing pandas code** while experiencing a considerable speedup from Modin, even on a single machine. To use Modin on a single machine, only a modification of the import statement is needed. Once you've changed your import statement, you're ready to use Modin just like you would pandas, since the API is identical to pandas. .. code-block:: python # import pandas as pd import modin.pandas as pd **That's it. You're ready to use Modin on your previous pandas workflows!** Advanced: Configuring the resources Modin uses ---------------------------------------------- Modin automatically check the number of CPUs available on your machine and sets the number of partitions to be equal to the number of CPUs. You can verify this by running the following code: .. code-block:: python import modin print(modin.config.NPartitions.get()) #prints 16 on a laptop with 16 physical cores Modin fully utilizes the resources on your machine. To read more about how this works, see :doc:`Why Modin? ` page for more details. Since Modin will use all of the resources available on your machine by default, at times, it is possible that you may like to limit the amount of resources Modin uses to free resources for another task or user. Here is how you would limit the number of CPUs Modin used in your bash environment variables: .. code-block:: bash export MODIN_CPUS=4 You can also specify this in your python script with ``os.environ``: .. code-block:: python import os os.environ["MODIN_CPUS"] = "4" import modin.pandas as pd If you're using a specific engine and want more control over the environment Modin uses, you can start Ray or Dask in your environment and Modin will connect to it. .. code-block:: python import ray ray.init(num_cpus=4) import modin.pandas as pd Specifying ``num_cpus`` limits the number of processors that Modin uses. You may also specify more processors than you have available on your machine; however this will not improve the performance (and might end up hurting the performance of the system). .. note:: Make sure to update the ``MODIN_CPUS`` configuration and initialize your preferred engine before you start working with the first operation using Modin! Otherwise, Modin will opt for the default setting. .. _`Jupyter notebook`: https://github.com/modin-project/modin/tree/main/examples/quickstart.ipynb ================================================ FILE: docs/getting_started/why_modin/modin_vs_dask_vs_koalas.rst ================================================ Modin vs. Dask DataFrame vs. Koalas =================================== Libraries such as `Dask DataFrame `_ (DaskDF for short) and `Koalas `_ aim to support the pandas API on top of distributed computing frameworks, Dask and Spark respectively. Instead, Modin aims to preserve the pandas API and behavior as is, while abstracting away the details of the distributed computing framework underneath. Thus, the aims of these libraries are fundamentally different. Specifically, Modin enables pandas-like * row and column-parallel operations, unlike DaskDF and Koalas that only support row-parallel operations * indexing & ordering semantics, unlike DaskDF and Koalas that deviate from these semantics * eager execution, unlike DaskDF and Koalas that provide lazy execution As a result, Modin's coverage is `more than 90% `_ of the pandas API, while DaskDF and Koalas' coverage is about 55%. .. figure:: ../../img/api_coverage_comparison.svg :align: center :alt: Percentage coverage of the pandas API after deduplication For more technical details please see our VLDB 2022 research paper, referenced `here `_. Brief Overview of DaskDF and Koalas ----------------------------------- Dask's `DataFrame `_ (DaskDF) is effectively a meta-DataFrame, partitioning and scheduling many smaller ``pandas.DataFrame`` objects. Users construct a task graph of dataframe computation step by step and then trigger computation using the ``compute`` function. Spark's `Koalas `_ provides the pandas API on Spark, leveraging the preexisting Spark SQL optimizer to execute select pandas commands. Like DaskDF, Koalas also employs lazy computation, only triggering computation when the user requests to see the results. Partitioning and Parallelization -------------------------------- Modin, DaskDF, Koalas are all examples of parallel dataframe systems. Parallelism is achieved by partitioning a large dataframe into smaller ones that can be operated on in parallel. As a result, the partitioning scheme chosen by the system dictates the pandas functions that can or can not be supported. **DaskDF and Koalas only support row-oriented partitioning and parallelism.** This approach is analogous to relational databases. The dataframe is conceptually broken down into horizontal partitions along rows, where each partition is independently processed if possible. When DaskDF or Koalas are required to perform column-parallel operations that to be done on columns independently (e.g., dropping columns with null values via ``dropna`` on the column ``axis``), they either perform very poorly with no parallelism or do not support that operation. **Modin supports both row, column, and cell-oriented partitioning and parallelism**. That is, the dataframe can be conceptually broken down as groups of rows, groups of columns, or both groups of rows and groups of columns (effectively a block or sub-matrix). Modin will transparently reshape the partitioning as necessary for the corresponding operation, based on whether the operation is row-parallel, column-parallel, or cell-parallel (independently applied to each unit cell). This allows Modin to support more of the pandas API and do so efficiently. Due to the finer-grained control over the partitioning, Modin can support a number of operations that are very challenging to parallelize in row-oriented systems (e.g., ``transpose``, ``median``, ``quantile``). This flexibility in partitioning also gives Modin tremendous power to implement efficient straggler mitigation and improve utilization over the entire cluster. API Coverage ------------ One of the key benefits of pandas is its versatility, due to the wide array of operations, with more than 600+ API operations for data cleaning, feature engineering, data transformation, data summarization, data exploration, and machine learning. However, it is not trivial to develop scalable implementations of each of these operations in a dataframe system. **DaskDF and Koalas only implements about** `55% `_ **of the pandas API**; they do not implement certain APIs that would deviate from the row-wise partitioning approach, or would be inefficient with the row-wise parallelization. For example, Dask does not implement ``iloc``, ``MultiIndex``, ``apply(axis=0)``, ``quantile`` (only approximate quantile is available), ``median``, and more. Given DaskDF's row-oriented architecture, ``iloc``, for example, can technically be implemented, but it would be inefficient, and column-wise operations such as ``apply(axis=0)`` would be impossible to implement. Similarly, Koalas does not implement ``apply(axis=0)`` (it only applies the function per row partition, giving a different result), ``quantile``, ``median`` (only approximate quantile/median is available), ``MultiIndex``, ``combine``, ``compare`` and more. **Modin supports all of the above pandas API functions, as well as others, with** `more than 90% `_ **coverage of the pandas API.** Modin additionally acts as a drop-in replacement for pandas, such that even if the API is not yet supported, it still works by falling back to running vanilla pandas. One of the key features of being a drop-in replacement is that not only will it work for existing code, if a user wishes to go back to running pandas directly, they are not locked in to using Modin and can switch between Modin and pandas at no cost. In other words, scripts and notebooks written in Modin can be converted to and from pandas as the user desires by simply replacing the import statement. Execution Semantics --------------------- **DaskDF and Koalas make use of lazy evaluation, which means that the computation is delayed until users explicitly evaluate the results.** This mode of evaluation places a lot of optimization responsibility on the user, forcing them to think about when it would be useful to inspect the intermediate results or delay doing so. Specifically, DaskDF's API differs from pandas in that it requires users to explicitly call ``.compute()`` to materialize the result of the computation. Often if that computation corresponds to a long chain of operators, this call can take a very long time to execute. Overall, the need to explicitly trigger computation makes the API less convenient to work with, but gives DaskDF and Koalas the opportunity to perform holistic optimizations over the entire dataflow graph. However, to the best of our knowledge, neither DaskDF nor Koalas actually leverage holistic optimizations. **Modin employs eager evaluation, like pandas.** Eager evaluation is the default mode of operation for data scientists when working with pandas in an interactive environment, such as Jupyter Notebooks. Modin reproduces this familiar behavior by performing all computations eagerly as soon as it is issued, so that users can inspect intermediate results and quickly see the results of their computations without having to wait or explicitly trigger computation. This is especially useful during interactive data analysis, where users often iterate on their dataframe workflows or build up their dataframe queries in an incremental fashion. We also have developed techniques for `opportunistic evaluation `_ that bridges the gap between lazy and eager evaluation that will be incorporated in Modin in the future. Ordering Semantics ------------------ By default, pandas preserves the order of the dataframe, so that users can expect a consistent, ordered view as they are operating on their dataframe. **Both DaskDF and Koalas make no guarantees about the order of rows in the DataFrame.** This is because DaskDF sorts the ``index`` for optimization purposes to speed up computations that involve the row index; and as a result, it does not support user-specified order. Likewise, Koalas `does not support ordering `_ by default because it will lead to a performance overhead when operating on distributed datasets. **DaskDF additionally does not support multi-indexing or sorting.** DaskDF sorts the data based on a single set of row labels for fast row lookups, and builds an indexing structure based on these row labels. Data is both logically and physically stored in the same order. As a result, DaskDF does not support a `sort` function. **Modin reproduces the intuitive behavior in pandas where the order of the DataFrame is preserved, and supports multi-indexing.** Enforcing ordering on a parallel dataframe system like Modin requires non-trivial effort that involves decoupling of the logical and physical representation of the data, enabling the order to be lazily kept up-to-date, but eagerly computed based on user needs (See Section 4.2 in `our recent paper `_). Modin abstracts away the physical representation of the data and provides an ordered view that is consistent with user's expectations. Compatibility with Computational Frameworks ------------------------------------------- **DaskDF and Koalas are meant to be run on Dask and Spark respectively.** They are highly tuned to the corresponding frameworks, and cannot be ported to other computational frameworks. **Modin's highly modular design is architected to run on a variety of systems, and support a variety of APIs.** The goal for the extensible design is that users can take the same notebook or script and seamlessly move between different clusters and environments, with Modin being able to support the pandas API on your preexisting infrastructure. Currently, Modin support running on Dask's compute engine in addition to Ray. The modular design makes it easier for developers to different execution engines or compile to different memory formats. Modin can run on a Dask cluster in the same way that DaskDF can, but they differ in the ways described above. In addition, Modin is continually expanding to support popular data processing APIs (SQL in addition to pandas, among other DSLs for data processing) while leveraging the same underlying execution framework. Modin's flexible architecture also means that as the `pandas API continues to evolve `_, Modin can quickly move towards supporting new versions of the pandas API. .. figure:: ../../img/performance-all-supported.svg :align: center :alt: Scalability of operators supported by Modin and other systems :width: 95% Performance Comparison ---------------------- **On operations supported by all systems, Modin provides substantial speedups.** Thanks to its optimized design, Modin is able to take advantage of multiple cores relative to both Koalas and DaskDF to efficiently execute pandas operations. It is notable that Koalas is often slower than pandas, due to the overhead of Spark. .. figure:: ../../img/performance-not-all-supported.svg :align: center :alt: Scalability of operators supported by Modin but not by other systems **Modin provides substantial speedups even on operators not supported by other systems.** Thanks to its flexible partitioning schemes that enable it to support the vast majority of pandas operations — be it row, column, or cell-oriented - Modin provides benefits on operations such as ``join``, ``median``, and ``infer_types``. While Koalas performs ``join`` slower than Pandas, Dask failed to support ``join`` on more than 20M rows, likely due poor support for `shuffles `_. Details of the benchmark and additional join experiments can be found in `our paper `_. .. _documentation: http://docs.dask.org/en/latest/DataFrame.html#design. .. _Modin's documentation: https://modin.readthedocs.io/en/latest/development/architecture.html ================================================ FILE: docs/getting_started/why_modin/out_of_core.rst ================================================ Out-of-memory data with Modin ============================= .. note:: | *Estimated Reading Time: 10 minutes* When using pandas, you might run into a memory error if you are working with large datasets that cannot fit in memory or perform certain memory-intensive operations (e.g., joins). Modin solves this problem by spilling over to disk, in other words, it uses your disk as an overflow for memory so that you can work with datasets that are too large to fit in memory. By default, Modin leverages out-of-core methods to handle datasets that don't fit in memory for both Ray and Dask engines. .. note:: Object spilling is disabled in a multi-node Ray cluster by default. To enable object spilling use `Ray instruction `_. Motivating Example: Memory error with pandas -------------------------------------------- pandas makes use of in-memory data structures to store and operate on data, which means that if you have a dataset that is too large to fit in memory, it will cause an error on pandas. As an example, let's creates a 80GB DataFrame by appending together 40 different 2GB DataFrames. .. code-block:: python import pandas import numpy as np df = pandas.concat([pandas.DataFrame(np.random.randint(0, 100, size=(2**20, 2**8))) for _ in range(40)]) # Memory Error! When we run this on a laptop with 32GB of RAM, pandas will run out of memory and throw an error (e.g., :code:`MemoryError` , :code:`Killed: 9`). The `pandas documentation `_ has a great section on recommendations for scaling your analysis to these larger datasets. However, this generally involves loading in less data or rewriting your pandas code to process the data in smaller chunks. Operating on out-of-memory data with Modin ------------------------------------------ In order to work with data that exceeds memory constraints, you can use Modin to handle these large datasets. .. code-block:: python import modin.pandas as pd import numpy as np df = pd.concat([pd.DataFrame(np.random.randint(0, 100, size=(2**20, 2**8))) for _ in range(40)]) # 40x2GB frames -- Working! df.info() Not only does Modin let you work with datasets that are too large to fit in memory, we can perform various operations on them without worrying about memory constraints. Advanced: Configuring out-of-core settings ------------------------------------------ .. why would you want to disable out of core? By default, out-of-core functionality is enabled by the compute engine selected. To disable it, start your preferred compute engine with the appropriate arguments. For example: .. code-block:: python import modin.pandas as pd import ray ray.init(_plasma_directory="/tmp") # setting to disable out of core in Ray df = pd.read_csv("some.csv") If you are using Dask, you have to modify local configuration files. Visit the Dask documentation_ on object spilling for more details. .. _documentation: https://distributed.dask.org/en/latest/worker.html#memory-management ================================================ FILE: docs/getting_started/why_modin/pandas.rst ================================================ How does Modin differ from pandas? ================================== .. note:: | *Estimated Reading Time: 10 minutes* In the earlier tutorials, we have seen how Modin can be used to speed up pandas workflows. Here, we discuss at a high level how Modin works, in particular, how Modin's dataframe implementation differs from pandas. Scalablity of implementation ---------------------------- Modin exposes the pandas API through ``modin.pandas``, but it does not inherit the same pitfalls and design decisions that make it difficult to scale. The pandas implementation is inherently single-threaded. This means that only one of your CPU cores can be utilized at any given time. In a laptop, it would look something like this with pandas: .. image:: /img/pandas_multicore.png :alt: pandas is single threaded! :align: center :scale: 80% However, Modin's implementation enables you to use all of the cores on your machine, or all of the cores in an entire cluster. On a laptop, it will look something like this: .. image:: /img/modin_multicore.png :alt: modin uses all of the cores! :align: center :scale: 80% The additional utilization leads to improved performance, however if you want to scale to an entire cluster, Modin suddenly looks something like this: .. image:: /img/modin_cluster.png :alt: modin works on a cluster too! :align: center :scale: 30% Modin is able to efficiently make use of all of the hardware available to it! Memory usage and immutability ----------------------------- The pandas API contains many cases of "inplace" updates, which are known to be controversial. This is due in part to the way pandas manages memory: the user may think they are saving memory, but pandas is usually copying the data whether an operation was inplace or not. Modin allows for inplace semantics, but the underlying data structures within Modin's implementation are immutable, unlike pandas. This immutability gives Modin the ability to internally chain operators and better manage memory layouts, because they will not be changed. This leads to improvements over pandas in memory usage in many common cases, due to the ability to share common memory blocks among all dataframes. Modin provides the inplace semantics by having a mutable pointer to the immutable internal Modin dataframe. This pointer can change, but the underlying data cannot, so when an inplace update is triggered, Modin will treat it as if it were not inplace and just update the pointer to the resulting Modin dataframe. API vs implementation --------------------- It is well known that the pandas API contains many duplicate ways of performing the same operation. Modin instead enforces that any one behavior have one and only one implementation internally. This guarantee enables Modin to focus on and optimize a smaller code footprint while still guaranteeing that it covers the entire pandas API. Modin has an internal algebra, which is roughly 15 operators, narrowed down from the original >200 that exist in pandas. The algebra is grounded in both practical and theoretical work. Learn more in our `VLDB 2020 paper`_. More information about this algebra can be found in the :doc:`architecture ` documentation. .. _VLDB 2020 paper: https://arxiv.org/abs/2001.00888 ================================================ FILE: docs/getting_started/why_modin/why_modin.rst ================================================ Why Modin? ========== In this section, we explain the design and motivation behind Modin and why you should use Modin to scale up your pandas workflows. We first describe the architectural differences between pandas and Modin. Then we describe how Modin can also help resolve out-of-memory issues common to pandas. Finally, we look at the key differences between Modin and other distributed dataframe libraries. .. toctree:: :maxdepth: 4 pandas out_of_core modin_vs_dask_vs_koalas Modin is built on many years of research and development at UC Berkeley. For more information on how this works underneath the hoods, check out our publications in this space: - `Flexible Rule-Based Decomposition and Metadata Independence in Modin `_ (VLDB 2021) - `Enhancing the Interactivity of Dataframe Queries by Leveraging Think Time `_ (IEEE Data Eng 2021) - `Dataframe Systems: Theory, Architecture, and Implementation `_ (PhD Dissertation 2021) - `Scaling Data Science does not mean Scaling Machines `_ (CIDR 2021) - `Towards Scalable Dataframe Systems `_ (VLDB 2020) ================================================ FILE: docs/index.rst ================================================ .. image:: img/MODIN_ver2_hrz.png :width: 400px :alt: modin logo :align: center ==== .. toctree:: :hidden: getting_started/quickstart usage_guide/index supported_apis/index development/index ecosystem contact .. raw:: html

To use Modin, replace the pandas import:

.. figure:: img/Modin_Pandas_Import.gif :align: center Scale your pandas workflow by changing a single line of code """""""""""""""""""""""""""""""""""""""""""""""""""""""""""" Modin uses Ray_, Dask_ or Unidist_ to provide an effortless way to speed up your pandas notebooks, scripts, and libraries. Unlike other distributed DataFrame libraries, Modin provides seamless integration and compatibility with existing pandas code. Even using the DataFrame constructor is identical. .. code-block:: python import modin.pandas as pd import numpy as np frame_data = np.random.randint(0, 100, size=(2**10, 2**8)) df = pd.DataFrame(frame_data) It is not necessary to know in advance the available hardware resources in order to use Modin. Additionally, it is not necessary to specify how to distribute or place data. Modin acts as a drop-in replacement for pandas, which means that you can continue using your previous pandas notebooks, *unchanged*, while experiencing a considerable speedup thanks to Modin, even on a single machine. Once you've changed your import statement, you’re ready to use Modin just like you would pandas. Installation and choosing your compute engine """"""""""""""""""""""""""""""""""""""""""""" Modin can be installed from PyPI: .. code-block:: bash pip install modin If you don't have Ray_, Dask_ or Unidist_ installed, you will need to install Modin with one of the targets: .. code-block:: bash pip install "modin[ray]" # Install Modin dependencies and Ray to run on Ray pip install "modin[dask]" # Install Modin dependencies and Dask to run on Dask pip install "modin[mpi]" # Install Modin dependencies and MPI to run on MPI through unidist pip install "modin[all]" # Install all of the above Modin will automatically detect which engine you have installed and use that for scheduling computation! If you want to choose a specific compute engine to run on, you can set the environment variable ``MODIN_ENGINE`` and Modin will do computation with that engine: .. code-block:: bash export MODIN_ENGINE=ray # Modin will use Ray export MODIN_ENGINE=dask # Modin will use Dask export MODIN_ENGINE=unidist # Modin will use Unidist If you want to choose the Unidist engine, you should set the additional environment variable ``UNIDIST_BACKEND``, because currently Modin only supports MPI through unidist: .. code-block:: bash export UNIDIST_BACKEND=mpi # Unidist will use MPI backend This can also be done within a notebook/interpreter before you import Modin: .. code-block:: python import os os.environ["MODIN_ENGINE"] = "ray" # Modin will use Ray os.environ["MODIN_ENGINE"] = "dask" # Modin will use Dask os.environ["MODIN_ENGINE"] = "unidist" # Modin will use Unidist os.environ["UNIDIST_BACKEND"] = "mpi" # Unidist will use MPI backend import modin.pandas as pd Faster pandas, even on your laptop """""""""""""""""""""""""""""""""" .. image:: img/read_csv_benchmark.png :height: 350px :width: 300px :alt: Plot of read_csv :align: right The ``modin.pandas`` `DataFrame`_ is an extremely light-weight parallel DataFrame. Modin transparently distributes the data and computation so that all you need to do is continue using the pandas API as you were before installing Modin. Unlike other parallel DataFrame systems, Modin is an extremely light-weight, robust DataFrame. Because it is so light-weight, Modin provides speed-ups of up to 4x on a laptop with 4 physical cores. In pandas, you are only able to use one core at a time when you are doing computation of any kind. With Modin, you are able to use all of the CPU cores on your machine. Even in ``read_csv``, we see large gains by efficiently distributing the work across your entire machine. .. code-block:: python import modin.pandas as pd df = pd.read_csv("my_dataset.csv") Modin is a DataFrame for datasets from 1MB to 1TB+ """""""""""""""""""""""""""""""""""""""""""""""""" We have focused heavily on bridging the solutions between DataFrames for small data (e.g. pandas) and large data. Often data scientists require different tools for doing the same thing on different sizes of data. The DataFrame solutions that exist for 1MB do not scale to 1TB+, and the overheads of the solutions for 1TB+ are too costly for datasets in the 1KB range. With Modin, because of its light-weight, robust, and scalable nature, you get a fast DataFrame at 1MB and 1TB+. **Modin is currently under active development. Requests and contributions are welcome!** If you are interested in learning more about Modin, please check out the :doc:`Getting Started` guide then refer to the :doc:`Developer Documentation` section, where you can find system architecture, internal implementation details, and other useful information. Also check out the `Github`_ to view open issues and make contributions. .. _Dataframe: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html .. _Ray: https://github.com/ray-project/ray/ .. _Dask: https://dask.org/ .. _Unidist: https://github.com/modin-project/unidist/ .. _Github: https://github.com/modin-project/modin ================================================ FILE: docs/release-procedure.md ================================================ ## Versioning ### Patch release Modin uses semantic versioning. So when doing a patch release, please make a separate branch off the previous release tag, and `git cherry-pick` **only** the commits we would like to have in our patch release (assuming previous release was versioned `X.Y.Z`): git checkout -b release-X.Y.Z+1 X.Y.Z ### Major and Minor releases A major (`xx.0.0`) or minor (`0.xx.0`) release could be done by branching from `main`: git checkout -b release-X.Y.0 main ## Preparing the release Before continuing with the release process, make sure that automated CI which runs on each commit passed successfully with the commit you deem as a "release candidate". Modin follows the "no push" logic, which is _only_ circumvented for cherry-picked commits, as reviewing them again would not add a lot of value but would add lots of excess work. Hence non-cherry-pick commits should happen in a separate branch in your own fork, and be delivered to the release branch by using a PR. Note that Modin uses fully signed commits, so you have to have GPG keys set up. See [onboarding instructions](https://github.com/modin-project/modin/blob/main/contributing/contributing.md) on where to get started. To update Modin version, follow the instructions below. ### Preparing the repo for a Major or Minor Version **Note**: this should be done in your fork of Modin. First, update your fork of Modin's main with the main repo's main. From your main, create a new branch called `release-X.Y.0` off of main. Create an empty commit in your new branch with the message `Release version X.Y.0`. Make sure to sign this commit with both your GPG key and with the conventional `git commit -s` (so `git commit -s -S`). Open a PR against modin-project/modin with just this commit. ### Preparing the repo for a Patch Version **Note**: this should be done in the original Modin repository (in `upstream`) . First, you must create a new branch in the upstream (main modin-project/modin) repo for the new release. This branch must be named `release-X.Y.Z`, and should be made off of the tag for the last release. To do this, use `git checkout -b release-X.Y.Z+1 X.Y.Z` to create the branch for the new release. Once this branch has been created, cherry-pick the commits that will go into this release, and push this branch to `upstream`. **Note**: now you must switch to your fork of Modin. From your fork of Modin, fetch the upstream repo, and checkout the release branch you made above. From this release branch, create a new branch. From your new branch, edit the `README.md` so that the PyPi badge will point to the badge for this specific version (instead of latest) and so that the docs link will point to the docs for this specific version (rather than latest). Once the badges have been edited, create a commit, the same as for a major or minor version, with the message `Release version X.Y.Z`, and make sure to sign it with both your GPG key, and the traditional git sign-off. Create a PR using your branch against the `release-X.Y.Z` branch in the original Modin repo. ### Tag commit After the PR has been merged, clone a clean copy of the Modin repo from the modin-project organization. You now need to tag the commit that corresponds to the above PR with the appropriate tag for this release. **Note**: from now on you work on the `main` branch (in `upstream`) for a major or minor release, or the `release-X.Y.Z` branch (in `upstream`) for a patch release. git tag -as X.Y.Z * Use `scripts/release.py` to draft the release notes (might be as simple as `python scripts/release.py notes > draft.txt`) * If you're experiencing [rate limiting by GitHub](https://docs.github.com/en/rest/overview/resources-in-the-rest-api#rate-limiting) during username resolving, pass a token via `--token` option to the script * Fill in the placeholder for summary of the release * Please look into PR sections and split them if necessary into smaller but better fitting ones, as the script only categorizes by prefix (`FIX-`, `TEST-`, etc.) * Make sure to correctly resolve contributors whom script failed to transform to GitHub usernames if there are any! * Include release documentation in the annotation and make sure it is signed. * Push the tag to `main` or `release-X.Y.Z` branch: `git push upstream X.Y.Z` * If you're re-pushing a tag (beware! you shouldn't be doing that, no, _really_!), you can remove remote tag and push a local one by `git push upstream :refs/tags/X.Y.Z` ### Build wheel: **Note**: This should be done from your clean clone of the `upstream` Modin repository from the modin-project organization, where you made the release tag. ```bash # Install/update tools pip install --upgrade build twine # Build a pure Python wheel. python3 setup.py sdist bdist_wheel ``` You may see the wheel in the `dist` folder: `ls -l dist`. Make sure the version is correct. Also make sure there is a `tar` file that contains the source. ### Upload wheels: Make sure you have an active PyPI account which has write access to Modin pypi repo, and make sure you have a pypi token set up. Use `twine` to upload wheels: ```bash twine upload dist/* ``` When asked for account, provide `__token__` (exactly as stated), when asked for password, present your token from pypi. ### Check with `pip install`: Run `pip install -U "modin[all]"` on Linux, Mac, and Windows systems in a new environment to test that the wheels were uploaded correctly. ## Make Github and conda-forge release ### Github Once the tag has been published, we need to make the release on GitHub. Go to the [Release page](https://github.com/modin-project/modin/releases), and click on `Draft a new release`. Choose the tag you made above from the dropdown menu, and copy paste the name of the release in the `Release title` box. Next, copy paste the release notes from above into the box labelled `Describe this release`. This will ensure that the release notes on GitHub are Markdown formatted. Double check that everything looks good by clicking `Preview`, and then hit the green `Publish release` button! ### Conda-forge Conda-forge has a bot which watches for new releases of software packaged through it, and in case of Modin it waits either for Github releases or for tags and then makes a new automatic PR with version increment. You should watch for that PR and, fixing any issues if there are some, merge it to make new Modin release appear in `conda-forge` channel. For detailed instructions on how to ensure the PR passes CI and is merge-able, check out [the how-to page in the modin-feedstock repo](https://github.com/conda-forge/modin-feedstock/blob/main/HOWTO.md)! ## Publicize Release Once the release has been finalized, make sure to post an announcement in the #general channel of the public Modin Slack! ================================================ FILE: docs/release_notes/release_notes-0.14.0.rst ================================================ :orphan: Modin 0.14.0 Key Features and Updates ------------------------ * Stability and Bugfixes * FIX-#4058: Allow pickling empty dataframes and series (#4095) * FIX-#4136: Fix exercise_3.ipynb example notebook (#4137) * FIX-#4105: Fix names of pandas options to avoid `OptionError` (#4109) * FIX-#3417: Fix read_csv with skiprows and header parameters (#3419) * FIX-#4142: Fix OmniSci enabling (#4146) * FIX-#4162: Use `skipif` instead of `skip` for compatibility with pytest 7.0 (#4163) * FIX-#4158: Do not print OmniSci logs to stdout by default (#4159) * FIX-#4177: Support read_feather from pathlike objects (#4177) * FIX-#4234: Upgrade pandas to 1.4.1 (#4235) * FIX-#3368: support unsigned integers in OmniSci backend (#4256) * FIX-#4057: Allow reading an empty parquet file (#4075) * FIX-#3884: Fix read_excel() dropping empty rows (#4161) * FIX-#4257: Fix Categorical() for scalar categories (#4258) * FIX-#4300: Fix Modin Categorical column dtype categories (#4276) * FIX-#4208: Fix lazy metadata update for `PandasDataFrame.from_labels` (#4209) * FIX-#3981, FIX-#3801, FIX-#4149: Stop broadcasting scalars to set items (#4160) * FIX-#4185: Fix rolling across column partitions (#4262) * FIX-#4303: Fix the syntax error in reading from postgres (#4304) * FIX-#4308: Add proper error handling in df.set_index (#4309) * FIX-#4056: Allow an empty parse_date list in `read_csv_glob` (#4074) * FIX-#4312: Fix constructing categorical frame with duplicate column names (#4313). * FIX-#4314: Allow passing a series of dtypes to astype (#4318) * FIX-#4310: Handle lists of lists of ints in read_csv_glob (#4319) * Performance enhancements * FIX-#4138, FIX-#4009: remove redundant sorting in the internal '.mask()' flow (#4140) * FIX-#4183: Stop shallow copies from creating global shared state. (#4184) * Benchmarking enhancements * FIX-#4221: add `wait` method for `PandasOnRayDataframeColumnPartition` class (#4231) * Refactor Codebase * REFACTOR-#3990: remove code duplication in `PandasDataframePartition` hierarchy (#3991) * REFACTOR-#4229: remove unused `dask_client` global variable in `modin\pandas\__init__.py` (#4230) * REFACTOR-#3997: remove code duplication for `broadcast_apply` method (#3996) * REFACTOR-#3994: remove code duplication for `get_indices` function (#3995) * REFACTOR-#4331: remove code duplication for `to_pandas`, `to_numpy` functions in `QueryCompiler` hierarchy (#4332) * REFACTOR-#4213: Refactor `modin/examples/tutorial/` directory (#4214) * REFACTOR-#4206: add assert check into `__init__` method of `PandasOnDaskDataframePartition` class (#4207) * REFACTOR-#3900: add flake8-no-implicit-concat plugin and refactor flake8 error codes (#3901) * REFACTOR-#4093: Refactor base to be smaller (#4220) * REFACTOR-#4047: Rename `cluster` directory to `cloud` in examples (#4212) * REFACTOR-#3853: interacting with Dask interface through `DaskWrapper` class (#3854) * REFACTOR-#4322: Move is_reduce_fn outside of groupby_agg (#4323) * Pandas API implementations and improvements * FEAT-#3603: add experimental `read_custom_text` function that can read custom line-by-line text files (#3441) * FEAT-#979: Enable reading from SQL server (#4279) * OmniSci enhancements * * XGBoost enhancements * * Developer API enhancements * FEAT-#4245: Define base interface for dataframe exchange protocol (#4246) * FEAT-#4244: Implement dataframe exchange protocol for HdkOnNative execution (#4269) * FEAT-#4144: Implement dataframe exchange protocol for pandas storage format (#4150) * FEAT-#4342: Support `from_dataframe`` for pandas storage format (#4343) * Update testing suite * TEST-#3628: Report coverage data for `test-internals` CI job (#4198) * TEST-#3938: Test tutorial notebooks in CI (#4145) * TEST-#4153: Fix condition of running lint-commit and set of CI triggers (#4156) * TEST-#4201: Add read_parquet, explode, tail, and various arithmetic functions to asv_bench (#4203) * Documentation improvements * DOCS-#4077: Add release notes template to docs folder (#4078) * DOCS-#4082: Add pdf/epub/htmlzip formats for doc builds (#4083) * DOCS-#4168: Fix rendering the examples on troubleshooting page (#4169) * DOCS-#4151: Add info in troubleshooting page related to Dask engine usage (#4152) * DOCS-#4172: Refresh Intel Distribution of Modin paragraph (#4175) * DOCS-#4173: Mention strict channel priority in conda install section (#4178) * DOCS-#4176: Update OmniSci usage section (#4192) * DOCS-#4027: Add GIF images and chart to Modin README demonstrating speedups (#4232) * DOCS-#3954: Add Dask example notebooks (#4139) * DOCS-#4272: Add bar chart comparisons to quick start guide (#4277) * DOCS-#3953: Add docs and notebook examples on running Modin with OmniSci (#4001) * DOCS-#4280: Change links in jupyter notebooks (#4281) * DOCS-#4290: Add changes for OmniSci notebooks (#4291) * DOCS-#4241: Update warnings and docs regarding defaulting to pandas (#4242) * DOCS-#3099: Fix `BasePandasDataSet` docstrings warnings (#4333) * DOCS-#4339: Reformat I/O functions docstrings (#4341) * DOCS-#4336: Reformat general utilities docstrings (#4338) * Dependencies * FIX-#4113, FIX-#4116, FIX-#4115: Apply new `black` formatting, fix pydocstyle check and readthedocs build (#4114) * TEST-#3227: Use codecov github action instead of bash form in GA workflows (#3226) * FIX-#4115: Unpin `pip` in readthedocs deps list (#4170) * TEST-#4217: Pin `Dask<2022.2.0` as a temporary fix of CI (#4218) Contributors ------------ @prutskov @amyskov @paulovn @anmyachev @YarShev @RehanSD @devin-petersohn @dchigarev @Garra1980 @mvashishtha @naren-ponder @jeffreykennethli @dorisjlee @Rubtsowa ================================================ FILE: docs/release_notes/release_notes-0.15.0.rst ================================================ :orphan: Modin 0.15.0 Key Features and Updates ------------------------ * Stability and Bugfixes * FIX-#4376: Upgrade pandas to 1.4.2 (#4377) * FIX-#3615: Relax some deps in development env (#4365) * FIX-#4370: Fix broken docstring links (#4375) * FIX-#4392: Align Modin XGBoost with xgb>=1.6 (#4393) * FIX-#4385: Get rid of `use-deprecated` option in `pip` (#4386) * FIX-#3527: Fix parquet partitioning issue causing negative row length partitions (#4368) * FIX-#4330: Override the memory limit to start ray 1.11.0 on Macs (#4335) * FIX-#4407: Align `insert` function with pandas in case of numpy array with several columns (#4408) * FIX-#4373: Fix invalid file path when trying `read_csv_glob` with `usecols` parameter (#4405) * FIX-#4394: Fix issue with multiindex metadata desync (#4395) * FIX-#4438: Fix `reindex` function that doesn't preserve initial index metadata (#4442) * FIX-#4425: Add parameters to groupby pct_change (#4429) * FIX-#4457: Fix `loc` in case when need reindex item (#4457) * FIX-#4414: Add missing f prefix on f-strings found at https://codereview.doctor (#4415) * FIX-#4461: Fix S3 CSV data path (#4462) * FIX-#4467: `drop_duplicates` no longer removes items based on index values (#4468) * FIX-#4449: Drain the call queue before waiting on result in benchmark mode (#4472) * FIX-#4518: Fix Modin Logging to report specific Modin warnings/errors (#4519) * FIX-#4481: Allow clipping with a Modin Series of bounds (#4486) * FIX-#4504: Support na_action in applymap (#4505) * FIX-#4503: Stop the memory logging thread after session exit (#4515) * FIX-#4531: Fix a makedirs race condition in to_parquet (#4533) * FIX-#4464: Refactor Ray utils and quick fix groupby.count failing on virtual partitions (#4490) * FIX-#4436: Fix to_pydatetime dtype for timezone None (#4437) * FIX-#4541: Fix merge_asof with non-unique right index (#4542) * Performance enhancements * FEAT-#4320: Add connectorx as an alternative engine for read_sql (#4346) * PERF-#4493: Use partition size caches more in Modin dataframe (#4495) * Benchmarking enhancements * FEAT-#4371: Add logging to Modin (#4372) * FEAT-#4501: Add RSS Memory Profiling to Modin Logging (#4502) * FEAT-#4524: Split Modin API and Memory log files (#4526) * Refactor Codebase * REFACTOR-#4284: use variable length unpacking when getting results from `deploy` function (#4285) * REFACTOR-#3642: Move PyArrow storage format usage from main feature to experimental ones (#4374) * REFACTOR-#4003: Delete the deprecated cloud mortgage example (#4406) * REFACTOR-#4513: Fix spelling mistakes in docs and docstrings (#4514) * REFACTOR-#4510: Align experimental and regular IO modules initializations (#4511) * Pandas API implementations and improvements * * OmniSci enhancements * * XGBoost enhancements * * Developer API enhancements * FEAT-#4359: Add __dataframe__ method to the protocol dataframe (#4360) * Update testing suite * TEST-#4363: Use Ray from pypi in CI (#4364) * FIX-#4422: get rid of case sensitivity for `warns_that_defaulting_to_pandas` (#4423) * TEST-#4426: Stop passing is_default kwarg to Modin and pandas (#4428) * FIX-#4439: Fix flake8 CI fail (#4440) * FIX-#4409: Fix `eval_insert` utility that doesn't actually check results of `insert` function (#4410) * TEST-#4482: Fix getitem and loc with series of bools (#4483). * Documentation improvements * DOCS-#4296: Fix docs warnings (#4297) * DOCS-#4388: Turn off fail_on_warning option for docs build (#4389) * DOCS-#4469: Say that commit messages can start with PERF (#4470). * DOCS-#4466: Recommend GitHub issues over bug_reports@modin.org (#4474). * DOCS-#4487: Recommend GitHub issues over feature_requests@modin.org (#4489). * DOCS-#4545: Add socials to README (#4555). * Dependencies * FIX-#4327: Update min pin for xgboost version (#4328) * FIX-#4383: Remove `pathlib` from deps (#4384) * FIX-#4390: Add `redis` to Modin dependencies (#4396) * FIX-#3689: Add black and flake8 into development environment files (#4480) * TEST-#4516: Add numpydoc to developer requirements (#4517) * New Features * FEAT-#4412: Add Batch Pipeline API to Modin (#4452) Contributors ------------ @YarShev @Garra1980 @prutskov @alexander3774 @amyskov @wangxiaoying @jeffreykennethli @mvashishtha @anmyachev @dchigarev @devin-petersohn @jrsacher @orcahmlee @naren-ponder @RehanSD ================================================ FILE: docs/release_notes/release_notes-0.16.0.rst ================================================ :orphan: Modin 0.16.0 Key Features and Updates ------------------------ * Stability and Bugfixes * FIX-#4570: Replace ``np.bool`` -> ``np.bool_`` (#4571) * FIX-#4543: Fix `read_csv` in case skiprows=<0, []> (#4544) * FIX-#4059: Add cell-wise execution for binary ops, fix bin ops for empty dataframes (#4391) * FIX-#4589: Pin protobuf<4.0.0 to fix ray (#4590) * FIX-#4577: Set attribute of Modin dataframe to updated value (#4588) * FIX-#4411: Fix binary_op between datetime64 Series and pandas timedelta (#4592) * FIX-#4604: Fix `groupby` + `agg` in case when multicolumn can arise (#4642) * FIX-#4582: Inherit custom log layer (#4583) * FIX-#4639: Fix `storage_options` usage for `read_csv` and `read_csv_glob` (#4644) * FIX-#4593: Ensure Modin warns when setting columns via attributes (#4621) * FIX-#4584: Enable pdb debug when running cloud tests (#4585) * FIX-#4564: Workaround import issues in Ray: auto-import pandas on python start if env var is set (#4603) * FIX-#4641: Reindex pandas partitions in `df.describe()` (#4651) * FIX-#2064: Fix `iloc`/`loc` assignment when dataframe is empty (#4677) * FIX-#4634: Check for FrozenList as `by` in `df.groupby()` (#4667) * FIX-#4680: Fix `read_csv` that started defaulting to pandas again in case of reading from a buffer and when a buffer has a non-zero starting position (#4681) * FIX-#4491: Wait for all partitions in parallel in benchmark mode (#4656) * FIX-#4358: MultiIndex `loc` shouldn't drop levels for full-key lookups (#4608) * FIX-#4658: Expand exception handling for `read_*` functions from s3 storages (#4659) * FIX-#4672: Fix incorrect warning when setting `frame.index` or `frame.columns` (#4721) * FIX-#4686: Propagate metadata and drain call queue in unwrap_partitions (#4697) * FIX-#4652: Support categorical data in `from_dataframe` (#4737) * FIX-#4756: Correctly propagate `storage_options` in `read_parquet` (#4764) * FIX-#4657: Use `fsspec` for handling s3/http-like paths instead of `s3fs` (#4710) * FIX-#4676: drain sub-virtual-partition call queues (#4695) * FIX-#4782: Exclude certain non-parquet files in `read_parquet` (#4783) * FIX-#4808: Set dtypes correctly after column rename (#4809) * FIX-#4811: Apply dataframe -> not_dataframe functions to virtual partitions (#4812) * FIX-#4099: Use mangled column names but keep the original when building frames from arrow (#4767) * FIX-#4838: Bump up modin-spreadsheet to latest master (#4839) * FIX-#4840: Change modin-spreadsheet version for notebook requirements (#4841) * FIX-#4835: Handle Pathlike paths in `read_parquet` (#4837) * FIX-#4872: Stop checking the private ray mac memory limit (#4873) * FIX-#4914: `base_lengths` should be computed from `base_frame` instead of `self` in `copartition` (#4915) * FIX-#4848: Fix rebalancing partitions when NPartitions == 1 (#4874) * FIX-#4927: Fix `dtypes` computation in `dataframe.filter` (#4928) * FIX-#4907: Implement `radd` for Series and DataFrame (#4908) * FIZ-#4945: Fix `_take_2d_positional` that loses indexes due to filtering empty dataframes (#4951) * FIX-#4818, PERF-#4825: Fix where by using the new n-ary operator (#4820) * FIX-#3983: FIX-#4107: Materialize 'rowid' columns when selecting rows by position (#4834) * FIX-#4845: Fix KeyError from `__getitem_bool` for single row dataframes (#4845) * FIX-#4734: Handle Series.apply when return type is a DataFrame (#4830) * FIX-#4983: Set `frac` to `None` in _sample when `n=0` (#4984) * FIX-#4993: Return `_default_to_pandas` in `df.attrs` (#4995) * FIX-#5043: Fix `execute` function in ASV utils failed if `len(partitions) == 0` (#5044) * FIX-#4597: Refactor Partition handling of func, args, kwargs (#4715) * FIX-#4996: Evaluate BenchmarkMode at each function call (#4997) * FIX-#4022: Fixed empty data frame with index (#4910) * FIX-#4090: Fixed check if the index is trivial (#4936) * FIX-#4966: Fix `to_timedelta` to return Series instead of TimedeltaIndex (#5028) * FIX-#5042: Fix series __getitem__ with invalid strings (#5048) * FIX-#4691: Fix binary operations between virtual partitions (#5049) * FIX-#5045: Fix ray virtual_partition.wait with duplicate object refs (#5058) * Performance enhancements * PERF-#4182: Add cell-wise execution for binary ops, fix bin ops for empty dataframes (#4391) * PERF-#4288: Improve perf of `groupby.mean` for narrow data (#4591) * PERF-#4772: Remove `df.copy` call from `from_pandas` since it is not needed for Ray and Dask (#4781) * PERF-#4325: Improve perf of multi-column assignment in `__setitem__` when no new column names are assigning (#4455) * PERF-#3844: Improve perf of `drop` operation (#4694) * PERF-#4727: Improve perf of `concat` operation (#4728) * PERF-#4705: Improve perf of arithmetic operations between `Series` objects with shared `.index` (#4689) * PERF-#4703: Improve performance in accessing `ser.cat.categories`, `ser.cat.ordered`, and `ser.__array_priority__` (#4704) * PERF-#4305: Parallelize `read_parquet` over row groups (#4700) * PERF-#4773: Compute `lengths` and `widths` in `put` method of Dask partition like Ray do (#4780) * PERF-#4732: Avoid overwriting already-evaluated `PandasOnRayDataframePartition._length_cache` and `PandasOnRayDataframePartition._width_cache` (#4754) * PERF-#4862: Don't call `compute_sliced_len.remote` when `row_labels/col_labels == slice(None)` (#4863) * PERF-#4713: Stop overriding the ray MacOS object store size limit (#4792) * PERF-#4851: Compute `dtypes` for binary operations that can only return bool type and the right operand is not a Modin object (#4852) * PERF-#4842: `copy` should not trigger any previous computations (#4843) * PERF-#4849: Compute `dtypes` in `concat` also for ROW_WISE case when possible (#4850) * PERF-#4929: Compute `dtype` when using `Series.dt` accessor (#4930) * PERF-#4892: Compute `lengths` in `rebalance_partitions` when possible (#4893) * PERF-#4794: Compute caches in `_propagate_index_objs` (#4888) * PERF-#4860: `PandasDataframeAxisPartition.deploy_axis_func` should be serialized only once (#4861) * PERF-#4890: `PandasDataframeAxisPartition.drain` should be serialized only once (#4891) * PERF-#4870: Avoid index materialization in `__getattribute__` and `__getitem__` (4911) * PERF-#4886: Use lazy index and columns evaluation in `query` method (#4887) * PERF-#4866: `iloc` function that used in `partition.mask` should be serialized only once (#4901) * PERF-#4920: Avoid index and cache computations in `take_2d_labels_or_positional` unless they are needed (#4921) * PERF-#4999: don't call `apply` in virtual partition' `drain_call_queue` if `call_queue` is empty (#4975) * PERF-#4268: Implement partition-parallel __getitem__ for bool Series masks (#4753) * PERF-#5017: `reset_index` shouldn't trigger index materialization if possible (#5018) * PERF-#4963: Use partition `width/length` methods instead of `_compute_axis_labels_and_lengths` if index is already known (#4964) * PERF-#4940: Optimize categorical dtype check in `concatenate` (#4953) * Benchmarking enhancements * TEST-#5066: Add outer join case for `TimeConcat` benchmark (#5067) * TEST-#5083: Add `merge` op with categorical data (#5084) * FEAT-#4706: Add Modin ClassLogger to PandasDataframePartitionManager (#4707) * TEST-#5014: Simplify adding new ASV benchmarks (#5015) * TEST-#5064: Update `TimeConcat` benchmark with new parameter `ignore_index` (#5065) * PERF-#4944: Avoid default_to_pandas in ``Series.cat.codes``, ``Series.dt.tz``, and ``Series.dt.to_pytimedelta`` (#4833) * TEST-#5068: Add binary op benchmark for Series (#5069) * Refactor Codebase * REFACTOR-#4530: Standardize access to physical data in partitions (#4563) * REFACTOR-#4534: Replace logging meta class with class decorator (#4535) * REFACTOR-#4708: Delete combine dtypes (#4709) * REFACTOR-#4629: Add type annotations to modin/config (#4685) * REFACTOR-#4717: Improve PartitionMgr.get_indices() usage (#4718) * REFACTOR-#4730: make Indexer immutable (#4731) * REFACTOR-#4774: remove `_build_treereduce_func` call from `_compute_dtypes` (#4775) * REFACTOR-#4750: Delete BaseDataframeAxisPartition.shuffle (#4751) * REFACTOR-#4722: Stop suppressing undefined name lint (#4723) * REFACTOR-#4832: unify `split_result_of_axis_func_pandas` (#4831) * REFACTOR-#4796: Introduce constant for __reduced__ column name (#4799) * REFACTOR-#4000: Remove code duplication for `PandasOnRayDataframePartitionManager` (#4895) * REFACTOR-#3780: Remove code duplication for `PandasOnDaskDataframe` (#3781) * REFACTOR-#4530: Unify access to physical data for any partition type (#4829) * REFACTOR-#4978: Align `modin/core/execution/dask/common/__init__.py` with `modin/core/execution/ray/common/__init__.py` (#4979) * REFACTOR-#4949: Remove code duplication in `default2pandas/dataframe.py` and `default2pandas/any.py` (#4950) * REFACTOR-#4976: Rename `RayTask` to `RayWrapper` in accordance with Dask (#4977) * REFACTOR-#4885: De-duplicated take_2d_labels_or_positional methods (#4883) * REFACTOR-#5005: Use `finalize` method instead of list comprehension + `drain_call_queue` (#5006) * REFACTOR-#5001: Remove `jenkins` stuff (#5002) * REFACTOR-#5026: Change exception names to simplify grepping (#5027) * REFACTOR-#4970: Rewrite base implementations of a partition' `width/length` (#4971) * REFACTOR-#4942: Remove `call` method in favor of `register` due to duplication (4943) * REFACTOR-#4922: Helpers for take_2d_labels_or_positional (#4865) * REFACTOR-#5024: Make `_row_lengths` and `_column_widths` public (#5025) * REFACTOR-#5009: Use `RayWrapper.materialize` instead of `ray.get` (#5010) * REFACTOR-#4755: Rewrite Pandas version mismatch warning (#4965) * REFACTOR-#5012: Add mypy checks for singleton files in base modin directory (#5013) * REFACTOR-#5038: Remove unnecessary _method argument from resamplers (#5039) * REFACTOR-#5081: Remove `c323f7fe385011ed849300155de07645.db` file (#5082) * Pandas API implementations and improvements * FEAT-#4670: Implement convert_dtypes by mapping across partitions (#4671) * OmniSci enhancements * FEAT-#4913: Enabling pyhdk * XGBoost enhancements * * Developer API enhancements * * Update testing suite * TEST-#4508: Reduce test_partition_api pytest threads to deflake it (#4551) * TEST-#4550: Use much less data in test_partition_api (#4554) * TEST-#4610: Remove explicit installation of `black`/`flake8` for omnisci ci-notebooks (#4609) * TEST-#2564: Add caching and use mamba for conda setups in GH (#4607) * TEST-#4557: Delete multiindex sorts instead of xfailing (#4559) * TEST-#4698: Stop passing invalid storage_options param (#4699) * TEST-#4745: Pin flake8 to <5 to workaround installation conflict (#4752) * TEST-#4875: XFail tests failing due to file gone missing (#4876) * TEST-#4879: Use pandas `ensure_clean()` in place of `io_tests_data` (#4881) * TEST-#4562: Use local Ray cluster in CI to resolve flaky `test-compat-win` (#5007) * TEST-#5040: Rework test_series using eval_general() (#5041) * TEST-#5050: Add black to pre-commit hook (#5051) * Documentation improvements * DOCS-#4552: Change default sphinx language to en to fix sphinx >= 5.0.0 build (#4553) * DOCS-#4628: Add to_parquet partial support notes (#4648) * DOCS-#4668: Set light theme for readthedocs page, remove theme switcher (#4669) * DOCS-#4748: Apply the Triage label to new issues (#4749) * DOCS-#4790: Give all templates issue type and triage labels (#4791) * DOCS-#4521: Document how to benchmark modin (#5020) * Dependencies * FEAT-#4598: Add support for pandas 1.4.3 (#4599) * FEAT-#4619: Integrate mypy static type checking (#4620) * FEAT-#4202: Allow dask past 2022.2.0 (#4769) * FEAT-#4925: Upgrade pandas to 1.4.4 (#4926) * TEST-#4998: Add flake8 plugins to dev requirements (#5000) * New Features * FEAT-4463: Add experimental fuzzydata integration for testing against a randomized dataframe workflow (#4556) * FEAT-#4419: Extend virtual partitioning API to pandas on Dask (#4420) * FEAT-#4147: Add partial compatibility with Python 3.6 and pandas 1.1 (#4301) * FEAT-#4569: Add error message when `read_` function defaults to pandas (#4647) * FEAT-#4725: Make index and columns lazy in Modin DataFrame (#4726) * FEAT-#4664: Finalize compatibility support for Python 3.6 (#4800) * FEAT-#4746: Sync interchange protocol with recent API changes (#4763) * FEAT-#4733: Support fastparquet as engine for `read_parquet` (#4807) * FEAT-#4766: Support fsspec URLs in `read_csv` and `read_csv_glob` (#4898) * FEAT-#4827: Implement `infer_types` dataframe algebra operator (#4871) * FEAT-#4989: Switch pandas version to 1.5 (#5037) Contributors ------------ @mvashishtha @NickCrews @prutskov @vnlitvinov @pyrito @suhailrehman @RehanSD @helmeleegy @anmyachev @d33bs @noloerino @devin-petersohn @YarShev @naren-ponder @jbrockmendel @ienkovich @Garra1980 @Billy2551 ================================================ FILE: docs/release_notes/release_notes-template.rst ================================================ :orphan: Modin X.X.X Key Features and Updates ------------------------ * Stability and Bugfixes * * Performance enhancements * * Benchmarking enhancements * * Refactor Codebase * * Pandas API implementations and improvements * * HDK enhancements * * XGBoost enhancements * * Developer API enhancements * * Update testing suite * * Documentation improvements * * Dependencies * * New Features Contributors ------------ ================================================ FILE: docs/requirements-doc.txt ================================================ # install current modin checkout to bring all required dependencies .[all] # now install some more optional dependencies colorama click flatbuffers funcsigs mock opencv-python pydata_sphinx_theme pyyaml recommonmark sphinx<6.0.0 sphinx-click ray>=2.10.0,<3 # Override to latest version of modin-spreadsheet git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5 sphinxcontrib_plantuml sphinx-issues ================================================ FILE: docs/supported_apis/dataframe_supported.rst ================================================ ``pd.DataFrame`` supported APIs =================================== The following table lists both implemented and not implemented methods. If you have need of an operation that is listed as not implemented, feel free to open an issue on the `GitHub repository`_, or give a thumbs up to already created issues. Contributions are also welcome! The following table is structured as follows: The first column contains the method name. The second column contains link to a description of corresponding pandas method. The third column is a flag for whether or not there is an implementation in Modin for the method in the left column. ``Y`` stands for yes, ``N`` stands for no, ``P`` stands for partial (meaning some parameters may not be supported yet), and ``D`` stands for default to pandas. +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | DataFrame method | pandas Doc link | Implemented? (Y/N/P/D) | Notes for Current implementation | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``T`` | `T`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``abs`` | `abs`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``add`` | `add`_ | Y | **Ray** and **Dask**: Shuffles data in operations | | | | | between DataFrames. | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``add_prefix`` | `add_prefix`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``add_suffix`` | `add_suffix`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``agg`` / ``aggregate`` | `agg`_ / `aggregate`_ | P | - Dictionary ``func`` parameter defaults to pandas | | | | | - Numpy operations default to pandas | | | | | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``align`` | `align`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``all`` | `all`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``any`` | `any`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``apply`` | `apply`_ | Y | See ``agg`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``applymap`` | `applymap`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``asfreq`` | `asfreq`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``asof`` | `asof`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``assign`` | `assign`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``astype`` | `astype`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``at`` | `at`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``at_time`` | `at_time`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``axes`` | `axes`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``between_time`` | `between_time`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``bfill`` | `bfill`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``bool`` | `bool`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``boxplot`` | `boxplot`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``clip`` | `clip`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``combine`` | `combine`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``combine_first`` | `combine_first`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``compare`` | `compare`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``copy`` | `copy`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``corr`` | `corr`_ | P | Correlation floating point precision may slightly | | | | | differ from pandas. For now pearson method is | | | | | available only. For other methods and for | | | | | ``numeric_only`` defaults to pandas. | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``corrwith`` | `corrwith`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``count`` | `count`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``cov`` | `cov`_ | P | Covariance floating point precision may slightly | | | | | differ from pandas. For ``numeric_only`` | | | | | defaults to pandas. | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``cummax`` | `cummax`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``cummin`` | `cummin`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``cumprod`` | `cumprod`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``cumsum`` | `cumsum`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``describe`` | `describe`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``diff`` | `diff`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``div`` | `div`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``divide`` | `divide`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``dot`` | `dot`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``drop`` | `drop`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``droplevel`` | `droplevel`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``drop_duplicates`` | `drop_duplicates`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``dropna`` | `dropna`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``dtypes`` | `dtypes`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``duplicated`` | `duplicated`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``empty`` | `empty`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``eq`` | `eq`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``equals`` | `equals`_ | Y | Requires shuffle, can be further optimized | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``eval`` | `eval`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``ewm`` | `ewm`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``expanding`` | `expanding`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``explode`` | `explode`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``ffill`` | `ffill`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``fillna`` | `fillna`_ | P | ``value`` parameter of type DataFrame defaults to | | | | | pandas. | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``filter`` | `filter`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``first`` | `first`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``first_valid_index`` | `first_valid_index`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``floordiv`` | `floordiv`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``from_dict`` | `from_dict`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``from_records`` | `from_records`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``ge`` | `ge`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``get`` | `get`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``groupby`` | `groupby`_ | Y | Not yet optimized for all operations. | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``gt`` | `gt`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``head`` | `head`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``hist`` | `hist`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``iat`` | `iat`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``idxmax`` | `idxmax`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``idxmin`` | `idxmin`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``iloc`` | `iloc`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``infer_objects`` | `infer_objects`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``info`` | `info`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``insert`` | `insert`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``interpolate`` | `interpolate`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``isetitem`` | `isetitem`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``isin`` | `isin`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``isna`` | `isna`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``isnull`` | `isnull`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``items`` | `items`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``iterrows`` | `iterrows`_ | P | Modin does not parallelize iteration in Python | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``itertuples`` | `itertuples`_ | P | Modin does not parallelize iteration in Python | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``join`` | `join`_ | P | When ``on`` is set to ``right`` or ``outer`` or | | | | | when ``validate`` is given defaults to pandas | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``keys`` | `keys`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``kurt`` | `kurt`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``kurtosis`` | `kurtosis`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``last`` | `last`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``last_valid_index`` | `last_valid_index`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``le`` | `le`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``loc`` | `loc`_ | P | We do not support: boolean array, callable. | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``lt`` | `lt`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``mask`` | `mask`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``max`` | `max`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``mean`` | `mean`_ | P | Modin defaults to pandas if given the ``level`` | | | | | param. | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``median`` | `median`_ | P | Modin defaults to pandas if given the ``level`` | | | | | param. | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``melt`` | `melt`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``memory_usage`` | `memory_usage`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | | | | Implemented the following cases: | | | | | ``left_index=True`` and ``right_index=True``, | | | | | ``how=left`` and ``how=inner`` for all values | | ``merge`` | `merge`_ | P | of parameters except ``left_index=True`` and | | | | | ``right_index=False`` or ``left_index=False`` | | | | | and ``right_index=True``. | | | | | Defaults to pandas otherwise. | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``min`` | `min`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``mod`` | `mod`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``mode`` | `mode`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``mul`` | `mul`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``multiply`` | `multiply`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``ndim`` | `ndim`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``ne`` | `ne`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``nlargest`` | `nlargest`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``notna`` | `notna`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``notnull`` | `notnull`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``nsmallest`` | `nsmallest`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``nunique`` | `nunique`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``pct_change`` | `pct_change`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``pipe`` | `pipe`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``pivot`` | `pivot`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``pivot_table`` | `pivot_table`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``plot`` | `plot`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``pop`` | `pop`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``pow`` | `pow`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``prod`` | `prod`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``product`` | `product`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``quantile`` | `quantile`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``query`` | `query`_ | P | Local variables not yet supported | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``radd`` | `radd`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``rank`` | `rank`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``rdiv`` | `rdiv`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``reindex`` | `reindex`_ | Y | Shuffles data | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``reindex_like`` | `reindex_like`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``rename`` | `rename`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``rename_axis`` | `rename_axis`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``reorder_levels`` | `reorder_levels`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``replace`` | `replace`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``resample`` | `resample`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``reset_index`` | `reset_index`_ | P | **Ray** and **Dask**: ``D`` when ``names`` or | | | | | ``allow_duplicates`` is non-default | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``rfloordiv`` | `rfloordiv`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``rmod`` | `rmod`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``rmul`` | `rmul`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``rolling`` | `rolling`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``round`` | `round`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``rpow`` | `rpow`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``rsub`` | `rsub`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``rtruediv`` | `rtruediv`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``sample`` | `sample`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``select_dtypes`` | `select_dtypes`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``sem`` | `sem`_ | P | Modin defaults to pandas if given the ``level`` | | | | | param. | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``set_axis`` | `set_axis`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``set_index`` | `set_index`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``shape`` | `shape`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``shift`` | `shift`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``size`` | `size`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``skew`` | `skew`_ | P | Modin defaults to pandas if given the ``level`` | | | | | param | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``sort_index`` | `sort_index`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``sort_values`` | `sort_values`_ | Y | Shuffles data. Order of indexes that have the | | | | | same sort key is not guaranteed to be the same | | | | | across sorts | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``sparse`` | `sparse`_ | N | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``squeeze`` | `squeeze`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``stack`` | `stack`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``std`` | `std`_ | P | Modin defaults to pandas if given the ``level`` | | | | | param. | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``style`` | `style`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``sub`` | `sub`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``subtract`` | `subtract`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``sum`` | `sum`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``swapaxes`` | `swapaxes`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``swaplevel`` | `swaplevel`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``tail`` | `tail`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``take`` | `take`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``to_clipboard`` | `to_clipboard`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``to_csv`` | `to_csv`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``to_dict`` | `to_dict`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``to_excel`` | `to_excel`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``to_feather`` | `to_feather`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``to_gbq`` | `to_gbq`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``to_hdf`` | `to_hdf`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``to_html`` | `to_html`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``to_json`` | `to_json`_ | D | | | | | | Experimental implementation: | | | | | DataFrame.modin.to_json_glob | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``to_xml`` | `to_xml`_ | D | | | | | | Experimental implementation: | | | | | DataFrame.modin.to_xml_glob | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``to_latex`` | `to_latex`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``to_orc`` | `to_orc`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``to_parquet`` | `to_parquet`_ | P | **Ray/Dask/Unidist**: Parallel implementation only | | | | | if path parameter is a string. In that case, the | | | | | ``path`` parameter specifies a directory where one | | | | | file is written per row partition of the Modin | | | | | dataframe. | | | | | Experimental implementation: | | | | | DataFrame.modin.to_parquet_glob | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``to_period`` | `to_period`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``to_pickle`` | `to_pickle`_ | D | Experimental implementation: | | | | | DataFrame.modin.to_pickle_glob | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``to_records`` | `to_records`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``to_sql`` | `to_sql`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``to_stata`` | `to_stata`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``to_string`` | `to_string`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``to_timestamp`` | `to_timestamp`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``to_xarray`` | `to_xarray`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``transform`` | `transform`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``transpose`` | `transpose`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``truediv`` | `truediv`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``truncate`` | `truncate`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``tz_convert`` | `tz_convert`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``tz_localize`` | `tz_localize`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``unstack`` | `unstack`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``update`` | `update`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``values`` | `values`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``value_counts`` | `value_counts`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``var`` | `var`_ | P | Modin defaults to pandas if given the ``level`` | | | | | param. | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``where`` | `where`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ .. _`GitHub repository`: https://github.com/modin-project/modin/issues .. _`T`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.T.html#pandas.DataFrame.T .. _`abs`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.abs.html#pandas.DataFrame.abs .. _`add`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.add.html#pandas.DataFrame.add .. _`add_prefix`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.add_prefix.html#pandas.DataFrame.add_prefix .. _`add_suffix`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.add_suffix.html#pandas.DataFrame.add_suffix .. _`agg`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.agg.html#pandas.DataFrame.agg .. _`aggregate`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.aggregate.html#pandas.DataFrame.aggregate .. _`align`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.align.html#pandas.DataFrame.align .. _`all`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.all.html#pandas.DataFrame.all .. _`any`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.any.html#pandas.DataFrame.any .. _`apply`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.apply.html#pandas.DataFrame.apply .. _`applymap`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.applymap.html#pandas.DataFrame.applymap .. _`asfreq`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.asfreq.html#pandas.DataFrame.asfreq .. _`asof`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.asof.html#pandas.DataFrame.asof .. _`assign`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.assign.html#pandas.DataFrame.assign .. _`astype`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.astype.html#pandas.DataFrame.astype .. _`at`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.at.html#pandas.DataFrame.at .. _`at_time`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.at_time.html#pandas.DataFrame.at_time .. _`axes`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.axes.html#pandas.DataFrame.axes .. _`between_time`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.between_time.html#pandas.DataFrame.between_time .. _`bfill`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.bfill.html#pandas.DataFrame.bfill .. _`bool`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.bool.html#pandas.DataFrame.bool .. _`boxplot`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.boxplot.html#pandas.DataFrame.boxplot .. _`clip`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.clip.html#pandas.DataFrame.clip .. _`combine`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.combine.html#pandas.DataFrame.combine .. _`combine_first`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.combine_first.html#pandas.DataFrame.combine_first .. _`compare`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.compare.html#pandas.DataFrame.compare .. _`compound`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.compound.html#pandas.DataFrame.compound .. _`copy`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.copy.html#pandas.DataFrame.copy .. _`corr`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.corr.html#pandas.DataFrame.corr .. _`corrwith`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.corrwith.html#pandas.DataFrame.corrwith .. _`count`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.count.html#pandas.DataFrame.count .. _`cov`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.cov.html#pandas.DataFrame.cov .. _`cummax`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.cummax.html#pandas.DataFrame.cummax .. _`cummin`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.cummin.html#pandas.DataFrame.cummin .. _`cumprod`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.cumprod.html#pandas.DataFrame.cumprod .. _`cumsum`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.cumsum.html#pandas.DataFrame.cumsum .. _`describe`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.describe.html#pandas.DataFrame.describe .. _`diff`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.diff.html#pandas.DataFrame.diff .. _`div`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.div.html#pandas.DataFrame.div .. _`divide`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.divide.html#pandas.DataFrame.divide .. _`dot`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dot.html#pandas.DataFrame.dot .. _`drop`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop.html#pandas.DataFrame.drop .. _`droplevel`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.droplevel.html .. _`drop_duplicates`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop_duplicates.html#pandas.DataFrame.drop_duplicates .. _`dropna`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dropna.html#pandas.DataFrame.dropna .. _`dtypes`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dtypes.html#pandas.DataFrame.dtypes .. _`duplicated`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.duplicated.html#pandas.DataFrame.duplicated .. _`empty`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.empty.html#pandas.DataFrame.empty .. _`eq`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.eq.html#pandas.DataFrame.eq .. _`equals`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.equals.html#pandas.DataFrame.equals .. _`eval`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.eval.html#pandas.DataFrame.eval .. _`ewm`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.ewm.html#pandas.DataFrame.ewm .. _`expanding`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.expanding.html#pandas.DataFrame.expanding .. _`explode`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.explode.html#pandas-dataframe-explode .. _`ffill`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.ffill.html#pandas.DataFrame.ffill .. _`fillna`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.fillna.html#pandas.DataFrame.fillna .. _`filter`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.filter.html#pandas.DataFrame.filter .. _`first`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.first.html#pandas.DataFrame.first .. _`first_valid_index`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.first_valid_index.html#pandas.DataFrame.first_valid_index .. _`floordiv`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.floordiv.html#pandas.DataFrame.floordiv .. _`from_dict`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.from_dict.html#pandas.DataFrame.from_dict .. _`from_records`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.from_records.html#pandas.DataFrame.from_records .. _`ge`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.ge.html#pandas.DataFrame.ge .. _`get`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.get.html#pandas.DataFrame.get .. _`get_dtype_counts`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.get_dtype_counts.html#pandas.DataFrame.get_dtype_counts .. _`get_ftype_counts`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.get_ftype_counts.html#pandas.DataFrame.get_ftype_counts .. _`get_value`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.get_value.html#pandas.DataFrame.get_value .. _`get_values`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.get_values.html#pandas.DataFrame.get_values .. _`groupby`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.groupby.html#pandas.DataFrame.groupby .. _`gt`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.gt.html#pandas.DataFrame.gt .. _`head`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.head.html#pandas.DataFrame.head .. _`hist`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.hist.html#pandas.DataFrame.hist .. _`iat`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.iat.html#pandas.DataFrame.iat .. _`idxmax`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.idxmax.html#pandas.DataFrame.idxmax .. _`idxmin`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.idxmin.html#pandas.DataFrame.idxmin .. _`iloc`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.iloc.html#pandas.DataFrame.iloc .. _`infer_objects`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.infer_objects.html#pandas.DataFrame.infer_objects .. _`info`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.info.html#pandas.DataFrame.info .. _`insert`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.insert.html#pandas.DataFrame.insert .. _`interpolate`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.interpolate.html#pandas.DataFrame.interpolate .. _`is_copy`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.is_copy.html#pandas.DataFrame.is_copy .. _`isetitem`: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.isetitem.html?#pandas-dataframe-isetitem .. _`isin`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.isin.html#pandas.DataFrame.isin .. _`isna`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.isna.html#pandas.DataFrame.isna .. _`isnull`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.isnull.html#pandas.DataFrame.isnull .. _`items`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.items.html#pandas.DataFrame.items .. _`iterrows`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.iterrows.html#pandas.DataFrame.iterrows .. _`itertuples`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.itertuples.html#pandas.DataFrame.itertuples .. _`ix`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.ix.html#pandas.DataFrame.ix .. _`join`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.join.html#pandas.DataFrame.join .. _`keys`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.keys.html#pandas.DataFrame.keys .. _`kurt`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.kurt.html#pandas.DataFrame.kurt .. _`kurtosis`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.kurtosis.html#pandas.DataFrame.kurtosis .. _`last`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.last.html#pandas.DataFrame.last .. _`last_valid_index`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.last_valid_index.html#pandas.DataFrame.last_valid_index .. _`le`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.le.html#pandas.DataFrame.le .. _`loc`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.loc.html#pandas.DataFrame.loc .. _`lt`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.lt.html#pandas.DataFrame.lt .. _`mask`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.mask.html#pandas.DataFrame.mask .. _`max`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.max.html#pandas.DataFrame.max .. _`mean`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.mean.html#pandas.DataFrame.mean .. _`median`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.median.html#pandas.DataFrame.median .. _`melt`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.melt.html#pandas.DataFrame.melt .. _`memory_usage`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.memory_usage.html#pandas.DataFrame.memory_usage .. _`merge`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html#pandas.DataFrame.merge .. _`min`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.min.html#pandas.DataFrame.min .. _`mod`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.mod.html#pandas.DataFrame.mod .. _`mode`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.mode.html#pandas.DataFrame.mode .. _`mul`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.mul.html#pandas.DataFrame.mul .. _`multiply`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.multiply.html#pandas.DataFrame.multiply .. _`ndim`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.ndim.html#pandas.DataFrame.ndim .. _`ne`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.ne.html#pandas.DataFrame.ne .. _`nlargest`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.nlargest.html#pandas.DataFrame.nlargest .. _`notna`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.notna.html#pandas.DataFrame.notna .. _`notnull`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.notnull.html#pandas.DataFrame.notnull .. _`nsmallest`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.nsmallest.html#pandas.DataFrame.nsmallest .. _`nunique`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.nunique.html#pandas.DataFrame.nunique .. _`pct_change`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.pct_change.html#pandas.DataFrame.pct_change .. _`pipe`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.pipe.html#pandas.DataFrame.pipe .. _`pivot`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.pivot.html#pandas.DataFrame.pivot .. _`pivot_table`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.pivot_table.html#pandas.DataFrame.pivot_table .. _`plot`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.html#pandas.DataFrame.plot .. _`pop`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.pop.html#pandas.DataFrame.pop .. _`pow`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.pow.html#pandas.DataFrame.pow .. _`prod`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.prod.html#pandas.DataFrame.prod .. _`product`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.product.html#pandas.DataFrame.product .. _`quantile`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.quantile.html#pandas.DataFrame.quantile .. _`query`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html#pandas.DataFrame.query .. _`radd`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.radd.html#pandas.DataFrame.radd .. _`rank`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rank.html#pandas.DataFrame.rank .. _`rdiv`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rdiv.html#pandas.DataFrame.rdiv .. _`reindex`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.reindex.html#pandas.DataFrame.reindex .. _`reindex_like`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.reindex_like.html#pandas.DataFrame.reindex_like .. _`rename`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rename.html#pandas.DataFrame.rename .. _`rename_axis`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rename_axis.html#pandas.DataFrame.rename_axis .. _`reorder_levels`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.reorder_levels.html#pandas.DataFrame.reorder_levels .. _`replace`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.replace.html#pandas.DataFrame.replace .. _`resample`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.resample.html#pandas.DataFrame.resample .. _`reset_index`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.reset_index.html#pandas.DataFrame.reset_index .. _`rfloordiv`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rfloordiv.html#pandas.DataFrame.rfloordiv .. _`rmod`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rmod.html#pandas.DataFrame.rmod .. _`rmul`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rmul.html#pandas.DataFrame.rmul .. _`rolling`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rolling.html#pandas.DataFrame.rolling .. _`round`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.round.html#pandas.DataFrame.round .. _`rpow`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rpow.html#pandas.DataFrame.rpow .. _`rsub`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rsub.html#pandas.DataFrame.rsub .. _`rtruediv`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rtruediv.html#pandas.DataFrame.rtruediv .. _`sample`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sample.html#pandas.DataFrame.sample .. _`select_dtypes`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.select_dtypes.html#pandas.DataFrame.select_dtypes .. _`sem`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sem.html#pandas.DataFrame.sem .. _`set_axis`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.set_axis.html#pandas.DataFrame.set_axis .. _`set_index`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.set_index.html#pandas.DataFrame.set_index .. _`set_value`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.set_value.html#pandas.DataFrame.set_value .. _`shape`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.shape.html#pandas.DataFrame.shape .. _`shift`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.shift.html#pandas.DataFrame.shift .. _`size`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.size.html#pandas.DataFrame.size .. _`skew`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.skew.html#pandas.DataFrame.skew .. _`sort_index`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sort_index.html#pandas.DataFrame.sort_index .. _`sort_values`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sort_values.html#pandas.DataFrame.sort_values .. _`sparse`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sparse.html#pandas-dataframe-sparse .. _`squeeze`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.squeeze.html#pandas.DataFrame.squeeze .. _`stack`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.stack.html#pandas.DataFrame.stack .. _`std`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.std.html#pandas.DataFrame.std .. _`style`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.style.html#pandas.DataFrame.style .. _`sub`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sub.html#pandas.DataFrame.sub .. _`subtract`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.subtract.html#pandas.DataFrame.subtract .. _`sum`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sum.html#pandas.DataFrame.sum .. _`swapaxes`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.swapaxes.html#pandas.DataFrame.swapaxes .. _`swaplevel`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.swaplevel.html#pandas.DataFrame.swaplevel .. _`tail`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.tail.html#pandas.DataFrame.tail .. _`take`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.take.html#pandas.DataFrame.take .. _`to_clipboard`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_clipboard.html#pandas.DataFrame.to_clipboard .. _`to_csv`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html#pandas.DataFrame.to_csv .. _`to_dict`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_dict.html#pandas.DataFrame.to_dict .. _`to_excel`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_excel.html#pandas.DataFrame.to_excel .. _`to_feather`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_feather.html#pandas.DataFrame.to_feather .. _`to_gbq`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_gbq.html#pandas.DataFrame.to_gbq .. _`to_hdf`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_hdf.html#pandas.DataFrame.to_hdf .. _`to_html`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_html.html#pandas.DataFrame.to_html .. _`to_json`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html#pandas.DataFrame.to_json .. _`to_xml`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_xml.html#pandas.DataFrame.to_xml .. _`to_latex`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_latex.html#pandas.DataFrame.to_latex .. _`to_orc`: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_orc.html#pandas.DataFrame.to_orc .. _`to_parquet`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_parquet.html#pandas.DataFrame.to_parquet .. _`to_period`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_period.html#pandas.DataFrame.to_period .. _`to_pickle`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_pickle.html#pandas.DataFrame.to_pickle .. _`to_records`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_records.html#pandas.DataFrame.to_records .. _`to_sql`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_sql.html#pandas.DataFrame.to_sql .. _`to_stata`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_stata.html#pandas.DataFrame.to_stata .. _`to_string`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_string.html#pandas.DataFrame.to_string .. _`to_timestamp`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_timestamp.html#pandas.DataFrame.to_timestamp .. _`to_xarray`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_xarray.html#pandas.DataFrame.to_xarray .. _`transform`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.transform.html#pandas.DataFrame.transform .. _`transpose`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.transpose.html#pandas.DataFrame.transpose .. _`truediv`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.truediv.html#pandas.DataFrame.truediv .. _`truncate`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.truncate.html#pandas.DataFrame.truncate .. _`tz_convert`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.tz_convert.html#pandas.DataFrame.tz_convert .. _`tz_localize`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.tz_localize.html#pandas.DataFrame.tz_localize .. _`unstack`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.unstack.html#pandas.DataFrame.unstack .. _`update`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.update.html#pandas.DataFrame.update .. _`value_counts`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.value_counts.html#pandas.DataFrame.value_counts .. _`values`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.values.html#pandas.DataFrame.values .. _`var`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.var.html#pandas.DataFrame.var .. _`where`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.where.html#pandas.DataFrame.where .. _`xs`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.xs.html#pandas.DataFrame.xs ================================================ FILE: docs/supported_apis/defaulting_to_pandas.rst ================================================ Defaulting to pandas ==================== Currently Modin does not support distributed execution for all methods from pandas API. The remaining unimplemented methods are being executed in a mode called "default to pandas". This allows users to continue using Modin even though their workloads contain functions not yet implemented in Modin. Here is a diagram of how we convert to pandas and perform the operation: .. image:: /img/convert_to_pandas.png :align: center We first convert to a pandas DataFrame, then perform the operation. **There is a performance penalty for going from a partitioned Modin DataFrame to pandas because of the communication cost and single-threaded nature of pandas.** Once the pandas operation has completed, we convert the DataFrame back into a partitioned Modin DataFrame. This way, operations performed after something defaults to pandas will be optimized with Modin. The exact methods we have implemented are listed in the respective subsections: * :doc:`DataFrame ` * :doc:`Series ` * :doc:`utilities ` * :doc:`I/O ` We have taken a community-driven approach to implementing new methods. We did a `study on pandas usage`_ to learn what the most-used APIs are. Modin currently supports **93%** of the pandas API based on our study of pandas usage, and we are actively expanding the API. **To request implementation, file an issue at https://github.com/modin-project/modin/issues or send an email to feature_requests@modin.org.** .. _`study on pandas usage`: https://github.com/modin-project/study_kaggle_usage ================================================ FILE: docs/supported_apis/index.rst ================================================ Supported APIs ============== For your convenience, we have compiled a list of currently implemented APIs and methods available in Modin. This documentation is updated as new methods and APIs are merged into the main branch, and not necessarily correct as of the most recent release. To view the docs for the most recent release, check that you're viewing the `stable version`_ of the docs. In order to install the latest version of Modin, follow the directions found on the :doc:`installation page `. Questions on implementation details ----------------------------------- If you have a question about the implementation details or would like more information about an API or method in Modin, please contact the Modin `developer mailing list`_. .. toctree:: :titlesonly: :hidden: defaulting_to_pandas dataframe_supported series_supported utilities_supported io_supported older_pandas_compat .. meta:: :description lang=en: Compilation of implemented pandas APIs in Modin. .. _developer mailing list: https://groups.google.com/forum/#!forum/modin-dev .. _stable version: https://modin.readthedocs.io/en/stable/supported_apis/index.html ================================================ FILE: docs/supported_apis/io_supported.rst ================================================ ``pd.read_`` and I/O APIs ================================= A number of IO methods default to pandas. We have parallelized ``read_csv``, ``read_parquet`` and some more (see table), though many of the remaining methods can be relatively easily parallelized. Some of the operations default to the pandas implementation, meaning it will read in serially as a single, non-distributed DataFrame and distribute it. Performance will be affected by this. The following table is structured as follows: The first column contains the method name. The second column is a flag for whether or not there is an implementation in Modin for the method in the left column. ``Y`` stands for yes, ``N`` stands for no, ``P`` stands for partial (meaning some parameters may not be supported yet), and ``D`` stands for default to pandas. .. note:: Support for fully asynchronous reading has been added for the following functions: ``read_csv``, ``read_fwf``, ``read_table``, ``read_custom_text``. This mode is disabled by default, one can enable it using ``MODIN_ASYNC_READ_MODE=True`` environment variable. Some parameter combinations are not supported and the function will be executed in synchronous mode. +-------------------+---------------------------------+--------------------------------------------------------+ | IO method | Modin Implementation? (Y/N/P/D) | Notes for Current implementation | +-------------------+---------------------------------+--------------------------------------------------------+ | `read_csv`_ | Y | | +-------------------+---------------------------------+--------------------------------------------------------+ | `read_fwf`_ | Y | | +-------------------+---------------------------------+--------------------------------------------------------+ | `read_table`_ | Y | | +-------------------+---------------------------------+--------------------------------------------------------+ | `read_parquet`_ | P | Parameters besides ``filters`` and ``storage_options`` | | | | passed via ``**kwargs`` are not supported. | | | | ``use_nullable_dtypes`` == True is not supported. | | | | | | | | Experimental implementation: read_parquet_glob | +-------------------+---------------------------------+--------------------------------------------------------+ | `read_json`_ | P | Implemented for ``lines=True`` | | | | Experimental implementation: read_json_glob | +-------------------+---------------------------------+--------------------------------------------------------+ | `read_xml` | D | Experimental implementation: read_xml_glob | +-------------------+---------------------------------+--------------------------------------------------------+ | `read_html`_ | D | | +-------------------+---------------------------------+--------------------------------------------------------+ | `read_clipboard`_ | D | | +-------------------+---------------------------------+--------------------------------------------------------+ | `read_excel`_ | D | | +-------------------+---------------------------------+--------------------------------------------------------+ | `read_hdf`_ | D | | +-------------------+---------------------------------+--------------------------------------------------------+ | `read_feather`_ | Y | | +-------------------+---------------------------------+--------------------------------------------------------+ | `read_stata`_ | D | | +-------------------+---------------------------------+--------------------------------------------------------+ | `read_sas`_ | D | | +-------------------+---------------------------------+--------------------------------------------------------+ | `read_pickle`_ | D | Experimental implementation: | | | | read_pickle_glob | +-------------------+---------------------------------+--------------------------------------------------------+ | `read_sql`_ | Y | | +-------------------+---------------------------------+--------------------------------------------------------+ .. _`read_csv`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html#pandas.read_csv .. _`read_fwf`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_fwf.html#pandas.read_fwf .. _`read_table`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_table.html#pandas.read_table .. _`read_parquet`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_parquet.html#pandas.read_parquet .. _`read_json`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_json.html#pandas.read_json .. _`read_html`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_html.html#pandas.read_html .. _`read_clipboard`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_clipboard.html#pandas.read_clipboard .. _`read_excel`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html#pandas.read_excel .. _`read_hdf`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_hdf.html#pandas.read_hdf .. _`read_feather`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_feather.html#pandas.read_feather .. _`read_stata`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_stata.html#pandas.read_stata .. _`read_sas`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_sas.html#pandas.read_sas .. _`read_pickle`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_pickle.html#pandas.read_pickle .. _`read_sql`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_sql.html#pandas.read_sql ================================================ FILE: docs/supported_apis/older_pandas_compat.rst ================================================ =================================== Pandas backwards compatibility mode =================================== Modin verions 0.16 and 0.17, but no later minor versions, had limited support for running with legacy pandas versions. The latest version of Modin no longer has such support. Motivation for compatibility mode --------------------------------- Modin aims to keep compatibility with latest pandas release, hopefully catching up each release within a few days. However, due to certain restrictions like need to use Python 3.6 it forces some users to use older pandas (1.1.x for Python 3.6, specifically), which normally would mean they're bound to be using ancient Modin as well. To overcome this, Modin has special "compatibility mode" where some basic functionality works, but please note that the support is "best possible effort" (e.g. not all older bugs are worth fixing). Known issues with pandas 1.1.x ------------------------------ * ``pd.append()`` does not preserve the order of columns in older pandas while Modin does * ``.astype()`` produces different error type on incompatible dtypes * ``read_csv()`` does not support reading from ZIP file *with compression* in parallel mode * ``read_*`` do not support ``storage_option`` named argument * ``to_csv()`` does not support binary mode for output file * ``read_excel()`` does not support ``.xlsx`` files * ``read_fwf()`` has a bug with list of skiprows and non-None nrows: `pandas-dev#10261`_ * ``.agg(int-value)`` produces TypeError in older pandas but Modin raises AssertionError * ``Series.reset_index(drop=True)`` does not ignore ``name`` in older pandas while Modin ignores it * ``.sort_index(ascending=None)`` does not raise ValueError in older pandas while Modin raises it Please keep in mind that there are probably more issues which are not yet uncovered! .. _`pandas-dev#10261`: https://github.com/pandas-dev/pandas/issues/10261 ================================================ FILE: docs/supported_apis/series_supported.rst ================================================ ``pd.Series`` supported APIs ============================ The following table lists both implemented and not implemented methods. If you have need of an operation that is listed as not implemented, feel free to open an issue on the `GitHub repository`_, or give a thumbs up to already created issues. Contributions are also welcome! The following table is structured as follows: The first column contains the method name. The second column is a flag for whether or not there is an implementation in Modin for the method in the left column. ``Y`` stands for yes, ``N`` stands for no, ``P`` stands for partial (meaning some parameters may not be supported yet), and ``D`` stands for default to pandas. To learn more about the implementations that default to pandas, see the related section on :doc:`Defaulting to pandas `. +-----------------------------+---------------------------------+----------------------------------------------------+ | Series method | Modin Implementation? (Y/N/P/D) | Notes for Current implementation | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``abs`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``add`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``add_prefix`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``add_suffix`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``agg`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``aggregate`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``align`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``all`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``any`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``apply`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``argmax`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``argmin`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``argsort`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``array`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``asfreq`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``asobject`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``asof`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``astype`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``at`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``at_time`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``autocorr`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``axes`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``base`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``between`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``between_time`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``bfill`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``bool`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``cat`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``clip`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``combine`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``combine_first`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``compare`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``compress`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``copy`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``corr`` | Y | Correlation floating point precision may slightly | | | | differ from pandas. For now pearson method is | | | | available only. For other methods defaults to | | | | pandas. | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``count`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``cov`` | Y | Covariance floating point precision may slightly | | | | differ from pandas. | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``cummax`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``cummin`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``cumprod`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``cumsum`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``data`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``describe`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``diff`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``div`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``divide`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``divmod`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``dot`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``drop`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``drop_duplicates`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``droplevel`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``dropna`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``dt`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``dtype`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``dtypes`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``duplicated`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``empty`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``eq`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``equals`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``ewm`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``expanding`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``explode`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``factorize`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``ffill`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``fillna`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``filter`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``first`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``first_valid_index`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``flags`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``floordiv`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``from_array`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``ftype`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``ge`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``get`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``get_dtype_counts`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``get_ftype_counts`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``get_value`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``get_values`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``groupby`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``gt`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``hasnans`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``head`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``hist`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``iat`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``idxmax`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``idxmin`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``iloc`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``imag`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``index`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``infer_objects`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``interpolate`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``is_monotonic_decreasing`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``is_monotonic_increasing`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``is_unique`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``isin`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``isna`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``isnull`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``item`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``items`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``itemsize`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``keys`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``kurt`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``kurtosis`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``last`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``last_valid_index`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``le`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``loc`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``lt`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``map`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``mask`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``max`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``mean`` | P | Modin defaults to pandas if given the ``level`` | | | | param. | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``median`` | P | Modin defaults to pandas if given the ``level`` | | | | param. | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``memory_usage`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``min`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``mod`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``mode`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``mul`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``multiply`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``name`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``nbytes`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``ndim`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``ne`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``nlargest`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``nonzero`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``notna`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``notnull`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``nsmallest`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``nunique`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``pct_change`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``pipe`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``plot`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``pop`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``pow`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``prod`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``product`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``ptp`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``put`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``quantile`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``radd`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``rank`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``ravel`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``rdiv`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``rdivmod`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``real`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``reindex`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``reindex_like`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``rename`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``rename_axis`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``reorder_levels`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``repeat`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``replace`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``resample`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``reset_index`` | P | **Ray** and **Dask**: ``D`` when ``names`` or | | | | ``allow_duplicates`` is non-default | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``rfloordiv`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``rmod`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``rmul`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``rolling`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``round`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``rpow`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``rsub`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``rtruediv`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``sample`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``searchsorted`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``sem`` | P | Modin defaults to pandas if given the ``level`` | | | | param. | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``set_axis`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``set_value`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``shape`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``shift`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``size`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``skew`` | P | Modin defaults to pandas if given the ``level`` | | | | param. | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``sort_index`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``sort_values`` | Y | Order of indexes that have the same sort key | | | | is not guaranteed to be the same across sorts; | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``sparse`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``squeeze`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``std`` | P | Modin defaults to pandas if given the ``level`` | | | | param. | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``str`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``strides`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``sub`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``subtract`` | Y | See ``add``; | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``sum`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``swapaxes`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``swaplevel`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``tail`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``take`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``to_clipboard`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``to_csv`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``to_dict`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``to_excel`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``to_frame`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``to_hdf`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``to_json`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``to_latex`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``to_list`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``to_numpy`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``to_period`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``to_pickle`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``to_sql`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``to_string`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``to_timestamp`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``to_xarray`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``tolist`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``transform`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``transpose`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``truediv`` | Y | See ``add`` | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``truncate`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``tz_convert`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``tz_localize`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``unique`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``unstack`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``update`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``valid`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``value_counts`` | Y | The indices order of resulting object may differ | | | | from pandas. | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``values`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``var`` | P | Modin defaults to pandas if given the ``level`` | | | | param. | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``view`` | D | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``where`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ .. _`GitHub repository`: https://github.com/modin-project/modin/issues ================================================ FILE: docs/supported_apis/utilities_supported.rst ================================================ pandas Utilities Supported ========================== If you run ``import modin.pandas as pd``, the following operations are available from ``pd.``, e.g. ``pd.concat``. If you do not see an operation that pandas enables and would like to request it, feel free to `open an issue`_. Make sure you tell us your primary use-case so we can make it happen faster! The following table is structured as follows: The first column contains the method name. The second column is a flag for whether or not there is an implementation in Modin for the method in the left column. ``Y`` stands for yes, ``N`` stands for no, ``P`` stands for partial (meaning some parameters may not be supported yet), and ``D`` stands for default to pandas. +---------------------------+---------------------------------+----------------------------------------------------+ | Utility method | Modin Implementation? (Y/N/P/D) | Notes for Current implementation | +---------------------------+---------------------------------+----------------------------------------------------+ | `pd.concat`_ | Y | | +---------------------------+---------------------------------+----------------------------------------------------+ | `pd.eval`_ | Y | | +---------------------------+---------------------------------+----------------------------------------------------+ | `pd.unique`_ | Y | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``pd.value_counts`` | Y | The indices order of resulting object may differ | | | | from pandas. | +---------------------------+---------------------------------+----------------------------------------------------+ | `pd.cut`_ | D | | +---------------------------+---------------------------------+----------------------------------------------------+ | `pd.to_numeric`_ | D | | +---------------------------+---------------------------------+----------------------------------------------------+ | `pd.factorize`_ | D | | +---------------------------+---------------------------------+----------------------------------------------------+ | `pd.from_dummies`_ | D | | +---------------------------+---------------------------------+----------------------------------------------------+ | `pd.qcut`_ | D | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``pd.match`` | D | | +---------------------------+---------------------------------+----------------------------------------------------+ | `pd.to_datetime`_ | D | | +---------------------------+---------------------------------+----------------------------------------------------+ | `pd.get_dummies`_ | Y | | +---------------------------+---------------------------------+----------------------------------------------------+ | `pd.date_range`_ | D | | +---------------------------+---------------------------------+----------------------------------------------------+ | `pd.bdate_range`_ | D | | +---------------------------+---------------------------------+----------------------------------------------------+ | `pd.to_timedelta`_ | D | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``pd.options`` | Y | | +---------------------------+---------------------------------+----------------------------------------------------+ Other objects & structures -------------------------- This list is a list of objects not currently distributed by Modin. All of these objects are compatible with the distributed components of Modin. If you are interested in contributing a distributed version of any of these objects, feel free to open a `pull request`_. * Panel * Index * MultiIndex * CategoricalIndex * DatetimeIndex * Timedelta * Timestamp * NaT * PeriodIndex * Categorical * Interval * UInt8Dtype * UInt16Dtype * UInt32Dtype * UInt64Dtype * SparseDtype * Int8Dtype * Int16Dtype * Int32Dtype * Int64Dtype * CategoricalDtype * DatetimeTZDtype * IntervalDtype * PeriodDtype * RangeIndex * TimedeltaIndex * IntervalIndex * IndexSlice * TimeGrouper * Grouper * array * Period * DateOffset * ExcelWriter * SparseArray .. _open an issue: https://github.com/modin-project/modin/issues .. _pull request: https://github.com/modin-project/modin/pulls .. _`pd.concat`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.concat.html#pandas.concat .. _`pd.eval`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.eval.html#pandas.eval .. _`pd.unique`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.unique.html#pandas.unique .. _`pd.cut`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.cut.html#pandas.cut .. _`pd.to_numeric`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_numeric.html#pandas.to_numeric .. _`pd.factorize`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.factorize.html#pandas.factorize .. _`pd.from_dummies`: https://pandas.pydata.org/docs/reference/api/pandas.from_dummies.html#pandas-from-dummies .. _`pd.qcut`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.qcut.html#pandas.qcut .. _`pd.to_datetime`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html#pandas.to_datetime .. _`pd.get_dummies`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.get_dummies.html#pandas.get_dummies .. _`pd.date_range`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.date_range.html#pandas.date_range .. _`pd.bdate_range`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.bdate_range.html#pandas.bdate_range .. _`pd.to_timedelta`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_timedelta.html#pandas.to_timedelta ================================================ FILE: docs/usage_guide/advanced_usage/batch.rst ================================================ Batch Pipline API Usage Guide ============================= Modin provides an experimental batching feature that pipelines row-parallel queries. This feature is currently only supported for the ``PandasOnRay`` engine. Please note that this feature is experimental and behavior or interfaces could be changed. Usage examples -------------- In examples below we build and run some pipelines. It is important to note that the queries passed to the pipeline operate on Modin DataFrame partitions, which are backed by ``pandas``. When using ``pandas``- module level functions, please make sure to import and use ``pandas`` rather than ``modin.pandas``. Simple Batch Pipelining ^^^^^^^^^^^^^^^^^^^^^^^ This example walks through a simple batch pipeline in order to familiarize the user with the API. .. code-block:: python from modin.experimental.batch import PandasQueryPipeline import modin.pandas as pd import numpy as np df = pd.DataFrame( np.random.randint(0, 100, (100, 100)), columns=[f"col {i}" for i in range(1, 101)], ) # Build the dataframe we will pipeline. pipeline = PandasQueryPipeline(df) # Build the pipeline. pipeline.add_query(lambda df: df + 1, is_output=True) # Add the first query and specify that # it is an output query. pipeline.add_query( lambda df: df.rename(columns={f"col {i}":f"col {i-1}" for i in range(1, 101)}) ) # Add a second query. pipeline.add_query( lambda df: df.drop(columns=['col 99']), is_output=True, ) # Add a third query and specify that it is an output query. new_df = pd.DataFrame( np.ones((100, 100)), columns=[f"col {i}" for i in range(1, 101)], ) # Build a second dataframe that we will pipeline now instead. pipeline.update_df(new_df) # Update the dataframe that we will pipeline to be `new_df` # instead of `df`. result_dfs = pipeline.compute_batch() # Begin batch processing. # Print pipeline results print(f"Result of Query 1:\n{result_dfs[0]}") print(f"Result of Query 2:\n{result_dfs[1]}") # Output IDs can also be specified pipeline = PandasQueryPipeline(df) # Build the pipeline. pipeline.add_query( lambda df: df + 1, is_output=True, output_id=1, ) # Add the first query, specify that it is an output query, as well as specify an output id. pipeline.add_query( lambda df: df.rename(columns={f"col {i}":f"col {i-1}" for i in range(1, 101)}) ) # Add a second query. pipeline.add_query( lambda df: df.drop(columns=['col 99']), is_output=True, output_id=2, ) # Add a third query, specify that it is an output query, and specify an output_id. result_dfs = pipeline.compute_batch() # Begin batch processing. # Print pipeline results - should be a dictionary mapping Output IDs to resulting dataframes: print(f"Mapping of Output ID to dataframe:\n{result_dfs}") # Print results for query_id, res_df in result_dfs.items(): print(f"Query {query_id} resulted in\n{res_df}") Batch Pipelining with Postprocessing ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ A postprocessing function can also be provided when calling ``pipeline.compute_batch``. The example below runs a similar pipeline as above, but the postprocessing function writes the output dfs to a parquet file. .. code-block:: python from modin.experimental.batch import PandasQueryPipeline import modin.pandas as pd import numpy as np import os import shutil df = pd.DataFrame( np.random.randint(0, 100, (100, 100)), columns=[f"col {i}" for i in range(1, 101)], ) # Build the dataframe we will pipeline. pipeline = PandasQueryPipeline(df) # Build the pipeline. pipeline.add_query( lambda df: df + 1, is_output=True, output_id=1, ) # Add the first query, specify that it is an output query, as well as specify an output id. pipeline.add_query( lambda df: df.rename(columns={f"col {i}":f"col {i-1}" for i in range(1, 101)}) ) # Add a second query. pipeline.add_query( lambda df: df.drop(columns=['col 99']), is_output=True, output_id=2, ) # Add a third query, specify that it is an output query, and specify an output_id. def postprocessing_func(df, output_id, partition_id): filepath = f"query_{output_id}/" os.makedirs(filepath, exist_ok=True) filepath += f"part-{partition_id:04d}.parquet" df.to_parquet(filepath) return df result_dfs = pipeline.compute_batch( postprocessor=postprocessing_func, pass_partition_id=True, pass_output_id=True, ) # Begin computation, pass in a postprocessing function, and specify that partition ID and # output ID should be passed to that postprocessing function. print(os.system("ls query_1/")) # Should show `NPartitions.get()` parquet files - which # correspond to partitions of the output of query 1. print(os.system("ls query_2/")) # Should show `NPartitions.get()` parquet files - which # correspond to partitions of the output of query 2. for query_id, res_df in result_dfs.items(): written_df = pd.read_parquet(f"query_{query_id}/") shutil.rmtree(f"query_{query_id}/") # Clean up print(f"Written and Computed DF are " + f"{'equal' if res_df.equals(written_df) else 'not equal'} for query {query_id}") Batch Pipelining with Fan Out ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ If the input dataframe to a query is small (consisting of only one partition), it is possible to induce additional parallelism using the ``fan_out`` argument. The ``fan_out`` argument replicates the input partition, applies the query to each replica, and then coalesces all of the replicas back to one partition using the ``reduce_fn`` that must also be specified when ``fan_out`` is ``True``. It is possible to control the parallelism via the ``num_partitions`` parameter passed to the constructor of the ``PandasQueryPipeline``. This parameter designates the desired number of partitions, and defaults to ``NPartitions.get()`` when not specified. During fan out, the input partition is replicated ``num_partitions`` times. In the previous examples, ``num_partitions`` was not specified, and so defaulted to ``NPartitions.get()``. The example below demonstrates the usage of ``fan_out`` and ``num_partitions``. We first demonstrate an example of a function that would benefit from this computation pattern: .. code-block:: python import glob from PIL import Image import torchvision.transforms as T import torchvision transforms = T.Compose([T.ToTensor()]) model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True) model.eval() COCO_INSTANCE_CATEGORY_NAMES = [ '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table', 'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush' ] def contains_cat(image, model): image = transforms(image) labels = [COCO_INSTANCE_CATEGORY_NAMES[i] for i in model([image])[0]['labels']] return 'cat' in labels def serial_query(df): """ This function takes as input a dataframe with a single row corresponding to a folder containing images to parse. Each image in the folder is passed through a neural network that detects whether it contains a cat, in serial, and a new column is computed for the dataframe that counts the number of images containing cats. Parameters ---------- df : a dataframe The dataframe to process Returns ------- The same dataframe as before, with an additional column containing the count of images containing cats. """ model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True) model.eval() img_folder = df['images'][0] images = sorted(glob.glob(f"{img_folder}/*.jpg")) cats = 0 for img in images: cats = cats + 1 if contains_cat(Image.open(img), model) else cats df['cat_count'] = cats return df To download the image files to test out this code, run the following bash script, which downloads the images from the fast-ai-coco S3 bucket to a folder called ``images`` in your current working directory: .. code-block:: shell aws s3 cp s3://fast-ai-coco/coco_tiny.tgz . --no-sign-request; tar -xf coco_tiny.tgz; mkdir \ images; mv coco_tiny/train/* images/; rm -rf coco_tiny; rm -rf coco_tiny.tgz We can pipeline that code like so: .. code-block:: python import modin.pandas as pd from modin.experimental.batch import PandasQueryPipeline from time import time df = pd.DataFrame([['images']], columns=['images']) pipeline = PandasQueryPipeline(df) pipeline.add_query(serial_query, is_output=True) serial_start = time() df_with_cat_count = pipeline.compute_batch()[0] serial_end = time() print(f"Result of pipeline:\n{df_with_cat_count}") We can induce `8x` parallelism into the pipeline above by combining the ``fan_out`` and ``num_partitions`` parameters like so: .. code-block:: python import modin.pandas as pd from modin.experimental.batch import PandasQueryPipeline import shutil from time import time df = pd.DataFrame([['images']], columns=['images']) desired_num_partitions = 8 def parallel_query(df, partition_id): """ This function takes as input a dataframe with a single row corresponding to a folder containing images to parse. It parses `total_images/desired_num_partitions` images every time it is called. A new column is computed for the dataframe that counts the number of images containing cats. Parameters ---------- df : a dataframe The dataframe to process partition_id : int The partition id of the dataframe that this function runs on. Returns ------- The same dataframe as before, with an additional column containing the count of images containing cats. """ model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True) model.eval() img_folder = df['images'][0] images = sorted(glob.glob(f"{img_folder}/*.jpg")) total_images = len(images) cats = 0 start_index = partition_id * (total_images // desired_num_partitions) if partition_id == desired_num_partitions - 1: # Last partition must parse to end of list images = images[start_index:] else: end_index = (partition_id + 1) * (total_images // desired_num_partitions) images = images[start_index:end_index] for img in images: cats = cats + 1 if contains_cat(Image.open(img), model) else cats df['cat_count'] = cats return df def reduce_fn(dfs): """ Coalesce the results of fanning out the `parallel_query` query. Parameters ---------- dfs : a list of dataframes The resulting dataframes from fanning out `parallel_query` Returns ------- A new dataframe whose `cat_count` column is the sum of the `cat_count` column of all dataframes in `dfs` """ df = dfs[0] cat_count = df['cat_count'][0] for dataframe in dfs[1:]: cat_count += dataframe['cat_count'][0] df['cat_count'] = cat_count return df pipeline = PandasQueryPipeline(df, desired_num_partitions) pipeline.add_query( parallel_query, fan_out=True, reduce_fn=reduce_fn, is_output=True, pass_partition_id=True ) parallel_start = time() df_with_cat_count = pipeline.compute_batch()[0] parallel_end = time() print(f"Result of pipeline:\n{df_with_cat_count}") print(f"Total Time in Serial: {serial_end - serial_start}") print(f"Total time with induced parallelism: {parallel_end - parallel_start}") shutil.rmtree("images/") # Clean up Batch Pipelining with Dynamic Repartitioning ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Similarly, it is also possible to hint to the Pipeline API to repartition after a node completes computation. This is currently only supported if the input dataframe consists of only one partition. The number of partitions after repartitioning is controlled by the ``num_partitions`` parameter passed to the constructor of the ``PandasQueryPipeline``. The following example demonstrates how to use the ``repartition_after`` parameter. .. code-block:: python import modin.pandas as pd from modin.experimental.batch import PandasQueryPipeline import numpy as np small_df = pd.DataFrame([[1, 2, 3]]) # Create a small dataframe def increase_dataframe_size(df): import pandas new_df = pandas.concat([df] * 1000) new_df = new_df.reset_index(drop=True) # Get a new range index that isn't duplicated return new_df desired_num_partitions = 24 # We will repartition to 24 partitions def add_partition_id_to_df(df, partition_id): import pandas new_col = pandas.Series([partition_id]*len(df), name="partition_id", index=df.index) return pandas.concat([df, new_col], axis=1) pipeline = PandasQueryPipeline(small_df, desired_num_partitions) pipeline.add_query(increase_dataframe_size, repartition_after=True) pipeline.add_query(add_partition_id_to_df, pass_partition_id=True, is_output=True) result_df = pipeline.compute_batch()[0] print(f"Number of partitions passed to second query: " + f"{len(np.unique(result_df['partition_id'].values))}") print(f"Result of pipeline:\n{result_df}") ================================================ FILE: docs/usage_guide/advanced_usage/index.rst ================================================ Advanced Usage ============== .. toctree:: :titlesonly: :hidden: /flow/modin/distributed/dataframe/pandas spreadsheets_api progress_bar modin_xgboost modin_logging modin_metrics batch modin_engines .. meta:: :description lang=en: Description of Modin's advanced features. Modin aims to not only optimize pandas, but also provide a comprehensive, integrated toolkit for data scientists. We are actively developing data science tools such as DataFrame spreadsheet integration, DataFrame algebra, progress bars, SQL queries on DataFrames, and more. Join us on `Slack`_ for the latest updates! Modin engines ------------- Modin supports a series of execution engines such as Ray_, Dask_, `MPI through unidist`_, each of which might be a more beneficial choice for a specific scenario. When doing the first operation with Modin it automatically initializes one of the engines to further perform distributed/parallel computation. If you are familiar with a concrete execution engine, it is possible to initialize the engine on your own and Modin will automatically attach to it. Refer to :doc:`Modin engines ` page for more details. Additional APIs --------------- Modin also supports these additional APIs on top of pandas to improve user experience. - :py:meth:`~modin.pandas.DataFrame.modin.to_pandas` -- convert a Modin DataFrame/Series to a pandas DataFrame/Series. - :py:meth:`~modin.pandas.DataFrame.get_backend` -- Get the ``Backend`` :doc:`configuration variable ` of this ``DataFrame``. - :py:meth:`~modin.pandas.DataFrame.move_to` -- Move data and execution for this ``DataFrame`` to the given ``Backend`` :doc:`configuration variable `. This method is an alias for ``DataFrame.set_backend``. - :py:meth:`~modin.pandas.DataFrame.set_backend` -- Move data and execution for this ``DataFrame`` to the given ``Backend`` :doc:`configuration variable `. This method is an alias for ``DatFrame.move_to``. - :py:func:`~modin.pandas.io.from_pandas` -- convert a pandas DataFrame to a Modin DataFrame. - :py:meth:`~modin.pandas.DataFrame.modin.to_ray` -- convert a Modin DataFrame/Series to a Ray Dataset. - :py:func:`~modin.pandas.io.from_ray` -- convert a Ray Dataset to a Modin DataFrame. - :py:meth:`~modin.pandas.DataFrame.modin.to_dask` -- convert a Modin DataFrame/Series to a Ray Dataset. - :py:func:`~modin.pandas.io.from_dask` -- convert a Modin DataFrame/Series to a Dask DataFrame/Series. - :py:func:`~modin.pandas.io.from_map` -- create a Modin DataFrame from map function applied to an iterable object. - :py:func:`~modin.pandas.io.from_arrow` -- convert an Arrow Table to a Modin DataFrame. - :py:func:`~modin.experimental.pandas.read_csv_glob` -- read multiple files in a directory. - :py:func:`~modin.experimental.pandas.read_sql` -- add optional parameters for the database connection. - :py:func:`~modin.experimental.pandas.read_custom_text` -- read custom text data from file. - :py:func:`~modin.experimental.pandas.read_pickle_glob` -- read multiple pickle files in a directory. - :py:func:`~modin.experimental.pandas.read_parquet_glob` -- read multiple parquet files in a directory. - :py:func:`~modin.experimental.pandas.read_json_glob` -- read multiple json files in a directory. - :py:func:`~modin.experimental.pandas.read_xml_glob` -- read multiple xml files in a directory. - :py:meth:`~modin.pandas.DataFrame.modin.to_pickle_glob` -- write to multiple pickle files in a directory. - :py:meth:`~modin.pandas.DataFrame.modin.to_parquet_glob` -- write to multiple parquet files in a directory. - :py:meth:`~modin.pandas.DataFrame.modin.to_json_glob` -- write to multiple json files in a directory. - :py:meth:`~modin.pandas.DataFrame.modin.to_xml_glob` -- write to multiple xml files in a directory. DataFrame partitioning API -------------------------- Modin DataFrame provides an API to directly access partitions: you can extract physical partitions from a :py:class:`~modin.pandas.dataframe.DataFrame`, modify their structure by reshuffling or applying some functions, and create a DataFrame from those modified partitions. Visit :doc:`pandas partitioning API ` documentation to learn more. Modin Spreadsheet API --------------------- The Spreadsheet API for Modin allows you to render the dataframe as a spreadsheet to easily explore your data and perform operations on a graphical user interface. The API also includes features for recording the changes made to the dataframe and exporting them as reproducible code. Built on top of Modin and SlickGrid, the spreadsheet interface is able to provide interactive response times even at a scale of billions of rows. See our `Modin Spreadsheet API documentation`_ for more details. .. figure:: /img/modin_spreadsheet_mini_demo.gif :align: center :width: 650px :height: 350px Progress Bar ------------ Visual progress bar for Dataframe operations such as groupby and fillna, as well as for file reading operations such as read_csv. Built using the `tqdm`_ library and Ray execution engine. See `Progress Bar documentation`_ for more details. .. figure:: /img/progress_bar_example.png :align: center Dataframe Algebra ----------------- A minimal set of operators that can be composed to express any dataframe query for use in query planning and optimization. See our `paper`_ for more information, and full documentation is coming soon! Distributed XGBoost on Modin ---------------------------- Modin provides an implementation of `distributed XGBoost`_ machine learning algorithm on Modin DataFrames. See our :doc:`Distributed XGBoost on Modin documentation ` for details about installation and usage, as well as :doc:`Modin XGBoost architecture documentation ` for information about implementation and internal execution flow. Logging with Modin ------------------ Modin logging offers users greater insight into their queries by logging internal Modin API calls, partition metadata, and system memory. Logging is disabled by default, but when it is enabled, log files are written to a local `.modin` directory at the same directory level as the notebook/script used to run Modin. See our :doc:`Logging with Modin documentation ` for usage information. Batch Pipeline API ------------------ Modin provides an experimental batched API that pipelines row parallel queries. See our :doc:`Batch Pipline API Usage Guide ` for a walkthrough on how to use this feature, as well as :doc:`Batch Pipeline API documentation ` for more information about the API. Fuzzydata Testing ----------------- An experimental GitHub Action on pull request has been added to Modin, which automatically runs the Modin codebase against `fuzzydata`, a random dataframe workflow generator. The resulting workflow that was used to test Modin codebase can be downloaded as an artifact from the GitHub Actions tab for further inspection. See `fuzzydata`_ for more details. .. _`Modin Spreadsheet API documentation`: spreadsheets_api.html .. _`Progress Bar documentation`: progress_bar.html .. _`Paper`: https://arxiv.org/pdf/2001.00888.pdf .. _`Slack`: https://modin.org/slack.html .. _`tqdm`: https://github.com/tqdm/tqdm .. _`distributed XGBoost`: https://medium.com/intel-analytics-software/distributed-xgboost-with-modin-on-ray-fc17edef7720 .. _`fuzzydata`: https://github.com/suhailrehman/fuzzydata .. _Ray: https://github.com/ray-project/ray .. _Dask: https://github.com/dask/distributed .. _`MPI through unidist`: https://github.com/modin-project/unidist ================================================ FILE: docs/usage_guide/advanced_usage/modin_engines.rst ================================================ Modin engines ============= As a rule, you don't have to worry about initialization of an execution engine as Modin itself automatically initializes one when performing the first operation. Also, Modin has a broad range of :doc:`configuration settings `, which you can use to configure an execution engine. If there is a reason to initialize an execution engine on your own and you are sure what to do, Modin will automatically attach to whichever engine is available. Below, you can find some examples on how to initialize a specific execution engine on your own. Ray --- You can initialize Ray engine with a specific number of CPUs (worker processes) to perform computation. .. code-block:: python import ray import modin.config as modin_cfg ray.init(num_cpus=) modin_cfg.Engine.put("ray") # Modin will use Ray engine modin_cfg.CpuCount.put() To get more details on all possible parameters for initialization refer to `Ray documentation`_. Dask ---- You can initialize Dask engine with a specific number of worker processes and threads per worker to perform computation. .. code-block:: python from distributed import Client import modin.config as modin_cfg client = Client(n_workers=, threads_per_worker=) modin_cfg.Engine.put("dask") # # Modin will use Dask engine modin_cfg.CpuCount.put() To get more details on all possible parameters for initialization refer to `Dask Distributed documentation`_. MPI through unidist ------------------- You can initialize MPI through unidist engine with a specific number of CPUs (worker processes) to perform computation. .. code-block:: python import unidist import unidist.config as unidist_cfg import modin.config as modin_cfg unidist_cfg.Backend.put("mpi") unidist_cfg.CpuCount.put() unidist.init() modin_cfg.Engine.put("unidist") # # Modin will use MPI through unidist engine modin_cfg.CpuCount.put() To get more details on all possible parameters for initialization refer to `unidist documentation`_. .. _`Ray documentation`: https://docs.ray.io/en/latest .. _Dask Distributed documentation: https://distributed.dask.org/en/latest .. _`unidist documentation`: https://unidist.readthedocs.io/en/latest ================================================ FILE: docs/usage_guide/advanced_usage/modin_logging.rst ================================================ Modin Logging ============= Modin logging offers users greater insight into their queries by logging internal Modin API calls, partition metadata, and profiling system memory. When Modin logging is enabled (default disabled), log files are written to a local ``.modin`` directory at the same directory level as the notebook/script used to run Modin. The logs generated by Modin Logging will be written to a ``.modin/logs/job_`` directory, uniquely named after the job uuid. The logs that contain the Modin API stack traces are named ``trace.log``. The logs that contain the memory utilization metrics are named ``memory.log``. By default, if any log file exceeds 10MB (configurable with ``LogFileSize``), that file will be saved and a separate log file will be created. For instance, if users have 20MB worth of Modin API logs, they can expect to find ``trace.log.1`` and ``trace.log.2`` in the ``.modin/logs/job_`` directory. After ``10 * LogFileSize`` MB or by default 100MB of logs, the logs will rollover and the original log files beginning with ``trace.log.1`` will be overwritten with the new log lines. **Developer Warning:** In some cases, running services like JupyterLab in the ``modin/modin`` directory may result in circular dependency issues. This is due to a naming conflict between the ``modin/logging`` directory and the Python ``logging`` module, which may be used as a default in such environments. To resolve this, please run Jupyterlab or other similar services from directories other than ``modin/modin``. Usage examples -------------- In the example below, we enable logging for internal Modin API calls, partition metadata and memory profiling. We can set the granularity (in seconds) at which the system memory utilization is logged using ``LogMemoryInterval``. We can also set the maximum size of the logs (in MBs) using ``LogFileSize``. .. code-block:: python import modin.pandas as pd from modin.config import LogMode, LogMemoryInterval, LogFileSize LogMode.enable() LogMemoryInterval.put(2) # Defaults to 5 seconds, new interval is 2 seconds LogFileSize.put(5) # Defaults to 10 MB per log file, new size is 5 MB # User code goes here Disable Modin logging like so: .. code-block:: python import modin.pandas as pd from modin.config import LogMode LogMode.disable() # User code goes here In Modin the lower-level functionality is logged in debug level, and higher level functionality in info level. By default when logging is enabled in Modin, both high level and low level functionality are logged. The below example script could be used to switch between logging all functions vs only logging higher level functions. Setting logger level to ``logging.INFO`` logs only higher level functions. .. code-block:: python import modin.pandas as pd from modin.logging.config import get_logger from modin.config import LogMode import logging LogMode.enable() logger = get_logger() logger.setLevel(logging.INFO) # Replace with logger.setLevel(logging.DEBUG) for lower level logs df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) df = pd.concat([df, df]) Debugging from user defined functions: .. warning:: When attempting to use Modin logging in user defined functions that execute in workers for logging lower-level operators as in example below, multiple log directories ``.modin/logs/job_**`` would be created for each worker executing the UDF. .. code-block:: python import modin.pandas as pd def udf(x): from modin.config import LogMode LogMode.enable() return x + 1 modin_df = pd.DataFrame([0, 1, 2, 3]) print(modin_df.map(udf)) So the **recommended** approach would be to use a different logger as in the below snipet to log from user defined functions that execute on workers. Below is an an example to log from UDF. For this the logger config has to be specified inside the UDF that would execute on a remote worker. .. code-block:: python import logging import modin.pandas as pd def udf(x): logging.basicConfig(filename='modin_udf.log', level=logging.INFO) logging.info("This log message will be written to modin_udf.log ") # User code goes here return x + 1 modin_df = pd.DataFrame([0, 1, 2, 3]) print(modin_df.map(udf)) ================================================ FILE: docs/usage_guide/advanced_usage/modin_metrics.rst ================================================ Modin Metrics ============= Modin allows for third-party systems to register a metrics handler to collect specific API statistics. Metrics have a name and a value, can be aggregated, discarded, or emitted without impact to the program. CPU load, memory usage, and disk usage are all typical metrics; but modin currently only emits metrics on API timings which can be used to optimize end-user interactive performance. New metrics may be added in the future. It is the responsibility of the handler to process or forward these metrics. The name of the metric will be in "dot format" and all lowercase, similar to graphite or rrd. The value is an integer or float. Example metric names include: * 'modin.core-dataframe.pandasdataframe.copy_index_cache' * 'modin.core-dataframe.pandasdataframe.transpose' * 'modin.query-compiler.pandasquerycompiler.transpose' * 'modin.query-compiler.basequerycompiler.columnarize' * 'modin.pandas-api.series.__init__' * 'modin.pandas-api.dataframe._reduce_dimension' * 'modin.pandas-api.dataframe.sum' Handlers are functions of the form: `fn(str, int|float)` and can be registered with: .. code-block:: python import modin.pandas as pd from modin.logging.metrics import add_metric_handler def func(name: str, value: int | float): print(f"Got metric {name} value {value}") add_metric_handler(func) .. warning:: A metric handler should be non-blocking, returning within 100ms, although this is not enforced. It must not throw exceptions or it will be deregistered. These restrictions are to help guard against the implementation of a metrics collector which would impact interactice performance significantly. The data from metrics should generally be offloaded to another system for processing and not involve any blocking network calls. Metrics are enabled by default. Modin metrics can be disabled like so: .. code-block:: python import modin.pandas as pd from modin.config import MetricsMode MetricsMode.disable() ================================================ FILE: docs/usage_guide/advanced_usage/modin_xgboost.rst ================================================ Distributed XGBoost on Modin ============================ Modin provides an implementation of `distributed XGBoost`_ machine learning algorithm on Modin DataFrames. Please note that this feature is experimental and behavior or interfaces could be changed. Install XGBoost on Modin ------------------------ Modin comes with all the dependencies except ``xgboost`` package by default. Currently, distributed XGBoost on Modin is only supported on the Ray execution engine, therefore, see the :doc:`installation page ` for more information on installing Modin with the Ray engine. To install ``xgboost`` package you can use ``pip``: .. code-block:: bash pip install xgboost XGBoost Train and Predict ------------------------- Distributed XGBoost functionality is placed in ``modin.experimental.xgboost`` module. ``modin.experimental.xgboost`` provides a drop-in replacement API for ``train`` and ``Booster.predict`` xgboost functions. .. automodule:: modin.experimental.xgboost :noindex: :members: train .. autoclass:: modin.experimental.xgboost.Booster :noindex: :members: predict ModinDMatrix ------------ Data is passed to ``modin.experimental.xgboost`` functions via a Modin ``DMatrix`` object. .. automodule:: modin.experimental.xgboost :noindex: :members: DMatrix Currently, the Modin ``DMatrix`` supports ``modin.pandas.DataFrame`` only as an input. A Single Node / Cluster setup ----------------------------- The XGBoost part of Modin uses a Ray resources by similar way as all Modin functions. To start the Ray runtime on a single node: .. code-block:: python import ray # Look at the Ray documentation with respect to the Ray configuration suited to you most. ray.init() If you already had the Ray cluster you can connect to it by next way: .. code-block:: python import ray ray.init(address='auto') A detailed information about initializing the Ray runtime you can find in `starting ray`_ page. Usage example ------------- In example below we train XGBoost model using `the Iris Dataset`_ and get prediction on the same data. All processing will be in a `single node` mode. .. code-block:: python from sklearn import datasets import ray # Look at the Ray documentation with respect to the Ray configuration suited to you most. ray.init() # Start the Ray runtime for single-node import modin.pandas as pd import modin.experimental.xgboost as xgb # Load iris dataset from sklearn iris = datasets.load_iris() # Create Modin DataFrames X = pd.DataFrame(iris.data) y = pd.DataFrame(iris.target) # Create DMatrix dtrain = xgb.DMatrix(X, y) dtest = xgb.DMatrix(X, y) # Set training parameters xgb_params = { "eta": 0.3, "max_depth": 3, "objective": "multi:softprob", "num_class": 3, "eval_metric": "mlogloss", } steps = 20 # Create dict for evaluation results evals_result = dict() # Run training model = xgb.train( xgb_params, dtrain, steps, evals=[(dtrain, "train")], evals_result=evals_result ) # Print evaluation results print(f'Evals results:\n{evals_result}') # Predict results prediction = model.predict(dtest) # Print prediction results print(f'Prediction results:\n{prediction}') .. _Dataframe: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html .. _`starting ray`: https://docs.ray.io/en/master/starting-ray.html .. _`the Iris Dataset`: https://scikit-learn.org/stable/auto_examples/datasets/plot_iris_dataset.html .. _`distributed XGBoost`: https://medium.com/intel-analytics-software/distributed-xgboost-with-modin-on-ray-fc17edef7720 ================================================ FILE: docs/usage_guide/advanced_usage/progress_bar.rst ================================================ Progress Bar ============ The progress bar allows users to see the estimated progress and completion time of each line they run, in environments such as a shell or Jupyter notebook. .. figure:: /img/progress_bar.gif :align: center Quickstart """""""""" The progress bar uses the `tqdm` library to visualize displays: .. code-block:: bash pip install tqdm Import the progress bar into your notebook by running the following: .. code-block:: python from modin.config import ProgressBar ProgressBar.enable() ================================================ FILE: docs/usage_guide/advanced_usage/spreadsheets_api.rst ================================================ Modin Spreadsheets API ====================== Getting started --------------- Install Modin-spreadsheet using pip: .. code-block:: bash pip install "modin[spreadsheet]" The following code snippet creates a spreadsheet using the FiveThirtyEight dataset on labor force information by college majors (licensed under CC BY 4.0): .. code-block:: python import modin.pandas as pd import modin.experimental.spreadsheet as mss df = pd.read_csv('https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/all-ages.csv') spreadsheet = mss.from_dataframe(df) spreadsheet .. figure:: /img/modin_spreadsheets_installation.png :align: center Basic Manipulations through User Interface ------------------------------------------ The Spreadsheet API allows users to manipulate the DataFrame with simple graphical controls for sorting, filtering, and editing. Here are the instructions for each operation: * **Sort**: Click on the column header of the column to sort on. * **Filter**: Click on the filter button on the column header and apply the desired filter to the column. The filter dropdown changes depending on the type of the column. Multiple filters are automatically combined. * **Edit Cell**: Double click on a cell and enter the new value. * **Add Rows**: Click on the “Add Row” button in the toolbar to duplicate the last row in the DataFrame. The duplicated values provide a convenient default and can be edited as necessary. * **Remove Rows**: Select row(s) and click the “Remove Row” button. Select a single row by clicking on it. Multiple rows can be selected with Cmd+Click (Windows: Ctrl+Click) on the desired rows or with Shift+Click to specify a range of rows. Some of these operations can also be done through the spreadsheet’s programmatic interface. Sorts and filters can be reset using the toolbar buttons. Edits and adding/removing rows can only be undone manually. Virtual Rendering ----------------- The spreadsheet will only render data based on the user’s viewport. This allows for quick rendering even on very large DataFrames because only a handful of rows are loaded at any given time. As a result, scrolling and viewing your data is smooth and responsive. Transformation History and Exporting Code ----------------------------------------- All operations on the spreadsheet are recorded and are easily exported as code for sharing or reproducibility. This history is automatically displayed in the history cell, which is generated below the spreadsheet whenever the spreadsheet widget is displayed. The history cell is displayed on default, but this can be turned off. Modin Spreadsheet API provides a few methods for interacting with the history: * `SpreadsheetWidget.get_history()`: Retrieves the transformation history in the form of reproducible code. * `SpreadsheetWidget.filter_relevant_history(persist=True)`: Returns the transformation history that contains only code relevant to the final state of the spreadsheet. The `persist` parameter determines whether the internal state and the displayed history is also filtered. * `SpreadsheetWidget.reset_history()`: Clears the history of transformation. Customizable Interface ---------------------- The spreadsheet widget provides a number of options that allows the user to change the appearance and the interactivity of the spreadsheet. Options include: * Row height/Column width * Preventing edits, sorts, or filters on the whole spreadsheet or on a per-column basis * Hiding the toolbar and history cell * Float precision * Highlighting of cells and rows * Viewport size Converting Spreadsheets To and From Dataframes ---------------------------------------------- .. automodule:: modin.experimental.spreadsheet.general :noindex: :members: from_dataframe .. automodule:: modin.experimental.spreadsheet.general :noindex: :members: to_dataframe Further API Documentation ------------------------- .. automodule:: modin_spreadsheet.grid :noindex: :members: SpreadsheetWidget ================================================ FILE: docs/usage_guide/benchmarking.rst ================================================ Benchmarking Modin ================== Summary ------- To benchmark a single Modin function, often turning on the :doc:`configuration variable ` variable :code:`BenchmarkMode` will suffice. There is no simple way to benchmark more complex Modin workflows, though benchmark mode or calling ``modin.utils.execute`` on Modin objects may be useful. The :doc:`Modin logs ` may help you identify bottlenecks in your code, and they may also help profile the execution of each Modin function. Modin's execution and benchmark mode ------------------------------------ Most of Modin's execution happens asynchronously, i.e. in separate processes that run independently of the main program flow. Some execution is also lazy, meaning that it doesn't start immediately once the user calls a Modin function. While Modin provides the same API as pandas, lazy and asynchronous execution can often make it hard to tell how much time each Modin function call takes, as well as to compare Modin's performance to pandas and other similar libraries. .. note:: All examples in this doc use the system specified at the bottom of this page. Consider the following ipython script: .. code-block:: python import modin.pandas as pd from modin.config import MinRowPartitionSize import time import ray # Look at the Ray documentation with respect to the Ray configuration suited to you most. ray.init() df = pd.DataFrame(list(range(MinRowPartitionSize.get() * 2))) %time result = df.map(lambda x: time.sleep(0.1) or x) %time print(result) Modin takes just 2.68 milliseconds for the ``map``, and 3.78 seconds to print the result. However, if we run this script in pandas by replacing :code:`import modin.pandas as pd` with :code:`import pandas as pd`, the ``map`` takes 6.63 seconds, and printing the result takes just 5.53 milliseconds. Both pandas and Modin start executing the ``map`` as soon as the interpreter evalutes it. While pandas blocks until the ``map`` has finished, Modin just kicks off asynchronous functions in remote ray processes. Printing the function result is fairly fast in pandas and Modin, but before Modin can print the data, it has to wait until all the remote functions complete. To time how long Modin takes for a single operation, you should typically use benchmark mode. Benchmark mode will wait for all asynchronous remote execution to complete. You can turn on benchmark mode on at any point as follows: .. code-block:: python from modin.config import BenchmarkMode BenchmarkMode.put(True) Rerunning the script above with benchmark mode on, the Modin ``map`` takes 3.59 seconds, and the ``print`` takes 183 milliseconds. These timings better reflect where Modin is spending its execution time. A caveat about benchmark mode ----------------------------- While benchmark code is often good for measuring the performance of a single Modin function call, it can underestimate Modin's performance in cases where Modin's asynchronous execution improves Modin's performance. Consider the following script with benchmark mode on: .. code-block:: python import numpy as np import time import ray from io import BytesIO import modin.pandas as pd from modin.config import BenchmarkMode, MinRowPartitionSize BenchmarkMode.put(True) start = time.time() df = pd.DataFrame(list(range(MinRowPartitionSize.get())), columns=['A']) result1 = df.map(lambda x: time.sleep(0.2) or x + 1) result2 = df.map(lambda x: time.sleep(0.2) or x + 2) result1.to_parquet(BytesIO()) result2.to_parquet(BytesIO()) end = time.time() print(f'map and write to parquet took {end - start} seconds.') .. code-block::python The script does two slow ``map`` on a dataframe and then writes each result to a buffer. The whole script takes 13 seconds with benchmark mode on, but just 7 seconds with benchmark mode off. Because Modin can run the ``map`` asynchronously, it can start writing the first result to its buffer while it's still computing the second result. With benchmark mode on, Modin has to execute every function synchronously instead. How to benchmark complex workflows ---------------------------------- Typically, to benchmark Modin's overall performance on your workflow, you should start by looking at end-to-end performance with benchmark mode off. It's common for Modin worfklows to end with writing results to one or more files, or with printing some Modin objects to an interactive console. Such end points are natural ways to make sure that all of the Modin execution that you require is complete. To measure more fine-grained performance, it can be helpful to turn benchmark mode on, but beware that doing so may reduce your script's overall performance and thus may not reflect where Modin is normally spending execution time, as pointed out above. Turning on :doc:`Modin logging ` and using the Modin logs can also help you profile your workflow. The Modin logs can also give a detailed break down of the performance of each Modin function at each Modin :doc:`layer `. Log mode is more useful when used in conjuction with benchmark mode. Sometimes, if you don't have a natural end-point to your workflow, you can just call ``modin.utils.execute`` on the workflow's final Modin objects. That will typically block on any asynchronous computation: .. code-block:: python import time import ray from io import BytesIO import modin.pandas as pd from modin.config import MinRowPartitionSize, NPartitions import modin.utils MinRowPartitionSize.put(32) NPartitions.put(16) def slow_add_one(x): if x == 5000: time.sleep(10) return x + 1 # Look at the Ray documentation with respect to the Ray configuration suited to you most. ray.init() df1 = pd.DataFrame(list(range(10_000)), columns=['A']) result = df1.map(slow_add_one) # %time modin.utils.execute(result) %time result.to_parquet(BytesIO()) .. code-block::python Writing the result to a buffer takes 9.84 seconds. However, if you uncomment the :code:`%time modin.utils.execute(result)` before the :code:`to_parquet` call, the :code:`to_parquet` takes just 23.8 milliseconds! .. note:: If you see any Modin documentation touting Modin's speed without using benchmark mode or otherwise guaranteeing that Modin is finishing all asynchronous and deferred computation, you should file an issue on the Modin GitHub. It's not fair to compare the speed of an async Modin function call to an equivalent synchronous call using another library. Appendix: System Information ---------------------------- The example scripts here were run on the following system: - **OS Platform and Distribution (e.g., Linux Ubuntu 16.04)**: macOS Monterey 12.4 - **Modin version**: d6d503ac7c3028d871c34d9e99e925ddb0746df6 - **Ray version**: 2.0.0 - **Python version**: 3.10.4 - **Machine**: MacBook Pro (16-inch, 2019) - **Processor**: 2.3 GHz 8-core Intel Core i9 processor - **Memory**: 16 GB 2667 MHz DDR4 ================================================ FILE: docs/usage_guide/examples/index.rst ================================================ Modin Usage Examples ==================== This section shows Modin usage examples in different scenarios like Modin on a local/remote cluster, the use of Modin spreadsheet. Tutorials ''''''''' The following tutorials cover the basic usage of Modin. `Here `_ is a one hour video tutorial that walks through these basic exercises. - Exercise 1: Introduction to Modin [`Source PandasOnRay `__, `Source PandasOnDask `__] - Exercise 2: Speed Improvements with Modin [`Source PandasOnRay `__, `Source PandasOnDask `__] - Exercise 3: Defaulting to pandas with Modin [`Source PandasOnRay `__, `Source PandasOnDask `__] The following tutorials covers more advanced features in Modin: - Exercise 4: Experimental Features in Modin (Spreadsheet, Progress Bar) [`Source PandasOnRay `__, `Source PandasOnDask `__] - Exercise 5: Setting up Modin in a Cluster Environment [`Source PandasOnRay `__] How to get required dependencies for the tutorial notebooks and to run them please refer to the respective `README.md `__ file. Data Science Benchmarks ''''''''''''''''''''''' - Using Modin with the NYC Taxi Dataset [`Source `__] - Using Modin with the Census Dataset (coming soon...) - Using Modin with the Plasticc Dataset (coming soon...) Modin Spreadsheets '''''''''''''''''' - Using Modin along with the Spreadsheets API [`Source `__] Modin with scikit-learn ''''''''''''''''''''''' - Modin for Machine Learning with scikit-learn [`Source `__] ================================================ FILE: docs/usage_guide/index.rst ================================================ Usage Guide =========== This guide describes both basic and advanced Modin usage, including usage examples, details regarding Modin configuration settings, as well as tips and tricks on how you can further optimize the performance of your workload with Modin. .. toctree:: :maxdepth: 4 /flow/modin/config examples/index advanced_usage/index optimization_notes/index benchmarking integrations .. meta:: :description lang=en: Usage-specific documentation. ================================================ FILE: docs/usage_guide/integrations.rst ================================================ Third Party Library Integrations ================================ Modin is a drop-in replacement for Pandas, so we want it to interoperate with third-party libraries just as Pandas does. To see where Modin performs well and where it needs to improve, we've selected a number of important machine learning + visualization + statistics libraries, and then looked at examples (from their documentation, if possible) about how they work with Pandas. Then we ran those same workflows with Modin, and tracked what worked, and what failed. In the table below, you'll see, for each third-party library we tested, the number of successful test calls / total test calls, and a qualitative description of how both Pandas and Modin integrate with that library. In the deeper dive, you can view the Jupyter notebook we have used to test API calls and the corresponding Github issues filed. If you come across other issues/ examples in your own workflows we encourage you to file an `issue `_ or contribute a `PR `_! .. note:: These interoperability metrics are preliminary and not all APIs for each library have been tested. Feel free to add more! Modin Interoperability by Library ''''''''''''''''''''''''''''''''' .. list-table:: :widths: 5 5 20 :header-rows: 1 * - Library - API successes / calls - Interoperability * - seaborn - 73% (11/15) - **Pandas**: Accepts Pandas DataFrames as inputs for producing plot |br| **Modin**: Mostly accepts Modin DataFrames as inputs for producing plots, but fails completely in some cases (pairplot, lmplot), and in others (catplot, objects.Plot) only works for some parameter combinations * - plotly - 78% (7 / 9) - **Pandas**: Accepts Pandas DataFrames as inputs for producing plots, including specifying X and Y parameters as df columns |br| **Modin**: Mostly accepts Modin DataFrames as inputs for producing plots (the exception is choropleth), but fails when specifying X and Y parameters as df columns * - matplotlib - 100% (5 / 5) - **Pandas**: Accepts Pandas DataFrames as inputs for producing plots like scatter, barh, etc. |br| **Modin**: Accepts Modin DataFrames as inputs for producing plots like scatter, barh, etc. * - altair - 0% (0 / 1) - **Pandas**: Accepts Pandas DataFrames as inputs for producing charts through Chart |br| **Modin**: Does not accept Modin DataFrames as inputs for producing charts through Chart * - bokeh - 0% (0 / 1) - **Pandas**: Loads Pandas DataFrames through ColumnDataSource |br| **Modin**: Does not load Modin DataFrames through ColumnDataSource * - sklearn - 100% (6 / 6) - **Pandas**: Many functions take Pandas DataFrames as inputs |br| **Modin**: Many functions take Modin DataFrames as inputs * - Hugging Face (Transformers, Datasets) - 100% (2 / 2) - **Pandas**: Loads Pandas DataFrames into Datasets, and processes Pandas DataFrame rows as inputs using Transformers.InputExample (deprecated) |br| **Modin**: Loads Modin DataFrames into Datasets (though slowly), and processes Modin DataFrame rows as inputs through Transformers.InputExample (deprecated) * - Tensorflow - 75% (3 / 4) - **Pandas**: Converts Pandas dataframes to tensors |br| **Modin**: Converts Modin DataFrames to tensors, but specialized APIs like Keras might not work yet * - NLTK - 100% (1 / 1) - **Pandas**: Performs transformations like tokenization on Pandas DataFrames |br| **Modin**: Performs transformations like tokenization on Modin DataFrames * - XGBoost - 100% (1 / 1) - **Pandas**: Loads Pandas DataFrames through the DMatrix function |br| **Modin**: Loads Modin DataFrames through the DMatrix function * - statsmodels - 50% (1 / 2) - **Pandas**: Can accept Pandas DataFrames when fitting models |br| **Modin**: Sometimes accepts Modin DataFrames when fitting models (e.g., formula.api.ols), but does not in others (e.g., api.OLS) .. |br| raw:: html
A Deeper Dive '''''''''''''' **seaborn** ----------- `Jupyter Notebook `__ Github Issues * https://github.com/modin-project/modin/issues/5435 * https://github.com/modin-project/modin/issues/5433 **plotly** ---------- `Jupyter Notebook `__ Github Issues * https://github.com/modin-project/modin/issues/5447 * https://github.com/modin-project/modin/issues/5445 **matplotlib** -------------- `Jupyter Notebook `__ **altair** ---------- `Jupyter Notebook `__ Github Issues * https://github.com/modin-project/modin/issues/5438 **bokeh** --------- `Jupyter Notebook `__ Github Issues * https://github.com/modin-project/modin/issues/5437 **sklearn** ----------- `Jupyter Notebook `__ **Hugging Face** ---------------- `Jupyter Notebook `__ **Tensorflow** -------------- `Jupyter Notebook `__ Github Issues * https://github.com/modin-project/modin/issues/5439 **NLTK** --------- `Jupyter Notebook `__ **XGBoost** ----------- `Jupyter Notebook `__ **statsmodels** --------------- `Jupyter Notebook `__ Github Issues * https://github.com/modin-project/modin/issues/5440 Appendix: System Information ''''''''''''''''''''''''''''' The example scripts here were run on the following system: - **OS Platform and Distribution (e.g., Linux Ubuntu 16.04)**: macOS Big Sur 11.5.2 - **Modin version**: 0.18.0+3.g4114183f - **Ray version**: 2.0.1 - **Python version**: 3.9.7.final.0 - **Machine**: MacBook Pro (16-inch, 2019) - **Processor**: 2.3 GHz 8-core Intel Core i9 processor - **Memory**: 16 GB 2667 MHz DDR4 ================================================ FILE: docs/usage_guide/optimization_notes/index.rst ================================================ Optimization Notes ================== Modin has chosen default values for a lot of the configurations here that provide excellent performance in most cases. This page is for those who love to optimize their code and those who are curious about existing optimizations within Modin. Here you can find more information about Modin's optimizations both for a pipeline of operations as well as for specific operations. If you want to go ahead and tune the Modin behavior on your own, refer to :doc:`Modin Configuration Settings ` page for the full set of configurations available in Modin. Range-partitioning in Modin """"""""""""""""""""""""""" Modin utilizes a range-partitioning approach for specific operations, significantly enhancing parallelism and reducing memory consumption in certain scenarios. Range-partitioning is typically engaged for operations that has key columns (to group on, to merge on, etc). You can enable `range-partitioning`_ by specifying ``cfg.RangePartitioning`` :doc:`configuration variable: ` .. code-block:: python import modin.pandas as pd import modin.config as cfg cfg.RangePartitioning.put(True) # past this point methods that support range-partitioning # will use it pd.DataFrame(...).groupby(...).mean() # use range-partitioning for groupby.mean() cfg.Range-partitioning.put(False) pd.DataFrame(...).groupby(...).mean() # use MapReduce implementation for groupby.mean() Building range-partitioning assumes data reshuffling, which may result into breaking the original order of rows, for some operation, it will mean that the result will be different from Pandas. Range-partitioning is not a silver bullet, meaning that enabling it is not always beneficial. Below you find a link to the list of operations that have support for range-partitioning and practical advices on when one should enable it: :doc:`operations that support range-partitioning `. Dynamic-partitioning in Modin """"""""""""""""""""""""""""" Ray engine experiences slowdowns when running a large number of small remote tasks at the same time. Ray Core recommends to `avoid tiny task`_. When modin DataFrame has a large number of partitions, some functions produce a large number of remote tasks, which can cause slowdowns. To solve this problem, Modin suggests using dynamic partitioning. This approach reduces the number of remote tasks by combining multiple partitions into a single virtual partition and perform a common remote task on them. Dynamic partitioning is typically used for operations that are fully or partially executed on all partitions separately. .. code-block:: python import modin.pandas as pd from modin.config import context df = pd.DataFrame(...) with context(DynamicPartitioning=True): df.abs() Dynamic partitioning is also not always useful, and this approach is usually used for medium-sized DataFrames with a large number of columns. If the number of columns is small, the number of partitions will be close to the number of CPUs, and Ray will not have this problem. If the DataFrame has too many rows, this is also not a good case for using Dynamic-partitioning, since each task is no longer tiny and performing the combined tasks carries more overhead than assigning them separately. Unfortunately, the use of Dynamic-partitioning depends on various factors such as data size, number of CPUs, operations performed, and it is up to the user to determine whether Dynamic-partitioning will give a boost in his case or not. .. TODO: Define heuristics to automatically enable dynamic partitioning without performance penalty. `Issue #7370 `_ Understanding Modin's partitioning mechanism """""""""""""""""""""""""""""""""""""""""""" Modin's partitioning is crucial for performance; so we recommend expert users to understand Modin's partitioning mechanism and how to tune it in order to achieve better performance. How Modin partitions a dataframe -------------------------------- Modin uses a partitioning scheme that partitions a dataframe along both axes, resulting in a matrix of partitions. The row and column chunk sizes are computed independently based on the length of the appropriate axis and Modin's special :doc:`configuration variables ` (``NPartitions``, ``MinRowPartitionSize`` and ``MinColumnPartitionSize``): - ``NPartitions`` is the maximum number of splits along an axis; by default, it equals to the number of cores on your local machine or cluster of nodes. - ``MinRowPartitionSize`` is the minimum number of rows to do a split. For instance, if ``MinRowPartitionSize`` is 32, the row axis will not be split unless the amount of rows is greater than 32. If it is is greater, for example, 34, then the row axis is sliced into two partitions: containing 32 and 2 rows accordingly. - ``MinColumnPartitionSize`` is the minimum number of columns to do a split. For instance, if ``MinColumnPartitionSize`` is 32, the column axis will not be split unless the amount of columns is greater than 32. If it is is greater, for example, 34, then the column axis is sliced into two partitions: containing 32 and 2 columns accordingly. Beware that ``NPartitions`` specifies a limit for the number of partitions `along a single axis`, which means, that the actual limit for the entire dataframe itself is the square of ``NPartitions``. .. figure:: /img/partitioning_mechanism/partitioning_examples.svg :align: center Full-axis functions ------------------- Some of the aggregation functions require knowledge about the entire axis, for example at ``.apply(foo, axis=0)`` the passed function ``foo`` expects to receive data for the whole column at once. When a full-axis function is applied, the partitions along this axis are collected at a single worker that processes the function. After the function is done, the partitioning of the data is back to normal. .. figure:: /img/partitioning_mechanism/full_axis_function.svg :align: center Note that the amount of remote calls is equal to the number of partitions, which means that since the number of partitions is decreased for full-axis functions it also decreases the potential for parallelism. Also note, that reduce functions such as ``.sum()``, ``.mean()``, ``.max()``, etc, are not considered to be full-axis, so they do not suffer from the decreasing level of parallelism. How to tune partitioning ------------------------ Configure Modin's default partitioning scheme ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ As you can see from the examples above, the more the dataframe's shape is closer to a square, the closer the number of partitions to the square of ``NPartitions``. In the case of ``NPartitions`` equals to the number of workers, that means that a single worker is going to process multiple partitions at once, which slows down overall performance. If your workflow mainly operates with wide dataframes and non-full-axis functions, it makes sense to reduce the ``NPartitions`` value so a single worker would process a single partition. .. figure:: /img/partitioning_mechanism/repartition_square_frames.svg :align: center Copy-pastable example, showing how tuning ``NPartitions`` value for wide frames may improve performance on your machine: .. code-block:: python from multiprocessing import cpu_count from modin.distributed.dataframe.pandas import unwrap_partitions import modin.config as cfg import modin.pandas as pd import numpy as np import timeit # Generating data for a square-like dataframe data = np.random.randint(0, 100, size=(5000, 5000)) # Explicitly setting `NPartitions` to its default value cfg.NPartitions.put(cpu_count()) # Each worker processes `cpu_count()` amount of partitions df = pd.DataFrame(data) print(f"NPartitions: {cfg.NPartitions.get()}") # Getting raw partitions to count them partitions_shape = np.array(unwrap_partitions(df)).shape print( f"The frame has {partitions_shape[0]}x{partitions_shape[1]}={np.prod(partitions_shape)} partitions " f"when the CPU has only {cpu_count()} cores." ) print(f"10 times of .abs(): {timeit.timeit(lambda: df.abs(), number=10)}s.") # Possible output: # NPartitions: 112 # The frame has 112x112=12544 partitions when the CPU has only 112 cores. # 10 times of .abs(): 23.64s. # Taking a square root of the the current `cpu_count` to make more even partitioning cfg.NPartitions.put(int(cpu_count() ** 0.5)) # Each worker processes a single partition df = pd.DataFrame(data) print(f"NPartitions: {cfg.NPartitions.get()}") # Getting raw partitions to count them partitions_shape = np.array(unwrap_partitions(df)).shape print( f"The frame has {partitions_shape[0]}x{partitions_shape[1]}={np.prod(partitions_shape)} " f"when the CPU has {cpu_count()} cores." ) print(f"10 times of .abs(): {timeit.timeit(lambda: df.abs(), number=10)}s.") # Possible output: # NPartitions: 10 # The frame has 10x10=100 partitions when the CPU has 112 cores. # 10 times of .abs(): 0.25s. Manually trigger repartitioning ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ If you're getting unexpectedly poor performance, although you configured ``MODIN_NPARTITIONS`` correctly, then this might be caused by unbalanced partitioning that occurred during the workflow's execution. Modin's idealogy is to handle partitioning internally and not let users worry about the possible consequences of applying a lot of "bad" operations that may affect DataFrame's partitioning. We're constantly making efforts to find and fix cases where partitioning may cause a headache for users. However, if you feel that you're dealing with unbalanced partitioning you may try to call an internal :py:meth:`modin.pandas.dataframe.DataFrame._repartition` method on your :py:class:`~modin.pandas.dataframe.DataFrame` in order to manually trigger partitions rebalancing and see whether it improves performance for your case. .. automethod:: modin.pandas.dataframe.DataFrame._repartition An actual use-case for this method may be the following: .. code-block:: python import modin.pandas as pd import timeit df = pd.DataFrame({"col0": [1, 2, 3, 4]}) # Appending a lot of columns may result into unbalanced partitioning for i in range(1, 128): df[f"col{i}"] = pd.Series([1, 2, 3, 4]) print( "DataFrame with unbalanced partitioning:", timeit.timeit(lambda: df.sum(), number=10) ) # 1.44s df = df._repartition() print( "DataFrame after '._repartition()':", timeit.timeit(lambda: df.sum(), number=10) ) # 0.21s. Avoid iterating over Modin DataFrame """""""""""""""""""""""""""""""""""" Use ``df.apply()`` or other aggregation methods when possible instead of iterating over a dataframe. For-loops don't scale and forces the distributed data to be collected back at the driver. Copy-pastable example, showing how replacing a for-loop to the equivalent ``.apply()`` may improve performance: .. code-block:: python import modin.pandas as pd import numpy as np from timeit import default_timer as timer data = np.random.randint(1, 100, (2 ** 10, 2 ** 2)) md_df = pd.DataFrame(data) result = [] t1 = timer() # Iterating over a dataframe forces to collect distributed data to the driver and doesn't scale for idx, row in md_df.iterrows(): result.append((row[1] + row[2]) / row[3]) print(f"Filling a list by iterating a Modin frame: {timer() - t1:.2f}s.") # Possible output: 36.15s. t1 = timer() # Using `.apply()` perfectly scales to all axis-partitions result = md_df.apply(lambda row: (row[1] + row[2]) / row[3], axis=1).to_numpy().tolist() print(f"Filling a list by using '.apply()' and converting the result to a list: {timer() - t1:.2f}s.") # Possible output: 0.22s. Use Modin's Dataframe Algebra API to implement custom parallel functions """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" Modin provides a set of low-level parallel-implemented operators which can be used to build most of the aggregation functions. These operators are present in the :doc:`algebra module `. Modin DataFrame allows users to use their own aggregations built with this module. Visit the :doc:`DataFrame's algebra ` page of the documentation for the steps to do it. Avoid mixing pandas and Modin DataFrames """""""""""""""""""""""""""""""""""""""" Although Modin is considered to be a drop-in replacement for pandas, Modin and pandas are not intended to be used together in a single flow. Passing a pandas DataFrame as an argument for a Modin's DataFrame method may either slowdown the function (because it has to process non-distributed object) or raise an error. You would also get an undefined behavior if you pass a Modin DataFrame as an input to pandas methods, since pandas identifies Modin's objects as a simple iterable, and so can't leverage its benefits as a distributed dataframe. Copy-pastable example, showing how mixing pandas and Modin DataFrames in a single flow may bottleneck performance: .. code-block:: python import modin.pandas as pd import numpy as np import timeit import pandas data = np.random.randint(0, 100, (2 ** 20, 2 ** 2)) md_df, md_df_copy = pd.DataFrame(data), pd.DataFrame(data) pd_df, pd_df_copy = pandas.DataFrame(data), pandas.DataFrame(data) print("concat modin frame + pandas frame:") # Concatenating modin frame + pandas frame using modin '.concat()' # This case is bad because Modin have to process non-distributed pandas object time = timeit.timeit(lambda: pd.concat([md_df, pd_df]), number=10) print(f"\t{time}s.\n") # Possible output: 0.44s. print("concat modin frame + modin frame:") # Concatenating modin frame + modin frame using modin '.concat()' # This is an ideal case, Modin is being used as intended time = timeit.timeit(lambda: pd.concat([md_df, md_df_copy]), number=10) print(f"\t{time}s.\n") # Possible output: 0.05s. print("concat pandas frame + pandas frame:") # Concatenating pandas frame + pandas frame using pandas '.concat()' time = timeit.timeit(lambda: pandas.concat([pd_df, pd_df_copy]), number=10) print(f"\t{time}s.\n") # Possible output: 0.31s. print("concat pandas frame + modin frame:") # Concatenating pandas frame + modin frame using pandas '.concat()' time = timeit.timeit(lambda: pandas.concat([pd_df, md_df]), number=10) print(f"\t{time}s.\n") # Possible output: TypeError Using pandas to execute queries with Modin's ``"Pandas"`` backend """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" By default, Modin distributes the data in a dataframe (or series) and attempts to process data for different partitions in parallel. However, for certain scenarios, such as handling small datasets, Modin's parallel execution may introduce unnecessary overhead. In such cases, it's more efficient to use serial execution with a single, unpartitioned pandas dataframe. You can enable this kind of local pandas execution by setting Modin's ``Backend`` :doc:`configuration variable ` to ``"Pandas"``. DataFrames created while Modin's global backend is set to ``"Pandas"`` will continue to use native execution even if you switch the global backend later. Modin supports interoperability between distributed Modin DataFrames and those using the pandas backend. Here is an example of using the pandas backend. .. code-block:: python import modin.pandas as pd from modin.config import Backend # This dataframe will use Modin's default, distributed execution. original_backend = Backend.get() assert original_backend != "Pandas" distributed_df_1 = pd.DataFrame([0]) # Set backend to "Pandas" for local pandas execution. Backend.put("Pandas") modin_on_pandas_df = pd.DataFrame([1]) assert modin_on_pandas_df.get_backend() == "Pandas" # Revert to default settings for distributed execution Backend.put(original_backend) distributed_df_2 = pd.DataFrame([2]) assert distributed_df_2.get_backend() == original_backend You can also use the pandas backend for some dataframes while using different backends for other dataframes. You can switch the backend of an individual dataframe or series with ``set_backend()`` or its synonym ``move_to()``. Here's an example of switching the backend for an individual dataframe. .. code-block:: python import modin.pandas as pd # This dataframe will use Modin's default, distributed execution. original_backend = Backend.get() assert original_backend != "Pandas" distributed_df_1 = pd.DataFrame([0]) pandas_df_1 = distributed_df_1.move_to("Pandas") assert pandas_df_1.get_backend() == "Pandas" pandas_df_1 = pandas_df_1.sort_values(0) assert pandas_df_1.get_backend() == "Pandas" new_df = pandas_df_1.move_to(original_backend) assert new_df.get_backend() == original_backend new_df.set_backend("Pandas", inplace=True) assert new_df.get_backend() == "Pandas" Automatic backend switching """"""""""""""""""""""""""" *This feature is under active development, and the API is subject to change.* Modin's backends may define heuristics for whether to automatically move data to another backend for more efficient computation of certain operations. Modin does not currently define these heuristics for any of its default backends, but any backends that wish to do so should implement the query compiler methods discussed in :ref:`the architecture document`. After implementing the relevant query compiler methods, the following APIs can be used to control when automatic switching occurs: .. code-block:: python import modin.pandas as pd from modin.core.storage_formats.pandas.query_compiler_caster import ( register_function_for_post_op_switch, register_function_for_pre_op_switch, ) from modin.config import AutoSwitchBackend # Enable automatic switching BEFORE computation for DataFrame.apply # when the DataFrame's backend is Pandas register_function_for_pre_op_switch( class_name="DataFrame", method="apply", backend="Pandas", ) # Enable automatic switching AFTER computation for Series.max # when the Series's backend is Pandas register_function_for_post_op_switch( class_name="Series", method="max", backend="Pandas", ) # Enable automatic switching globally (use .disable() to turn off) AutoSwitchBackend.enable() df = pd.DataFrame([[1, 2, 3]]) # "pin" a single DataFrame/Series, preventing it from # automatically switching backends df.pin_backend(inplace=True) # "unpin" it to re-enable automatic switching df.unpin_backend(inplace=True) Operation-specific optimizations """""""""""""""""""""""""""""""" merge ----- ``merge`` operation in Modin uses the broadcast join algorithm: combining a right Modin DataFrame into a pandas DataFrame and broadcasting it to the row partitions of the left Modin DataFrame. In order to minimize interprocess communication cost when doing an inner join you may want to swap left and right DataFrames. .. code-block:: python import modin.pandas as pd import numpy as np left_data = np.random.randint(0, 100, size=(2**8, 2**8)) right_data = np.random.randint(0, 100, size=(2**12, 2**12)) left_df = pd.DataFrame(left_data) right_df = pd.DataFrame(right_data) %timeit left_df.merge(right_df, how="inner", on=10) 3.59 s 107 ms per loop (mean std. dev. of 7 runs, 1 loop each) %timeit right_df.merge(left_df, how="inner", on=10) 1.22 s 40.1 ms per loop (mean std. dev. of 7 runs, 1 loop each) Note that result columns order may differ for first and second ``merge``. .. _range-partitioning: https://www.techopedia.com/definition/31994/range-partitioning .. _`avoid tiny task`: https://docs.ray.io/en/latest/ray-core/tips-for-first-time.html#tip-2-avoid-tiny-tasks ================================================ FILE: docs/usage_guide/optimization_notes/range_partitioning_ops.rst ================================================ :orphan: Operations that support range-partitioning in Modin ################################################### The following operations change their behavior once ``cfg.RangePartitioning`` variable is set to ``True``. Go through the list find out when it could be beneficial to engage range-partitioning for a certain method. GroupBy ======= .. note:: When grouping on multiple columns using range-partitioning implementation, the result may not be sorted even if ``groupby(sort=True, ...)`` was passed: https://github.com/modin-project/modin/issues/6875. Range-partitioning groupby implementation is automatically engaged for ``groupby.apply()``, ``groupby.transform()``, ``groupby.rolling()``. For groupby aggregations from `this list`_, MapReduce implementation is used by default. MapReduce tends to show better performance for groupby with low-cardinality. If the cardinality of your columns to group is expected to be high, it's recommended to engage range-partitioning implementation. Merge ===== .. note:: Range-partitioning approach is implemented only for "left" and "inner" merge and only when merging on a single column using `on` argument. Range-partitioning merge replaces broadcast merge. It is recommended to use range-partitioning implementation if the right dataframe in merge is as big as the left dataframe. In this case, range-partitioning implementation works faster and consumes less RAM. Under the spoiler you can find performance comparison of range-partitioning and broadcast merge in different scenarios: .. raw:: html
Performance measurements for merge The performance was measured on `h2o join queries`_ using Intel(R) Xeon(R) Gold 6238R CPU @ 2.20GHz (56 cores), with the number of cores allocated for Modin limited by 44 (``MODIN_CPUS=44``). Measurements for small 500mb data: .. image:: /img/range_partitioning_measurements/merge_h2o_500mb.jpg :align: center Measurements for medium 5gb data: .. image:: /img/range_partitioning_measurements/merge_h2o_5gb.png :align: center .. raw:: html
``.unique()`` and ``.drop_duplicates()`` ======================================== .. note:: When range-partitioning is enabled, both ``.unique()`` and ``.drop_duplicates()`` will yield results that are sorted along rows. If range-partitioning is disabled, the original order will be maintained. Range-partitioning implementation of ``.unique()`` / ``.drop_duplicates()`` works best when the input data size is big (more than 5_000_000 rows) and when the output size is also expected to be big (no more than 80% values are duplicates). Under the spoiler you can find performance comparisons in different scenarios: .. raw:: html
Performance measurements for ``.unique()`` The performance was measured on randomly generated data using Intel(R) Xeon(R) Gold 6238R CPU @ 2.20GHz (56 cores). The `duplicate rate` shows the procentage of duplicated rows in the dataset. You can learn more about this micro-benchmark by reading its source code: .. raw:: html
Micro-benchmark's source code .. code-block:: python import modin.pandas as pd import numpy as np import modin.config as cfg from modin.utils import execute from timeit import default_timer as timer import pandas cfg.CpuCount.put(16) def get_data(nrows, dtype): if dtype == int: return np.arange(nrows) elif dtype == float: return np.arange(nrows).astype(float) elif dtype == str: return np.array([f"value{i}" for i in range(nrows)]) else: raise NotImplementedError(dtype) pd.DataFrame(np.arange(cfg.NPartitions.get() * cfg.MinRowPartitionSize.get())).to_numpy() nrows = [1_000_000, 5_000_000, 10_000_000, 25_000_000, 50_000_000, 100_000_000] duplicate_rate = [0, 0.1, 0.5, 0.95] dtypes = [int, str] use_range_part = [True, False] columns = pandas.MultiIndex.from_product([dtypes, duplicate_rate, use_range_part], names=["dtype", "duplicate rate", "use range-part"]) result = pandas.DataFrame(index=nrows, columns=columns) i = 0 total_its = len(nrows) * len(duplicate_rate) * len(dtypes) * len(use_range_part) for dt in dtypes: for nrow in nrows: data = get_data(nrow, dt) np.random.shuffle(data) for dpr in duplicate_rate: data_c = data.copy() dupl_val = data_c[0] num_duplicates = int(dpr * nrow) dupl_indices = np.random.choice(np.arange(nrow), num_duplicates, replace=False) data_c[dupl_indices] = dupl_val for impl in use_range_part: print(f"{round((i / total_its) * 100, 2)}%") i += 1 cfg.RangePartitioning.put(impl) sr = pd.Series(data_c) execute(sr) t1 = timer() # returns a list, so no need for materialization sr.unique() tm = timer() - t1 print(nrow, dpr, dt, impl, tm) result.loc[nrow, (dt, dpr, impl)] = tm result.to_excel("unique.xlsx") .. raw:: html
Measurements with 16 cores being allocated for Modin (``MODIN_CPUS=16``): .. image:: /img/range_partitioning_measurements/unique_16cpus.jpg :align: center Measurements with 44 cores being allocated for Modin (``MODIN_CPUS=44``): .. image:: /img/range_partitioning_measurements/unique_44cpus.jpg :align: center .. raw:: html
.. raw:: html
Performance measurements for ``.drop_duplicates()`` The performance was measured on randomly generated data using Intel(R) Xeon(R) Gold 6238R CPU @ 2.20GHz (56 cores). The `duplicate rate` shows the procentage of duplicated rows in the dataset. The `subset size` shows the number of columns being specified as a ``subset`` parameter for ``df.drop_duplicates()``. You can learn more about this micro-benchmark by reading its source code: .. raw:: html
Micro-benchmark's source code .. code-block:: python import modin.pandas as pd import numpy as np import modin.config as cfg from modin.utils import execute from timeit import default_timer as timer import pandas cfg.CpuCount.put(16) pd.DataFrame(np.arange(cfg.NPartitions.get() * cfg.MinRowPartitionSize.get())).to_numpy() nrows = [1_000_000, 5_000_000, 10_000_000, 25_000_000] duplicate_rate = [0, 0.1, 0.5, 0.95] subset = [["col0"], ["col1", "col2", "col3", "col4"], None] ncols = 15 use_range_part = [True, False] columns = pandas.MultiIndex.from_product( [ [len(sbs) if sbs is not None else ncols for sbs in subset], duplicate_rate, use_range_part ], names=["subset size", "duplicate rate", "use range-part"] ) result = pandas.DataFrame(index=nrows, columns=columns) i = 0 total_its = len(nrows) * len(duplicate_rate) * len(subset) * len(use_range_part) for sbs in subset: for nrow in nrows: data = {f"col{i}": np.arange(nrow) for i in range(ncols)} pandas_df = pandas.DataFrame(data) for dpr in duplicate_rate: pandas_df_c = pandas_df.copy() dupl_val = pandas_df_c.iloc[0] num_duplicates = int(dpr * nrow) dupl_indices = np.random.choice(np.arange(nrow), num_duplicates, replace=False) pandas_df_c.iloc[dupl_indices] = dupl_val for impl in use_range_part: print(f"{round((i / total_its) * 100, 2)}%") i += 1 cfg.RangePartitioning.put(impl) md_df = pd.DataFrame(pandas_df_c) execute(md_df) t1 = timer() res = md_df.drop_duplicates(subset=sbs) execute(res) tm = timer() - t1 sbs_s = len(sbs) if sbs is not None else ncols print("len()", res.shape, nrow, dpr, sbs_s, impl, tm) result.loc[nrow, (sbs_s, dpr, impl)] = tm result.to_excel("drop_dupl.xlsx") .. raw:: html
Measurements with 16 cores being allocated for Modin (``MODIN_CPUS=16``): .. image:: /img/range_partitioning_measurements/drop_duplicates_16cpus.jpg :align: center Measurements with 44 cores being allocated for Modin (``MODIN_CPUS=44``): .. image:: /img/range_partitioning_measurements/drop_duplicates_44cpus.jpg :align: center .. raw:: html
'.nunique()' ============ .. note:: Range-partitioning approach is implemented only for ``pd.Series.nunique()`` and 1-column dataframes. For multi-column dataframes ``.nunique()`` can only use full-axis reduce implementation. Range-partitioning implementation of '.nunique()'' works best when the input data size is big (more than 5_000_000 rows) and when the output size is also expected to be big (no more than 80% values are duplicates). Under the spoiler you can find performance comparisons in different scenarios: .. raw:: html
Performance measurements for ``.nunique()`` The performance was measured on randomly generated data using Intel(R) Xeon(R) Gold 6238R CPU @ 2.20GHz (56 cores). The `duplicate rate` shows the procentage of duplicated rows in the dataset. You can learn more about this micro-benchmark by reading its source code: .. raw:: html
Micro-benchmark's source code .. code-block:: python import modin.pandas as pd import numpy as np import modin.config as cfg from modin.utils import execute from timeit import default_timer as timer import pandas cfg.CpuCount.put(16) def get_data(nrows, dtype): if dtype == int: return np.arange(nrows) elif dtype == float: return np.arange(nrows).astype(float) elif dtype == str: return np.array([f"value{i}" for i in range(nrows)]) else: raise NotImplementedError(dtype) pd.DataFrame(np.arange(cfg.NPartitions.get() * cfg.MinRowPartitionSize.get())).to_numpy() nrows = [1_000_000, 5_000_000, 10_000_000, 25_000_000, 50_000_000, 100_000_000] duplicate_rate = [0, 0.1, 0.5, 0.95] dtypes = [int, str] use_range_part = [True, False] columns = pandas.MultiIndex.from_product([dtypes, duplicate_rate, use_range_part], names=["dtype", "duplicate rate", "use range-part"]) result = pandas.DataFrame(index=nrows, columns=columns) i = 0 total_its = len(nrows) * len(duplicate_rate) * len(dtypes) * len(use_range_part) for dt in dtypes: for nrow in nrows: data = get_data(nrow, dt) np.random.shuffle(data) for dpr in duplicate_rate: data_c = data.copy() dupl_val = data_c[0] num_duplicates = int(dpr * nrow) dupl_indices = np.random.choice(np.arange(nrow), num_duplicates, replace=False) data_c[dupl_indices] = dupl_val for impl in use_range_part: print(f"{round((i / total_its) * 100, 2)}%") i += 1 cfg.RangePartitioning.put(impl) sr = pd.Series(data_c) execute(sr) t1 = timer() # returns a scalar, so no need for materialization res = sr.nunique() tm = timer() - t1 print(nrow, dpr, dt, impl, tm) result.loc[nrow, (dt, dpr, impl)] = tm result.to_excel("nunique.xlsx") .. raw:: html
Measurements with 16 cores being allocated for Modin (``MODIN_CPUS=16``): .. image:: /img/range_partitioning_measurements/nunique_16cpus.jpg :align: center .. raw:: html
Resample ======== .. note:: Range-partitioning approach doesn't support transform-like functions (like `.interpolate()`, `.ffill()`, `.bfill()`, ...) It is recommended to use range-partitioning for resampling if you're dealing with a dataframe that has more than 5_000_000 rows and the expected output is also expected to be big (more than 500_000 rows). Under the spoiler you can find performance comparisons in different scenarios: .. raw:: html
Performance measurements for ``.resample()`` The script below measures performance of ``df.resample(rule).sum()`` using Intel(R) Xeon(R) Gold 6238R CPU @ 2.20GHz (56 cores). You can learn more about this micro-benchmark by reading its source code: .. raw:: html
Micro-benchmark's source code .. code-block:: python import pandas import numpy as np import modin.pandas as pd import modin.config as cfg from timeit import default_timer as timer from modin.utils import execute cfg.CpuCount.put(16) nrows = [1_000_000, 5_000_000, 10_000_000] ncols = [5, 33] rules = [ "500ms", # doubles nrows "30s", # decreases nrows in 30 times "5min", # decreases nrows in 300 ] use_rparts = [True, False] cols = pandas.MultiIndex.from_product([rules, ncols, use_rparts], names=["rule", "ncols", "USE RANGE PART"]) rres = pandas.DataFrame(index=nrows, columns=cols) total_nits = len(nrows) * len(ncols) * len(rules) * len(use_rparts) i = 0 for nrow in nrows: for ncol in ncols: index = pandas.date_range("31/12/2000", periods=nrow, freq="s") data = {f"col{i}": np.arange(nrow) for i in range(ncol)} pd_df = pandas.DataFrame(data, index=index) for rule in rules: for rparts in use_rparts: print(f"{round((i / total_nits) * 100, 2)}%") i += 1 cfg.RangePartitioning.put(rparts) df = pd.DataFrame(data, index=index) execute(df) t1 = timer() res = df.resample(rule).sum() execute(res) ts = timer() - t1 print(nrow, ncol, rule, rparts, ts) rres.loc[nrow, (rule, ncol, rparts)] = ts rres.to_excel("resample.xlsx") .. raw:: html
Measurements with 16 cores being allocated for Modin (``MODIN_CPUS=16``): .. image:: /img/range_partitioning_measurements/resample_16cpus.jpg :align: center .. raw:: html
pivot_table =========== Range-partitioning implementation is automatically applied for ``df.pivot_table`` whenever possible, users can't control this. sort_values =========== Range-partitioning implementation is automatically applied for ``df.sort_values`` whenever possible, users can't control this. .. _h2o join queries: https://h2oai.github.io/db-benchmark/ .. _this list: https://github.com/modin-project/modin/blob/7b233e4a920d5f03dce7a82847847b92ae7ad617/modin/core/storage_formats/pandas/groupby.py#L236-L247 ================================================ FILE: environment-dev.yml ================================================ name: modin channels: - conda-forge dependencies: - pip # required dependencies - pandas>=2.2,<2.4 - numpy>=1.22.4 - fsspec>=2022.11.0 - packaging>=21.0 - psutil>=5.8.0 # optional dependencies # NOTE Keep the ray and dask dependencies in sync with the Linux and Windows # Unidist environment dependencies. - ray-core>=2.10.0,<3 - pyarrow>=10.0.1 # workaround for https://github.com/conda/conda/issues/11744 - grpcio!=1.45.* - grpcio!=1.46.* - dask>=2.22.0 - distributed>=2.22.0 - xarray>=2022.12.0 - jinja2>=3.1.2 - scipy>=1.10.0 - s3fs>=2022.11.0 - lxml>=4.9.2 - openpyxl>=3.1.0 - xlrd>=2.0.1 - matplotlib>=3.6.3 - sqlalchemy>=2.0.0 - pandas-gbq>=0.19.0 - pytables>=3.8.0 # pymssql==2.2.8 broken: https://github.com/modin-project/modin/issues/6429 - pymssql>=2.1.5,!=2.2.8 - psycopg2>=2.9.6 - fastparquet>=2022.12.0 - tqdm>=4.60.0 - numexpr>=2.8.4 # dependencies for making release - pygithub>=v1.58.0 - pygit2>=1.9.2 # test dependencies - coverage>=7.1.0 - moto>=4.1.0 - pytest>=7.3.2 - pytest-benchmark>=4.0.0 - pytest-cov>=4.0.0 - pytest-xdist>=3.2.0 - typing_extensions # code linters - black>=24.1.0 - flake8>=6.0.0 - flake8-no-implicit-concat>=0.3.4 - flake8-print>=5.0.0 - mypy>=1.0.0 - pandas-stubs>=2.0.0 - isort>=5.12 - pip: - dataframe-api-compat>=0.2.7 - asv==0.5.1 # no conda package for windows so we install it with pip - connectorx>=0.2.6a4 - fuzzydata>=0.0.11 # Fixes breaking ipywidgets changes, but didn't release yet. - git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5 # The `numpydoc` version should match the version installed in the `lint-pydocstyle` job of the CI. - numpydoc==1.6.0 - polars ================================================ FILE: examples/data/boston_housing.csv ================================================ ,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE 0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0 1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6 2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7 3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4 4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2 5,0.02985,0.0,2.18,0.0,0.458,6.43,58.7,6.0622,3.0,222.0,18.7,394.12,5.21,28.7 6,0.08829,12.5,7.87,0.0,0.524,6.012,66.6,5.5605,5.0,311.0,15.2,395.6,12.43,22.9 7,0.14455,12.5,7.87,0.0,0.524,6.172,96.1,5.9505,5.0,311.0,15.2,396.9,19.15,27.1 8,0.21124,12.5,7.87,0.0,0.524,5.631,100.0,6.0821,5.0,311.0,15.2,386.63,29.93,16.5 9,0.17004,12.5,7.87,0.0,0.524,6.004,85.9,6.5921,5.0,311.0,15.2,386.71,17.1,18.9 10,0.22489,12.5,7.87,0.0,0.524,6.377,94.3,6.3467,5.0,311.0,15.2,392.52,20.45,15.0 11,0.11747,12.5,7.87,0.0,0.524,6.009,82.9,6.2267,5.0,311.0,15.2,396.9,13.27,18.9 12,0.09378,12.5,7.87,0.0,0.524,5.889,39.0,5.4509,5.0,311.0,15.2,390.5,15.71,21.7 13,0.62976,0.0,8.14,0.0,0.538,5.949,61.8,4.7075,4.0,307.0,21.0,396.9,8.26,20.4 14,0.63796,0.0,8.14,0.0,0.538,6.096,84.5,4.4619,4.0,307.0,21.0,380.02,10.26,18.2 15,0.62739,0.0,8.14,0.0,0.538,5.834,56.5,4.4986,4.0,307.0,21.0,395.62,8.47,19.9 16,1.05393,0.0,8.14,0.0,0.538,5.935,29.3,4.4986,4.0,307.0,21.0,386.85,6.58,23.1 17,0.7842,0.0,8.14,0.0,0.538,5.99,81.7,4.2579,4.0,307.0,21.0,386.75,14.67,17.5 18,0.80271,0.0,8.14,0.0,0.538,5.456,36.6,3.7965,4.0,307.0,21.0,288.99,11.69,20.2 19,0.7258,0.0,8.14,0.0,0.538,5.727,69.5,3.7965,4.0,307.0,21.0,390.95,11.28,18.2 20,1.25179,0.0,8.14,0.0,0.538,5.57,98.1,3.7979,4.0,307.0,21.0,376.57,21.02,13.6 21,0.85204,0.0,8.14,0.0,0.538,5.965,89.2,4.0123,4.0,307.0,21.0,392.53,13.83,19.6 22,1.23247,0.0,8.14,0.0,0.538,6.142,91.7,3.9769,4.0,307.0,21.0,396.9,18.72,15.2 23,0.98843,0.0,8.14,0.0,0.538,5.813,100.0,4.0952,4.0,307.0,21.0,394.54,19.88,14.5 24,0.75026,0.0,8.14,0.0,0.538,5.924,94.1,4.3996,4.0,307.0,21.0,394.33,16.3,15.6 25,0.84054,0.0,8.14,0.0,0.538,5.599,85.7,4.4546,4.0,307.0,21.0,303.42,16.51,13.9 26,0.67191,0.0,8.14,0.0,0.538,5.813,90.3,4.682,4.0,307.0,21.0,376.88,14.81,16.6 27,0.95577,0.0,8.14,0.0,0.538,6.047,88.8,4.4534,4.0,307.0,21.0,306.38,17.28,14.8 28,0.77299,0.0,8.14,0.0,0.538,6.495,94.4,4.4547,4.0,307.0,21.0,387.94,12.8,18.4 29,1.00245,0.0,8.14,0.0,0.538,6.674,87.3,4.239,4.0,307.0,21.0,380.23,11.98,21.0 30,1.13081,0.0,8.14,0.0,0.538,5.713,94.1,4.233,4.0,307.0,21.0,360.17,22.6,12.7 31,1.35472,0.0,8.14,0.0,0.538,6.072,100.0,4.175,4.0,307.0,21.0,376.73,13.04,14.5 32,1.38799,0.0,8.14,0.0,0.538,5.95,82.0,3.99,4.0,307.0,21.0,232.6,27.71,13.2 33,1.15172,0.0,8.14,0.0,0.538,5.701,95.0,3.7872,4.0,307.0,21.0,358.77,18.35,13.1 34,1.61282,0.0,8.14,0.0,0.538,6.096,96.9,3.7598,4.0,307.0,21.0,248.31,20.34,13.5 35,0.06417,0.0,5.96,0.0,0.499,5.933,68.2,3.3603,5.0,279.0,19.2,396.9,9.68,18.9 36,0.09744,0.0,5.96,0.0,0.499,5.841,61.4,3.3779,5.0,279.0,19.2,377.56,11.41,20.0 37,0.08014,0.0,5.96,0.0,0.499,5.85,41.5,3.9342,5.0,279.0,19.2,396.9,8.77,21.0 38,0.17505,0.0,5.96,0.0,0.499,5.966,30.2,3.8473,5.0,279.0,19.2,393.43,10.13,24.7 39,0.02763,75.0,2.95,0.0,0.428,6.595,21.8,5.4011,3.0,252.0,18.3,395.63,4.32,30.8 40,0.03359,75.0,2.95,0.0,0.428,7.024,15.8,5.4011,3.0,252.0,18.3,395.62,1.98,34.9 41,0.12744,0.0,6.91,0.0,0.448,6.77,2.9,5.7209,3.0,233.0,17.9,385.41,4.84,26.6 42,0.1415,0.0,6.91,0.0,0.448,6.169,6.6,5.7209,3.0,233.0,17.9,383.37,5.81,25.3 43,0.15936,0.0,6.91,0.0,0.448,6.211,6.5,5.7209,3.0,233.0,17.9,394.46,7.44,24.7 44,0.12269,0.0,6.91,0.0,0.448,6.069,40.0,5.7209,3.0,233.0,17.9,389.39,9.55,21.2 45,0.17142,0.0,6.91,0.0,0.448,5.682,33.8,5.1004,3.0,233.0,17.9,396.9,10.21,19.3 46,0.18836,0.0,6.91,0.0,0.448,5.786,33.3,5.1004,3.0,233.0,17.9,396.9,14.15,20.0 47,0.22927,0.0,6.91,0.0,0.448,6.03,85.5,5.6894,3.0,233.0,17.9,392.74,18.8,16.6 48,0.25387,0.0,6.91,0.0,0.448,5.399,95.3,5.87,3.0,233.0,17.9,396.9,30.81,14.4 49,0.21977,0.0,6.91,0.0,0.448,5.602,62.0,6.0877,3.0,233.0,17.9,396.9,16.2,19.4 50,0.08873,21.0,5.64,0.0,0.439,5.963,45.7,6.8147,4.0,243.0,16.8,395.56,13.45,19.7 51,0.04337,21.0,5.64,0.0,0.439,6.115,63.0,6.8147,4.0,243.0,16.8,393.97,9.43,20.5 52,0.0536,21.0,5.64,0.0,0.439,6.511,21.1,6.8147,4.0,243.0,16.8,396.9,5.28,25.0 53,0.04981,21.0,5.64,0.0,0.439,5.998,21.4,6.8147,4.0,243.0,16.8,396.9,8.43,23.4 54,0.0136,75.0,4.0,0.0,0.41,5.888,47.6,7.3197,3.0,469.0,21.1,396.9,14.8,18.9 55,0.01311,90.0,1.22,0.0,0.403,7.249,21.9,8.6966,5.0,226.0,17.9,395.93,4.81,35.4 56,0.02055,85.0,0.74,0.0,0.41,6.383,35.7,9.1876,2.0,313.0,17.3,396.9,5.77,24.7 57,0.01432,100.0,1.32,0.0,0.411,6.816,40.5,8.3248,5.0,256.0,15.1,392.9,3.95,31.6 58,0.15445,25.0,5.13,0.0,0.453,6.145,29.2,7.8148,8.0,284.0,19.7,390.68,6.86,23.3 59,0.10328,25.0,5.13,0.0,0.453,5.927,47.2,6.932,8.0,284.0,19.7,396.9,9.22,19.6 60,0.14932,25.0,5.13,0.0,0.453,5.741,66.2,7.2254,8.0,284.0,19.7,395.11,13.15,18.7 61,0.17171,25.0,5.13,0.0,0.453,5.966,93.4,6.8185,8.0,284.0,19.7,378.08,14.44,16.0 62,0.11027,25.0,5.13,0.0,0.453,6.456,67.8,7.2255,8.0,284.0,19.7,396.9,6.73,22.2 63,0.1265,25.0,5.13,0.0,0.453,6.762,43.4,7.9809,8.0,284.0,19.7,395.58,9.5,25.0 64,0.01951,17.5,1.38,0.0,0.4161,7.104,59.5,9.2229,3.0,216.0,18.6,393.24,8.05,33.0 65,0.03584,80.0,3.37,0.0,0.398,6.29,17.8,6.6115,4.0,337.0,16.1,396.9,4.67,23.5 66,0.04379,80.0,3.37,0.0,0.398,5.787,31.1,6.6115,4.0,337.0,16.1,396.9,10.24,19.4 67,0.05789,12.5,6.07,0.0,0.409,5.878,21.4,6.498,4.0,345.0,18.9,396.21,8.1,22.0 68,0.13554,12.5,6.07,0.0,0.409,5.594,36.8,6.498,4.0,345.0,18.9,396.9,13.09,17.4 69,0.12816,12.5,6.07,0.0,0.409,5.885,33.0,6.498,4.0,345.0,18.9,396.9,8.79,20.9 70,0.08826,0.0,10.81,0.0,0.413,6.417,6.6,5.2873,4.0,305.0,19.2,383.73,6.72,24.2 71,0.15876,0.0,10.81,0.0,0.413,5.961,17.5,5.2873,4.0,305.0,19.2,376.94,9.88,21.7 72,0.09164,0.0,10.81,0.0,0.413,6.065,7.8,5.2873,4.0,305.0,19.2,390.91,5.52,22.8 73,0.19539,0.0,10.81,0.0,0.413,6.245,6.2,5.2873,4.0,305.0,19.2,377.17,7.54,23.4 74,0.07896,0.0,12.83,0.0,0.437,6.273,6.0,4.2515,5.0,398.0,18.7,394.92,6.78,24.1 75,0.09512,0.0,12.83,0.0,0.437,6.286,45.0,4.5026,5.0,398.0,18.7,383.23,8.94,21.4 76,0.10153,0.0,12.83,0.0,0.437,6.279,74.5,4.0522,5.0,398.0,18.7,373.66,11.97,20.0 77,0.08707,0.0,12.83,0.0,0.437,6.14,45.8,4.0905,5.0,398.0,18.7,386.96,10.27,20.8 78,0.05646,0.0,12.83,0.0,0.437,6.232,53.7,5.0141,5.0,398.0,18.7,386.4,12.34,21.2 79,0.08387,0.0,12.83,0.0,0.437,5.874,36.6,4.5026,5.0,398.0,18.7,396.06,9.1,20.3 80,0.04113,25.0,4.86,0.0,0.426,6.727,33.5,5.4007,4.0,281.0,19.0,396.9,5.29,28.0 81,0.04462,25.0,4.86,0.0,0.426,6.619,70.4,5.4007,4.0,281.0,19.0,395.63,7.22,23.9 82,0.03659,25.0,4.86,0.0,0.426,6.302,32.2,5.4007,4.0,281.0,19.0,396.9,6.72,24.8 83,0.03551,25.0,4.86,0.0,0.426,6.167,46.7,5.4007,4.0,281.0,19.0,390.64,7.51,22.9 84,0.05059,0.0,4.49,0.0,0.449,6.389,48.0,4.7794,3.0,247.0,18.5,396.9,9.62,23.9 85,0.05735,0.0,4.49,0.0,0.449,6.63,56.1,4.4377,3.0,247.0,18.5,392.3,6.53,26.6 86,0.05188,0.0,4.49,0.0,0.449,6.015,45.1,4.4272,3.0,247.0,18.5,395.99,12.86,22.5 87,0.07151,0.0,4.49,0.0,0.449,6.121,56.8,3.7476,3.0,247.0,18.5,395.15,8.44,22.2 88,0.0566,0.0,3.41,0.0,0.489,7.007,86.3,3.4217,2.0,270.0,17.8,396.9,5.5,23.6 89,0.05302,0.0,3.41,0.0,0.489,7.079,63.1,3.4145,2.0,270.0,17.8,396.06,5.7,28.7 90,0.04684,0.0,3.41,0.0,0.489,6.417,66.1,3.0923,2.0,270.0,17.8,392.18,8.81,22.6 91,0.03932,0.0,3.41,0.0,0.489,6.405,73.9,3.0921,2.0,270.0,17.8,393.55,8.2,22.0 92,0.04203,28.0,15.04,0.0,0.464,6.442,53.6,3.6659,4.0,270.0,18.2,395.01,8.16,22.9 93,0.02875,28.0,15.04,0.0,0.464,6.211,28.9,3.6659,4.0,270.0,18.2,396.33,6.21,25.0 94,0.04294,28.0,15.04,0.0,0.464,6.249,77.3,3.615,4.0,270.0,18.2,396.9,10.59,20.6 95,0.12204,0.0,2.89,0.0,0.445,6.625,57.8,3.4952,2.0,276.0,18.0,357.98,6.65,28.4 96,0.11504,0.0,2.89,0.0,0.445,6.163,69.6,3.4952,2.0,276.0,18.0,391.83,11.34,21.4 97,0.12083,0.0,2.89,0.0,0.445,8.069,76.0,3.4952,2.0,276.0,18.0,396.9,4.21,38.7 98,0.08187,0.0,2.89,0.0,0.445,7.82,36.9,3.4952,2.0,276.0,18.0,393.53,3.57,43.8 99,0.0686,0.0,2.89,0.0,0.445,7.416,62.5,3.4952,2.0,276.0,18.0,396.9,6.19,33.2 100,0.14866,0.0,8.56,0.0,0.52,6.727,79.9,2.7778,5.0,384.0,20.9,394.76,9.42,27.5 101,0.11432,0.0,8.56,0.0,0.52,6.781,71.3,2.8561,5.0,384.0,20.9,395.58,7.67,26.5 102,0.22876,0.0,8.56,0.0,0.52,6.405,85.4,2.7147,5.0,384.0,20.9,70.8,10.63,18.6 103,0.21161,0.0,8.56,0.0,0.52,6.137,87.4,2.7147,5.0,384.0,20.9,394.47,13.44,19.3 104,0.1396,0.0,8.56,0.0,0.52,6.167,90.0,2.421,5.0,384.0,20.9,392.69,12.33,20.1 105,0.13262,0.0,8.56,0.0,0.52,5.851,96.7,2.1069,5.0,384.0,20.9,394.05,16.47,19.5 106,0.1712,0.0,8.56,0.0,0.52,5.836,91.9,2.211,5.0,384.0,20.9,395.67,18.66,19.5 107,0.13117,0.0,8.56,0.0,0.52,6.127,85.2,2.1224,5.0,384.0,20.9,387.69,14.09,20.4 108,0.12802,0.0,8.56,0.0,0.52,6.474,97.1,2.4329,5.0,384.0,20.9,395.24,12.27,19.8 109,0.26363,0.0,8.56,0.0,0.52,6.229,91.2,2.5451,5.0,384.0,20.9,391.23,15.55,19.4 110,0.10793,0.0,8.56,0.0,0.52,6.195,54.4,2.7778,5.0,384.0,20.9,393.49,13.0,21.7 111,0.10084,0.0,10.01,0.0,0.547,6.715,81.6,2.6775,6.0,432.0,17.8,395.59,10.16,22.8 112,0.12329,0.0,10.01,0.0,0.547,5.913,92.9,2.3534,6.0,432.0,17.8,394.95,16.21,18.8 113,0.22212,0.0,10.01,0.0,0.547,6.092,95.4,2.548,6.0,432.0,17.8,396.9,17.09,18.7 114,0.14231,0.0,10.01,0.0,0.547,6.254,84.2,2.2565,6.0,432.0,17.8,388.74,10.45,18.5 115,0.17134,0.0,10.01,0.0,0.547,5.928,88.2,2.4631,6.0,432.0,17.8,344.91,15.76,18.3 116,0.13158,0.0,10.01,0.0,0.547,6.176,72.5,2.7301,6.0,432.0,17.8,393.3,12.04,21.2 117,0.15098,0.0,10.01,0.0,0.547,6.021,82.6,2.7474,6.0,432.0,17.8,394.51,10.3,19.2 118,0.13058,0.0,10.01,0.0,0.547,5.872,73.1,2.4775,6.0,432.0,17.8,338.63,15.37,20.4 119,0.14476,0.0,10.01,0.0,0.547,5.731,65.2,2.7592,6.0,432.0,17.8,391.5,13.61,19.3 120,0.06899,0.0,25.65,0.0,0.581,5.87,69.7,2.2577,2.0,188.0,19.1,389.15,14.37,22.0 121,0.07165,0.0,25.65,0.0,0.581,6.004,84.1,2.1974,2.0,188.0,19.1,377.67,14.27,20.3 122,0.09299,0.0,25.65,0.0,0.581,5.961,92.9,2.0869,2.0,188.0,19.1,378.09,17.93,20.5 123,0.15038,0.0,25.65,0.0,0.581,5.856,97.0,1.9444,2.0,188.0,19.1,370.31,25.41,17.3 124,0.09849,0.0,25.65,0.0,0.581,5.879,95.8,2.0063,2.0,188.0,19.1,379.38,17.58,18.8 125,0.16902,0.0,25.65,0.0,0.581,5.986,88.4,1.9929,2.0,188.0,19.1,385.02,14.81,21.4 126,0.38735,0.0,25.65,0.0,0.581,5.613,95.6,1.7572,2.0,188.0,19.1,359.29,27.26,15.7 127,0.25915,0.0,21.89,0.0,0.624,5.693,96.0,1.7883,4.0,437.0,21.2,392.11,17.19,16.2 128,0.32543,0.0,21.89,0.0,0.624,6.431,98.8,1.8125,4.0,437.0,21.2,396.9,15.39,18.0 129,0.88125,0.0,21.89,0.0,0.624,5.637,94.7,1.9799,4.0,437.0,21.2,396.9,18.34,14.3 130,0.34006,0.0,21.89,0.0,0.624,6.458,98.9,2.1185,4.0,437.0,21.2,395.04,12.6,19.2 131,1.19294,0.0,21.89,0.0,0.624,6.326,97.7,2.271,4.0,437.0,21.2,396.9,12.26,19.6 132,0.59005,0.0,21.89,0.0,0.624,6.372,97.9,2.3274,4.0,437.0,21.2,385.76,11.12,23.0 133,0.32982,0.0,21.89,0.0,0.624,5.822,95.4,2.4699,4.0,437.0,21.2,388.69,15.03,18.4 134,0.97617,0.0,21.89,0.0,0.624,5.757,98.4,2.346,4.0,437.0,21.2,262.76,17.31,15.6 135,0.55778,0.0,21.89,0.0,0.624,6.335,98.2,2.1107,4.0,437.0,21.2,394.67,16.96,18.1 136,0.32264,0.0,21.89,0.0,0.624,5.942,93.5,1.9669,4.0,437.0,21.2,378.25,16.9,17.4 137,0.35233,0.0,21.89,0.0,0.624,6.454,98.4,1.8498,4.0,437.0,21.2,394.08,14.59,17.1 138,0.2498,0.0,21.89,0.0,0.624,5.857,98.2,1.6686,4.0,437.0,21.2,392.04,21.32,13.3 139,0.54452,0.0,21.89,0.0,0.624,6.151,97.9,1.6687,4.0,437.0,21.2,396.9,18.46,17.8 140,0.2909,0.0,21.89,0.0,0.624,6.174,93.6,1.6119,4.0,437.0,21.2,388.08,24.16,14.0 141,1.62864,0.0,21.89,0.0,0.624,5.019,100.0,1.4394,4.0,437.0,21.2,396.9,34.41,14.4 142,3.32105,0.0,19.58,1.0,0.871,5.403,100.0,1.3216,5.0,403.0,14.7,396.9,26.82,13.4 143,4.0974,0.0,19.58,0.0,0.871,5.468,100.0,1.4118,5.0,403.0,14.7,396.9,26.42,15.6 144,2.77974,0.0,19.58,0.0,0.871,4.903,97.8,1.3459,5.0,403.0,14.7,396.9,29.29,11.8 145,2.37934,0.0,19.58,0.0,0.871,6.13,100.0,1.4191,5.0,403.0,14.7,172.91,27.8,13.8 146,2.15505,0.0,19.58,0.0,0.871,5.628,100.0,1.5166,5.0,403.0,14.7,169.27,16.65,15.6 147,2.36862,0.0,19.58,0.0,0.871,4.926,95.7,1.4608,5.0,403.0,14.7,391.71,29.53,14.6 148,2.33099,0.0,19.58,0.0,0.871,5.186,93.8,1.5296,5.0,403.0,14.7,356.99,28.32,17.8 149,2.73397,0.0,19.58,0.0,0.871,5.597,94.9,1.5257,5.0,403.0,14.7,351.85,21.45,15.4 150,1.6566,0.0,19.58,0.0,0.871,6.122,97.3,1.618,5.0,403.0,14.7,372.8,14.1,21.5 151,1.49632,0.0,19.58,0.0,0.871,5.404,100.0,1.5916,5.0,403.0,14.7,341.6,13.28,19.6 152,1.12658,0.0,19.58,1.0,0.871,5.012,88.0,1.6102,5.0,403.0,14.7,343.28,12.12,15.3 153,2.14918,0.0,19.58,0.0,0.871,5.709,98.5,1.6232,5.0,403.0,14.7,261.95,15.79,19.4 154,1.41385,0.0,19.58,1.0,0.871,6.129,96.0,1.7494,5.0,403.0,14.7,321.02,15.12,17.0 155,3.53501,0.0,19.58,1.0,0.871,6.152,82.6,1.7455,5.0,403.0,14.7,88.01,15.02,15.6 156,2.44668,0.0,19.58,0.0,0.871,5.272,94.0,1.7364,5.0,403.0,14.7,88.63,16.14,13.1 157,1.22358,0.0,19.58,0.0,0.605,6.943,97.4,1.8773,5.0,403.0,14.7,363.43,4.59,41.3 158,1.34284,0.0,19.58,0.0,0.605,6.066,100.0,1.7573,5.0,403.0,14.7,353.89,6.43,24.3 159,1.42502,0.0,19.58,0.0,0.871,6.51,100.0,1.7659,5.0,403.0,14.7,364.31,7.39,23.3 160,1.27346,0.0,19.58,1.0,0.605,6.25,92.6,1.7984,5.0,403.0,14.7,338.92,5.5,27.0 161,1.46336,0.0,19.58,0.0,0.605,7.489,90.8,1.9709,5.0,403.0,14.7,374.43,1.73,50.0 162,1.83377,0.0,19.58,1.0,0.605,7.802,98.2,2.0407,5.0,403.0,14.7,389.61,1.92,50.0 163,1.51902,0.0,19.58,1.0,0.605,8.375,93.9,2.162,5.0,403.0,14.7,388.45,3.32,50.0 164,2.24236,0.0,19.58,0.0,0.605,5.854,91.8,2.422,5.0,403.0,14.7,395.11,11.64,22.7 165,2.924,0.0,19.58,0.0,0.605,6.101,93.0,2.2834,5.0,403.0,14.7,240.16,9.81,25.0 166,2.01019,0.0,19.58,0.0,0.605,7.929,96.2,2.0459,5.0,403.0,14.7,369.3,3.7,50.0 167,1.80028,0.0,19.58,0.0,0.605,5.877,79.2,2.4259,5.0,403.0,14.7,227.61,12.14,23.8 168,2.3004,0.0,19.58,0.0,0.605,6.319,96.1,2.1,5.0,403.0,14.7,297.09,11.1,23.8 169,2.44953,0.0,19.58,0.0,0.605,6.402,95.2,2.2625,5.0,403.0,14.7,330.04,11.32,22.3 170,1.20742,0.0,19.58,0.0,0.605,5.875,94.6,2.4259,5.0,403.0,14.7,292.29,14.43,17.4 171,2.3139,0.0,19.58,0.0,0.605,5.88,97.3,2.3887,5.0,403.0,14.7,348.13,12.03,19.1 172,0.13914,0.0,4.05,0.0,0.51,5.572,88.5,2.5961,5.0,296.0,16.6,396.9,14.69,23.1 173,0.09178,0.0,4.05,0.0,0.51,6.416,84.1,2.6463,5.0,296.0,16.6,395.5,9.04,23.6 174,0.08447,0.0,4.05,0.0,0.51,5.859,68.7,2.7019,5.0,296.0,16.6,393.23,9.64,22.6 175,0.06664,0.0,4.05,0.0,0.51,6.546,33.1,3.1323,5.0,296.0,16.6,390.96,5.33,29.4 176,0.07022,0.0,4.05,0.0,0.51,6.02,47.2,3.5549,5.0,296.0,16.6,393.23,10.11,23.2 177,0.05425,0.0,4.05,0.0,0.51,6.315,73.4,3.3175,5.0,296.0,16.6,395.6,6.29,24.6 178,0.06642,0.0,4.05,0.0,0.51,6.86,74.4,2.9153,5.0,296.0,16.6,391.27,6.92,29.9 179,0.0578,0.0,2.46,0.0,0.488,6.98,58.4,2.829,3.0,193.0,17.8,396.9,5.04,37.2 180,0.06588,0.0,2.46,0.0,0.488,7.765,83.3,2.741,3.0,193.0,17.8,395.56,7.56,39.8 181,0.06888,0.0,2.46,0.0,0.488,6.144,62.2,2.5979,3.0,193.0,17.8,396.9,9.45,36.2 182,0.09103,0.0,2.46,0.0,0.488,7.155,92.2,2.7006,3.0,193.0,17.8,394.12,4.82,37.9 183,0.10008,0.0,2.46,0.0,0.488,6.563,95.6,2.847,3.0,193.0,17.8,396.9,5.68,32.5 184,0.08308,0.0,2.46,0.0,0.488,5.604,89.8,2.9879,3.0,193.0,17.8,391.0,13.98,26.4 185,0.06047,0.0,2.46,0.0,0.488,6.153,68.8,3.2797,3.0,193.0,17.8,387.11,13.15,29.6 186,0.05602,0.0,2.46,0.0,0.488,7.831,53.6,3.1992,3.0,193.0,17.8,392.63,4.45,50.0 187,0.07875,45.0,3.44,0.0,0.437,6.782,41.1,3.7886,5.0,398.0,15.2,393.87,6.68,32.0 188,0.12579,45.0,3.44,0.0,0.437,6.556,29.1,4.5667,5.0,398.0,15.2,382.84,4.56,29.8 189,0.0837,45.0,3.44,0.0,0.437,7.185,38.9,4.5667,5.0,398.0,15.2,396.9,5.39,34.9 190,0.09068,45.0,3.44,0.0,0.437,6.951,21.5,6.4798,5.0,398.0,15.2,377.68,5.1,37.0 191,0.06911,45.0,3.44,0.0,0.437,6.739,30.8,6.4798,5.0,398.0,15.2,389.71,4.69,30.5 192,0.08664,45.0,3.44,0.0,0.437,7.178,26.3,6.4798,5.0,398.0,15.2,390.49,2.87,36.4 193,0.02187,60.0,2.93,0.0,0.401,6.8,9.9,6.2196,1.0,265.0,15.6,393.37,5.03,31.1 194,0.01439,60.0,2.93,0.0,0.401,6.604,18.8,6.2196,1.0,265.0,15.6,376.7,4.38,29.1 195,0.01381,80.0,0.46,0.0,0.422,7.875,32.0,5.6484,4.0,255.0,14.4,394.23,2.97,50.0 196,0.04011,80.0,1.52,0.0,0.404,7.287,34.1,7.309,2.0,329.0,12.6,396.9,4.08,33.3 197,0.04666,80.0,1.52,0.0,0.404,7.107,36.6,7.309,2.0,329.0,12.6,354.31,8.61,30.3 198,0.03768,80.0,1.52,0.0,0.404,7.274,38.3,7.309,2.0,329.0,12.6,392.2,6.62,34.6 199,0.0315,95.0,1.47,0.0,0.403,6.975,15.3,7.6534,3.0,402.0,17.0,396.9,4.56,34.9 200,0.01778,95.0,1.47,0.0,0.403,7.135,13.9,7.6534,3.0,402.0,17.0,384.3,4.45,32.9 201,0.03445,82.5,2.03,0.0,0.415,6.162,38.4,6.27,2.0,348.0,14.7,393.77,7.43,24.1 202,0.02177,82.5,2.03,0.0,0.415,7.61,15.7,6.27,2.0,348.0,14.7,395.38,3.11,42.3 203,0.0351,95.0,2.68,0.0,0.4161,7.853,33.2,5.118,4.0,224.0,14.7,392.78,3.81,48.5 204,0.02009,95.0,2.68,0.0,0.4161,8.034,31.9,5.118,4.0,224.0,14.7,390.55,2.88,50.0 205,0.13642,0.0,10.59,0.0,0.489,5.891,22.3,3.9454,4.0,277.0,18.6,396.9,10.87,22.6 206,0.22969,0.0,10.59,0.0,0.489,6.326,52.5,4.3549,4.0,277.0,18.6,394.87,10.97,24.4 207,0.25199,0.0,10.59,0.0,0.489,5.783,72.7,4.3549,4.0,277.0,18.6,389.43,18.06,22.5 208,0.13587,0.0,10.59,1.0,0.489,6.064,59.1,4.2392,4.0,277.0,18.6,381.32,14.66,24.4 209,0.43571,0.0,10.59,1.0,0.489,5.344,100.0,3.875,4.0,277.0,18.6,396.9,23.09,20.0 210,0.17446,0.0,10.59,1.0,0.489,5.96,92.1,3.8771,4.0,277.0,18.6,393.25,17.27,21.7 211,0.37578,0.0,10.59,1.0,0.489,5.404,88.6,3.665,4.0,277.0,18.6,395.24,23.98,19.3 212,0.21719,0.0,10.59,1.0,0.489,5.807,53.8,3.6526,4.0,277.0,18.6,390.94,16.03,22.4 213,0.14052,0.0,10.59,0.0,0.489,6.375,32.3,3.9454,4.0,277.0,18.6,385.81,9.38,28.1 214,0.28955,0.0,10.59,0.0,0.489,5.412,9.8,3.5875,4.0,277.0,18.6,348.93,29.55,23.7 215,0.19802,0.0,10.59,0.0,0.489,6.182,42.4,3.9454,4.0,277.0,18.6,393.63,9.47,25.0 216,0.0456,0.0,13.89,1.0,0.55,5.888,56.0,3.1121,5.0,276.0,16.4,392.8,13.51,23.3 217,0.07013,0.0,13.89,0.0,0.55,6.642,85.1,3.4211,5.0,276.0,16.4,392.78,9.69,28.7 218,0.11069,0.0,13.89,1.0,0.55,5.951,93.8,2.8893,5.0,276.0,16.4,396.9,17.92,21.5 219,0.11425,0.0,13.89,1.0,0.55,6.373,92.4,3.3633,5.0,276.0,16.4,393.74,10.5,23.0 220,0.35809,0.0,6.2,1.0,0.507,6.951,88.5,2.8617,8.0,307.0,17.4,391.7,9.71,26.7 221,0.40771,0.0,6.2,1.0,0.507,6.164,91.3,3.048,8.0,307.0,17.4,395.24,21.46,21.7 222,0.62356,0.0,6.2,1.0,0.507,6.879,77.7,3.2721,8.0,307.0,17.4,390.39,9.93,27.5 223,0.6147,0.0,6.2,0.0,0.507,6.618,80.8,3.2721,8.0,307.0,17.4,396.9,7.6,30.1 224,0.31533,0.0,6.2,0.0,0.504,8.266,78.3,2.8944,8.0,307.0,17.4,385.05,4.14,44.8 225,0.52693,0.0,6.2,0.0,0.504,8.725,83.0,2.8944,8.0,307.0,17.4,382.0,4.63,50.0 226,0.38214,0.0,6.2,0.0,0.504,8.04,86.5,3.2157,8.0,307.0,17.4,387.38,3.13,37.6 227,0.41238,0.0,6.2,0.0,0.504,7.163,79.9,3.2157,8.0,307.0,17.4,372.08,6.36,31.6 228,0.29819,0.0,6.2,0.0,0.504,7.686,17.0,3.3751,8.0,307.0,17.4,377.51,3.92,46.7 229,0.44178,0.0,6.2,0.0,0.504,6.552,21.4,3.3751,8.0,307.0,17.4,380.34,3.76,31.5 230,0.537,0.0,6.2,0.0,0.504,5.981,68.1,3.6715,8.0,307.0,17.4,378.35,11.65,24.3 231,0.46296,0.0,6.2,0.0,0.504,7.412,76.9,3.6715,8.0,307.0,17.4,376.14,5.25,31.7 232,0.57529,0.0,6.2,0.0,0.507,8.337,73.3,3.8384,8.0,307.0,17.4,385.91,2.47,41.7 233,0.33147,0.0,6.2,0.0,0.507,8.247,70.4,3.6519,8.0,307.0,17.4,378.95,3.95,48.3 234,0.44791,0.0,6.2,1.0,0.507,6.726,66.5,3.6519,8.0,307.0,17.4,360.2,8.05,29.0 235,0.33045,0.0,6.2,0.0,0.507,6.086,61.5,3.6519,8.0,307.0,17.4,376.75,10.88,24.0 236,0.52058,0.0,6.2,1.0,0.507,6.631,76.5,4.148,8.0,307.0,17.4,388.45,9.54,25.1 237,0.51183,0.0,6.2,0.0,0.507,7.358,71.6,4.148,8.0,307.0,17.4,390.07,4.73,31.5 238,0.08244,30.0,4.93,0.0,0.428,6.481,18.5,6.1899,6.0,300.0,16.6,379.41,6.36,23.7 239,0.09252,30.0,4.93,0.0,0.428,6.606,42.2,6.1899,6.0,300.0,16.6,383.78,7.37,23.3 240,0.11329,30.0,4.93,0.0,0.428,6.897,54.3,6.3361,6.0,300.0,16.6,391.25,11.38,22.0 241,0.10612,30.0,4.93,0.0,0.428,6.095,65.1,6.3361,6.0,300.0,16.6,394.62,12.4,20.1 242,0.1029,30.0,4.93,0.0,0.428,6.358,52.9,7.0355,6.0,300.0,16.6,372.75,11.22,22.2 243,0.12757,30.0,4.93,0.0,0.428,6.393,7.8,7.0355,6.0,300.0,16.6,374.71,5.19,23.7 244,0.20608,22.0,5.86,0.0,0.431,5.593,76.5,7.9549,7.0,330.0,19.1,372.49,12.5,17.6 245,0.19133,22.0,5.86,0.0,0.431,5.605,70.2,7.9549,7.0,330.0,19.1,389.13,18.46,18.5 246,0.33983,22.0,5.86,0.0,0.431,6.108,34.9,8.0555,7.0,330.0,19.1,390.18,9.16,24.3 247,0.19657,22.0,5.86,0.0,0.431,6.226,79.2,8.0555,7.0,330.0,19.1,376.14,10.15,20.5 248,0.16439,22.0,5.86,0.0,0.431,6.433,49.1,7.8265,7.0,330.0,19.1,374.71,9.52,24.5 249,0.19073,22.0,5.86,0.0,0.431,6.718,17.5,7.8265,7.0,330.0,19.1,393.74,6.56,26.2 250,0.1403,22.0,5.86,0.0,0.431,6.487,13.0,7.3967,7.0,330.0,19.1,396.28,5.9,24.4 251,0.21409,22.0,5.86,0.0,0.431,6.438,8.9,7.3967,7.0,330.0,19.1,377.07,3.59,24.8 252,0.08221,22.0,5.86,0.0,0.431,6.957,6.8,8.9067,7.0,330.0,19.1,386.09,3.53,29.6 253,0.36894,22.0,5.86,0.0,0.431,8.259,8.4,8.9067,7.0,330.0,19.1,396.9,3.54,42.8 254,0.04819,80.0,3.64,0.0,0.392,6.108,32.0,9.2203,1.0,315.0,16.4,392.89,6.57,21.9 255,0.03548,80.0,3.64,0.0,0.392,5.876,19.1,9.2203,1.0,315.0,16.4,395.18,9.25,20.9 256,0.01538,90.0,3.75,0.0,0.394,7.454,34.2,6.3361,3.0,244.0,15.9,386.34,3.11,44.0 257,0.61154,20.0,3.97,0.0,0.647,8.704,86.9,1.801,5.0,264.0,13.0,389.7,5.12,50.0 258,0.66351,20.0,3.97,0.0,0.647,7.333,100.0,1.8946,5.0,264.0,13.0,383.29,7.79,36.0 259,0.65665,20.0,3.97,0.0,0.647,6.842,100.0,2.0107,5.0,264.0,13.0,391.93,6.9,30.1 260,0.54011,20.0,3.97,0.0,0.647,7.203,81.8,2.1121,5.0,264.0,13.0,392.8,9.59,33.8 261,0.53412,20.0,3.97,0.0,0.647,7.52,89.4,2.1398,5.0,264.0,13.0,388.37,7.26,43.1 262,0.52014,20.0,3.97,0.0,0.647,8.398,91.5,2.2885,5.0,264.0,13.0,386.86,5.91,48.8 263,0.82526,20.0,3.97,0.0,0.647,7.327,94.5,2.0788,5.0,264.0,13.0,393.42,11.25,31.0 264,0.55007,20.0,3.97,0.0,0.647,7.206,91.6,1.9301,5.0,264.0,13.0,387.89,8.1,36.5 265,0.76162,20.0,3.97,0.0,0.647,5.56,62.8,1.9865,5.0,264.0,13.0,392.4,10.45,22.8 266,0.7857,20.0,3.97,0.0,0.647,7.014,84.6,2.1329,5.0,264.0,13.0,384.07,14.79,30.7 267,0.57834,20.0,3.97,0.0,0.575,8.297,67.0,2.4216,5.0,264.0,13.0,384.54,7.44,50.0 268,0.5405,20.0,3.97,0.0,0.575,7.47,52.6,2.872,5.0,264.0,13.0,390.3,3.16,43.5 269,0.09065,20.0,6.96,1.0,0.464,5.92,61.5,3.9175,3.0,223.0,18.6,391.34,13.65,20.7 270,0.29916,20.0,6.96,0.0,0.464,5.856,42.1,4.429,3.0,223.0,18.6,388.65,13.0,21.1 271,0.16211,20.0,6.96,0.0,0.464,6.24,16.3,4.429,3.0,223.0,18.6,396.9,6.59,25.2 272,0.1146,20.0,6.96,0.0,0.464,6.538,58.7,3.9175,3.0,223.0,18.6,394.96,7.73,24.4 273,0.22188,20.0,6.96,1.0,0.464,7.691,51.8,4.3665,3.0,223.0,18.6,390.77,6.58,35.2 274,0.05644,40.0,6.41,1.0,0.447,6.758,32.9,4.0776,4.0,254.0,17.6,396.9,3.53,32.4 275,0.09604,40.0,6.41,0.0,0.447,6.854,42.8,4.2673,4.0,254.0,17.6,396.9,2.98,32.0 276,0.10469,40.0,6.41,1.0,0.447,7.267,49.0,4.7872,4.0,254.0,17.6,389.25,6.05,33.2 277,0.06127,40.0,6.41,1.0,0.447,6.826,27.6,4.8628,4.0,254.0,17.6,393.45,4.16,33.1 278,0.07978,40.0,6.41,0.0,0.447,6.482,32.1,4.1403,4.0,254.0,17.6,396.9,7.19,29.1 279,0.21038,20.0,3.33,0.0,0.4429,6.812,32.2,4.1007,5.0,216.0,14.9,396.9,4.85,35.1 280,0.03578,20.0,3.33,0.0,0.4429,7.82,64.5,4.6947,5.0,216.0,14.9,387.31,3.76,45.4 281,0.03705,20.0,3.33,0.0,0.4429,6.968,37.2,5.2447,5.0,216.0,14.9,392.23,4.59,35.4 282,0.06129,20.0,3.33,1.0,0.4429,7.645,49.7,5.2119,5.0,216.0,14.9,377.07,3.01,46.0 283,0.01501,90.0,1.21,1.0,0.401,7.923,24.8,5.885,1.0,198.0,13.6,395.52,3.16,50.0 284,0.00906,90.0,2.97,0.0,0.4,7.088,20.8,7.3073,1.0,285.0,15.3,394.72,7.85,32.2 285,0.01096,55.0,2.25,0.0,0.389,6.453,31.9,7.3073,1.0,300.0,15.3,394.72,8.23,22.0 286,0.01965,80.0,1.76,0.0,0.385,6.23,31.5,9.0892,1.0,241.0,18.2,341.6,12.93,20.1 287,0.03871,52.5,5.32,0.0,0.405,6.209,31.3,7.3172,6.0,293.0,16.6,396.9,7.14,23.2 288,0.0459,52.5,5.32,0.0,0.405,6.315,45.6,7.3172,6.0,293.0,16.6,396.9,7.6,22.3 289,0.04297,52.5,5.32,0.0,0.405,6.565,22.9,7.3172,6.0,293.0,16.6,371.72,9.51,24.8 290,0.03502,80.0,4.95,0.0,0.411,6.861,27.9,5.1167,4.0,245.0,19.2,396.9,3.33,28.5 291,0.07886,80.0,4.95,0.0,0.411,7.148,27.7,5.1167,4.0,245.0,19.2,396.9,3.56,37.3 292,0.03615,80.0,4.95,0.0,0.411,6.63,23.4,5.1167,4.0,245.0,19.2,396.9,4.7,27.9 293,0.08265,0.0,13.92,0.0,0.437,6.127,18.4,5.5027,4.0,289.0,16.0,396.9,8.58,23.9 294,0.08199,0.0,13.92,0.0,0.437,6.009,42.3,5.5027,4.0,289.0,16.0,396.9,10.4,21.7 295,0.12932,0.0,13.92,0.0,0.437,6.678,31.1,5.9604,4.0,289.0,16.0,396.9,6.27,28.6 296,0.05372,0.0,13.92,0.0,0.437,6.549,51.0,5.9604,4.0,289.0,16.0,392.85,7.39,27.1 297,0.14103,0.0,13.92,0.0,0.437,5.79,58.0,6.32,4.0,289.0,16.0,396.9,15.84,20.3 298,0.06466,70.0,2.24,0.0,0.4,6.345,20.1,7.8278,5.0,358.0,14.8,368.24,4.97,22.5 299,0.05561,70.0,2.24,0.0,0.4,7.041,10.0,7.8278,5.0,358.0,14.8,371.58,4.74,29.0 300,0.04417,70.0,2.24,0.0,0.4,6.871,47.4,7.8278,5.0,358.0,14.8,390.86,6.07,24.8 301,0.03537,34.0,6.09,0.0,0.433,6.59,40.4,5.4917,7.0,329.0,16.1,395.75,9.5,22.0 302,0.09266,34.0,6.09,0.0,0.433,6.495,18.4,5.4917,7.0,329.0,16.1,383.61,8.67,26.4 303,0.1,34.0,6.09,0.0,0.433,6.982,17.7,5.4917,7.0,329.0,16.1,390.43,4.86,33.1 304,0.05515,33.0,2.18,0.0,0.472,7.236,41.1,4.022,7.0,222.0,18.4,393.68,6.93,36.1 305,0.05479,33.0,2.18,0.0,0.472,6.616,58.1,3.37,7.0,222.0,18.4,393.36,8.93,28.4 306,0.07503,33.0,2.18,0.0,0.472,7.42,71.9,3.0992,7.0,222.0,18.4,396.9,6.47,33.4 307,0.04932,33.0,2.18,0.0,0.472,6.849,70.3,3.1827,7.0,222.0,18.4,396.9,7.53,28.2 308,0.49298,0.0,9.9,0.0,0.544,6.635,82.5,3.3175,4.0,304.0,18.4,396.9,4.54,22.8 309,0.3494,0.0,9.9,0.0,0.544,5.972,76.7,3.1025,4.0,304.0,18.4,396.24,9.97,20.3 310,2.63548,0.0,9.9,0.0,0.544,4.973,37.8,2.5194,4.0,304.0,18.4,350.45,12.64,16.1 311,0.79041,0.0,9.9,0.0,0.544,6.122,52.8,2.6403,4.0,304.0,18.4,396.9,5.98,22.1 312,0.26169,0.0,9.9,0.0,0.544,6.023,90.4,2.834,4.0,304.0,18.4,396.3,11.72,19.4 313,0.26938,0.0,9.9,0.0,0.544,6.266,82.8,3.2628,4.0,304.0,18.4,393.39,7.9,21.6 314,0.3692,0.0,9.9,0.0,0.544,6.567,87.3,3.6023,4.0,304.0,18.4,395.69,9.28,23.8 315,0.25356,0.0,9.9,0.0,0.544,5.705,77.7,3.945,4.0,304.0,18.4,396.42,11.5,16.2 316,0.31827,0.0,9.9,0.0,0.544,5.914,83.2,3.9986,4.0,304.0,18.4,390.7,18.33,17.8 317,0.24522,0.0,9.9,0.0,0.544,5.782,71.7,4.0317,4.0,304.0,18.4,396.9,15.94,19.8 318,0.40202,0.0,9.9,0.0,0.544,6.382,67.2,3.5325,4.0,304.0,18.4,395.21,10.36,23.1 319,0.47547,0.0,9.9,0.0,0.544,6.113,58.8,4.0019,4.0,304.0,18.4,396.23,12.73,21.0 320,0.1676,0.0,7.38,0.0,0.493,6.426,52.3,4.5404,5.0,287.0,19.6,396.9,7.2,23.8 321,0.18159,0.0,7.38,0.0,0.493,6.376,54.3,4.5404,5.0,287.0,19.6,396.9,6.87,23.1 322,0.35114,0.0,7.38,0.0,0.493,6.041,49.9,4.7211,5.0,287.0,19.6,396.9,7.7,20.4 323,0.28392,0.0,7.38,0.0,0.493,5.708,74.3,4.7211,5.0,287.0,19.6,391.13,11.74,18.5 324,0.34109,0.0,7.38,0.0,0.493,6.415,40.1,4.7211,5.0,287.0,19.6,396.9,6.12,25.0 325,0.19186,0.0,7.38,0.0,0.493,6.431,14.7,5.4159,5.0,287.0,19.6,393.68,5.08,24.6 326,0.30347,0.0,7.38,0.0,0.493,6.312,28.9,5.4159,5.0,287.0,19.6,396.9,6.15,23.0 327,0.24103,0.0,7.38,0.0,0.493,6.083,43.7,5.4159,5.0,287.0,19.6,396.9,12.79,22.2 328,0.06617,0.0,3.24,0.0,0.46,5.868,25.8,5.2146,4.0,430.0,16.9,382.44,9.97,19.3 329,0.06724,0.0,3.24,0.0,0.46,6.333,17.2,5.2146,4.0,430.0,16.9,375.21,7.34,22.6 330,0.04544,0.0,3.24,0.0,0.46,6.144,32.2,5.8736,4.0,430.0,16.9,368.57,9.09,19.8 331,0.05023,35.0,6.06,0.0,0.4379,5.706,28.4,6.6407,1.0,304.0,16.9,394.02,12.43,17.1 332,0.03466,35.0,6.06,0.0,0.4379,6.031,23.3,6.6407,1.0,304.0,16.9,362.25,7.83,19.4 333,0.05083,0.0,5.19,0.0,0.515,6.316,38.1,6.4584,5.0,224.0,20.2,389.71,5.68,22.2 334,0.03738,0.0,5.19,0.0,0.515,6.31,38.5,6.4584,5.0,224.0,20.2,389.4,6.75,20.7 335,0.03961,0.0,5.19,0.0,0.515,6.037,34.5,5.9853,5.0,224.0,20.2,396.9,8.01,21.1 336,0.03427,0.0,5.19,0.0,0.515,5.869,46.3,5.2311,5.0,224.0,20.2,396.9,9.8,19.5 337,0.03041,0.0,5.19,0.0,0.515,5.895,59.6,5.615,5.0,224.0,20.2,394.81,10.56,18.5 338,0.03306,0.0,5.19,0.0,0.515,6.059,37.3,4.8122,5.0,224.0,20.2,396.14,8.51,20.6 339,0.05497,0.0,5.19,0.0,0.515,5.985,45.4,4.8122,5.0,224.0,20.2,396.9,9.74,19.0 340,0.06151,0.0,5.19,0.0,0.515,5.968,58.5,4.8122,5.0,224.0,20.2,396.9,9.29,18.7 341,0.01301,35.0,1.52,0.0,0.442,7.241,49.3,7.0379,1.0,284.0,15.5,394.74,5.49,32.7 342,0.02498,0.0,1.89,0.0,0.518,6.54,59.7,6.2669,1.0,422.0,15.9,389.96,8.65,16.5 343,0.02543,55.0,3.78,0.0,0.484,6.696,56.4,5.7321,5.0,370.0,17.6,396.9,7.18,23.9 344,0.03049,55.0,3.78,0.0,0.484,6.874,28.1,6.4654,5.0,370.0,17.6,387.97,4.61,31.2 345,0.03113,0.0,4.39,0.0,0.442,6.014,48.5,8.0136,3.0,352.0,18.8,385.64,10.53,17.5 346,0.06162,0.0,4.39,0.0,0.442,5.898,52.3,8.0136,3.0,352.0,18.8,364.61,12.67,17.2 347,0.0187,85.0,4.15,0.0,0.429,6.516,27.7,8.5353,4.0,351.0,17.9,392.43,6.36,23.1 348,0.01501,80.0,2.01,0.0,0.435,6.635,29.7,8.344,4.0,280.0,17.0,390.94,5.99,24.5 349,0.02899,40.0,1.25,0.0,0.429,6.939,34.5,8.7921,1.0,335.0,19.7,389.85,5.89,26.6 350,0.06211,40.0,1.25,0.0,0.429,6.49,44.4,8.7921,1.0,335.0,19.7,396.9,5.98,22.9 351,0.0795,60.0,1.69,0.0,0.411,6.579,35.9,10.7103,4.0,411.0,18.3,370.78,5.49,24.1 352,0.07244,60.0,1.69,0.0,0.411,5.884,18.5,10.7103,4.0,411.0,18.3,392.33,7.79,18.6 353,0.01709,90.0,2.02,0.0,0.41,6.728,36.1,12.1265,5.0,187.0,17.0,384.46,4.5,30.1 354,0.04301,80.0,1.91,0.0,0.413,5.663,21.9,10.5857,4.0,334.0,22.0,382.8,8.05,18.2 355,0.10659,80.0,1.91,0.0,0.413,5.936,19.5,10.5857,4.0,334.0,22.0,376.04,5.57,20.6 356,8.98296,0.0,18.1,1.0,0.77,6.212,97.4,2.1222,24.0,666.0,20.2,377.73,17.6,17.8 357,3.8497,0.0,18.1,1.0,0.77,6.395,91.0,2.5052,24.0,666.0,20.2,391.34,13.27,21.7 358,5.20177,0.0,18.1,1.0,0.77,6.127,83.4,2.7227,24.0,666.0,20.2,395.43,11.48,22.7 359,4.26131,0.0,18.1,0.0,0.77,6.112,81.3,2.5091,24.0,666.0,20.2,390.74,12.67,22.6 360,4.54192,0.0,18.1,0.0,0.77,6.398,88.0,2.5182,24.0,666.0,20.2,374.56,7.79,25.0 361,3.83684,0.0,18.1,0.0,0.77,6.251,91.1,2.2955,24.0,666.0,20.2,350.65,14.19,19.9 362,3.67822,0.0,18.1,0.0,0.77,5.362,96.2,2.1036,24.0,666.0,20.2,380.79,10.19,20.8 363,4.22239,0.0,18.1,1.0,0.77,5.803,89.0,1.9047,24.0,666.0,20.2,353.04,14.64,16.8 364,3.47428,0.0,18.1,1.0,0.718,8.78,82.9,1.9047,24.0,666.0,20.2,354.55,5.29,21.9 365,4.55587,0.0,18.1,0.0,0.718,3.561,87.9,1.6132,24.0,666.0,20.2,354.7,7.12,27.5 366,3.69695,0.0,18.1,0.0,0.718,4.963,91.4,1.7523,24.0,666.0,20.2,316.03,14.0,21.9 367,13.5222,0.0,18.1,0.0,0.631,3.863,100.0,1.5106,24.0,666.0,20.2,131.42,13.33,23.1 368,4.89822,0.0,18.1,0.0,0.631,4.97,100.0,1.3325,24.0,666.0,20.2,375.52,3.26,50.0 369,5.66998,0.0,18.1,1.0,0.631,6.683,96.8,1.3567,24.0,666.0,20.2,375.33,3.73,50.0 370,6.53876,0.0,18.1,1.0,0.631,7.016,97.5,1.2024,24.0,666.0,20.2,392.05,2.96,50.0 371,9.2323,0.0,18.1,0.0,0.631,6.216,100.0,1.1691,24.0,666.0,20.2,366.15,9.53,50.0 372,8.26725,0.0,18.1,1.0,0.668,5.875,89.6,1.1296,24.0,666.0,20.2,347.88,8.88,50.0 373,11.1081,0.0,18.1,0.0,0.668,4.906,100.0,1.1742,24.0,666.0,20.2,396.9,34.77,13.8 374,18.4982,0.0,18.1,0.0,0.668,4.138,100.0,1.137,24.0,666.0,20.2,396.9,37.97,13.8 375,19.6091,0.0,18.1,0.0,0.671,7.313,97.9,1.3163,24.0,666.0,20.2,396.9,13.44,15.0 376,15.288,0.0,18.1,0.0,0.671,6.649,93.3,1.3449,24.0,666.0,20.2,363.02,23.24,13.9 377,9.82349,0.0,18.1,0.0,0.671,6.794,98.8,1.358,24.0,666.0,20.2,396.9,21.24,13.3 378,23.6482,0.0,18.1,0.0,0.671,6.38,96.2,1.3861,24.0,666.0,20.2,396.9,23.69,13.1 379,17.8667,0.0,18.1,0.0,0.671,6.223,100.0,1.3861,24.0,666.0,20.2,393.74,21.78,10.2 380,88.9762,0.0,18.1,0.0,0.671,6.968,91.9,1.4165,24.0,666.0,20.2,396.9,17.21,10.4 381,15.8744,0.0,18.1,0.0,0.671,6.545,99.1,1.5192,24.0,666.0,20.2,396.9,21.08,10.9 382,9.18702,0.0,18.1,0.0,0.7,5.536,100.0,1.5804,24.0,666.0,20.2,396.9,23.6,11.3 383,7.99248,0.0,18.1,0.0,0.7,5.52,100.0,1.5331,24.0,666.0,20.2,396.9,24.56,12.3 384,20.0849,0.0,18.1,0.0,0.7,4.368,91.2,1.4395,24.0,666.0,20.2,285.83,30.63,8.8 385,16.8118,0.0,18.1,0.0,0.7,5.277,98.1,1.4261,24.0,666.0,20.2,396.9,30.81,7.2 386,24.3938,0.0,18.1,0.0,0.7,4.652,100.0,1.4672,24.0,666.0,20.2,396.9,28.28,10.5 387,22.5971,0.0,18.1,0.0,0.7,5.0,89.5,1.5184,24.0,666.0,20.2,396.9,31.99,7.4 388,14.3337,0.0,18.1,0.0,0.7,4.88,100.0,1.5895,24.0,666.0,20.2,372.92,30.62,10.2 389,8.15174,0.0,18.1,0.0,0.7,5.39,98.9,1.7281,24.0,666.0,20.2,396.9,20.85,11.5 390,6.96215,0.0,18.1,0.0,0.7,5.713,97.0,1.9265,24.0,666.0,20.2,394.43,17.11,15.1 391,5.29305,0.0,18.1,0.0,0.7,6.051,82.5,2.1678,24.0,666.0,20.2,378.38,18.76,23.2 392,11.5779,0.0,18.1,0.0,0.7,5.036,97.0,1.77,24.0,666.0,20.2,396.9,25.68,9.7 393,8.64476,0.0,18.1,0.0,0.693,6.193,92.6,1.7912,24.0,666.0,20.2,396.9,15.17,13.8 394,13.3598,0.0,18.1,0.0,0.693,5.887,94.7,1.7821,24.0,666.0,20.2,396.9,16.35,12.7 395,8.71675,0.0,18.1,0.0,0.693,6.471,98.8,1.7257,24.0,666.0,20.2,391.98,17.12,13.1 396,5.87205,0.0,18.1,0.0,0.693,6.405,96.0,1.6768,24.0,666.0,20.2,396.9,19.37,12.5 397,7.67202,0.0,18.1,0.0,0.693,5.747,98.9,1.6334,24.0,666.0,20.2,393.1,19.92,8.5 398,38.3518,0.0,18.1,0.0,0.693,5.453,100.0,1.4896,24.0,666.0,20.2,396.9,30.59,5.0 399,9.91655,0.0,18.1,0.0,0.693,5.852,77.8,1.5004,24.0,666.0,20.2,338.16,29.97,6.3 400,25.0461,0.0,18.1,0.0,0.693,5.987,100.0,1.5888,24.0,666.0,20.2,396.9,26.77,5.6 401,14.2362,0.0,18.1,0.0,0.693,6.343,100.0,1.5741,24.0,666.0,20.2,396.9,20.32,7.2 402,9.59571,0.0,18.1,0.0,0.693,6.404,100.0,1.639,24.0,666.0,20.2,376.11,20.31,12.1 403,24.8017,0.0,18.1,0.0,0.693,5.349,96.0,1.7028,24.0,666.0,20.2,396.9,19.77,8.3 404,41.5292,0.0,18.1,0.0,0.693,5.531,85.4,1.6074,24.0,666.0,20.2,329.46,27.38,8.5 405,67.9208,0.0,18.1,0.0,0.693,5.683,100.0,1.4254,24.0,666.0,20.2,384.97,22.98,5.0 406,20.7162,0.0,18.1,0.0,0.659,4.138,100.0,1.1781,24.0,666.0,20.2,370.22,23.34,11.9 407,11.9511,0.0,18.1,0.0,0.659,5.608,100.0,1.2852,24.0,666.0,20.2,332.09,12.13,27.9 408,7.40389,0.0,18.1,0.0,0.597,5.617,97.9,1.4547,24.0,666.0,20.2,314.64,26.4,17.2 409,14.4383,0.0,18.1,0.0,0.597,6.852,100.0,1.4655,24.0,666.0,20.2,179.36,19.78,27.5 410,51.1358,0.0,18.1,0.0,0.597,5.757,100.0,1.413,24.0,666.0,20.2,2.6,10.11,15.0 411,14.0507,0.0,18.1,0.0,0.597,6.657,100.0,1.5275,24.0,666.0,20.2,35.05,21.22,17.2 412,18.811,0.0,18.1,0.0,0.597,4.628,100.0,1.5539,24.0,666.0,20.2,28.79,34.37,17.9 413,28.6558,0.0,18.1,0.0,0.597,5.155,100.0,1.5894,24.0,666.0,20.2,210.97,20.08,16.3 414,45.7461,0.0,18.1,0.0,0.693,4.519,100.0,1.6582,24.0,666.0,20.2,88.27,36.98,7.0 415,18.0846,0.0,18.1,0.0,0.679,6.434,100.0,1.8347,24.0,666.0,20.2,27.25,29.05,7.2 416,10.8342,0.0,18.1,0.0,0.679,6.782,90.8,1.8195,24.0,666.0,20.2,21.57,25.79,7.5 417,25.9406,0.0,18.1,0.0,0.679,5.304,89.1,1.6475,24.0,666.0,20.2,127.36,26.64,10.4 418,73.5341,0.0,18.1,0.0,0.679,5.957,100.0,1.8026,24.0,666.0,20.2,16.45,20.62,8.8 419,11.8123,0.0,18.1,0.0,0.718,6.824,76.5,1.794,24.0,666.0,20.2,48.45,22.74,8.4 420,11.0874,0.0,18.1,0.0,0.718,6.411,100.0,1.8589,24.0,666.0,20.2,318.75,15.02,16.7 421,7.02259,0.0,18.1,0.0,0.718,6.006,95.3,1.8746,24.0,666.0,20.2,319.98,15.7,14.2 422,12.0482,0.0,18.1,0.0,0.614,5.648,87.6,1.9512,24.0,666.0,20.2,291.55,14.1,20.8 423,7.05042,0.0,18.1,0.0,0.614,6.103,85.1,2.0218,24.0,666.0,20.2,2.52,23.29,13.4 424,8.79212,0.0,18.1,0.0,0.584,5.565,70.6,2.0635,24.0,666.0,20.2,3.65,17.16,11.7 425,15.8603,0.0,18.1,0.0,0.679,5.896,95.4,1.9096,24.0,666.0,20.2,7.68,24.39,8.3 426,12.2472,0.0,18.1,0.0,0.584,5.837,59.7,1.9976,24.0,666.0,20.2,24.65,15.69,10.2 427,37.6619,0.0,18.1,0.0,0.679,6.202,78.7,1.8629,24.0,666.0,20.2,18.82,14.52,10.9 428,7.36711,0.0,18.1,0.0,0.679,6.193,78.1,1.9356,24.0,666.0,20.2,96.73,21.52,11.0 429,9.33889,0.0,18.1,0.0,0.679,6.38,95.6,1.9682,24.0,666.0,20.2,60.72,24.08,9.5 430,8.49213,0.0,18.1,0.0,0.584,6.348,86.1,2.0527,24.0,666.0,20.2,83.45,17.64,14.5 431,10.0623,0.0,18.1,0.0,0.584,6.833,94.3,2.0882,24.0,666.0,20.2,81.33,19.69,14.1 432,6.44405,0.0,18.1,0.0,0.584,6.425,74.8,2.2004,24.0,666.0,20.2,97.95,12.03,16.1 433,5.58107,0.0,18.1,0.0,0.713,6.436,87.9,2.3158,24.0,666.0,20.2,100.19,16.22,14.3 434,13.9134,0.0,18.1,0.0,0.713,6.208,95.0,2.2222,24.0,666.0,20.2,100.63,15.17,11.7 435,11.1604,0.0,18.1,0.0,0.74,6.629,94.6,2.1247,24.0,666.0,20.2,109.85,23.27,13.4 436,14.4208,0.0,18.1,0.0,0.74,6.461,93.3,2.0026,24.0,666.0,20.2,27.49,18.05,9.6 437,15.1772,0.0,18.1,0.0,0.74,6.152,100.0,1.9142,24.0,666.0,20.2,9.32,26.45,8.7 438,13.6781,0.0,18.1,0.0,0.74,5.935,87.9,1.8206,24.0,666.0,20.2,68.95,34.02,8.4 439,9.39063,0.0,18.1,0.0,0.74,5.627,93.9,1.8172,24.0,666.0,20.2,396.9,22.88,12.8 440,22.0511,0.0,18.1,0.0,0.74,5.818,92.4,1.8662,24.0,666.0,20.2,391.45,22.11,10.5 441,9.72418,0.0,18.1,0.0,0.74,6.406,97.2,2.0651,24.0,666.0,20.2,385.96,19.52,17.1 442,5.66637,0.0,18.1,0.0,0.74,6.219,100.0,2.0048,24.0,666.0,20.2,395.69,16.59,18.4 443,9.96654,0.0,18.1,0.0,0.74,6.485,100.0,1.9784,24.0,666.0,20.2,386.73,18.85,15.4 444,12.8023,0.0,18.1,0.0,0.74,5.854,96.6,1.8956,24.0,666.0,20.2,240.52,23.79,10.8 445,0.6718,0.0,18.1,0.0,0.74,6.459,94.8,1.9879,24.0,666.0,20.2,43.06,23.98,11.8 446,6.28807,0.0,18.1,0.0,0.74,6.341,96.4,2.072,24.0,666.0,20.2,318.01,17.79,14.9 447,9.92485,0.0,18.1,0.0,0.74,6.251,96.6,2.198,24.0,666.0,20.2,388.52,16.44,12.6 448,9.32909,0.0,18.1,0.0,0.713,6.185,98.7,2.2616,24.0,666.0,20.2,396.9,18.13,14.1 449,7.52601,0.0,18.1,0.0,0.713,6.417,98.3,2.185,24.0,666.0,20.2,304.21,19.31,13.0 450,6.71772,0.0,18.1,0.0,0.713,6.749,92.6,2.3236,24.0,666.0,20.2,0.32,17.44,13.4 451,5.44114,0.0,18.1,0.0,0.713,6.655,98.2,2.3552,24.0,666.0,20.2,355.29,17.73,15.2 452,5.09017,0.0,18.1,0.0,0.713,6.297,91.8,2.3682,24.0,666.0,20.2,385.09,17.27,16.1 453,8.24809,0.0,18.1,0.0,0.713,7.393,99.3,2.4527,24.0,666.0,20.2,375.87,16.74,17.8 454,9.51363,0.0,18.1,0.0,0.713,6.728,94.1,2.4961,24.0,666.0,20.2,6.68,18.71,14.9 455,4.75237,0.0,18.1,0.0,0.713,6.525,86.5,2.4358,24.0,666.0,20.2,50.92,18.13,14.1 456,4.66883,0.0,18.1,0.0,0.713,5.976,87.9,2.5806,24.0,666.0,20.2,10.48,19.01,12.7 457,8.20058,0.0,18.1,0.0,0.713,5.936,80.3,2.7792,24.0,666.0,20.2,3.5,16.94,13.5 458,7.75223,0.0,18.1,0.0,0.713,6.301,83.7,2.7831,24.0,666.0,20.2,272.21,16.23,14.9 459,6.80117,0.0,18.1,0.0,0.713,6.081,84.4,2.7175,24.0,666.0,20.2,396.9,14.7,20.0 460,4.81213,0.0,18.1,0.0,0.713,6.701,90.0,2.5975,24.0,666.0,20.2,255.23,16.42,16.4 461,3.69311,0.0,18.1,0.0,0.713,6.376,88.4,2.5671,24.0,666.0,20.2,391.43,14.65,17.7 462,6.65492,0.0,18.1,0.0,0.713,6.317,83.0,2.7344,24.0,666.0,20.2,396.9,13.99,19.5 463,5.82115,0.0,18.1,0.0,0.713,6.513,89.9,2.8016,24.0,666.0,20.2,393.82,10.29,20.2 464,7.83932,0.0,18.1,0.0,0.655,6.209,65.4,2.9634,24.0,666.0,20.2,396.9,13.22,21.4 465,3.1636,0.0,18.1,0.0,0.655,5.759,48.2,3.0665,24.0,666.0,20.2,334.4,14.13,19.9 466,3.77498,0.0,18.1,0.0,0.655,5.952,84.7,2.8715,24.0,666.0,20.2,22.01,17.15,19.0 467,4.42228,0.0,18.1,0.0,0.584,6.003,94.5,2.5403,24.0,666.0,20.2,331.29,21.32,19.1 468,15.5757,0.0,18.1,0.0,0.58,5.926,71.0,2.9084,24.0,666.0,20.2,368.74,18.13,19.1 469,13.0751,0.0,18.1,0.0,0.58,5.713,56.7,2.8237,24.0,666.0,20.2,396.9,14.76,20.1 470,4.34879,0.0,18.1,0.0,0.58,6.167,84.0,3.0334,24.0,666.0,20.2,396.9,16.29,19.9 471,4.03841,0.0,18.1,0.0,0.532,6.229,90.7,3.0993,24.0,666.0,20.2,395.33,12.87,19.6 472,3.56868,0.0,18.1,0.0,0.58,6.437,75.0,2.8965,24.0,666.0,20.2,393.37,14.36,23.2 473,4.64689,0.0,18.1,0.0,0.614,6.98,67.6,2.5329,24.0,666.0,20.2,374.68,11.66,29.8 474,8.05579,0.0,18.1,0.0,0.584,5.427,95.4,2.4298,24.0,666.0,20.2,352.58,18.14,13.8 475,6.39312,0.0,18.1,0.0,0.584,6.162,97.4,2.206,24.0,666.0,20.2,302.76,24.1,13.3 476,4.87141,0.0,18.1,0.0,0.614,6.484,93.6,2.3053,24.0,666.0,20.2,396.21,18.68,16.7 477,15.0234,0.0,18.1,0.0,0.614,5.304,97.3,2.1007,24.0,666.0,20.2,349.48,24.91,12.0 478,10.233,0.0,18.1,0.0,0.614,6.185,96.7,2.1705,24.0,666.0,20.2,379.7,18.03,14.6 479,14.3337,0.0,18.1,0.0,0.614,6.229,88.0,1.9512,24.0,666.0,20.2,383.32,13.11,21.4 480,5.82401,0.0,18.1,0.0,0.532,6.242,64.7,3.4242,24.0,666.0,20.2,396.9,10.74,23.0 481,5.70818,0.0,18.1,0.0,0.532,6.75,74.9,3.3317,24.0,666.0,20.2,393.07,7.74,23.7 482,5.73116,0.0,18.1,0.0,0.532,7.061,77.0,3.4106,24.0,666.0,20.2,395.28,7.01,25.0 483,2.81838,0.0,18.1,0.0,0.532,5.762,40.3,4.0983,24.0,666.0,20.2,392.92,10.42,21.8 484,2.37857,0.0,18.1,0.0,0.583,5.871,41.9,3.724,24.0,666.0,20.2,370.73,13.34,20.6 485,3.67367,0.0,18.1,0.0,0.583,6.312,51.9,3.9917,24.0,666.0,20.2,388.62,10.58,21.2 486,5.69175,0.0,18.1,0.0,0.583,6.114,79.8,3.5459,24.0,666.0,20.2,392.68,14.98,19.1 487,4.83567,0.0,18.1,0.0,0.583,5.905,53.2,3.1523,24.0,666.0,20.2,388.22,11.45,20.6 488,0.15086,0.0,27.74,0.0,0.609,5.454,92.7,1.8209,4.0,711.0,20.1,395.09,18.06,15.2 489,0.18337,0.0,27.74,0.0,0.609,5.414,98.3,1.7554,4.0,711.0,20.1,344.05,23.97,7.0 490,0.20746,0.0,27.74,0.0,0.609,5.093,98.0,1.8226,4.0,711.0,20.1,318.43,29.68,8.1 491,0.10574,0.0,27.74,0.0,0.609,5.983,98.8,1.8681,4.0,711.0,20.1,390.11,18.07,13.6 492,0.11132,0.0,27.74,0.0,0.609,5.983,83.5,2.1099,4.0,711.0,20.1,396.9,13.35,20.1 493,0.17331,0.0,9.69,0.0,0.585,5.707,54.0,2.3817,6.0,391.0,19.2,396.9,12.01,21.8 494,0.27957,0.0,9.69,0.0,0.585,5.926,42.6,2.3817,6.0,391.0,19.2,396.9,13.59,24.5 495,0.17899,0.0,9.69,0.0,0.585,5.67,28.8,2.7986,6.0,391.0,19.2,393.29,17.6,23.1 496,0.2896,0.0,9.69,0.0,0.585,5.39,72.9,2.7986,6.0,391.0,19.2,396.9,21.14,19.7 497,0.26838,0.0,9.69,0.0,0.585,5.794,70.6,2.8927,6.0,391.0,19.2,396.9,14.1,18.3 498,0.23912,0.0,9.69,0.0,0.585,6.019,65.3,2.4091,6.0,391.0,19.2,396.9,12.92,21.2 499,0.17783,0.0,9.69,0.0,0.585,5.569,73.5,2.3999,6.0,391.0,19.2,395.77,15.1,17.5 500,0.22438,0.0,9.69,0.0,0.585,6.027,79.7,2.4982,6.0,391.0,19.2,396.9,14.33,16.8 501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4 502,0.04527,0.0,11.93,0.0,0.573,6.12,76.7,2.2875,1.0,273.0,21.0,396.9,9.08,20.6 503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.9,5.64,23.9 504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0 505,0.04741,0.0,11.93,0.0,0.573,6.03,80.8,2.505,1.0,273.0,21.0,396.9,7.88,11.9 ================================================ FILE: examples/data/census_1k.csv ================================================ "YEAR","DATANUM","SERIAL","CBSERIAL","HHWT","CPI99","GQ","QGQ","PERNUM","PERWT","SEX","AGE","EDUC","EDUCD","INCTOT","SEX_HEAD","SEX_MOM","SEX_POP","SEX_SP","SEX_MOM2","SEX_POP2","AGE_HEAD","AGE_MOM","AGE_POP","AGE_SP","AGE_MOM2","AGE_POP2","EDUC_HEAD","EDUC_MOM","EDUC_POP","EDUC_SP","EDUC_MOM2","EDUC_POP2","EDUCD_HEAD","EDUCD_MOM","EDUCD_POP","EDUCD_SP","EDUCD_MOM2","EDUCD_POP2","INCTOT_HEAD","INCTOT_MOM","INCTOT_POP","INCTOT_SP","INCTOT_MOM2","INCTOT_POP2" 1970,2,1,,100,4.54,1,0,1,100,1,39,6,60,12450,1,,,2,,,39,,,36,,,6,,,3,,,60,,,30,,,12450,,,3450,, 1970,2,1,,100,4.54,1,0,2,100,2,36,3,30,3450,1,,,1,,,39,,,39,,,6,,,6,,,60,,,60,,,12450,,,12450,, 1970,2,2,,100,4.54,1,0,1,100,1,56,7,70,9050,1,,,2,,,56,,,54,,,7,,,6,,,70,,,60,,,9050,,,0,, 1970,2,2,,100,4.54,1,0,2,100,2,54,6,60,0,1,,,1,,,56,,,56,,,7,,,7,,,70,,,70,,,9050,,,9050,, 1970,2,4,,100,4.54,1,0,1,100,1,82,1,17,7450,1,,,2,,,82,,,74,,,1,,,2,,,17,,,23,,,7450,,,650,, 1970,2,4,,100,4.54,1,0,2,100,2,74,2,23,650,1,,,1,,,82,,,82,,,1,,,1,,,17,,,17,,,7450,,,7450,, 1970,2,5,,100,4.54,1,0,1,100,1,66,10,100,6950,1,,,2,,,66,,,62,,,10,,,6,,,100,,,60,,,6950,,,250,, 1970,2,5,,100,4.54,1,0,2,100,2,62,6,60,250,1,,,1,,,66,,,66,,,10,,,10,,,100,,,100,,,6950,,,6950,, 1970,2,6,,100,4.54,1,0,1,100,2,70,4,40,1250,2,,,,,,70,,,,,,4,,,,,,40,,,,,,1250,,,,, 1970,2,7,,100,4.54,1,0,1,100,1,25,6,60,11150,1,,,2,,,25,,,22,,,6,,,6,,,60,,,60,,,11150,,,4050,, 1970,2,7,,100,4.54,1,0,2,100,2,22,6,60,4050,1,,,1,,,25,,,25,,,6,,,6,,,60,,,60,,,11150,,,11150,, 1970,2,7,,100,4.54,1,0,3,100,1,1,0,1,9999999,1,2,1,,,,25,22,25,,,,6,6,6,,,,60,60,60,,,,11150,4050,11150,,, 1970,2,8,,100,4.54,3,0,1,100,2,98,2,26,550,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 1970,2,9,,100,4.54,1,0,1,100,1,25,10,100,6150,1,,,2,,,25,,,27,,,10,,,9,,,100,,,90,,,6150,,,1050,, 1970,2,9,,100,4.54,1,0,2,100,2,27,9,90,1050,1,,,1,,,25,,,25,,,10,,,10,,,100,,,100,,,6150,,,6150,, 1970,2,10,,100,4.54,1,0,1,100,1,41,11,111,8050,1,2,,,,,41,78,,,,,11,6,,,,,111,60,,,,,8050,0,,,, 1970,2,10,,100,4.54,1,0,2,100,2,78,6,60,0,1,,,,,,41,,,,,,11,,,,,,111,,,,,,8050,,,,, 1970,2,10,,100,4.54,1,0,3,100,2,38,6,60,7150,1,2,,,,,41,78,,,,,11,6,,,,,111,60,,,,,8050,0,,,, 1970,2,11,,100,4.54,1,0,1,100,1,20,6,60,2050,1,,,,,,20,,,,,,6,,,,,,60,,,,,,2050,,,,, 1970,2,13,,100,4.54,1,0,1,100,1,37,6,65,16850,1,,,2,,,37,,,30,,,6,,,8,,,65,,,80,,,16850,,,350,, 1970,2,13,,100,4.54,1,0,2,100,2,30,8,80,350,1,,,1,,,37,,,37,,,6,,,6,,,65,,,65,,,16850,,,16850,, 1970,2,13,,100,4.54,1,0,3,100,1,5,0,2,9999999,1,2,1,,,,37,30,37,,,,6,8,6,,,,65,80,65,,,,16850,350,16850,,, 1970,2,13,,100,4.54,1,0,4,100,2,1,0,1,9999999,1,2,1,,,,37,30,37,,,,6,8,6,,,,65,80,65,,,,16850,350,16850,,, 1970,2,14,,100,4.54,1,0,1,100,1,49,2,23,8850,1,,,2,,,49,,,35,,,2,,,3,,,23,,,30,,,8850,,,4850,, 1970,2,14,,100,4.54,1,0,2,100,2,35,3,30,4850,1,,,1,,,49,,,49,,,2,,,2,,,23,,,23,,,8850,,,8850,, 1970,2,14,,100,4.54,1,0,3,100,2,17,3,30,250,1,2,1,,,,49,35,49,,,,2,3,2,,,,23,30,23,,,,8850,4850,8850,,, 1970,2,14,,100,4.54,1,0,4,100,2,14,2,25,0,1,2,1,,,,49,35,49,,,,2,3,2,,,,23,30,23,,,,8850,4850,8850,,, 1970,2,14,,100,4.54,1,0,5,100,1,10,1,15,9999999,1,2,1,,,,49,35,49,,,,2,3,2,,,,23,30,23,,,,8850,4850,8850,,, 1970,2,14,,100,4.54,1,0,6,100,2,8,1,14,9999999,1,2,1,,,,49,35,49,,,,2,3,2,,,,23,30,23,,,,8850,4850,8850,,, 1970,2,14,,100,4.54,1,0,7,100,1,0,0,1,9999999,1,2,1,,,,49,35,49,,,,2,3,2,,,,23,30,23,,,,8850,4850,8850,,, 1970,2,15,,100,4.54,1,0,1,100,2,62,7,70,7750,2,,,,,,62,,,,,,7,,,,,,70,,,,,,7750,,,,, 1970,2,15,,100,4.54,1,0,2,100,1,35,11,111,5350,2,2,,,,,62,62,,,,,7,7,,,,,70,70,,,,,7750,7750,,,, 1970,2,16,,100,4.54,1,0,1,100,1,57,4,40,11250,1,,,2,,,57,,,54,,,4,,,2,,,40,,,26,,,11250,,,150,, 1970,2,16,,100,4.54,1,0,2,100,2,54,2,26,150,1,2,,1,,,57,86,,57,,,4,2,,4,,,40,26,,40,,,11250,1250,,11250,, 1970,2,16,,100,4.54,1,0,3,100,2,86,2,26,1250,1,,,,,,57,,,,,,4,,,,,,40,,,,,,11250,,,,, 1970,2,17,,100,4.54,1,0,1,100,1,54,6,60,6050,1,,,,,,54,,,,,,6,,,,,,60,,,,,,6050,,,,, 1970,2,17,,100,4.54,1,0,2,100,2,64,2,26,0,1,,,,,,54,,,,,,6,,,,,,60,,,,,,6050,,,,, 1970,2,18,,100,4.54,1,0,1,100,1,52,7,70,12050,1,,,2,,,52,,,44,,,7,,,6,,,70,,,60,,,12050,,,650,, 1970,2,18,,100,4.54,1,0,2,100,2,44,6,60,650,1,,,1,,,52,,,52,,,7,,,7,,,70,,,70,,,12050,,,12050,, 1970,2,18,,100,4.54,1,0,3,100,2,16,4,40,950,1,2,1,,,,52,44,52,,,,7,6,7,,,,70,60,70,,,,12050,650,12050,,, 1970,2,18,,100,4.54,1,0,4,100,2,15,3,30,350,1,2,1,,,,52,44,52,,,,7,6,7,,,,70,60,70,,,,12050,650,12050,,, 1970,2,18,,100,4.54,1,0,5,100,1,14,2,25,350,1,2,1,,,,52,44,52,,,,7,6,7,,,,70,60,70,,,,12050,650,12050,,, 1970,2,18,,100,4.54,1,0,6,100,1,12,2,22,9999999,1,2,1,,,,52,44,52,,,,7,6,7,,,,70,60,70,,,,12050,650,12050,,, 1970,2,18,,100,4.54,1,0,7,100,1,6,1,12,9999999,1,2,1,,,,52,44,52,,,,7,6,7,,,,70,60,70,,,,12050,650,12050,,, 1970,2,19,,100,4.54,1,0,1,100,1,77,2,26,250,1,,,2,,,77,,,79,,,2,,,2,,,26,,,26,,,250,,,0,, 1970,2,19,,100,4.54,1,0,2,100,2,79,2,26,0,1,,,1,,,77,,,77,,,2,,,2,,,26,,,26,,,250,,,250,, 1970,2,20,,100,4.54,1,0,1,100,1,36,6,60,11450,1,,,2,,,36,,,32,,,6,,,6,,,60,,,60,,,11450,,,5550,, 1970,2,20,,100,4.54,1,0,2,100,2,32,6,60,5550,1,,,1,,,36,,,36,,,6,,,6,,,60,,,60,,,11450,,,11450,, 1970,2,20,,100,4.54,1,0,3,100,2,9,1,16,9999999,1,2,1,,,,36,32,36,,,,6,6,6,,,,60,60,60,,,,11450,5550,11450,,, 1970,2,21,,100,4.54,1,0,1,100,1,21,4,40,2450,1,,,2,,,21,,,20,,,4,,,4,,,40,,,40,,,2450,,,4550,, 1970,2,21,,100,4.54,1,0,2,100,2,20,4,40,4550,1,,,1,,,21,,,21,,,4,,,4,,,40,,,40,,,2450,,,2450,, 1970,2,21,,100,4.54,1,0,3,100,1,5,1,12,9999999,1,2,1,,,,21,20,21,,,,4,4,4,,,,40,40,40,,,,2450,4550,2450,,, 1970,2,21,,100,4.54,1,0,4,100,1,4,1,11,9999999,1,2,1,,,,21,20,21,,,,4,4,4,,,,40,40,40,,,,2450,4550,2450,,, 1970,2,22,,100,4.54,1,0,1,100,1,23,2,26,5050,1,,,,,,23,,,,,,2,,,,,,26,,,,,,5050,,,,, 1970,2,22,,100,4.54,1,0,2,100,2,23,3,30,1850,1,,,,,,23,,,,,,2,,,,,,26,,,,,,5050,,,,, 1970,2,23,,100,4.54,1,0,1,100,1,63,6,60,5050,1,,,,,,63,,,,,,6,,,,,,60,,,,,,5050,,,,, 1970,2,24,,100,4.54,1,0,1,100,2,68,3,30,2150,2,,,,,,68,,,,,,3,,,,,,30,,,,,,2150,,,,, 1970,2,25,,100,4.54,1,0,1,100,1,65,2,22,4850,1,,,2,,,65,,,61,,,2,,,4,,,22,,,40,,,4850,,,4350,, 1970,2,25,,100,4.54,1,0,2,100,2,61,4,40,4350,1,,,1,,,65,,,65,,,2,,,2,,,22,,,22,,,4850,,,4850,, 1970,2,26,,100,4.54,1,0,1,100,1,61,8,80,2150,1,,,2,,,61,,,66,,,8,,,6,,,80,,,60,,,2150,,,5650,, 1970,2,26,,100,4.54,1,0,2,100,2,66,6,60,5650,1,,,1,,,61,,,61,,,8,,,8,,,80,,,80,,,2150,,,2150,, 1970,2,27,,100,4.54,1,0,1,100,2,77,1,14,4050,2,,,,,,77,,,,,,1,,,,,,14,,,,,,4050,,,,, 1970,2,27,,100,4.54,1,0,2,100,1,75,1,14,2050,2,,,,,,77,,,,,,1,,,,,,14,,,,,,4050,,,,, 1970,2,28,,100,4.54,1,0,1,100,1,32,8,80,5050,1,,,,,,32,,,,,,8,,,,,,80,,,,,,5050,,,,, 1970,2,29,,100,4.54,1,0,1,100,1,59,5,50,15050,1,,,2,,,59,,,55,,,5,,,6,,,50,,,60,,,15050,,,0,, 1970,2,29,,100,4.54,1,0,2,100,2,55,6,60,0,1,,,1,,,59,,,59,,,5,,,5,,,50,,,50,,,15050,,,15050,, 1970,2,30,,100,4.54,1,0,1,100,2,47,6,60,0,2,,,,,,47,,,,,,6,,,,,,60,,,,,,0,,,,, 1970,2,31,,100,4.54,1,0,1,100,1,43,8,80,7050,1,,,2,,,43,,,41,,,8,,,6,,,80,,,60,,,7050,,,2050,, 1970,2,31,,100,4.54,1,0,2,100,2,41,6,60,2050,1,,,1,,,43,,,43,,,8,,,8,,,80,,,80,,,7050,,,7050,, 1970,2,31,,100,4.54,1,0,3,100,2,18,6,65,4050,1,2,1,,,,43,41,43,,,,8,6,8,,,,80,60,80,,,,7050,2050,7050,,, 1970,2,31,,100,4.54,1,0,4,100,2,15,2,26,0,1,2,1,,,,43,41,43,,,,8,6,8,,,,80,60,80,,,,7050,2050,7050,,, 1970,2,31,,100,4.54,1,0,5,100,2,10,1,17,9999999,1,2,1,,,,43,41,43,,,,8,6,8,,,,80,60,80,,,,7050,2050,7050,,, 1970,2,32,,100,4.54,1,0,1,100,1,40,10,100,13350,1,,,2,,,40,,,36,,,10,,,10,,,100,,,100,,,13350,,,0,, 1970,2,32,,100,4.54,1,0,2,100,2,36,10,100,0,1,,,1,,,40,,,40,,,10,,,10,,,100,,,100,,,13350,,,13350,, 1970,2,32,,100,4.54,1,0,3,100,1,14,2,25,0,1,2,1,,,,40,36,40,,,,10,10,10,,,,100,100,100,,,,13350,0,13350,,, 1970,2,32,,100,4.54,1,0,4,100,1,10,1,16,9999999,1,2,1,,,,40,36,40,,,,10,10,10,,,,100,100,100,,,,13350,0,13350,,, 1970,2,32,,100,4.54,1,0,5,100,2,4,1,11,9999999,1,2,1,,,,40,36,40,,,,10,10,10,,,,100,100,100,,,,13350,0,13350,,, 1970,2,33,,100,4.54,1,0,1,100,1,40,11,111,25050,1,,,2,,,40,,,32,,,11,,,10,,,111,,,100,,,25050,,,0,, 1970,2,33,,100,4.54,1,0,2,100,2,32,10,100,0,1,,,1,,,40,,,40,,,11,,,11,,,111,,,111,,,25050,,,25050,, 1970,2,33,,100,4.54,1,0,3,100,1,5,0,2,9999999,1,2,1,,,,40,32,40,,,,11,10,11,,,,111,100,111,,,,25050,0,25050,,, 1970,2,33,,100,4.54,1,0,4,100,1,3,0,2,9999999,1,2,1,,,,40,32,40,,,,11,10,11,,,,111,100,111,,,,25050,0,25050,,, 1970,2,34,,100,4.54,1,0,1,100,1,31,11,111,19350,1,,,2,,,31,,,31,,,11,,,10,,,111,,,100,,,19350,,,0,, 1970,2,34,,100,4.54,1,0,2,100,2,31,10,100,0,1,,,1,,,31,,,31,,,11,,,11,,,111,,,111,,,19350,,,19350,, 1970,2,34,,100,4.54,1,0,3,100,2,3,0,2,9999999,1,2,1,,,,31,31,31,,,,11,10,11,,,,111,100,111,,,,19350,0,19350,,, 1970,2,35,,100,4.54,1,0,1,100,1,64,11,111,17150,1,,,,,,64,,,,,,11,,,,,,111,,,,,,17150,,,,, 1970,2,36,,100,4.54,1,0,1,100,1,55,3,30,9050,1,,,2,,,55,,,51,,,3,,,6,,,30,,,60,,,9050,,,2950,, 1970,2,36,,100,4.54,1,0,2,100,2,51,6,60,2950,1,,,1,,,55,,,55,,,3,,,3,,,30,,,30,,,9050,,,9050,, 1970,2,37,,100,4.54,1,0,1,100,1,43,11,111,50000,1,,,2,,,43,,,40,,,11,,,10,,,111,,,100,,,50000,,,1150,, 1970,2,37,,100,4.54,1,0,2,100,2,40,10,100,1150,1,,,1,,,43,,,43,,,11,,,11,,,111,,,111,,,50000,,,50000,, 1970,2,37,,100,4.54,1,0,3,100,1,16,4,40,250,1,2,1,,,,43,40,43,,,,11,10,11,,,,111,100,111,,,,50000,1150,50000,,, 1970,2,37,,100,4.54,1,0,4,100,2,15,2,26,50,1,2,1,,,,43,40,43,,,,11,10,11,,,,111,100,111,,,,50000,1150,50000,,, 1970,2,37,,100,4.54,1,0,5,100,1,12,2,22,9999999,1,2,1,,,,43,40,43,,,,11,10,11,,,,111,100,111,,,,50000,1150,50000,,, 1970,2,38,,100,4.54,1,0,1,100,1,32,10,100,22150,1,,,2,,,32,,,31,,,10,,,7,,,100,,,70,,,22150,,,0,, 1970,2,38,,100,4.54,1,0,2,100,2,31,7,70,0,1,,,1,,,32,,,32,,,10,,,10,,,100,,,100,,,22150,,,22150,, 1970,2,38,,100,4.54,1,0,3,100,2,5,0,2,9999999,1,2,1,,,,32,31,32,,,,10,7,10,,,,100,70,100,,,,22150,0,22150,,, 1970,2,38,,100,4.54,1,0,4,100,1,4,0,2,9999999,1,2,1,,,,32,31,32,,,,10,7,10,,,,100,70,100,,,,22150,0,22150,,, 1970,2,38,,100,4.54,1,0,5,100,2,2,0,1,9999999,1,2,1,,,,32,31,32,,,,10,7,10,,,,100,70,100,,,,22150,0,22150,,, 1970,2,39,,100,4.54,1,0,1,100,1,54,8,80,17850,1,,,2,,,54,,,47,,,8,,,6,,,80,,,60,,,17850,,,0,, 1970,2,39,,100,4.54,1,0,2,100,2,47,6,60,0,1,,,1,,,54,,,54,,,8,,,8,,,80,,,80,,,17850,,,17850,, 1970,2,39,,100,4.54,1,0,3,100,1,19,6,60,2750,1,2,1,,,,54,47,54,,,,8,6,8,,,,80,60,80,,,,17850,0,17850,,, 1970,2,39,,100,4.54,1,0,4,100,1,12,2,23,9999999,1,2,1,,,,54,47,54,,,,8,6,8,,,,80,60,80,,,,17850,0,17850,,, 1970,2,40,,100,4.54,1,0,1,100,1,46,6,60,17150,1,,,2,,,46,,,46,,,6,,,6,,,60,,,60,,,17150,,,5050,, 1970,2,40,,100,4.54,1,0,2,100,2,46,6,60,5050,1,,,1,,,46,,,46,,,6,,,6,,,60,,,60,,,17150,,,17150,, 1970,2,41,,100,4.54,1,0,1,100,1,70,6,60,11450,1,,,,,,70,,,,,,6,,,,,,60,,,,,,11450,,,,, 1970,2,41,,100,4.54,1,0,2,100,2,28,5,50,2150,1,,,,,,70,,,,,,6,,,,,,60,,,,,,11450,,,,, 1970,2,41,,100,4.54,1,0,3,100,1,33,8,80,9850,1,,1,,,,70,,70,,,,6,,6,,,,60,,60,,,,11450,,11450,,, 1970,2,41,,100,4.54,1,0,4,100,1,0,0,1,9999999,1,,1,,,,70,,33,,,,6,,8,,,,60,,80,,,,11450,,9850,,, 1970,2,41,,100,4.54,1,0,5,100,2,5,0,2,9999999,1,,1,,,,70,,33,,,,6,,8,,,,60,,80,,,,11450,,9850,,, 1970,2,42,,100,4.54,1,0,1,100,1,53,3,30,10050,1,,,2,,,53,,,51,,,3,,,6,,,30,,,60,,,10050,,,0,, 1970,2,42,,100,4.54,1,0,2,100,2,51,6,60,0,1,,,1,,,53,,,53,,,3,,,3,,,30,,,30,,,10050,,,10050,, 1970,2,42,,100,4.54,1,0,3,100,2,14,2,26,0,1,2,1,,,,53,51,53,,,,3,6,3,,,,30,60,30,,,,10050,0,10050,,, 1970,2,43,,100,4.54,1,0,1,100,1,41,11,111,32050,1,,,2,,,41,,,40,,,11,,,11,,,111,,,111,,,32050,,,250,, 1970,2,43,,100,4.54,1,0,2,100,2,40,11,111,250,1,,,1,,,41,,,41,,,11,,,11,,,111,,,111,,,32050,,,32050,, 1970,2,43,,100,4.54,1,0,3,100,2,10,1,17,9999999,1,2,1,,,,41,40,41,,,,11,11,11,,,,111,111,111,,,,32050,250,32050,,, 1970,2,43,,100,4.54,1,0,4,100,2,6,1,12,9999999,1,2,1,,,,41,40,41,,,,11,11,11,,,,111,111,111,,,,32050,250,32050,,, 1970,2,44,,100,4.54,1,0,1,100,1,47,2,26,14050,1,,,2,,,47,,,44,,,2,,,2,,,26,,,26,,,14050,,,0,, 1970,2,44,,100,4.54,1,0,2,100,2,44,2,26,0,1,,,1,,,47,,,47,,,2,,,2,,,26,,,26,,,14050,,,14050,, 1970,2,44,,100,4.54,1,0,3,100,2,21,8,80,2050,1,2,1,,,,47,44,47,,,,2,2,2,,,,26,26,26,,,,14050,0,14050,,, 1970,2,44,,100,4.54,1,0,4,100,1,18,5,50,1750,1,2,1,,,,47,44,47,,,,2,2,2,,,,26,26,26,,,,14050,0,14050,,, 1970,2,44,,100,4.54,1,0,5,100,1,4,0,2,9999999,1,2,1,,,,47,44,47,,,,2,2,2,,,,26,26,26,,,,14050,0,14050,,, 1970,2,45,,100,4.54,1,0,1,100,1,60,8,80,16550,1,,,2,,,60,,,50,,,8,,,6,,,80,,,60,,,16550,,,2950,, 1970,2,45,,100,4.54,1,0,2,100,2,50,6,60,2950,1,,,1,,,60,,,60,,,8,,,8,,,80,,,80,,,16550,,,16550,, 1970,2,46,,100,4.54,1,0,1,100,1,47,10,100,16250,1,2,,,,,47,83,,,,,10,2,,,,,100,22,,,,,16250,1250,,,, 1970,2,46,,100,4.54,1,0,2,100,2,83,2,22,1250,1,,,,,,47,,,,,,10,,,,,,100,,,,,,16250,,,,, 1970,2,46,,100,4.54,1,0,3,100,2,83,2,22,1450,1,,,,,,47,,,,,,10,,,,,,100,,,,,,16250,,,,, 1970,2,48,,100,4.54,1,0,1,100,1,49,6,60,8850,1,2,,2,,,49,89,,48,,,6,2,,6,,,60,26,,60,,,8850,450,,6050,, 1970,2,48,,100,4.54,1,0,2,100,2,48,6,60,6050,1,,,1,,,49,,,49,,,6,,,6,,,60,,,60,,,8850,,,8850,, 1970,2,48,,100,4.54,1,0,3,100,2,89,2,26,450,1,,,,,,49,,,,,,6,,,,,,60,,,,,,8850,,,,, 1970,2,49,,100,4.54,1,0,1,100,1,30,10,100,12050,1,,,2,,,30,,,30,,,10,,,10,,,100,,,100,,,12050,,,5650,, 1970,2,49,,100,4.54,1,0,2,100,2,30,10,100,5650,1,,,1,,,30,,,30,,,10,,,10,,,100,,,100,,,12050,,,12050,, 1970,2,49,,100,4.54,1,0,3,100,1,3,0,2,9999999,1,2,1,,,,30,30,30,,,,10,10,10,,,,100,100,100,,,,12050,5650,12050,,, 1970,2,49,,100,4.54,1,0,4,100,1,0,0,1,9999999,1,2,1,,,,30,30,30,,,,10,10,10,,,,100,100,100,,,,12050,5650,12050,,, 1970,2,50,,100,4.54,1,0,1,100,1,54,8,80,10150,1,,,,,,54,,,,,,8,,,,,,80,,,,,,10150,,,,, 1970,2,51,,100,4.54,1,0,1,100,2,64,10,100,8650,2,,,,,,64,,,,,,10,,,,,,100,,,,,,8650,,,,, 1970,2,52,,100,4.54,1,0,1,100,2,37,7,70,11350,2,,,,,,37,,,,,,7,,,,,,70,,,,,,11350,,,,, 1970,2,52,,100,4.54,1,0,2,100,2,6,1,12,9999999,2,2,,,,,37,37,,,,,7,7,,,,,70,70,,,,,11350,11350,,,, 1970,2,53,,100,4.54,1,0,1,100,2,34,6,60,8050,2,,,,,,34,,,,,,6,,,,,,60,,,,,,8050,,,,, 1970,2,53,,100,4.54,1,0,2,100,1,11,2,22,9999999,2,2,,,,,34,34,,,,,6,6,,,,,60,60,,,,,8050,8050,,,, 1970,2,53,,100,4.54,1,0,3,100,2,9,1,15,9999999,2,2,,,,,34,34,,,,,6,6,,,,,60,60,,,,,8050,8050,,,, 1970,2,54,,100,4.54,1,0,1,100,1,48,0,2,0,1,,,2,,,48,,,51,,,0,,,2,,,2,,,26,,,0,,,0,, 1970,2,54,,100,4.54,1,0,2,100,2,51,2,26,0,1,,,1,,,48,,,48,,,0,,,0,,,2,,,2,,,0,,,0,, 1970,2,55,,100,4.54,1,0,1,100,2,27,2,25,150,2,,,,,,27,,,,,,2,,,,,,25,,,,,,150,,,,, 1970,2,55,,100,4.54,1,0,2,100,2,12,2,23,9999999,2,2,,,,,27,27,,,,,2,2,,,,,25,25,,,,,150,150,,,, 1970,2,55,,100,4.54,1,0,3,100,2,7,1,14,9999999,2,2,,,,,27,27,,,,,2,2,,,,,25,25,,,,,150,150,,,, 1970,2,55,,100,4.54,1,0,4,100,2,6,1,12,9999999,2,2,,,,,27,27,,,,,2,2,,,,,25,25,,,,,150,150,,,, 1970,2,55,,100,4.54,1,0,5,100,2,5,1,11,9999999,2,2,,,,,27,27,,,,,2,2,,,,,25,25,,,,,150,150,,,, 1970,2,56,,100,4.54,1,0,1,100,1,58,2,22,8050,1,,,2,,,58,,,52,,,2,,,6,,,22,,,60,,,8050,,,0,, 1970,2,56,,100,4.54,1,0,2,100,2,52,6,60,0,1,,,1,,,58,,,58,,,2,,,2,,,22,,,22,,,8050,,,8050,, 1970,2,56,,100,4.54,1,0,3,100,2,23,9,90,0,1,2,1,,,,58,52,58,,,,2,6,2,,,,22,60,22,,,,8050,0,8050,,, 1970,2,57,,100,4.54,1,0,1,100,1,32,2,26,7050,1,,,2,,,32,,,32,,,2,,,5,,,26,,,50,,,7050,,,5050,, 1970,2,57,,100,4.54,1,0,2,100,2,32,5,50,5050,1,,,1,,,32,,,32,,,2,,,2,,,26,,,26,,,7050,,,7050,, 1970,2,57,,100,4.54,1,0,3,100,2,5,0,2,9999999,1,2,1,,,,32,32,32,,,,2,5,2,,,,26,50,26,,,,7050,5050,7050,,, 1970,2,57,,100,4.54,1,0,4,100,2,3,0,2,9999999,1,2,1,,,,32,32,32,,,,2,5,2,,,,26,50,26,,,,7050,5050,7050,,, 1970,2,57,,100,4.54,1,0,5,100,2,13,2,23,9999999,1,2,1,,,,32,32,32,,,,2,5,2,,,,26,50,26,,,,7050,5050,7050,,, 1970,2,57,,100,4.54,1,0,6,100,1,10,1,17,9999999,1,2,1,,,,32,32,32,,,,2,5,2,,,,26,50,26,,,,7050,5050,7050,,, 1970,2,57,,100,4.54,1,0,7,100,2,9,1,16,9999999,1,2,1,,,,32,32,32,,,,2,5,2,,,,26,50,26,,,,7050,5050,7050,,, 1970,2,57,,100,4.54,1,0,8,100,2,8,1,15,9999999,1,2,1,,,,32,32,32,,,,2,5,2,,,,26,50,26,,,,7050,5050,7050,,, 1970,2,57,,100,4.54,1,0,9,100,1,6,1,12,9999999,1,2,1,,,,32,32,32,,,,2,5,2,,,,26,50,26,,,,7050,5050,7050,,, 1970,2,58,,100,4.54,1,0,1,100,1,24,11,110,1350,1,,,2,,,24,,,25,,,11,,,11,,,110,,,110,,,1350,,,8150,, 1970,2,58,,100,4.54,1,0,2,100,2,25,11,110,8150,1,,,1,,,24,,,24,,,11,,,11,,,110,,,110,,,1350,,,1350,, 1970,2,58,,100,4.54,1,0,3,100,1,1,0,1,9999999,1,2,1,,,,24,25,24,,,,11,11,11,,,,110,110,110,,,,1350,8150,1350,,, 1970,2,59,,100,4.54,1,0,1,100,1,34,5,50,14150,1,,,2,,,34,,,31,,,5,,,6,,,50,,,60,,,14150,,,5050,, 1970,2,59,,100,4.54,1,0,2,100,2,31,6,60,5050,1,,,1,,,34,,,34,,,5,,,5,,,50,,,50,,,14150,,,14150,, 1970,2,59,,100,4.54,1,0,3,100,2,12,2,22,9999999,1,2,1,,,,34,31,34,,,,5,6,5,,,,50,60,50,,,,14150,5050,14150,,, 1970,2,59,,100,4.54,1,0,4,100,1,11,2,22,9999999,1,2,1,,,,34,31,34,,,,5,6,5,,,,50,60,50,,,,14150,5050,14150,,, 1970,2,59,,100,4.54,1,0,5,100,2,10,1,17,9999999,1,2,1,,,,34,31,34,,,,5,6,5,,,,50,60,50,,,,14150,5050,14150,,, 1970,2,59,,100,4.54,1,0,6,100,1,6,1,12,9999999,1,2,1,,,,34,31,34,,,,5,6,5,,,,50,60,50,,,,14150,5050,14150,,, 1970,2,60,,100,4.54,1,0,1,100,1,35,10,100,14050,1,,,,,,35,,,,,,10,,,,,,100,,,,,,14050,,,,, 1970,2,62,,100,4.54,1,0,1,100,1,53,6,60,10650,1,,,2,,,53,,,49,,,6,,,6,,,60,,,60,,,10650,,,6650,, 1970,2,62,,100,4.54,1,0,2,100,2,49,6,60,6650,1,,,1,,,53,,,53,,,6,,,6,,,60,,,60,,,10650,,,10650,, 1970,2,63,,100,4.54,1,0,1,100,1,78,2,26,1250,1,,,,,,78,,,,,,2,,,,,,26,,,,,,1250,,,,, 1970,2,63,,100,4.54,1,0,2,100,2,38,7,70,7050,1,,1,,,,78,,78,,,,2,,2,,,,26,,26,,,,1250,,1250,,, 1970,2,64,,100,4.54,1,0,1,100,1,37,6,60,7050,1,,,2,,,37,,,36,,,6,,,6,,,60,,,60,,,7050,,,0,, 1970,2,64,,100,4.54,1,0,2,100,2,36,6,60,0,1,,,1,,,37,,,37,,,6,,,6,,,60,,,60,,,7050,,,7050,, 1970,2,64,,100,4.54,1,0,3,100,1,13,2,25,9999999,1,2,1,,,,37,36,37,,,,6,6,6,,,,60,60,60,,,,7050,0,7050,,, 1970,2,64,,100,4.54,1,0,4,100,1,12,2,22,9999999,1,2,1,,,,37,36,37,,,,6,6,6,,,,60,60,60,,,,7050,0,7050,,, 1970,2,64,,100,4.54,1,0,5,100,1,5,0,2,9999999,1,2,1,,,,37,36,37,,,,6,6,6,,,,60,60,60,,,,7050,0,7050,,, 1970,2,65,,100,4.54,1,0,1,100,2,33,8,80,8050,2,,,,,,33,,,,,,8,,,,,,80,,,,,,8050,,,,, 1970,2,66,,100,4.54,1,0,1,100,1,25,4,40,8050,1,,,2,,,25,,,23,,,4,,,6,,,40,,,60,,,8050,,,6050,, 1970,2,66,,100,4.54,1,0,2,100,2,23,6,60,6050,1,,,1,,,25,,,25,,,4,,,4,,,40,,,40,,,8050,,,8050,, 1970,2,66,,100,4.54,1,0,3,100,2,4,1,11,9999999,1,2,1,,,,25,23,25,,,,4,6,4,,,,40,60,40,,,,8050,6050,8050,,, 1970,2,66,,100,4.54,1,0,4,100,1,2,0,1,9999999,1,2,1,,,,25,23,25,,,,4,6,4,,,,40,60,40,,,,8050,6050,8050,,, 1970,2,68,,100,4.54,1,0,1,100,1,40,5,50,3850,1,,,2,,,40,,,34,,,5,,,10,,,50,,,100,,,3850,,,850,, 1970,2,68,,100,4.54,1,0,2,100,2,34,10,100,850,1,,,1,,,40,,,40,,,5,,,5,,,50,,,50,,,3850,,,3850,, 1970,2,68,,100,4.54,1,0,3,100,1,1,0,1,9999999,1,2,1,,,,40,34,40,,,,5,10,5,,,,50,100,50,,,,3850,850,3850,,, 1970,2,68,,100,4.54,1,0,4,100,2,0,0,1,9999999,1,2,1,,,,40,34,40,,,,5,10,5,,,,50,100,50,,,,3850,850,3850,,, 1970,2,68,,100,4.54,1,0,5,100,2,5,1,11,9999999,1,2,1,,,,40,34,40,,,,5,10,5,,,,50,100,50,,,,3850,850,3850,,, 1970,2,69,,100,4.54,1,0,1,100,2,35,4,40,3850,2,,,,,,35,,,,,,4,,,,,,40,,,,,,3850,,,,, 1970,2,69,,100,4.54,1,0,2,100,2,8,1,15,9999999,2,2,,,,,35,35,,,,,4,4,,,,,40,40,,,,,3850,3850,,,, 1970,2,69,,100,4.54,1,0,3,100,2,7,1,15,9999999,2,2,,,,,35,35,,,,,4,4,,,,,40,40,,,,,3850,3850,,,, 1970,2,69,,100,4.54,1,0,4,100,2,6,1,14,9999999,2,2,,,,,35,35,,,,,4,4,,,,,40,40,,,,,3850,3850,,,, 1970,2,69,,100,4.54,1,0,5,100,2,2,0,1,9999999,2,2,,,,,35,35,,,,,4,4,,,,,40,40,,,,,3850,3850,,,, 1970,2,69,,100,4.54,1,0,6,100,1,13,2,23,9999999,2,2,,,,,35,35,,,,,4,4,,,,,40,40,,,,,3850,3850,,,, 1970,2,70,,100,4.54,1,0,1,100,1,29,2,26,1750,1,,,,,,29,,,,,,2,,,,,,26,,,,,,1750,,,,, 1970,2,71,,100,4.54,1,0,1,100,1,35,7,70,12050,1,,,,,,35,,,,,,7,,,,,,70,,,,,,12050,,,,, 1970,2,71,,100,4.54,1,0,2,100,2,31,10,100,5450,1,,,,,,35,,,,,,7,,,,,,70,,,,,,12050,,,,, 1970,2,72,,100,4.54,1,0,1,100,2,52,4,40,1550,2,,,,,,52,,,,,,4,,,,,,40,,,,,,1550,,,,, 1970,2,72,,100,4.54,1,0,2,100,2,26,5,50,2450,2,2,,,,,52,52,,,,,4,4,,,,,40,40,,,,,1550,1550,,,, 1970,2,72,,100,4.54,1,0,3,100,2,6,1,14,9999999,2,2,,,,,52,52,,,,,4,4,,,,,40,40,,,,,1550,1550,,,, 1970,2,72,,100,4.54,1,0,4,100,2,2,0,1,9999999,2,2,,,,,52,52,,,,,4,4,,,,,40,40,,,,,1550,1550,,,, 1970,2,72,,100,4.54,1,0,5,100,1,1,0,1,9999999,2,2,,,,,52,52,,,,,4,4,,,,,40,40,,,,,1550,1550,,,, 1970,2,74,,100,4.54,1,0,1,100,2,33,6,60,10050,2,,,,,,33,,,,,,6,,,,,,60,,,,,,10050,,,,, 1970,2,74,,100,4.54,1,0,2,100,1,13,2,22,9999999,2,2,,,,,33,33,,,,,6,6,,,,,60,60,,,,,10050,10050,,,, 1970,2,74,,100,4.54,1,0,3,100,1,11,1,17,9999999,2,2,,,,,33,33,,,,,6,6,,,,,60,60,,,,,10050,10050,,,, 1970,2,75,,100,4.54,1,0,1,100,2,26,9,90,10550,2,,,,,,26,,,,,,9,,,,,,90,,,,,,10550,,,,, 1970,2,76,,100,4.54,1,0,1,100,1,35,11,111,17250,1,,,,,,35,,,,,,11,,,,,,111,,,,,,17250,,,,, 1970,2,77,,100,4.54,1,0,1,100,1,49,6,60,11050,1,,,2,,,49,,,49,,,6,,,6,,,60,,,60,,,11050,,,0,, 1970,2,77,,100,4.54,1,0,2,100,2,49,6,60,0,1,,,1,,,49,,,49,,,6,,,6,,,60,,,60,,,11050,,,11050,, 1970,2,77,,100,4.54,1,0,3,100,2,16,4,40,3550,1,2,1,,,,49,49,49,,,,6,6,6,,,,60,60,60,,,,11050,0,11050,,, 1970,2,77,,100,4.54,1,0,4,100,1,12,2,23,9999999,1,2,1,,,,49,49,49,,,,6,6,6,,,,60,60,60,,,,11050,0,11050,,, 1970,2,77,,100,4.54,1,0,5,100,2,20,6,60,6050,1,2,1,,,,49,49,49,,,,6,6,6,,,,60,60,60,,,,11050,0,11050,,, 1970,2,78,,100,4.54,1,0,1,100,1,72,2,26,2050,1,,,2,,,72,,,65,,,2,,,2,,,26,,,26,,,2050,,,750,, 1970,2,78,,100,4.54,1,0,2,100,2,65,2,26,750,1,,,1,,,72,,,72,,,2,,,2,,,26,,,26,,,2050,,,2050,, 1970,2,79,,100,4.54,1,0,1,100,1,37,6,60,7550,1,,,2,,,37,,,35,,,6,,,9,,,60,,,90,,,7550,,,5550,, 1970,2,79,,100,4.54,1,0,2,100,2,35,9,90,5550,1,,,1,,,37,,,37,,,6,,,6,,,60,,,60,,,7550,,,7550,, 1970,2,80,,100,4.54,1,0,1,100,1,44,6,60,6050,1,,,2,,,44,,,45,,,6,,,6,,,60,,,60,,,6050,,,0,, 1970,2,80,,100,4.54,1,0,2,100,2,45,6,60,0,1,,,1,,,44,,,44,,,6,,,6,,,60,,,60,,,6050,,,6050,, 1970,2,80,,100,4.54,1,0,3,100,1,25,6,60,5050,1,2,1,,,,44,45,44,,,,6,6,6,,,,60,60,60,,,,6050,0,6050,,, 1970,2,81,,100,4.54,1,0,1,100,1,35,2,22,4050,1,,,2,,,35,,,31,,,2,,,2,,,22,,,23,,,4050,,,0,, 1970,2,81,,100,4.54,1,0,2,100,2,31,2,23,0,1,,,1,,,35,,,35,,,2,,,2,,,22,,,22,,,4050,,,4050,, 1970,2,81,,100,4.54,1,0,3,100,2,13,2,23,9999999,1,2,1,,,,35,31,35,,,,2,2,2,,,,22,23,22,,,,4050,0,4050,,, 1970,2,81,,100,4.54,1,0,4,100,2,4,0,2,9999999,1,2,1,,,,35,31,35,,,,2,2,2,,,,22,23,22,,,,4050,0,4050,,, 1970,2,81,,100,4.54,1,0,5,100,2,1,0,1,9999999,1,2,1,,,,35,31,35,,,,2,2,2,,,,22,23,22,,,,4050,0,4050,,, 1970,2,82,,100,4.54,1,0,1,100,1,45,6,60,10050,1,,,2,,,45,,,46,,,6,,,7,,,60,,,70,,,10050,,,4050,, 1970,2,82,,100,4.54,1,0,2,100,2,46,7,70,4050,1,,,1,,,45,,,45,,,6,,,6,,,60,,,60,,,10050,,,10050,, 1970,2,82,,100,4.54,1,0,3,100,1,19,6,60,2050,1,2,1,,,,45,46,45,,,,6,7,6,,,,60,70,60,,,,10050,4050,10050,,, 1970,2,82,,100,4.54,1,0,4,100,2,13,2,23,9999999,1,2,1,,,,45,46,45,,,,6,7,6,,,,60,70,60,,,,10050,4050,10050,,, 1970,2,83,,100,4.54,1,0,1,100,1,58,2,26,9650,1,,,2,,,58,,,59,,,2,,,4,,,26,,,40,,,9650,,,0,, 1970,2,83,,100,4.54,1,0,2,100,2,59,4,40,0,1,,,1,,,58,,,58,,,2,,,2,,,26,,,26,,,9650,,,9650,, 1970,2,83,,100,4.54,1,0,3,100,1,23,10,100,2050,1,2,1,,,,58,59,58,,,,2,4,2,,,,26,40,26,,,,9650,0,9650,,, 1970,2,84,,100,4.54,1,0,1,100,1,68,2,26,5050,1,,,,,,68,,,,,,2,,,,,,26,,,,,,5050,,,,, 1970,2,85,,100,4.54,1,0,1,100,1,40,2,22,8050,1,,,2,,,40,,,35,,,2,,,1,,,22,,,17,,,8050,,,0,, 1970,2,85,,100,4.54,1,0,2,100,2,35,1,17,0,1,,,1,,,40,,,40,,,2,,,2,,,22,,,22,,,8050,,,8050,, 1970,2,85,,100,4.54,1,0,3,100,2,1,0,1,9999999,1,2,1,,,,40,35,40,,,,2,1,2,,,,22,17,22,,,,8050,0,8050,,, 1970,2,85,,100,4.54,1,0,4,100,1,0,0,1,9999999,1,2,1,,,,40,35,40,,,,2,1,2,,,,22,17,22,,,,8050,0,8050,,, 1970,2,86,,100,4.54,1,0,1,100,1,56,2,23,7950,1,,,,,,56,,,,,,2,,,,,,23,,,,,,7950,,,,, 1970,2,86,,100,4.54,1,0,2,100,2,82,2,26,1150,1,,1,,,,56,,56,,,,2,,2,,,,23,,23,,,,7950,,7950,,, 1970,2,87,,100,4.54,1,0,1,100,1,28,6,60,12150,1,,,2,,,28,,,28,,,6,,,6,,,60,,,60,,,12150,,,0,, 1970,2,87,,100,4.54,1,0,2,100,2,28,6,60,0,1,,,1,,,28,,,28,,,6,,,6,,,60,,,60,,,12150,,,12150,, 1970,2,87,,100,4.54,1,0,3,100,1,1,0,1,9999999,1,2,1,,,,28,28,28,,,,6,6,6,,,,60,60,60,,,,12150,0,12150,,, 1970,2,88,,100,4.54,1,0,1,100,2,70,2,26,3550,2,,,,,,70,,,,,,2,,,,,,26,,,,,,3550,,,,, 1970,2,89,,100,4.54,1,0,1,100,1,54,2,26,7150,1,,,2,,,54,,,55,,,2,,,4,,,26,,,40,,,7150,,,0,, 1970,2,89,,100,4.54,1,0,2,100,2,55,4,40,0,1,,,1,,,54,,,54,,,2,,,2,,,26,,,26,,,7150,,,7150,, 1970,2,90,,100,4.54,1,0,1,100,2,42,6,60,0,2,,,,,,42,,,,,,6,,,,,,60,,,,,,0,,,,, 1970,2,90,,100,4.54,1,0,2,100,1,14,2,26,0,2,2,,,,,42,42,,,,,6,6,,,,,60,60,,,,,0,0,,,, 1970,2,90,,100,4.54,1,0,3,100,1,13,2,23,9999999,2,2,,,,,42,42,,,,,6,6,,,,,60,60,,,,,0,0,,,, 1970,2,90,,100,4.54,1,0,4,100,1,11,2,22,9999999,2,2,,,,,42,42,,,,,6,6,,,,,60,60,,,,,0,0,,,, 1970,2,90,,100,4.54,1,0,5,100,2,8,1,14,9999999,2,2,,,,,42,42,,,,,6,6,,,,,60,60,,,,,0,0,,,, 1970,2,91,,100,4.54,1,0,1,100,2,36,6,60,3250,2,,,,,,36,,,,,,6,,,,,,60,,,,,,3250,,,,, 1970,2,92,,100,4.54,1,0,1,100,1,42,4,40,6250,1,,,2,,,42,,,33,,,4,,,4,,,40,,,40,,,6250,,,750,, 1970,2,92,,100,4.54,1,0,2,100,2,33,4,40,750,1,,,1,,,42,,,42,,,4,,,4,,,40,,,40,,,6250,,,6250,, 1970,2,92,,100,4.54,1,0,3,100,2,13,2,25,9999999,1,2,1,,,,42,33,42,,,,4,4,4,,,,40,40,40,,,,6250,750,6250,,, 1970,2,92,,100,4.54,1,0,4,100,1,12,1,17,9999999,1,2,1,,,,42,33,42,,,,4,4,4,,,,40,40,40,,,,6250,750,6250,,, 1970,2,92,,100,4.54,1,0,5,100,1,10,1,15,9999999,1,2,1,,,,42,33,42,,,,4,4,4,,,,40,40,40,,,,6250,750,6250,,, 1970,2,92,,100,4.54,1,0,6,100,2,6,1,12,9999999,1,2,1,,,,42,33,42,,,,4,4,4,,,,40,40,40,,,,6250,750,6250,,, 1970,2,92,,100,4.54,1,0,7,100,2,95,2,23,1750,1,2,1,,,,42,33,42,,,,4,4,4,,,,40,40,40,,,,6250,750,6250,,, 1970,2,93,,100,4.54,1,0,1,100,1,35,10,100,12050,1,,,2,,,35,,,31,,,10,,,6,,,100,,,60,,,12050,,,3250,, 1970,2,93,,100,4.54,1,0,2,100,2,31,6,60,3250,1,,,1,,,35,,,35,,,10,,,10,,,100,,,100,,,12050,,,12050,, 1970,2,93,,100,4.54,1,0,3,100,1,14,2,26,0,1,2,1,,,,35,31,35,,,,10,6,10,,,,100,60,100,,,,12050,3250,12050,,, 1970,2,93,,100,4.54,1,0,4,100,2,12,2,23,9999999,1,2,1,,,,35,31,35,,,,10,6,10,,,,100,60,100,,,,12050,3250,12050,,, 1970,2,93,,100,4.54,1,0,5,100,2,10,1,16,9999999,1,2,1,,,,35,31,35,,,,10,6,10,,,,100,60,100,,,,12050,3250,12050,,, 1970,2,94,,100,4.54,1,0,1,100,2,72,9,90,6350,2,,,,,,72,,,,,,9,,,,,,90,,,,,,6350,,,,, 1970,2,94,,100,4.54,1,0,2,100,2,32,11,111,8650,2,2,,,,,72,72,,,,,9,9,,,,,90,90,,,,,6350,6350,,,, 1970,2,94,,100,4.54,1,0,3,100,2,42,6,60,7050,2,2,,,,,72,72,,,,,9,9,,,,,90,90,,,,,6350,6350,,,, 1970,2,95,,100,4.54,1,0,1,100,1,50,5,50,16150,1,,,2,,,50,,,48,,,5,,,6,,,50,,,60,,,16150,,,50,, 1970,2,95,,100,4.54,1,0,2,100,2,48,6,60,50,1,,,1,,,50,,,50,,,5,,,5,,,50,,,50,,,16150,,,16150,, 1970,2,95,,100,4.54,1,0,3,100,2,15,3,30,0,1,2,1,,,,50,48,50,,,,5,6,5,,,,50,60,50,,,,16150,50,16150,,, 1970,2,95,,100,4.54,1,0,4,100,1,13,2,23,9999999,1,2,1,,,,50,48,50,,,,5,6,5,,,,50,60,50,,,,16150,50,16150,,, 1970,2,96,,100,4.54,1,0,1,100,1,21,4,40,12050,1,,,2,,,21,,,19,,,4,,,6,,,40,,,60,,,12050,,,12050,, 1970,2,96,,100,4.54,1,0,2,100,2,19,6,60,12050,1,,,1,,,21,,,21,,,4,,,4,,,40,,,40,,,12050,,,12050,, 1970,2,97,,100,4.54,1,0,1,100,1,66,4,40,7150,1,,,2,,,66,,,64,,,4,,,2,,,40,,,23,,,7150,,,550,, 1970,2,97,,100,4.54,1,0,2,100,2,64,2,23,550,1,,,1,,,66,,,66,,,4,,,4,,,40,,,40,,,7150,,,7150,, 1970,2,98,,100,4.54,1,0,1,100,1,56,6,60,11050,1,,,2,,,56,,,53,,,6,,,6,,,60,,,60,,,11050,,,0,, 1970,2,98,,100,4.54,1,0,2,100,2,53,6,60,0,1,,,1,,,56,,,56,,,6,,,6,,,60,,,60,,,11050,,,11050,, 1970,2,98,,100,4.54,1,0,3,100,1,29,7,70,5050,1,2,1,,,,56,53,56,,,,6,6,6,,,,60,60,60,,,,11050,0,11050,,, 1970,2,98,,100,4.54,1,0,4,100,1,18,5,50,0,1,2,1,,,,56,53,56,,,,6,6,6,,,,60,60,60,,,,11050,0,11050,,, 1970,2,99,,100,4.54,1,0,1,100,1,51,8,80,12050,1,,,2,,,51,,,55,,,8,,,4,,,80,,,40,,,12050,,,0,, 1970,2,99,,100,4.54,1,0,2,100,2,55,4,40,0,1,,,1,,,51,,,51,,,8,,,8,,,80,,,80,,,12050,,,12050,, 1970,2,99,,100,4.54,1,0,3,100,2,11,2,22,9999999,1,2,1,,,,51,55,51,,,,8,4,8,,,,80,40,80,,,,12050,0,12050,,, 1970,2,100,,100,4.54,1,0,1,100,2,56,4,40,6250,2,,,,,,56,,,,,,4,,,,,,40,,,,,,6250,,,,, 1970,2,101,,100,4.54,1,0,1,100,1,42,9,90,21850,1,,,2,,,42,,,39,,,9,,,6,,,90,,,60,,,21850,,,650,, 1970,2,101,,100,4.54,1,0,2,100,2,39,6,60,650,1,,,1,,,42,,,42,,,9,,,9,,,90,,,90,,,21850,,,21850,, 1970,2,101,,100,4.54,1,0,3,100,1,18,5,50,650,1,2,1,,,,42,39,42,,,,9,6,9,,,,90,60,90,,,,21850,650,21850,,, 1970,2,101,,100,4.54,1,0,4,100,2,11,2,22,9999999,1,2,1,,,,42,39,42,,,,9,6,9,,,,90,60,90,,,,21850,650,21850,,, 1970,2,102,,100,4.54,1,0,1,100,1,49,6,60,19150,1,,,2,,,49,,,46,,,6,,,6,,,60,,,60,,,19150,,,0,, 1970,2,102,,100,4.54,1,0,2,100,2,46,6,60,0,1,,,1,,,49,,,49,,,6,,,6,,,60,,,60,,,19150,,,19150,, 1970,2,102,,100,4.54,1,0,3,100,2,9,1,16,9999999,1,2,1,,,,49,46,49,,,,6,6,6,,,,60,60,60,,,,19150,0,19150,,, 1970,2,102,,100,4.54,1,0,4,100,1,6,1,12,9999999,1,2,1,,,,49,46,49,,,,6,6,6,,,,60,60,60,,,,19150,0,19150,,, 1970,2,102,,100,4.54,1,0,5,100,1,17,5,50,2050,1,2,1,,,,49,46,49,,,,6,6,6,,,,60,60,60,,,,19150,0,19150,,, 1970,2,102,,100,4.54,1,0,6,100,1,16,3,30,1450,1,2,1,,,,49,46,49,,,,6,6,6,,,,60,60,60,,,,19150,0,19150,,, 1970,2,102,,100,4.54,1,0,7,100,1,16,3,30,1450,1,2,1,,,,49,46,49,,,,6,6,6,,,,60,60,60,,,,19150,0,19150,,, 1970,2,102,,100,4.54,1,0,8,100,2,11,2,22,9999999,1,2,1,,,,49,46,49,,,,6,6,6,,,,60,60,60,,,,19150,0,19150,,, 1970,2,103,,100,4.54,1,0,1,100,1,59,7,70,8850,1,,,,,,59,,,,,,7,,,,,,70,,,,,,8850,,,,, 1970,2,103,,100,4.54,1,0,2,100,2,69,6,60,450,1,,,,,,59,,,,,,7,,,,,,70,,,,,,8850,,,,, 1970,2,104,,100,4.54,1,0,1,100,1,59,5,50,6750,1,,,2,,,59,,,59,,,5,,,2,,,50,,,26,,,6750,,,4850,, 1970,2,104,,100,4.54,1,0,2,100,2,59,2,26,4850,1,,,1,,,59,,,59,,,5,,,5,,,50,,,50,,,6750,,,6750,, 1970,2,104,,100,4.54,1,0,3,100,1,20,6,60,1650,1,2,1,,,,59,59,59,,,,5,2,5,,,,50,26,50,,,,6750,4850,6750,,, 1970,2,105,,100,4.54,1,0,1,100,2,55,7,70,9450,2,,,,,,55,,,,,,7,,,,,,70,,,,,,9450,,,,, 1970,2,106,,100,4.54,1,0,1,100,1,63,2,23,7950,1,,,2,,,63,,,57,,,2,,,4,,,23,,,40,,,7950,,,4650,, 1970,2,106,,100,4.54,1,0,2,100,2,57,4,40,4650,1,,,1,,,63,,,63,,,2,,,2,,,23,,,23,,,7950,,,7950,, 1970,2,106,,100,4.54,1,0,3,100,1,21,8,80,1450,1,2,1,,,,63,57,63,,,,2,4,2,,,,23,40,23,,,,7950,4650,7950,,, 1970,2,107,,100,4.54,1,0,1,100,1,40,3,30,18650,1,,,2,,,40,,,37,,,3,,,6,,,30,,,60,,,18650,,,0,, 1970,2,107,,100,4.54,1,0,2,100,2,37,6,60,0,1,,,1,,,40,,,40,,,3,,,3,,,30,,,30,,,18650,,,18650,, 1970,2,107,,100,4.54,1,0,3,100,1,14,2,26,0,1,2,1,,,,40,37,40,,,,3,6,3,,,,30,60,30,,,,18650,0,18650,,, 1970,2,107,,100,4.54,1,0,4,100,1,8,1,15,9999999,1,2,1,,,,40,37,40,,,,3,6,3,,,,30,60,30,,,,18650,0,18650,,, 1970,2,108,,100,4.54,1,0,1,100,1,36,6,65,4050,1,,,2,,,36,,,36,,,6,,,2,,,65,,,26,,,4050,,,1550,, 1970,2,108,,100,4.54,1,0,2,100,2,36,2,26,1550,1,,,1,,,36,,,36,,,6,,,6,,,65,,,65,,,4050,,,4050,, 1970,2,108,,100,4.54,1,0,3,100,2,17,5,50,950,1,2,1,,,,36,36,36,,,,6,2,6,,,,65,26,65,,,,4050,1550,4050,,, 1970,2,108,,100,4.54,1,0,4,100,2,14,2,26,0,1,2,1,,,,36,36,36,,,,6,2,6,,,,65,26,65,,,,4050,1550,4050,,, 1970,2,108,,100,4.54,1,0,5,100,1,10,1,17,9999999,1,2,1,,,,36,36,36,,,,6,2,6,,,,65,26,65,,,,4050,1550,4050,,, 1970,2,108,,100,4.54,1,0,6,100,2,8,1,15,9999999,1,2,1,,,,36,36,36,,,,6,2,6,,,,65,26,65,,,,4050,1550,4050,,, 1970,2,109,,100,4.54,1,0,1,100,1,37,10,100,1550,1,,,,,,37,,,,,,10,,,,,,100,,,,,,1550,,,,, 1970,2,109,,100,4.54,1,0,2,100,1,48,10,100,11050,1,,,,,,37,,,,,,10,,,,,,100,,,,,,1550,,,,, 1970,2,110,,100,4.54,1,0,1,100,2,78,6,60,3950,2,,,,,,78,,,,,,6,,,,,,60,,,,,,3950,,,,, 1970,2,111,,100,4.54,1,0,1,100,1,32,4,40,6050,1,,,,,,32,,,,,,4,,,,,,40,,,,,,6050,,,,, 1970,2,112,,100,4.54,1,0,1,100,2,63,2,25,250,2,,,,,,63,,,,,,2,,,,,,25,,,,,,250,,,,, 1970,2,113,,100,4.54,1,0,1,100,1,42,5,50,10050,1,,,2,,,42,,,34,,,5,,,6,,,50,,,60,,,10050,,,550,, 1970,2,113,,100,4.54,1,0,2,100,2,34,6,60,550,1,,,1,,,42,,,42,,,5,,,5,,,50,,,50,,,10050,,,10050,, 1970,2,113,,100,4.54,1,0,3,100,1,13,2,23,9999999,1,2,1,,,,42,34,42,,,,5,6,5,,,,50,60,50,,,,10050,550,10050,,, 1970,2,113,,100,4.54,1,0,4,100,1,12,2,22,9999999,1,2,1,,,,42,34,42,,,,5,6,5,,,,50,60,50,,,,10050,550,10050,,, 1970,2,113,,100,4.54,1,0,5,100,2,11,1,17,9999999,1,2,1,,,,42,34,42,,,,5,6,5,,,,50,60,50,,,,10050,550,10050,,, 1970,2,114,,100,4.54,1,0,1,100,1,82,2,26,0,1,,,2,,,82,,,75,,,2,,,2,,,26,,,26,,,0,,,1450,, 1970,2,114,,100,4.54,1,0,2,100,2,75,2,26,1450,1,,,1,,,82,,,82,,,2,,,2,,,26,,,26,,,0,,,0,, 1970,2,114,,100,4.54,1,0,3,100,1,47,0,2,0,1,2,1,,,,82,75,82,,,,2,2,2,,,,26,26,26,,,,0,1450,0,,, 1970,2,115,,100,4.54,1,0,1,100,2,70,2,23,1250,2,,,,,,70,,,,,,2,,,,,,23,,,,,,1250,,,,, 1970,2,116,,100,4.54,1,0,1,100,1,65,3,30,3450,1,,,2,,,65,,,65,,,3,,,6,,,30,,,60,,,3450,,,350,, 1970,2,116,,100,4.54,1,0,2,100,2,65,6,60,350,1,,,1,,,65,,,65,,,3,,,3,,,30,,,30,,,3450,,,3450,, 1970,2,116,,100,4.54,1,0,3,100,1,42,7,70,6750,1,2,1,,,,65,65,65,,,,3,6,3,,,,30,60,30,,,,3450,350,3450,,, 1970,2,117,,100,4.54,1,0,1,100,1,34,6,60,14050,1,,,2,,,34,,,31,,,6,,,7,,,60,,,70,,,14050,,,0,, 1970,2,117,,100,4.54,1,0,2,100,2,31,7,70,0,1,,,1,,,34,,,34,,,6,,,6,,,60,,,60,,,14050,,,14050,, 1970,2,117,,100,4.54,1,0,3,100,1,8,1,15,9999999,1,2,1,,,,34,31,34,,,,6,7,6,,,,60,70,60,,,,14050,0,14050,,, 1970,2,117,,100,4.54,1,0,4,100,2,7,1,14,9999999,1,2,1,,,,34,31,34,,,,6,7,6,,,,60,70,60,,,,14050,0,14050,,, 1970,2,117,,100,4.54,1,0,5,100,2,5,1,11,9999999,1,2,1,,,,34,31,34,,,,6,7,6,,,,60,70,60,,,,14050,0,14050,,, 1970,2,117,,100,4.54,1,0,6,100,2,1,0,1,9999999,1,2,1,,,,34,31,34,,,,6,7,6,,,,60,70,60,,,,14050,0,14050,,, 1970,2,118,,100,4.54,1,0,1,100,1,23,6,60,6050,1,,,2,,,23,,,20,,,6,,,7,,,60,,,70,,,6050,,,6050,, 1970,2,118,,100,4.54,1,0,2,100,2,20,7,70,6050,1,,,1,,,23,,,23,,,6,,,6,,,60,,,60,,,6050,,,6050,, 1970,2,119,,100,4.54,1,0,1,100,1,24,11,110,8650,1,,,2,,,24,,,25,,,11,,,6,,,110,,,60,,,8650,,,4050,, 1970,2,119,,100,4.54,1,0,2,100,2,25,6,60,4050,1,,,1,,,24,,,24,,,11,,,11,,,110,,,110,,,8650,,,8650,, 1970,2,119,,100,4.54,1,0,3,100,2,0,0,1,9999999,1,2,1,,,,24,25,24,,,,11,6,11,,,,110,60,110,,,,8650,4050,8650,,, 1970,2,120,,100,4.54,1,0,1,100,1,35,2,26,10150,1,,,2,,,35,,,29,,,2,,,6,,,26,,,60,,,10150,,,2550,, 1970,2,120,,100,4.54,1,0,2,100,2,29,6,60,2550,1,,,1,,,35,,,35,,,2,,,2,,,26,,,26,,,10150,,,10150,, 1970,2,120,,100,4.54,1,0,3,100,2,2,0,1,9999999,1,2,1,,,,35,29,35,,,,2,6,2,,,,26,60,26,,,,10150,2550,10150,,, 1970,2,120,,100,4.54,1,0,4,100,1,4,0,2,9999999,1,2,1,,,,35,29,35,,,,2,6,2,,,,26,60,26,,,,10150,2550,10150,,, 1970,2,121,,100,4.54,1,0,1,100,1,45,4,40,12550,1,,,,,,45,,,,,,4,,,,,,40,,,,,,12550,,,,, 1970,2,122,,100,4.54,1,0,1,100,1,64,6,60,5550,1,,,2,,,64,,,62,,,6,,,2,,,60,,,26,,,5550,,,0,, 1970,2,122,,100,4.54,1,0,2,100,2,62,2,26,0,1,,,1,,,64,,,64,,,6,,,6,,,60,,,60,,,5550,,,5550,, 1970,2,123,,100,4.54,1,0,1,100,1,26,1,12,1950,1,,,2,,,26,,,27,,,1,,,8,,,12,,,80,,,1950,,,550,, 1970,2,123,,100,4.54,1,0,2,100,2,27,8,80,550,1,,,1,,,26,,,26,,,1,,,1,,,12,,,12,,,1950,,,1950,, 1970,2,123,,100,4.54,1,0,3,100,1,4,0,2,9999999,1,2,1,,,,26,27,26,,,,1,8,1,,,,12,80,12,,,,1950,550,1950,,, 1970,2,123,,100,4.54,1,0,4,100,1,4,0,2,9999999,1,2,1,,,,26,27,26,,,,1,8,1,,,,12,80,12,,,,1950,550,1950,,, 1970,2,123,,100,4.54,1,0,5,100,2,2,0,1,9999999,1,2,1,,,,26,27,26,,,,1,8,1,,,,12,80,12,,,,1950,550,1950,,, 1970,2,124,,100,4.54,1,0,1,100,1,44,2,23,10750,1,,,2,,,44,,,48,,,2,,,2,,,23,,,26,,,10750,,,0,, 1970,2,124,,100,4.54,1,0,2,100,2,48,2,26,0,1,,,1,,,44,,,44,,,2,,,2,,,23,,,23,,,10750,,,10750,, 1970,2,124,,100,4.54,1,0,3,100,2,10,1,17,9999999,1,2,1,,,,44,48,44,,,,2,2,2,,,,23,26,23,,,,10750,0,10750,,, 1970,2,125,,100,4.54,1,0,1,100,1,55,2,26,11450,1,,,2,,,55,,,50,,,2,,,7,,,26,,,70,,,11450,,,7050,, 1970,2,125,,100,4.54,1,0,2,100,2,50,7,70,7050,1,,,1,,,55,,,55,,,2,,,2,,,26,,,26,,,11450,,,11450,, 1970,2,126,,100,4.54,1,0,1,100,1,53,2,26,11150,1,,,2,,,53,,,49,,,2,,,2,,,26,,,26,,,11150,,,0,, 1970,2,126,,100,4.54,1,0,2,100,2,49,2,26,0,1,,,1,,,53,,,53,,,2,,,2,,,26,,,26,,,11150,,,11150,, 1970,2,126,,100,4.54,1,0,3,100,1,21,5,50,750,1,2,1,,,,53,49,53,,,,2,2,2,,,,26,26,26,,,,11150,0,11150,,, 1970,2,126,,100,4.54,1,0,4,100,2,15,3,30,0,1,2,1,,,,53,49,53,,,,2,2,2,,,,26,26,26,,,,11150,0,11150,,, 1970,2,127,,100,4.54,1,0,1,100,1,40,8,80,11550,1,,,2,,,40,,,39,,,8,,,6,,,80,,,65,,,11550,,,2950,, 1970,2,127,,100,4.54,1,0,2,100,2,39,6,65,2950,1,,,1,,,40,,,40,,,8,,,8,,,80,,,80,,,11550,,,11550,, 1970,2,127,,100,4.54,1,0,3,100,2,17,5,50,250,1,2,1,,,,40,39,40,,,,8,6,8,,,,80,65,80,,,,11550,2950,11550,,, 1970,2,127,,100,4.54,1,0,4,100,1,14,2,26,350,1,2,1,,,,40,39,40,,,,8,6,8,,,,80,65,80,,,,11550,2950,11550,,, 1970,2,127,,100,4.54,1,0,5,100,2,8,1,15,9999999,1,2,1,,,,40,39,40,,,,8,6,8,,,,80,65,80,,,,11550,2950,11550,,, 1970,2,128,,100,4.54,1,0,1,100,1,48,3,30,9050,1,,,2,,,48,,,44,,,3,,,3,,,30,,,30,,,9050,,,0,, 1970,2,128,,100,4.54,1,0,2,100,2,44,3,30,0,1,,,1,,,48,,,48,,,3,,,3,,,30,,,30,,,9050,,,9050,, 1970,2,128,,100,4.54,1,0,3,100,1,21,6,60,6850,1,2,1,,,,48,44,48,,,,3,3,3,,,,30,30,30,,,,9050,0,9050,,, 1970,2,128,,100,4.54,1,0,4,100,2,16,3,30,0,1,2,1,,,,48,44,48,,,,3,3,3,,,,30,30,30,,,,9050,0,9050,,, 1970,2,128,,100,4.54,1,0,5,100,2,15,3,30,0,1,2,1,,,,48,44,48,,,,3,3,3,,,,30,30,30,,,,9050,0,9050,,, 1970,2,128,,100,4.54,1,0,6,100,2,11,2,22,9999999,1,2,1,,,,48,44,48,,,,3,3,3,,,,30,30,30,,,,9050,0,9050,,, 1970,2,130,,100,4.54,1,0,1,100,1,61,2,22,10250,1,,,2,,,61,,,58,,,2,,,2,,,22,,,25,,,10250,,,3050,, 1970,2,130,,100,4.54,1,0,2,100,2,58,2,25,3050,1,,,1,,,61,,,61,,,2,,,2,,,22,,,22,,,10250,,,10250,, 1970,2,131,,100,4.54,1,0,1,100,1,59,4,40,6550,1,,,,,,59,,,,,,4,,,,,,40,,,,,,6550,,,,, 1970,2,132,,100,4.54,1,0,1,100,1,44,2,26,8850,1,,,2,,,44,,,39,,,2,,,1,,,26,,,17,,,8850,,,0,, 1970,2,132,,100,4.54,1,0,2,100,2,39,1,17,0,1,,,1,,,44,,,44,,,2,,,2,,,26,,,26,,,8850,,,8850,, 1970,2,132,,100,4.54,1,0,3,100,1,15,4,40,0,1,2,1,,,,44,39,44,,,,2,1,2,,,,26,17,26,,,,8850,0,8850,,, 1970,2,132,,100,4.54,1,0,4,100,2,10,1,17,9999999,1,2,1,,,,44,39,44,,,,2,1,2,,,,26,17,26,,,,8850,0,8850,,, 1970,2,134,,100,4.54,1,0,1,100,1,22,1,16,4850,1,,,,,,22,,,,,,1,,,,,,16,,,,,,4850,,,,, 1970,2,134,,100,4.54,1,0,2,100,1,20,2,23,4850,1,,,,,,22,,,,,,1,,,,,,16,,,,,,4850,,,,, 1970,2,135,,100,4.54,1,0,1,100,1,72,2,25,3350,1,,,2,,,72,,,75,,,2,,,2,,,25,,,26,,,3350,,,0,, 1970,2,135,,100,4.54,1,0,2,100,2,75,2,26,0,1,,,1,,,72,,,72,,,2,,,2,,,25,,,25,,,3350,,,3350,, 1970,2,136,,100,4.54,3,0,1,100,1,14,2,23,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 1970,2,137,,100,4.54,1,0,1,100,1,26,6,60,650,1,,,2,,,26,,,23,,,6,,,6,,,60,,,60,,,650,,,0,, 1970,2,137,,100,4.54,1,0,2,100,2,23,6,60,0,1,,,1,,,26,,,26,,,6,,,6,,,60,,,60,,,650,,,650,, 1970,2,137,,100,4.54,1,0,3,100,1,3,0,2,9999999,1,2,1,,,,26,23,26,,,,6,6,6,,,,60,60,60,,,,650,0,650,,, 1970,2,137,,100,4.54,1,0,4,100,2,2,0,1,9999999,1,2,1,,,,26,23,26,,,,6,6,6,,,,60,60,60,,,,650,0,650,,, 1970,2,137,,100,4.54,1,0,5,100,1,0,0,1,9999999,1,2,1,,,,26,23,26,,,,6,6,6,,,,60,60,60,,,,650,0,650,,, 1970,2,138,,100,4.54,1,0,1,100,1,32,6,60,-850,1,,,,,,32,,,,,,6,,,,,,60,,,,,,-850,,,,, 1970,2,138,,100,4.54,1,0,2,100,2,38,6,60,2650,1,,,,,,32,,,,,,6,,,,,,60,,,,,,-850,,,,, 1970,2,140,,100,4.54,1,0,1,100,1,57,1,15,3050,1,,,2,,,57,,,52,,,1,,,2,,,15,,,25,,,3050,,,0,, 1970,2,140,,100,4.54,1,0,2,100,2,52,2,25,0,1,,,1,,,57,,,57,,,1,,,1,,,15,,,15,,,3050,,,3050,, 1970,2,140,,100,4.54,1,0,3,100,1,30,6,60,4050,1,2,1,,,,57,52,57,,,,1,2,1,,,,15,25,15,,,,3050,0,3050,,, 1970,2,141,,100,4.54,1,0,1,100,2,75,2,23,5550,2,,,,,,75,,,,,,2,,,,,,23,,,,,,5550,,,,, 1970,2,142,,100,4.54,1,0,1,100,1,76,2,22,750,1,,,,,,76,,,,,,2,,,,,,22,,,,,,750,,,,, 1970,2,142,,100,4.54,1,0,2,100,2,72,2,26,250,1,,,,,,76,,,,,,2,,,,,,22,,,,,,750,,,,, 1970,2,143,,100,4.54,1,0,1,100,1,58,6,60,7050,1,,,2,,,58,,,57,,,6,,,6,,,60,,,60,,,7050,,,6550,, 1970,2,143,,100,4.54,1,0,2,100,2,57,6,60,6550,1,,,1,,,58,,,58,,,6,,,6,,,60,,,60,,,7050,,,7050,, 1970,2,144,,100,4.54,1,0,1,100,1,30,11,110,15050,1,,,2,,,30,,,29,,,11,,,9,,,110,,,90,,,15050,,,0,, 1970,2,144,,100,4.54,1,0,2,100,2,29,9,90,0,1,,,1,,,30,,,30,,,11,,,11,,,110,,,110,,,15050,,,15050,, 1970,2,144,,100,4.54,1,0,3,100,2,3,0,2,9999999,1,2,1,,,,30,29,30,,,,11,9,11,,,,110,90,110,,,,15050,0,15050,,, 1970,2,145,,100,4.54,1,0,1,100,1,30,2,26,5150,1,,,2,,,30,,,29,,,2,,,4,,,26,,,40,,,5150,,,3850,, 1970,2,145,,100,4.54,1,0,2,100,2,29,4,40,3850,1,,,1,,,30,,,30,,,2,,,2,,,26,,,26,,,5150,,,5150,, 1970,2,145,,100,4.54,1,0,3,100,2,12,2,22,9999999,1,2,1,,,,30,29,30,,,,2,4,2,,,,26,40,26,,,,5150,3850,5150,,, 1970,2,145,,100,4.54,1,0,4,100,1,10,1,17,9999999,1,2,1,,,,30,29,30,,,,2,4,2,,,,26,40,26,,,,5150,3850,5150,,, 1970,2,145,,100,4.54,1,0,5,100,2,8,1,14,9999999,1,2,1,,,,30,29,30,,,,2,4,2,,,,26,40,26,,,,5150,3850,5150,,, 1970,2,145,,100,4.54,1,0,6,100,2,3,0,2,9999999,1,2,1,,,,30,29,30,,,,2,4,2,,,,26,40,26,,,,5150,3850,5150,,, 1970,2,146,,100,4.54,1,0,1,100,2,70,6,60,1950,2,,,,,,70,,,,,,6,,,,,,60,,,,,,1950,,,,, 1970,2,146,,100,4.54,1,0,2,100,1,37,2,26,7550,2,2,,,,,70,70,,,,,6,6,,,,,60,60,,,,,1950,1950,,,, 1970,2,147,,100,4.54,1,0,1,100,1,34,9,90,13050,1,,,2,,,34,,,34,,,9,,,11,,,90,,,110,,,13050,,,0,, 1970,2,147,,100,4.54,1,0,2,100,2,34,11,110,0,1,,,1,,,34,,,34,,,9,,,9,,,90,,,90,,,13050,,,13050,, 1970,2,147,,100,4.54,1,0,3,100,2,9,1,16,9999999,1,2,1,,,,34,34,34,,,,9,11,9,,,,90,110,90,,,,13050,0,13050,,, 1970,2,147,,100,4.54,1,0,4,100,1,7,1,14,9999999,1,2,1,,,,34,34,34,,,,9,11,9,,,,90,110,90,,,,13050,0,13050,,, 1970,2,148,,100,4.54,1,0,1,100,2,79,6,60,1150,2,,,,,,79,,,,,,6,,,,,,60,,,,,,1150,,,,, 1970,2,149,,100,4.54,1,0,1,100,1,70,2,26,11050,1,,,2,,,70,,,68,,,2,,,2,,,26,,,26,,,11050,,,0,, 1970,2,149,,100,4.54,1,0,2,100,2,68,2,26,0,1,,,1,,,70,,,70,,,2,,,2,,,26,,,26,,,11050,,,11050,, 1970,2,150,,100,4.54,1,0,1,100,1,35,6,60,15050,1,,,2,,,35,,,31,,,6,,,6,,,60,,,60,,,15050,,,1550,, 1970,2,150,,100,4.54,1,0,2,100,2,31,6,60,1550,1,,,1,,,35,,,35,,,6,,,6,,,60,,,60,,,15050,,,15050,, 1970,2,150,,100,4.54,1,0,3,100,1,8,1,14,9999999,1,2,1,,,,35,31,35,,,,6,6,6,,,,60,60,60,,,,15050,1550,15050,,, 1970,2,150,,100,4.54,1,0,4,100,2,5,1,11,9999999,1,2,1,,,,35,31,35,,,,6,6,6,,,,60,60,60,,,,15050,1550,15050,,, 1970,2,152,,100,4.54,1,0,1,100,1,58,6,60,12650,1,,,2,,,58,,,55,,,6,,,2,,,60,,,26,,,12650,,,0,, 1970,2,152,,100,4.54,1,0,2,100,2,55,2,26,0,1,,,1,,,58,,,58,,,6,,,6,,,60,,,60,,,12650,,,12650,, 1970,2,153,,100,4.54,1,0,1,100,2,49,4,40,4550,2,,,,,,49,,,,,,4,,,,,,40,,,,,,4550,,,,, 1970,2,153,,100,4.54,1,0,2,100,1,39,5,50,6050,2,2,,,,,49,49,,,,,4,4,,,,,40,40,,,,,4550,4550,,,, 1970,2,154,,100,4.54,3,0,1,100,1,73,2,26,4250,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 1970,2,155,,100,4.54,1,0,1,100,1,88,4,40,1550,1,,,2,,,88,,,84,,,4,,,2,,,40,,,26,,,1550,,,850,, 1970,2,155,,100,4.54,1,0,2,100,2,84,2,26,850,1,,,1,,,88,,,88,,,4,,,4,,,40,,,40,,,1550,,,1550,, 1970,2,156,,100,4.54,1,0,1,100,1,25,5,50,10050,1,,,2,,,25,,,24,,,5,,,6,,,50,,,60,,,10050,,,750,, 1970,2,156,,100,4.54,1,0,2,100,2,24,6,60,750,1,,,1,,,25,,,25,,,5,,,5,,,50,,,50,,,10050,,,10050,, 1970,2,156,,100,4.54,1,0,3,100,2,6,1,12,9999999,1,2,1,,,,25,24,25,,,,5,6,5,,,,50,60,50,,,,10050,750,10050,,, 1970,2,156,,100,4.54,1,0,4,100,1,3,0,2,9999999,1,2,1,,,,25,24,25,,,,5,6,5,,,,50,60,50,,,,10050,750,10050,,, 1970,2,157,,100,4.54,1,0,1,100,2,56,0,2,3450,2,,,,,,56,,,,,,0,,,,,,2,,,,,,3450,,,,, 1970,2,158,,100,4.54,3,0,1,100,2,79,6,60,2250,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 1970,2,159,,100,4.54,1,0,1,100,1,25,8,80,1850,1,,,2,,,25,,,36,,,8,,,6,,,80,,,60,,,1850,,,3250,, 1970,2,159,,100,4.54,1,0,2,100,2,36,6,60,3250,1,,,1,,,25,,,25,,,8,,,8,,,80,,,80,,,1850,,,1850,, 1970,2,160,,100,4.54,1,0,1,100,2,28,6,60,3550,2,,,,,,28,,,,,,6,,,,,,60,,,,,,3550,,,,, 1970,2,160,,100,4.54,1,0,2,100,1,7,1,12,9999999,2,2,,,,,28,28,,,,,6,6,,,,,60,60,,,,,3550,3550,,,, 1970,2,161,,100,4.54,1,0,1,100,2,21,8,80,1350,2,,,,,,21,,,,,,8,,,,,,80,,,,,,1350,,,,, 1970,2,161,,100,4.54,1,0,2,100,2,23,10,100,2750,2,,,,,,21,,,,,,8,,,,,,80,,,,,,1350,,,,, 1970,2,162,,100,4.54,1,0,1,100,1,22,10,100,2050,1,,,2,,,22,,,22,,,10,,,10,,,100,,,100,,,2050,,,1250,, 1970,2,162,,100,4.54,1,0,2,100,2,22,10,100,1250,1,,,1,,,22,,,22,,,10,,,10,,,100,,,100,,,2050,,,2050,, 1970,2,163,,100,4.54,1,0,1,100,2,60,6,60,4350,2,,,,,,60,,,,,,6,,,,,,60,,,,,,4350,,,,, 1970,2,164,,100,4.54,1,0,1,100,1,20,8,80,2250,1,,,,,,20,,,,,,8,,,,,,80,,,,,,2250,,,,, 1970,2,165,,100,4.54,1,0,1,100,1,78,2,23,3250,1,,,,,,78,,,,,,2,,,,,,23,,,,,,3250,,,,, 1970,2,166,,100,4.54,1,0,1,100,1,24,10,100,9050,1,,,2,,,24,,,24,,,10,,,8,,,100,,,80,,,9050,,,3250,, 1970,2,166,,100,4.54,1,0,2,100,2,24,8,80,3250,1,,,1,,,24,,,24,,,10,,,10,,,100,,,100,,,9050,,,9050,, 1970,2,166,,100,4.54,1,0,3,100,1,0,0,1,9999999,1,2,1,,,,24,24,24,,,,10,8,10,,,,100,80,100,,,,9050,3250,9050,,, 1970,2,167,,100,4.54,1,0,1,100,1,34,10,100,17050,1,,,2,,,34,,,33,,,10,,,10,,,100,,,100,,,17050,,,0,, 1970,2,167,,100,4.54,1,0,2,100,2,33,10,100,0,1,,,1,,,34,,,34,,,10,,,10,,,100,,,100,,,17050,,,17050,, 1970,2,167,,100,4.54,1,0,3,100,1,9,1,16,9999999,1,2,1,,,,34,33,34,,,,10,10,10,,,,100,100,100,,,,17050,0,17050,,, 1970,2,167,,100,4.54,1,0,4,100,2,7,1,12,9999999,1,2,1,,,,34,33,34,,,,10,10,10,,,,100,100,100,,,,17050,0,17050,,, 1970,2,168,,100,4.54,1,0,1,100,1,27,6,60,16650,1,,,2,,,27,,,26,,,6,,,6,,,60,,,60,,,16650,,,0,, 1970,2,168,,100,4.54,1,0,2,100,2,26,6,60,0,1,,,1,,,27,,,27,,,6,,,6,,,60,,,60,,,16650,,,16650,, 1970,2,168,,100,4.54,1,0,3,100,2,5,1,11,9999999,1,2,1,,,,27,26,27,,,,6,6,6,,,,60,60,60,,,,16650,0,16650,,, 1970,2,168,,100,4.54,1,0,4,100,2,2,0,1,9999999,1,2,1,,,,27,26,27,,,,6,6,6,,,,60,60,60,,,,16650,0,16650,,, 1970,2,168,,100,4.54,1,0,5,100,2,1,0,1,9999999,1,2,1,,,,27,26,27,,,,6,6,6,,,,60,60,60,,,,16650,0,16650,,, 1970,2,169,,100,4.54,1,0,1,100,1,28,6,65,11250,1,,,2,,,28,,,23,,,6,,,7,,,65,,,70,,,11250,,,6550,, 1970,2,169,,100,4.54,1,0,2,100,2,23,7,70,6550,1,,,1,,,28,,,28,,,6,,,6,,,65,,,65,,,11250,,,11250,, 1970,2,170,,100,4.54,1,0,1,100,1,45,6,60,12950,1,,,2,,,45,,,46,,,6,,,4,,,60,,,40,,,12950,,,0,, 1970,2,170,,100,4.54,1,0,2,100,2,46,4,40,0,1,,,1,,,45,,,45,,,6,,,6,,,60,,,60,,,12950,,,12950,, 1970,2,170,,100,4.54,1,0,3,100,1,18,6,65,1650,1,2,1,,,,45,46,45,,,,6,4,6,,,,60,40,60,,,,12950,0,12950,,, 1970,2,170,,100,4.54,1,0,4,100,2,23,6,60,5550,1,2,1,,,,45,46,45,,,,6,4,6,,,,60,40,60,,,,12950,0,12950,,, 1970,2,170,,100,4.54,1,0,5,100,2,0,0,1,9999999,1,2,,,,,45,23,,,,,6,6,,,,,60,60,,,,,12950,5550,,,, 1970,2,171,,100,4.54,1,0,1,100,1,26,6,60,11450,1,,,2,,,26,,,27,,,6,,,6,,,60,,,60,,,11450,,,0,, 1970,2,171,,100,4.54,1,0,2,100,2,27,6,60,0,1,,,1,,,26,,,26,,,6,,,6,,,60,,,60,,,11450,,,11450,, 1970,2,171,,100,4.54,1,0,3,100,1,4,0,2,9999999,1,2,1,,,,26,27,26,,,,6,6,6,,,,60,60,60,,,,11450,0,11450,,, 1970,2,171,,100,4.54,1,0,4,100,2,2,0,1,9999999,1,2,1,,,,26,27,26,,,,6,6,6,,,,60,60,60,,,,11450,0,11450,,, 1970,2,171,,100,4.54,1,0,5,100,2,0,0,1,9999999,1,2,1,,,,26,27,26,,,,6,6,6,,,,60,60,60,,,,11450,0,11450,,, 1970,2,172,,100,4.54,1,0,1,100,1,33,6,60,1450,1,,,2,,,33,,,37,,,6,,,2,,,60,,,26,,,1450,,,3450,, 1970,2,172,,100,4.54,1,0,2,100,2,37,2,26,3450,1,,,1,,,33,,,33,,,6,,,6,,,60,,,60,,,1450,,,1450,, 1970,2,172,,100,4.54,1,0,3,100,2,6,1,12,9999999,1,2,1,,,,33,37,33,,,,6,2,6,,,,60,26,60,,,,1450,3450,1450,,, 1970,2,172,,100,4.54,1,0,4,100,1,5,1,11,9999999,1,2,1,,,,33,37,33,,,,6,2,6,,,,60,26,60,,,,1450,3450,1450,,, 1970,2,174,,100,4.54,1,0,1,100,1,60,4,40,13850,1,,,2,,,60,,,48,,,4,,,6,,,40,,,60,,,13850,,,0,, 1970,2,174,,100,4.54,1,0,2,100,2,48,6,60,0,1,,,1,,,60,,,60,,,4,,,4,,,40,,,40,,,13850,,,13850,, 1970,2,175,,100,4.54,1,0,1,100,1,37,1,15,3550,1,,,,,,37,,,,,,1,,,,,,15,,,,,,3550,,,,, 1970,2,175,,100,4.54,1,0,2,100,1,29,1,17,1550,1,,,,,,37,,,,,,1,,,,,,15,,,,,,3550,,,,, 1970,2,175,,100,4.54,1,0,3,100,1,28,1,17,2050,1,,,,,,37,,,,,,1,,,,,,15,,,,,,3550,,,,, 1970,2,175,,100,4.54,1,0,4,100,1,29,2,23,1350,1,,,,,,37,,,,,,1,,,,,,15,,,,,,3550,,,,, 1970,2,176,,100,4.54,4,0,1,100,1,34,11,111,11550,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 1970,2,177,,100,4.54,1,0,1,100,2,75,8,80,50000,2,,,,,,75,,,,,,8,,,,,,80,,,,,,50000,,,,, 1970,2,177,,100,4.54,1,0,2,100,2,44,6,60,0,2,2,,,,,75,75,,,,,8,8,,,,,80,80,,,,,50000,50000,,,, 1970,2,178,,100,4.54,1,0,1,100,2,28,8,80,6050,2,,,,,,28,,,,,,8,,,,,,80,,,,,,6050,,,,, 1970,2,178,,100,4.54,1,0,2,100,1,5,1,11,9999999,2,2,,,,,28,28,,,,,8,8,,,,,80,80,,,,,6050,6050,,,, 1970,2,179,,100,4.54,1,0,1,100,1,35,11,111,19150,1,,,2,,,35,,,30,,,11,,,11,,,111,,,110,,,19150,,,4050,, 1970,2,179,,100,4.54,1,0,2,100,2,30,11,110,4050,1,,,1,,,35,,,35,,,11,,,11,,,111,,,111,,,19150,,,19150,, 1970,2,179,,100,4.54,1,0,3,100,1,4,1,11,9999999,1,2,1,,,,35,30,35,,,,11,11,11,,,,111,110,111,,,,19150,4050,19150,,, 1970,2,180,,100,4.54,1,0,1,100,1,28,10,100,2350,1,,,2,,,28,,,26,,,10,,,11,,,100,,,110,,,2350,,,7250,, 1970,2,180,,100,4.54,1,0,2,100,2,26,11,110,7250,1,,,1,,,28,,,28,,,10,,,10,,,100,,,100,,,2350,,,2350,, 1970,2,182,,100,4.54,1,0,1,100,1,73,2,25,1050,1,,,,,,73,,,,,,2,,,,,,25,,,,,,1050,,,,, 1970,2,183,,100,4.54,1,0,1,100,1,27,6,60,15050,1,,,2,,,27,,,27,,,6,,,6,,,60,,,60,,,15050,,,0,, 1970,2,183,,100,4.54,1,0,2,100,2,27,6,60,0,1,,,1,,,27,,,27,,,6,,,6,,,60,,,60,,,15050,,,15050,, 1970,2,183,,100,4.54,1,0,3,100,2,4,0,2,9999999,1,2,1,,,,27,27,27,,,,6,6,6,,,,60,60,60,,,,15050,0,15050,,, 1970,2,183,,100,4.54,1,0,4,100,1,0,0,1,9999999,1,2,1,,,,27,27,27,,,,6,6,6,,,,60,60,60,,,,15050,0,15050,,, 1970,2,184,,100,4.54,1,0,1,100,2,49,2,25,2550,2,,,,,,49,,,,,,2,,,,,,25,,,,,,2550,,,,, 1970,2,184,,100,4.54,1,0,2,100,2,48,2,25,0,2,,,,,,49,,,,,,2,,,,,,25,,,,,,2550,,,,, 1970,2,185,,100,4.54,1,0,1,100,1,29,6,60,8250,1,,,2,,,29,,,22,,,6,,,6,,,60,,,60,,,8250,,,5050,, 1970,2,185,,100,4.54,1,0,2,100,2,22,6,60,5050,1,,,1,,,29,,,29,,,6,,,6,,,60,,,60,,,8250,,,8250,, 1970,2,187,,100,4.54,1,0,1,100,1,51,8,80,10050,1,,,2,,,51,,,44,,,8,,,6,,,80,,,60,,,10050,,,1450,, 1970,2,187,,100,4.54,1,0,2,100,2,44,6,60,1450,1,,,1,,,51,,,51,,,8,,,8,,,80,,,80,,,10050,,,10050,, 1970,2,187,,100,4.54,1,0,3,100,2,16,4,40,0,1,2,1,,,,51,44,51,,,,8,6,8,,,,80,60,80,,,,10050,1450,10050,,, 1970,2,187,,100,4.54,1,0,4,100,2,10,1,17,9999999,1,2,1,,,,51,44,51,,,,8,6,8,,,,80,60,80,,,,10050,1450,10050,,, 1970,2,187,,100,4.54,1,0,5,100,2,9,1,15,9999999,1,2,1,,,,51,44,51,,,,8,6,8,,,,80,60,80,,,,10050,1450,10050,,, 1970,2,187,,100,4.54,1,0,6,100,2,19,6,60,0,1,2,1,,,,51,44,51,,,,8,6,8,,,,80,60,80,,,,10050,1450,10050,,, 1970,2,187,,100,4.54,1,0,7,100,2,0,0,1,9999999,1,2,,,,,51,19,,,,,8,6,,,,,80,60,,,,,10050,0,,,, 1970,2,188,,100,4.54,1,0,1,100,1,55,6,60,11250,1,,,2,,,55,,,50,,,6,,,2,,,60,,,26,,,11250,,,650,, 1970,2,188,,100,4.54,1,0,2,100,2,50,2,26,650,1,,,1,,,55,,,55,,,6,,,6,,,60,,,60,,,11250,,,11250,, 1970,2,188,,100,4.54,1,0,3,100,2,10,2,22,9999999,1,2,1,,,,55,50,55,,,,6,2,6,,,,60,26,60,,,,11250,650,11250,,, 1970,2,189,,100,4.54,1,0,1,100,1,51,6,60,8050,1,,,2,,,51,,,51,,,6,,,4,,,60,,,40,,,8050,,,0,, 1970,2,189,,100,4.54,1,0,2,100,2,51,4,40,0,1,,,1,,,51,,,51,,,6,,,6,,,60,,,60,,,8050,,,8050,, 1970,2,189,,100,4.54,1,0,3,100,2,22,8,80,0,1,2,1,,,,51,51,51,,,,6,4,6,,,,60,40,60,,,,8050,0,8050,,, 1970,2,189,,100,4.54,1,0,4,100,1,17,5,50,1250,1,2,1,,,,51,51,51,,,,6,4,6,,,,60,40,60,,,,8050,0,8050,,, 1970,2,190,,100,4.54,1,0,1,100,1,43,11,111,21150,1,,,2,,,43,,,39,,,11,,,6,,,111,,,60,,,21150,,,0,, 1970,2,190,,100,4.54,1,0,2,100,2,39,6,60,0,1,,,1,,,43,,,43,,,11,,,11,,,111,,,111,,,21150,,,21150,, 1970,2,190,,100,4.54,1,0,3,100,1,15,2,26,0,1,2,1,,,,43,39,43,,,,11,6,11,,,,111,60,111,,,,21150,0,21150,,, 1970,2,190,,100,4.54,1,0,4,100,1,12,2,22,9999999,1,2,1,,,,43,39,43,,,,11,6,11,,,,111,60,111,,,,21150,0,21150,,, 1970,2,190,,100,4.54,1,0,5,100,1,9,1,16,9999999,1,2,1,,,,43,39,43,,,,11,6,11,,,,111,60,111,,,,21150,0,21150,,, 1970,2,191,,100,4.54,1,0,1,100,1,54,3,30,7550,1,,,2,,,54,,,50,,,3,,,2,,,30,,,26,,,7550,,,5050,, 1970,2,191,,100,4.54,1,0,2,100,2,50,2,26,5050,1,,,1,,,54,,,54,,,3,,,3,,,30,,,30,,,7550,,,7550,, 1970,2,192,,100,4.54,1,0,1,100,2,62,2,26,3550,2,,,,,,62,,,,,,2,,,,,,26,,,,,,3550,,,,, 1970,2,193,,100,4.54,1,0,1,100,1,26,4,40,11050,1,,,2,,,26,,,21,,,4,,,6,,,40,,,60,,,11050,,,0,, 1970,2,193,,100,4.54,1,0,2,100,2,21,6,60,0,1,,,1,,,26,,,26,,,4,,,4,,,40,,,40,,,11050,,,11050,, 1970,2,193,,100,4.54,1,0,3,100,1,0,0,1,9999999,1,2,1,,,,26,21,26,,,,4,6,4,,,,40,60,40,,,,11050,0,11050,,, 1970,2,194,,100,4.54,1,0,1,100,1,28,1,16,2550,1,,,2,,,28,,,23,,,1,,,0,,,16,,,2,,,2550,,,1550,, 1970,2,194,,100,4.54,1,0,2,100,2,23,0,2,1550,1,,,1,,,28,,,28,,,1,,,1,,,16,,,16,,,2550,,,2550,, 1970,2,194,,100,4.54,1,0,3,100,1,4,0,2,9999999,1,2,1,,,,28,23,28,,,,1,0,1,,,,16,2,16,,,,2550,1550,2550,,, 1970,2,194,,100,4.54,1,0,4,100,2,2,0,1,9999999,1,2,1,,,,28,23,28,,,,1,0,1,,,,16,2,16,,,,2550,1550,2550,,, 1970,2,195,,100,4.54,1,0,1,100,1,48,6,60,9450,1,,,2,,,48,,,45,,,6,,,4,,,60,,,40,,,9450,,,3650,, 1970,2,195,,100,4.54,1,0,2,100,2,45,4,40,3650,1,,,1,,,48,,,48,,,6,,,6,,,60,,,60,,,9450,,,9450,, 1970,2,196,,100,4.54,1,0,1,100,1,43,3,30,18050,1,,,2,,,43,,,37,,,3,,,6,,,30,,,60,,,18050,,,2550,, 1970,2,196,,100,4.54,1,0,2,100,2,37,6,60,2550,1,,,1,,,43,,,43,,,3,,,3,,,30,,,30,,,18050,,,18050,, 1970,2,196,,100,4.54,1,0,3,100,2,12,2,22,9999999,1,2,1,,,,43,37,43,,,,3,6,3,,,,30,60,30,,,,18050,2550,18050,,, 1970,2,196,,100,4.54,1,0,4,100,2,8,1,14,9999999,1,2,1,,,,43,37,43,,,,3,6,3,,,,30,60,30,,,,18050,2550,18050,,, 1970,2,196,,100,4.54,1,0,5,100,2,14,2,26,0,1,2,1,,,,43,37,43,,,,3,6,3,,,,30,60,30,,,,18050,2550,18050,,, 1970,2,197,,100,4.54,1,0,1,100,1,56,2,23,7650,1,,,,,,56,,,,,,2,,,,,,23,,,,,,7650,,,,, 1970,2,198,,100,4.54,1,0,1,100,1,62,2,22,12050,1,,,2,,,62,,,59,,,2,,,7,,,22,,,70,,,12050,,,2650,, 1970,2,198,,100,4.54,1,0,2,100,2,59,7,70,2650,1,,,1,,,62,,,62,,,2,,,2,,,22,,,22,,,12050,,,12050,, 1970,2,198,,100,4.54,1,0,3,100,2,22,7,70,4350,1,2,1,,,,62,59,62,,,,2,7,2,,,,22,70,22,,,,12050,2650,12050,,, 1970,2,198,,100,4.54,1,0,4,100,2,20,1,14,1050,1,2,1,,,,62,59,62,,,,2,7,2,,,,22,70,22,,,,12050,2650,12050,,, 1970,2,199,,100,4.54,1,0,1,100,1,50,2,26,12050,1,,,2,,,50,,,51,,,2,,,2,,,26,,,25,,,12050,,,0,, 1970,2,199,,100,4.54,1,0,2,100,2,51,2,25,0,1,,,1,,,50,,,50,,,2,,,2,,,26,,,26,,,12050,,,12050,, 1970,2,199,,100,4.54,1,0,3,100,2,18,5,50,750,1,2,1,,,,50,51,50,,,,2,2,2,,,,26,25,26,,,,12050,0,12050,,, 1970,2,200,,100,4.54,1,0,1,100,1,63,4,40,12350,1,,,2,,,63,,,59,,,4,,,6,,,40,,,60,,,12350,,,0,, 1970,2,200,,100,4.54,1,0,2,100,2,59,6,60,0,1,,,1,,,63,,,63,,,4,,,4,,,40,,,40,,,12350,,,12350,, 1970,2,201,,100,4.54,1,0,1,100,1,67,4,40,6350,1,,,2,,,67,,,61,,,4,,,2,,,40,,,26,,,6350,,,0,, 1970,2,201,,100,4.54,1,0,2,100,2,61,2,26,0,1,,,1,,,67,,,67,,,4,,,4,,,40,,,40,,,6350,,,6350,, 1970,2,202,,100,4.54,1,0,1,100,1,51,6,60,10050,1,,,2,,,51,,,45,,,6,,,6,,,60,,,60,,,10050,,,0,, 1970,2,202,,100,4.54,1,0,2,100,2,45,6,60,0,1,,,1,,,51,,,51,,,6,,,6,,,60,,,60,,,10050,,,10050,, 1970,2,202,,100,4.54,1,0,3,100,2,17,4,40,150,1,2,1,,,,51,45,51,,,,6,6,6,,,,60,60,60,,,,10050,0,10050,,, 1970,2,203,,100,4.54,1,0,1,100,1,42,6,60,13750,1,,,2,,,42,,,43,,,6,,,3,,,60,,,30,,,13750,,,2750,, 1970,2,203,,100,4.54,1,0,2,100,2,43,3,30,2750,1,,,1,,,42,,,42,,,6,,,6,,,60,,,60,,,13750,,,13750,, 1970,2,203,,100,4.54,1,0,3,100,1,19,5,50,1550,1,2,1,,,,42,43,42,,,,6,3,6,,,,60,30,60,,,,13750,2750,13750,,, 1970,2,203,,100,4.54,1,0,4,100,1,15,3,30,50,1,2,1,,,,42,43,42,,,,6,3,6,,,,60,30,60,,,,13750,2750,13750,,, 1970,2,203,,100,4.54,1,0,5,100,2,11,1,17,9999999,1,2,1,,,,42,43,42,,,,6,3,6,,,,60,30,60,,,,13750,2750,13750,,, 1970,2,204,,100,4.54,1,0,1,100,1,46,2,23,5250,1,,,2,,,46,,,42,,,2,,,1,,,23,,,17,,,5250,,,0,, 1970,2,204,,100,4.54,1,0,2,100,2,42,1,17,0,1,,,1,,,46,,,46,,,2,,,2,,,23,,,23,,,5250,,,5250,, 1970,2,205,,100,4.54,1,0,1,100,2,47,2,26,3550,2,,,,,,47,,,,,,2,,,,,,26,,,,,,3550,,,,, 1970,2,205,,100,4.54,1,0,2,100,1,8,1,14,9999999,2,2,,,,,47,28,,,,,2,5,,,,,26,50,,,,,3550,6050,,,, 1970,2,205,,100,4.54,1,0,3,100,2,28,5,50,6050,2,2,,,,,47,47,,,,,2,2,,,,,26,26,,,,,3550,3550,,,, 1970,2,205,,100,4.54,1,0,4,100,1,1,0,1,9999999,2,2,,,,,47,28,,,,,2,5,,,,,26,50,,,,,3550,6050,,,, 1970,2,206,,100,4.54,1,0,1,100,1,29,2,23,7250,1,,,2,,,29,,,24,,,2,,,2,,,23,,,26,,,7250,,,0,, 1970,2,206,,100,4.54,1,0,2,100,2,24,2,26,0,1,,,1,,,29,,,29,,,2,,,2,,,23,,,23,,,7250,,,7250,, 1970,2,206,,100,4.54,1,0,3,100,1,2,0,1,9999999,1,2,1,,,,29,24,29,,,,2,2,2,,,,23,26,23,,,,7250,0,7250,,, 1970,2,206,,100,4.54,1,0,4,100,1,0,0,1,9999999,1,2,1,,,,29,24,29,,,,2,2,2,,,,23,26,23,,,,7250,0,7250,,, 1970,2,207,,100,4.54,1,0,1,100,2,49,6,60,7450,2,,,,,,49,,,,,,6,,,,,,60,,,,,,7450,,,,, 1970,2,208,,100,4.54,1,0,1,100,1,35,4,40,7250,1,,,2,,,35,,,29,,,4,,,2,,,40,,,22,,,7250,,,0,, 1970,2,208,,100,4.54,1,0,2,100,2,29,2,22,0,1,,,1,,,35,,,35,,,4,,,4,,,40,,,40,,,7250,,,7250,, 1970,2,208,,100,4.54,1,0,3,100,2,7,1,16,9999999,1,2,1,,,,35,29,35,,,,4,2,4,,,,40,22,40,,,,7250,0,7250,,, 1970,2,208,,100,4.54,1,0,4,100,1,8,1,14,9999999,1,2,1,,,,35,29,35,,,,4,2,4,,,,40,22,40,,,,7250,0,7250,,, 1970,2,208,,100,4.54,1,0,5,100,1,5,0,2,9999999,1,2,1,,,,35,29,35,,,,4,2,4,,,,40,22,40,,,,7250,0,7250,,, 1970,2,208,,100,4.54,1,0,6,100,1,3,0,2,9999999,1,2,1,,,,35,29,35,,,,4,2,4,,,,40,22,40,,,,7250,0,7250,,, 1970,2,208,,100,4.54,1,0,7,100,1,0,0,1,9999999,1,2,1,,,,35,29,35,,,,4,2,4,,,,40,22,40,,,,7250,0,7250,,, 1970,2,209,,100,4.54,1,0,1,100,1,29,4,40,7050,1,,,2,,,29,,,40,,,4,,,0,,,40,,,2,,,7050,,,0,, 1970,2,209,,100,4.54,1,0,2,100,2,40,0,2,0,1,,,1,,,29,,,29,,,4,,,4,,,40,,,40,,,7050,,,7050,, 1970,2,209,,100,4.54,1,0,3,100,1,7,1,14,9999999,1,2,1,,,,29,40,29,,,,4,0,4,,,,40,2,40,,,,7050,0,7050,,, 1970,2,210,,100,4.54,1,0,1,100,1,77,1,15,1750,1,,,2,,,77,,,64,,,1,,,2,,,15,,,26,,,1750,,,1050,, 1970,2,210,,100,4.54,1,0,2,100,2,64,2,26,1050,1,,,1,,,77,,,77,,,1,,,1,,,15,,,15,,,1750,,,1750,, 1970,2,210,,100,4.54,1,0,3,100,1,26,6,65,5350,1,,,,,,77,,,,,,1,,,,,,15,,,,,,1750,,,,, 1970,2,211,,100,4.54,1,0,1,100,1,92,0,2,2250,1,,,2,,,92,,,76,,,0,,,1,,,2,,,16,,,2250,,,1150,, 1970,2,211,,100,4.54,1,0,2,100,2,76,1,16,1150,1,,,1,,,92,,,92,,,0,,,0,,,2,,,2,,,2250,,,2250,, 1970,2,212,,100,4.54,1,0,1,100,1,41,6,60,7350,1,,,2,,,41,,,38,,,6,,,4,,,60,,,40,,,7350,,,0,, 1970,2,212,,100,4.54,1,0,2,100,2,38,4,40,0,1,,,1,,,41,,,41,,,6,,,6,,,60,,,60,,,7350,,,7350,, 1970,2,212,,100,4.54,1,0,3,100,1,14,2,26,650,1,2,1,,,,41,38,41,,,,6,4,6,,,,60,40,60,,,,7350,0,7350,,, 1970,2,212,,100,4.54,1,0,4,100,2,18,6,60,0,1,2,1,,,,41,38,41,,,,6,4,6,,,,60,40,60,,,,7350,0,7350,,, 1970,2,212,,100,4.54,1,0,5,100,1,17,4,40,0,1,2,1,,,,41,38,41,,,,6,4,6,,,,60,40,60,,,,7350,0,7350,,, 1970,2,212,,100,4.54,1,0,6,100,2,12,2,23,9999999,1,2,1,,,,41,38,41,,,,6,4,6,,,,60,40,60,,,,7350,0,7350,,, 1970,2,212,,100,4.54,1,0,7,100,1,10,1,16,9999999,1,2,1,,,,41,38,41,,,,6,4,6,,,,60,40,60,,,,7350,0,7350,,, 1970,2,212,,100,4.54,1,0,8,100,2,15,2,25,2150,1,2,1,,,,41,38,41,,,,6,4,6,,,,60,40,60,,,,7350,0,7350,,, 1970,2,213,,100,4.54,1,0,1,100,2,55,2,22,1650,2,,,,,,55,,,,,,2,,,,,,22,,,,,,1650,,,,, 1970,2,213,,100,4.54,1,0,2,100,1,23,6,60,0,2,2,,,,,55,55,,,,,2,2,,,,,22,22,,,,,1650,1650,,,, 1970,2,213,,100,4.54,1,0,3,100,1,21,6,65,0,2,2,,,,,55,55,,,,,2,2,,,,,22,22,,,,,1650,1650,,,, 1970,2,213,,100,4.54,1,0,4,100,1,16,4,40,0,2,2,,,,,55,55,,,,,2,2,,,,,22,22,,,,,1650,1650,,,, 1970,2,213,,100,4.54,1,0,5,100,1,13,2,26,9999999,2,2,,,,,55,55,,,,,2,2,,,,,22,22,,,,,1650,1650,,,, 1970,2,213,,100,4.54,1,0,6,100,1,12,2,23,9999999,2,2,,,,,55,55,,,,,2,2,,,,,22,22,,,,,1650,1650,,,, 1970,2,214,,100,4.54,1,0,1,100,1,37,2,25,9550,1,,,2,,,37,,,47,,,2,,,6,,,25,,,60,,,9550,,,10650,, 1970,2,214,,100,4.54,1,0,2,100,2,47,6,60,10650,1,,,1,,,37,,,37,,,2,,,2,,,25,,,25,,,9550,,,9550,, 1970,2,214,,100,4.54,1,0,3,100,1,19,6,65,1950,1,2,1,,,,37,47,37,,,,2,6,2,,,,25,60,25,,,,9550,10650,9550,,, 1970,2,215,,100,4.54,1,0,1,100,1,52,6,60,15350,1,,,2,,,52,,,51,,,6,,,6,,,60,,,60,,,15350,,,0,, 1970,2,215,,100,4.54,1,0,2,100,2,51,6,60,0,1,,,1,,,52,,,52,,,6,,,6,,,60,,,60,,,15350,,,15350,, 1970,2,215,,100,4.54,1,0,3,100,1,26,6,60,5750,1,2,1,,,,52,51,52,,,,6,6,6,,,,60,60,60,,,,15350,0,15350,,, 1970,2,216,,100,4.54,1,0,1,100,1,50,10,100,6450,1,,,2,,,50,,,38,,,10,,,6,,,100,,,60,,,6450,,,2450,, 1970,2,216,,100,4.54,1,0,2,100,2,38,6,60,2450,1,,,1,,,50,,,50,,,10,,,10,,,100,,,100,,,6450,,,6450,, 1970,2,216,,100,4.54,1,0,3,100,1,17,4,40,650,1,2,1,,,,50,38,50,,,,10,6,10,,,,100,60,100,,,,6450,2450,6450,,, 1970,2,216,,100,4.54,1,0,4,100,1,16,4,40,950,1,2,1,,,,50,38,50,,,,10,6,10,,,,100,60,100,,,,6450,2450,6450,,, 1970,2,216,,100,4.54,1,0,5,100,1,14,2,26,150,1,2,1,,,,50,38,50,,,,10,6,10,,,,100,60,100,,,,6450,2450,6450,,, 1970,2,217,,100,4.54,1,0,1,100,1,39,6,60,11050,1,,,2,,,39,,,41,,,6,,,6,,,60,,,60,,,11050,,,1250,, 1970,2,217,,100,4.54,1,0,2,100,2,41,6,60,1250,1,,,1,,,39,,,39,,,6,,,6,,,60,,,60,,,11050,,,11050,, 1970,2,217,,100,4.54,1,0,3,100,1,17,4,40,0,1,2,1,,,,39,41,39,,,,6,6,6,,,,60,60,60,,,,11050,1250,11050,,, 1970,2,218,,100,4.54,1,0,1,100,1,29,6,60,17750,1,,,2,,,29,,,29,,,6,,,6,,,60,,,60,,,17750,,,0,, 1970,2,218,,100,4.54,1,0,2,100,2,29,6,60,0,1,,,1,,,29,,,29,,,6,,,6,,,60,,,60,,,17750,,,17750,, 1970,2,218,,100,4.54,1,0,3,100,1,6,1,12,9999999,1,2,1,,,,29,29,29,,,,6,6,6,,,,60,60,60,,,,17750,0,17750,,, 1970,2,218,,100,4.54,1,0,4,100,2,3,0,2,9999999,1,2,1,,,,29,29,29,,,,6,6,6,,,,60,60,60,,,,17750,0,17750,,, 1970,2,219,,100,4.54,1,0,1,100,1,30,9,90,10050,1,,,2,,,30,,,22,,,9,,,6,,,90,,,60,,,10050,,,6050,, 1970,2,219,,100,4.54,1,0,2,100,2,22,6,60,6050,1,,,1,,,30,,,30,,,9,,,9,,,90,,,90,,,10050,,,10050,, 1970,2,220,,100,4.54,1,0,1,100,1,34,10,100,12050,1,,,2,,,34,,,30,,,10,,,10,,,100,,,100,,,12050,,,0,, 1970,2,220,,100,4.54,1,0,2,100,2,30,10,100,0,1,,,1,,,34,,,34,,,10,,,10,,,100,,,100,,,12050,,,12050,, 1970,2,221,,100,4.54,1,0,1,100,1,28,10,100,35050,1,,,,,,28,,,,,,10,,,,,,100,,,,,,35050,,,,, 1970,2,222,,100,4.54,1,0,1,100,1,31,6,60,9850,1,,,2,,,31,,,26,,,6,,,7,,,60,,,70,,,9850,,,0,, 1970,2,222,,100,4.54,1,0,2,100,2,26,7,70,0,1,,,1,,,31,,,31,,,6,,,6,,,60,,,60,,,9850,,,9850,, 1970,2,222,,100,4.54,1,0,3,100,1,2,0,1,9999999,1,2,1,,,,31,26,31,,,,6,7,6,,,,60,70,60,,,,9850,0,9850,,, 1970,2,222,,100,4.54,1,0,4,100,2,0,0,1,9999999,1,2,1,,,,31,26,31,,,,6,7,6,,,,60,70,60,,,,9850,0,9850,,, 1970,2,223,,100,4.54,1,0,1,100,1,34,6,60,13250,1,,,2,,,34,,,30,,,6,,,6,,,60,,,60,,,13250,,,0,, 1970,2,223,,100,4.54,1,0,2,100,2,30,6,60,0,1,,,1,,,34,,,34,,,6,,,6,,,60,,,60,,,13250,,,13250,, 1970,2,223,,100,4.54,1,0,3,100,1,7,1,14,9999999,1,2,1,,,,34,30,34,,,,6,6,6,,,,60,60,60,,,,13250,0,13250,,, 1970,2,223,,100,4.54,1,0,4,100,1,5,0,2,9999999,1,2,1,,,,34,30,34,,,,6,6,6,,,,60,60,60,,,,13250,0,13250,,, 1970,2,223,,100,4.54,1,0,5,100,1,2,0,1,9999999,1,2,1,,,,34,30,34,,,,6,6,6,,,,60,60,60,,,,13250,0,13250,,, 1970,2,224,,100,4.54,1,0,1,100,1,52,8,80,15250,1,,,2,,,52,,,56,,,8,,,7,,,80,,,70,,,15250,,,0,, 1970,2,224,,100,4.54,1,0,2,100,2,56,7,70,0,1,,,1,,,52,,,52,,,8,,,8,,,80,,,80,,,15250,,,15250,, 1970,2,224,,100,4.54,1,0,3,100,2,19,6,60,3950,1,2,1,,,,52,56,52,,,,8,7,8,,,,80,70,80,,,,15250,0,15250,,, 1970,2,225,,100,4.54,1,0,1,100,1,34,7,70,14050,1,,,2,,,34,,,32,,,7,,,8,,,70,,,80,,,14050,,,0,, 1970,2,225,,100,4.54,1,0,2,100,2,32,8,80,0,1,,,1,,,34,,,34,,,7,,,7,,,70,,,70,,,14050,,,14050,, 1970,2,225,,100,4.54,1,0,3,100,2,8,1,15,9999999,1,2,1,,,,34,32,34,,,,7,8,7,,,,70,80,70,,,,14050,0,14050,,, 1970,2,225,,100,4.54,1,0,4,100,1,2,0,1,9999999,1,2,1,,,,34,32,34,,,,7,8,7,,,,70,80,70,,,,14050,0,14050,,, 1970,2,226,,100,4.54,1,0,1,100,1,39,7,70,15650,1,,,2,,,39,,,38,,,7,,,3,,,70,,,30,,,15650,,,0,, 1970,2,226,,100,4.54,1,0,2,100,2,38,3,30,0,1,,,1,,,39,,,39,,,7,,,7,,,70,,,70,,,15650,,,15650,, 1970,2,226,,100,4.54,1,0,3,100,2,11,1,17,9999999,1,2,1,,,,39,38,39,,,,7,3,7,,,,70,30,70,,,,15650,0,15650,,, 1970,2,226,,100,4.54,1,0,4,100,2,9,1,15,9999999,1,2,1,,,,39,38,39,,,,7,3,7,,,,70,30,70,,,,15650,0,15650,,, 1970,2,226,,100,4.54,1,0,5,100,2,5,0,2,9999999,1,2,1,,,,39,38,39,,,,7,3,7,,,,70,30,70,,,,15650,0,15650,,, 1970,2,226,,100,4.54,1,0,6,100,1,2,0,1,9999999,1,2,1,,,,39,38,39,,,,7,3,7,,,,70,30,70,,,,15650,0,15650,,, 1970,2,226,,100,4.54,1,0,7,100,1,1,0,1,9999999,1,2,1,,,,39,38,39,,,,7,3,7,,,,70,30,70,,,,15650,0,15650,,, 1970,2,227,,100,4.54,1,0,1,100,2,45,6,60,8050,2,,,,,,45,,,,,,6,,,,,,60,,,,,,8050,,,,, 1970,2,227,,100,4.54,1,0,2,100,2,17,5,50,1350,2,2,,,,,45,45,,,,,6,6,,,,,60,60,,,,,8050,8050,,,, 1970,2,227,,100,4.54,1,0,3,100,2,20,8,80,2050,2,2,,,,,45,45,,,,,6,6,,,,,60,60,,,,,8050,8050,,,, 1970,2,228,,100,4.54,1,0,1,100,1,60,8,80,18150,1,,,2,,,60,,,56,,,8,,,6,,,80,,,60,,,18150,,,250,, 1970,2,228,,100,4.54,1,0,2,100,2,56,6,60,250,1,,,1,,,60,,,60,,,8,,,8,,,80,,,80,,,18150,,,18150,, 1970,2,228,,100,4.54,1,0,3,100,2,62,2,26,0,1,,,,,,60,,,,,,8,,,,,,80,,,,,,18150,,,,, 1970,2,230,,100,4.54,1,0,1,100,2,63,7,70,4250,2,,,,,,63,,,,,,7,,,,,,70,,,,,,4250,,,,, 1970,2,230,,100,4.54,1,0,2,100,2,57,6,60,4350,2,,,,,,63,,,,,,7,,,,,,70,,,,,,4250,,,,, 1970,2,231,,100,4.54,1,0,1,100,2,65,5,50,2250,2,,,,,,65,,,,,,5,,,,,,50,,,,,,2250,,,,, 1970,2,232,,100,4.54,1,0,1,100,1,75,2,26,23850,1,,,2,,,75,,,58,,,2,,,6,,,26,,,60,,,23850,,,1850,, 1970,2,232,,100,4.54,1,0,2,100,2,58,6,60,1850,1,,,1,,,75,,,75,,,2,,,2,,,26,,,26,,,23850,,,23850,, 1970,2,233,,100,4.54,4,4,1,100,1,42,2,22,150,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 1970,2,234,,100,4.54,1,0,1,100,1,50,1,17,3850,1,,,,,,50,,,,,,1,,,,,,17,,,,,,3850,,,,, 1970,2,235,,100,4.54,1,0,1,100,2,22,5,50,1050,2,,,,,,22,,,,,,5,,,,,,50,,,,,,1050,,,,, 1970,2,235,,100,4.54,1,0,2,100,1,3,0,2,9999999,2,2,,,,,22,22,,,,,5,5,,,,,50,50,,,,,1050,1050,,,, 1970,2,235,,100,4.54,1,0,3,100,2,20,6,60,3450,2,,,,,,22,,,,,,5,,,,,,50,,,,,,1050,,,,, 1970,2,235,,100,4.54,1,0,4,100,1,1,0,1,9999999,2,2,,,,,22,20,,,,,5,6,,,,,50,60,,,,,1050,3450,,,, 1970,2,236,,100,4.54,1,0,1,100,2,79,2,26,2150,2,,,,,,79,,,,,,2,,,,,,26,,,,,,2150,,,,, 1970,2,237,,100,4.54,1,0,1,100,2,45,3,30,2150,2,,,,,,45,,,,,,3,,,,,,30,,,,,,2150,,,,, 1970,2,237,,100,4.54,1,0,2,100,2,12,2,22,9999999,2,,,,,,45,,,,,,3,,,,,,30,,,,,,2150,,,,, 1970,2,238,,100,4.54,1,0,1,100,2,40,6,60,2750,2,,,,,,40,,,,,,6,,,,,,60,,,,,,2750,,,,, 1970,2,238,,100,4.54,1,0,2,100,1,18,6,60,650,2,2,,,,,40,40,,,,,6,6,,,,,60,60,,,,,2750,2750,,,, 1970,2,238,,100,4.54,1,0,3,100,1,17,3,30,350,2,2,,,,,40,40,,,,,6,6,,,,,60,60,,,,,2750,2750,,,, 1970,2,238,,100,4.54,1,0,4,100,1,13,2,25,9999999,2,2,,,,,40,40,,,,,6,6,,,,,60,60,,,,,2750,2750,,,, 1970,2,238,,100,4.54,1,0,5,100,1,11,1,17,9999999,2,2,,,,,40,40,,,,,6,6,,,,,60,60,,,,,2750,2750,,,, 1970,2,238,,100,4.54,1,0,6,100,2,10,1,17,9999999,2,2,,,,,40,40,,,,,6,6,,,,,60,60,,,,,2750,2750,,,, 1970,2,238,,100,4.54,1,0,7,100,2,9,1,15,9999999,2,2,,,,,40,40,,,,,6,6,,,,,60,60,,,,,2750,2750,,,, 1970,2,240,,100,4.54,1,0,1,100,1,49,4,40,350,1,,,2,,,49,,,54,,,4,,,5,,,40,,,50,,,350,,,750,, 1970,2,240,,100,4.54,1,0,2,100,2,54,5,50,750,1,,,1,,,49,,,49,,,4,,,4,,,40,,,40,,,350,,,350,, 1970,2,240,,100,4.54,1,0,3,100,2,2,0,1,9999999,1,2,1,,,,49,54,49,,,,4,5,4,,,,40,50,40,,,,350,750,350,,, 1970,2,241,,100,4.54,1,0,1,100,1,65,2,26,5550,1,,,2,,,65,,,49,,,2,,,4,,,26,,,40,,,5550,,,5350,, 1970,2,241,,100,4.54,1,0,2,100,2,49,4,40,5350,1,,,1,,,65,,,65,,,2,,,2,,,26,,,26,,,5550,,,5550,, 1970,2,241,,100,4.54,1,0,3,100,1,21,6,65,7550,1,2,1,,,,65,49,65,,,,2,4,2,,,,26,40,26,,,,5550,5350,5550,,, 1970,2,241,,100,4.54,1,0,4,100,2,19,5,50,550,1,2,1,,,,65,49,65,,,,2,4,2,,,,26,40,26,,,,5550,5350,5550,,, 1970,2,241,,100,4.54,1,0,5,100,2,74,6,60,650,1,,,,,,65,,,,,,2,,,,,,26,,,,,,5550,,,,, 1970,2,242,,100,4.54,1,0,1,100,1,72,7,70,1250,1,,,2,,,72,,,62,,,7,,,2,,,70,,,23,,,1250,,,350,, 1970,2,242,,100,4.54,1,0,2,100,2,62,2,23,350,1,,,1,,,72,,,72,,,7,,,7,,,70,,,70,,,1250,,,1250,, 1970,2,243,,100,4.54,1,0,1,100,1,28,6,60,12050,1,,,2,,,28,,,28,,,6,,,6,,,60,,,60,,,12050,,,1550,, 1970,2,243,,100,4.54,1,0,2,100,2,28,6,60,1550,1,,,1,,,28,,,28,,,6,,,6,,,60,,,60,,,12050,,,12050,, 1970,2,243,,100,4.54,1,0,3,100,1,6,1,11,9999999,1,2,1,,,,28,28,28,,,,6,6,6,,,,60,60,60,,,,12050,1550,12050,,, 1970,2,243,,100,4.54,1,0,4,100,2,4,1,11,9999999,1,2,1,,,,28,28,28,,,,6,6,6,,,,60,60,60,,,,12050,1550,12050,,, 1970,2,243,,100,4.54,1,0,5,100,2,0,0,1,9999999,1,2,1,,,,28,28,28,,,,6,6,6,,,,60,60,60,,,,12050,1550,12050,,, 1970,2,244,,100,4.54,1,0,1,100,1,28,6,60,8050,1,,,,,,28,,,,,,6,,,,,,60,,,,,,8050,,,,, 1970,2,244,,100,4.54,1,0,2,100,2,25,6,60,6550,1,,,,,,28,,,,,,6,,,,,,60,,,,,,8050,,,,, 1970,2,245,,100,4.54,1,0,1,100,1,44,4,40,10050,1,,,2,,,44,,,43,,,4,,,4,,,40,,,40,,,10050,,,1450,, 1970,2,245,,100,4.54,1,0,2,100,2,43,4,40,1450,1,,,1,,,44,,,44,,,4,,,4,,,40,,,40,,,10050,,,10050,, 1970,2,245,,100,4.54,1,0,3,100,2,18,6,65,2750,1,2,1,,,,44,43,44,,,,4,4,4,,,,40,40,40,,,,10050,1450,10050,,, 1970,2,245,,100,4.54,1,0,4,100,1,11,1,17,9999999,1,2,1,,,,44,43,44,,,,4,4,4,,,,40,40,40,,,,10050,1450,10050,,, 1970,2,246,,100,4.54,1,0,1,100,1,27,10,100,5050,1,,,,,,27,,,,,,10,,,,,,100,,,,,,5050,,,,, 1970,2,246,,100,4.54,1,0,2,100,1,23,9,90,7350,1,,,,,,27,,,,,,10,,,,,,100,,,,,,5050,,,,, 1970,2,247,,100,4.54,1,0,1,100,1,59,4,40,350,1,,,,,,59,,,,,,4,,,,,,40,,,,,,350,,,,, 1970,2,247,,100,4.54,1,0,2,100,2,52,4,40,7350,1,,,,,,59,,,,,,4,,,,,,40,,,,,,350,,,,, 1970,2,248,,100,4.54,1,0,1,100,1,57,4,40,10750,1,,,2,,,57,,,38,,,4,,,6,,,40,,,60,,,10750,,,0,, 1970,2,248,,100,4.54,1,0,2,100,2,38,6,60,0,1,,,1,,,57,,,57,,,4,,,4,,,40,,,40,,,10750,,,10750,, 1970,2,248,,100,4.54,1,0,3,100,2,16,2,23,0,1,2,1,,,,57,38,57,,,,4,6,4,,,,40,60,40,,,,10750,0,10750,,, 1970,2,248,,100,4.54,1,0,4,100,2,4,0,2,9999999,1,2,1,,,,57,38,57,,,,4,6,4,,,,40,60,40,,,,10750,0,10750,,, 1970,2,248,,100,4.54,1,0,5,100,2,8,1,14,9999999,1,2,1,,,,57,38,57,,,,4,6,4,,,,40,60,40,,,,10750,0,10750,,, 1970,2,250,,100,4.54,1,0,1,100,1,59,6,60,7050,1,,,,,,59,,,,,,6,,,,,,60,,,,,,7050,,,,, 1970,2,251,,100,4.54,1,0,1,100,1,41,6,60,10050,1,,,2,,,41,,,33,,,6,,,6,,,60,,,60,,,10050,,,0,, 1970,2,251,,100,4.54,1,0,2,100,2,33,6,60,0,1,,,1,,,41,,,41,,,6,,,6,,,60,,,60,,,10050,,,10050,, 1970,2,251,,100,4.54,1,0,3,100,1,10,2,22,9999999,1,2,1,,,,41,33,41,,,,6,6,6,,,,60,60,60,,,,10050,0,10050,,, 1970,2,251,,100,4.54,1,0,4,100,1,6,1,11,9999999,1,2,1,,,,41,33,41,,,,6,6,6,,,,60,60,60,,,,10050,0,10050,,, 1970,2,252,,100,4.54,1,0,1,100,1,51,2,26,10050,1,,,2,,,51,,,46,,,2,,,2,,,26,,,26,,,10050,,,0,, 1970,2,252,,100,4.54,1,0,2,100,2,46,2,26,0,1,,,1,,,51,,,51,,,2,,,2,,,26,,,26,,,10050,,,10050,, 1970,2,252,,100,4.54,1,0,3,100,1,15,2,26,0,1,2,1,,,,51,46,51,,,,2,2,2,,,,26,26,26,,,,10050,0,10050,,, 1970,2,253,,100,4.54,1,0,1,100,2,47,6,60,8050,2,,,,,,47,,,,,,6,,,,,,60,,,,,,8050,,,,, 1970,2,254,,100,4.54,1,0,1,100,1,33,8,80,6050,1,,,2,,,33,,,41,,,8,,,6,,,80,,,60,,,6050,,,5050,, 1970,2,254,,100,4.54,1,0,2,100,2,41,6,60,5050,1,,,1,,,33,,,33,,,8,,,8,,,80,,,80,,,6050,,,6050,, 1970,2,254,,100,4.54,1,0,3,100,1,15,3,30,150,1,2,1,,,,33,41,33,,,,8,6,8,,,,80,60,80,,,,6050,5050,6050,,, 1970,2,254,,100,4.54,1,0,4,100,1,14,2,26,0,1,2,1,,,,33,41,33,,,,8,6,8,,,,80,60,80,,,,6050,5050,6050,,, 1970,2,254,,100,4.54,1,0,5,100,2,12,2,23,9999999,1,2,1,,,,33,41,33,,,,8,6,8,,,,80,60,80,,,,6050,5050,6050,,, 1970,2,254,,100,4.54,1,0,6,100,2,47,8,80,6050,1,,,,,,33,,,,,,8,,,,,,80,,,,,,6050,,,,, 1970,2,255,,100,4.54,1,0,1,100,1,42,6,60,20050,1,,,2,,,42,,,36,,,6,,,10,,,60,,,100,,,20050,,,13150,, 1970,2,255,,100,4.54,1,0,2,100,2,36,10,100,13150,1,,,1,,,42,,,42,,,6,,,6,,,60,,,60,,,20050,,,20050,, 1970,2,256,,100,4.54,1,0,1,100,1,62,11,111,16950,1,,,2,,,62,,,57,,,11,,,10,,,111,,,100,,,16950,,,50,, 1970,2,256,,100,4.54,1,0,2,100,2,57,10,100,50,1,,,1,,,62,,,62,,,11,,,11,,,111,,,111,,,16950,,,16950,, 1970,2,256,,100,4.54,1,0,3,100,2,19,7,70,850,1,2,1,,,,62,57,62,,,,11,10,11,,,,111,100,111,,,,16950,50,16950,,, 1970,2,257,,100,4.54,1,0,1,100,1,38,11,111,23050,1,,,2,,,38,,,37,,,11,,,9,,,111,,,90,,,23050,,,0,, 1970,2,257,,100,4.54,1,0,2,100,2,37,9,90,0,1,,,1,,,38,,,38,,,11,,,11,,,111,,,111,,,23050,,,23050,, 1970,2,257,,100,4.54,1,0,3,100,2,14,2,26,0,1,2,1,,,,38,37,38,,,,11,9,11,,,,111,90,111,,,,23050,0,23050,,, 1970,2,257,,100,4.54,1,0,4,100,2,11,2,22,9999999,1,2,1,,,,38,37,38,,,,11,9,11,,,,111,90,111,,,,23050,0,23050,,, 1970,2,257,,100,4.54,1,0,5,100,1,9,1,16,9999999,1,2,1,,,,38,37,38,,,,11,9,11,,,,111,90,111,,,,23050,0,23050,,, 1970,2,258,,100,4.54,1,0,1,100,1,38,6,65,3050,1,,,,,,38,,,,,,6,,,,,,65,,,,,,3050,,,,, 1970,2,259,,100,4.54,1,0,1,100,2,69,3,30,4150,2,,,,,,69,,,,,,3,,,,,,30,,,,,,4150,,,,, 1970,2,259,,100,4.54,1,0,2,100,1,26,2,23,1650,2,,,,,,69,,,,,,3,,,,,,30,,,,,,4150,,,,, 1970,2,260,,100,4.54,3,0,1,100,2,89,6,60,850,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 1970,2,261,,100,4.54,1,0,1,100,1,36,6,60,11450,1,,,2,,,36,,,35,,,6,,,6,,,60,,,60,,,11450,,,250,, 1970,2,261,,100,4.54,1,0,2,100,2,35,6,60,250,1,,,1,,,36,,,36,,,6,,,6,,,60,,,60,,,11450,,,11450,, 1970,2,261,,100,4.54,1,0,3,100,2,8,1,14,9999999,1,2,1,,,,36,35,36,,,,6,6,6,,,,60,60,60,,,,11450,250,11450,,, 1970,2,261,,100,4.54,1,0,4,100,2,5,0,2,9999999,1,2,1,,,,36,35,36,,,,6,6,6,,,,60,60,60,,,,11450,250,11450,,, 1970,2,262,,100,4.54,1,0,1,100,1,59,6,60,18250,1,,,2,,,59,,,51,,,6,,,6,,,60,,,60,,,18250,,,0,, 1970,2,262,,100,4.54,1,0,2,100,2,51,6,60,0,1,,,1,,,59,,,59,,,6,,,6,,,60,,,60,,,18250,,,18250,, 1970,2,263,,100,4.54,1,0,1,100,1,48,6,60,10050,1,,,2,,,48,,,47,,,6,,,3,,,60,,,30,,,10050,,,0,, 1970,2,263,,100,4.54,1,0,2,100,2,47,3,30,0,1,,,1,,,48,,,48,,,6,,,6,,,60,,,60,,,10050,,,10050,, 1970,2,263,,100,4.54,1,0,3,100,1,20,6,60,0,1,2,1,,,,48,47,48,,,,6,3,6,,,,60,30,60,,,,10050,0,10050,,, 1970,2,263,,100,4.54,1,0,4,100,1,14,2,23,0,1,2,1,,,,48,47,48,,,,6,3,6,,,,60,30,60,,,,10050,0,10050,,, 1970,2,263,,100,4.54,1,0,5,100,2,11,1,16,9999999,1,2,1,,,,48,47,48,,,,6,3,6,,,,60,30,60,,,,10050,0,10050,,, 1970,2,264,,100,4.54,1,0,1,100,1,34,11,110,10850,1,,,2,,,34,,,34,,,11,,,11,,,110,,,110,,,10850,,,150,, 1970,2,264,,100,4.54,1,0,2,100,2,34,11,110,150,1,,,1,,,34,,,34,,,11,,,11,,,110,,,110,,,10850,,,10850,, 1970,2,264,,100,4.54,1,0,3,100,2,5,1,11,9999999,1,2,1,,,,34,34,34,,,,11,11,11,,,,110,110,110,,,,10850,150,10850,,, 1970,2,264,,100,4.54,1,0,4,100,2,2,0,1,9999999,1,2,1,,,,34,34,34,,,,11,11,11,,,,110,110,110,,,,10850,150,10850,,, 1970,2,265,,100,4.54,1,0,1,100,1,67,8,80,5450,1,,,2,,,67,,,62,,,8,,,7,,,80,,,70,,,5450,,,11550,, 1970,2,265,,100,4.54,1,0,2,100,2,62,7,70,11550,1,,,1,,,67,,,67,,,8,,,8,,,80,,,80,,,5450,,,5450,, 1970,2,265,,100,4.54,1,0,3,100,1,21,8,80,0,1,2,1,,,,67,62,67,,,,8,7,8,,,,80,70,80,,,,5450,11550,5450,,, 1970,2,266,,100,4.54,1,0,1,100,1,77,4,40,15750,1,,,2,,,77,,,74,,,4,,,6,,,40,,,60,,,15750,,,2250,, 1970,2,266,,100,4.54,1,0,2,100,2,74,6,60,2250,1,,,1,,,77,,,77,,,4,,,4,,,40,,,40,,,15750,,,15750,, 1970,2,267,,100,4.54,1,0,1,100,1,53,6,60,12050,1,,,2,,,53,,,52,,,6,,,6,,,60,,,60,,,12050,,,3550,, 1970,2,267,,100,4.54,1,0,2,100,2,52,6,60,3550,1,,,1,,,53,,,53,,,6,,,6,,,60,,,60,,,12050,,,12050,, 1970,2,267,,100,4.54,1,0,3,100,2,20,8,80,2550,1,2,1,,,,53,52,53,,,,6,6,6,,,,60,60,60,,,,12050,3550,12050,,, 1970,2,268,,100,4.54,1,0,1,100,1,39,11,111,1650,1,,,2,,,39,,,37,,,11,,,10,,,111,,,100,,,1650,,,3350,, 1970,2,268,,100,4.54,1,0,2,100,2,37,10,100,3350,1,,,1,,,39,,,39,,,11,,,11,,,111,,,111,,,1650,,,1650,, 1970,2,268,,100,4.54,1,0,3,100,1,6,1,12,9999999,1,2,1,,,,39,37,39,,,,11,10,11,,,,111,100,111,,,,1650,3350,1650,,, 1970,2,269,,100,4.54,1,0,1,100,2,50,2,26,2850,2,,,,,,50,,,,,,2,,,,,,26,,,,,,2850,,,,, 1970,2,269,,100,4.54,1,0,2,100,2,17,3,30,2750,2,2,,,,,50,50,,,,,2,2,,,,,26,26,,,,,2850,2850,,,, 1970,2,269,,100,4.54,1,0,3,100,1,26,2,25,0,2,,,,,,50,,,,,,2,,,,,,26,,,,,,2850,,,,, 1970,2,269,,100,4.54,1,0,4,100,1,26,1,17,13250,2,,,,,,50,,,,,,2,,,,,,26,,,,,,2850,,,,, 1970,2,269,,100,4.54,1,0,5,100,1,26,1,15,13250,2,,,,,,50,,,,,,2,,,,,,26,,,,,,2850,,,,, 1970,2,270,,100,4.54,1,0,1,100,1,44,7,70,7150,1,,,2,,,44,,,48,,,7,,,8,,,70,,,80,,,7150,,,2650,, 1970,2,270,,100,4.54,1,0,2,100,2,48,8,80,2650,1,,,1,,,44,,,44,,,7,,,7,,,70,,,70,,,7150,,,7150,, 1970,2,270,,100,4.54,1,0,3,100,2,12,2,22,9999999,1,2,1,,,,44,48,44,,,,7,8,7,,,,70,80,70,,,,7150,2650,7150,,, 1970,2,270,,100,4.54,1,0,4,100,2,10,1,17,9999999,1,2,1,,,,44,48,44,,,,7,8,7,,,,70,80,70,,,,7150,2650,7150,,, 1970,2,271,,100,4.54,1,0,1,100,1,34,11,111,5550,1,,,2,,,34,,,29,,,11,,,11,,,111,,,111,,,5550,,,8250,, 1970,2,271,,100,4.54,1,0,2,100,2,29,11,111,8250,1,,,1,,,34,,,34,,,11,,,11,,,111,,,111,,,5550,,,5550,, 1970,2,271,,100,4.54,1,0,3,100,2,2,0,1,9999999,1,2,1,,,,34,29,34,,,,11,11,11,,,,111,111,111,,,,5550,8250,5550,,, 1970,2,272,,100,4.54,1,0,1,100,1,43,11,110,10850,1,,,2,,,43,,,34,,,11,,,11,,,110,,,111,,,10850,,,11550,, 1970,2,272,,100,4.54,1,0,2,100,2,34,11,111,11550,1,,,1,,,43,,,43,,,11,,,11,,,110,,,110,,,10850,,,10850,, 1970,2,272,,100,4.54,1,0,3,100,2,9,1,17,9999999,1,2,1,,,,43,34,43,,,,11,11,11,,,,110,111,110,,,,10850,11550,10850,,, 1970,2,273,,100,4.54,1,0,1,100,2,28,2,26,4150,2,2,,,,,28,59,,,,,2,2,,,,,26,23,,,,,4150,0,,,, 1970,2,273,,100,4.54,1,0,2,100,1,7,1,14,9999999,2,2,,,,,28,28,,,,,2,2,,,,,26,26,,,,,4150,4150,,,, 1970,2,273,,100,4.54,1,0,3,100,2,59,2,23,0,2,,,,,,28,,,,,,2,,,,,,26,,,,,,4150,,,,, 1970,2,274,,100,4.54,1,0,1,100,1,38,2,26,5050,1,,,2,,,38,,,35,,,2,,,6,,,26,,,60,,,5050,,,4050,, 1970,2,274,,100,4.54,1,0,2,100,2,35,6,60,4050,1,,,1,,,38,,,38,,,2,,,2,,,26,,,26,,,5050,,,5050,, 1970,2,274,,100,4.54,1,0,3,100,2,15,3,30,350,1,2,1,,,,38,35,38,,,,2,6,2,,,,26,60,26,,,,5050,4050,5050,,, 1970,2,274,,100,4.54,1,0,4,100,2,14,2,25,0,1,2,1,,,,38,35,38,,,,2,6,2,,,,26,60,26,,,,5050,4050,5050,,, 1970,2,274,,100,4.54,1,0,5,100,1,9,1,15,9999999,1,2,1,,,,38,35,38,,,,2,6,2,,,,26,60,26,,,,5050,4050,5050,,, 1970,2,274,,100,4.54,1,0,6,100,1,8,1,14,9999999,1,2,1,,,,38,35,38,,,,2,6,2,,,,26,60,26,,,,5050,4050,5050,,, 1970,2,275,,100,4.54,1,0,1,100,1,72,1,14,0,1,,,,,,72,,,,,,1,,,,,,14,,,,,,0,,,,, 1970,2,276,,100,4.54,1,0,1,100,2,27,3,30,3250,2,,,,,,27,,,,,,3,,,,,,30,,,,,,3250,,,,, 1970,2,276,,100,4.54,1,0,2,100,2,9,1,16,9999999,2,2,,,,,27,27,,,,,3,3,,,,,30,30,,,,,3250,3250,,,, 1970,2,276,,100,4.54,1,0,3,100,1,8,1,15,9999999,2,2,,,,,27,27,,,,,3,3,,,,,30,30,,,,,3250,3250,,,, 1970,2,276,,100,4.54,1,0,4,100,1,7,1,14,9999999,2,2,,,,,27,27,,,,,3,3,,,,,30,30,,,,,3250,3250,,,, 1970,2,276,,100,4.54,1,0,5,100,1,6,1,11,9999999,2,2,,,,,27,27,,,,,3,3,,,,,30,30,,,,,3250,3250,,,, 1970,2,276,,100,4.54,1,0,6,100,1,4,0,2,9999999,2,2,,,,,27,27,,,,,3,3,,,,,30,30,,,,,3250,3250,,,, 1970,2,276,,100,4.54,1,0,7,100,2,3,0,2,9999999,2,2,,,,,27,27,,,,,3,3,,,,,30,30,,,,,3250,3250,,,, 1970,2,277,,100,4.54,1,0,1,100,1,74,2,26,4450,1,,,2,,,74,,,58,,,2,,,2,,,26,,,26,,,4450,,,4450,, 1970,2,277,,100,4.54,1,0,2,100,2,58,2,26,4450,1,,,1,,,74,,,74,,,2,,,2,,,26,,,26,,,4450,,,4450,, 1970,2,278,,100,4.54,1,0,1,100,1,34,3,30,9350,1,,,2,,,34,,,24,,,3,,,6,,,30,,,60,,,9350,,,0,, 1970,2,278,,100,4.54,1,0,2,100,2,24,6,60,0,1,,,1,,,34,,,34,,,3,,,3,,,30,,,30,,,9350,,,9350,, 1970,2,279,,100,4.54,1,0,1,100,2,64,6,60,7550,2,,,,,,64,,,,,,6,,,,,,60,,,,,,7550,,,,, 1970,2,280,,100,4.54,1,0,1,100,1,48,6,60,7550,1,,,,,,48,,,,,,6,,,,,,60,,,,,,7550,,,,, 1970,2,281,,100,4.54,1,0,1,100,1,47,11,110,9250,1,,,2,,,47,,,40,,,11,,,6,,,110,,,60,,,9250,,,4450,, 1970,2,281,,100,4.54,1,0,2,100,2,40,6,60,4450,1,,,1,,,47,,,47,,,11,,,11,,,110,,,110,,,9250,,,9250,, 1970,2,281,,100,4.54,1,0,3,100,1,16,4,40,0,1,2,1,,,,47,40,47,,,,11,6,11,,,,110,60,110,,,,9250,4450,9250,,, 1970,2,281,,100,4.54,1,0,4,100,1,12,2,22,9999999,1,2,1,,,,47,40,47,,,,11,6,11,,,,110,60,110,,,,9250,4450,9250,,, 1970,2,282,,100,4.54,1,0,1,100,1,36,2,26,5250,1,,,,,,36,,,,,,2,,,,,,26,,,,,,5250,,,,, 1970,2,282,,100,4.54,1,0,2,100,1,39,2,26,5050,1,,,,,,36,,,,,,2,,,,,,26,,,,,,5250,,,,, 1970,2,282,,100,4.54,1,0,3,100,2,77,2,23,650,1,,,,,,36,,,,,,2,,,,,,26,,,,,,5250,,,,, 1970,2,282,,100,4.54,1,0,4,100,1,47,4,40,0,1,,,,,,36,,,,,,2,,,,,,26,,,,,,5250,,,,, 1970,2,282,,100,4.54,1,0,5,100,2,14,2,26,2450,1,,1,,,,36,,47,,,,2,,4,,,,26,,40,,,,5250,,0,,, 1970,2,282,,100,4.54,1,0,6,100,2,10,1,16,9999999,1,,1,,,,36,,47,,,,2,,4,,,,26,,40,,,,5250,,0,,, 1970,2,282,,100,4.54,1,0,7,100,1,8,1,14,9999999,1,,1,,,,36,,47,,,,2,,4,,,,26,,40,,,,5250,,0,,, 1970,2,283,,100,4.54,1,0,1,100,2,23,8,80,7850,2,,,,,,23,,,,,,8,,,,,,80,,,,,,7850,,,,, 1970,2,283,,100,4.54,1,0,2,100,2,18,4,40,0,2,,,,,,23,,,,,,8,,,,,,80,,,,,,7850,,,,, 1970,2,284,,100,4.54,1,0,1,100,2,31,1,16,6250,2,,,,,,31,,,,,,1,,,,,,16,,,,,,6250,,,,, 1970,2,284,,100,4.54,1,0,2,100,1,5,0,2,9999999,2,2,,,,,31,31,,,,,1,1,,,,,16,16,,,,,6250,6250,,,, 1970,2,284,,100,4.54,1,0,3,100,2,4,0,2,9999999,2,2,,,,,31,31,,,,,1,1,,,,,16,16,,,,,6250,6250,,,, 1970,2,285,,100,4.54,1,0,1,100,1,49,2,26,9050,1,,,2,,,49,,,50,,,2,,,2,,,26,,,26,,,9050,,,1550,, 1970,2,285,,100,4.54,1,0,2,100,2,50,2,26,1550,1,,,1,,,49,,,49,,,2,,,2,,,26,,,26,,,9050,,,9050,, 1970,2,286,,100,4.54,1,0,1,100,2,61,2,26,8450,2,,,,,,61,,,,,,2,,,,,,26,,,,,,8450,,,,, 1970,2,287,,100,4.54,1,0,1,100,2,14,2,26,0,2,,,,,,14,,,,,,2,,,,,,26,,,,,,0,,,,, 1970,2,287,,100,4.54,1,0,2,100,1,10,1,17,9999999,2,2,,,,,14,14,,,,,2,2,,,,,26,26,,,,,0,0,,,, 1970,2,287,,100,4.54,1,0,3,100,1,12,1,17,9999999,2,,,,,,14,,,,,,2,,,,,,26,,,,,,0,,,,, 1970,2,287,,100,4.54,1,0,4,100,1,9,1,16,9999999,2,,,,,,14,,,,,,2,,,,,,26,,,,,,0,,,,, 1970,2,287,,100,4.54,1,0,5,100,1,5,1,12,9999999,2,,,,,,14,,,,,,2,,,,,,26,,,,,,0,,,,, 1970,2,287,,100,4.54,1,0,6,100,1,48,2,25,6450,2,,,,,,14,,,,,,2,,,,,,26,,,,,,0,,,,, 1970,2,287,,100,4.54,1,0,7,100,2,2,0,1,9999999,2,,,,,,14,,,,,,2,,,,,,26,,,,,,0,,,,, 1970,2,288,,100,4.54,1,0,1,100,1,58,4,40,8050,1,,,2,,,58,,,53,,,4,,,6,,,40,,,60,,,8050,,,7050,, 1970,2,288,,100,4.54,1,0,2,100,2,53,6,60,7050,1,,,1,,,58,,,58,,,4,,,4,,,40,,,40,,,8050,,,8050,, 1970,2,289,,100,4.54,1,0,1,100,1,67,3,30,14750,1,,,,,,67,,,,,,3,,,,,,30,,,,,,14750,,,,, 1970,2,290,,100,4.54,1,0,1,100,1,73,2,25,1950,1,,,2,,,73,,,68,,,2,,,2,,,25,,,25,,,1950,,,650,, 1970,2,290,,100,4.54,1,0,2,100,2,68,2,25,650,1,2,,1,,,73,89,,73,,,2,2,,2,,,25,25,,25,,,1950,850,,1950,, 1970,2,290,,100,4.54,1,0,3,100,2,89,2,25,850,1,,,,,,73,,,,,,2,,,,,,25,,,,,,1950,,,,, 1970,2,291,,100,4.54,1,0,1,100,1,68,2,26,3450,1,,,2,,,68,,,65,,,2,,,4,,,26,,,40,,,3450,,,1750,, 1970,2,291,,100,4.54,1,0,2,100,2,65,4,40,1750,1,,,1,,,68,,,68,,,2,,,2,,,26,,,26,,,3450,,,3450,, 1970,2,292,,100,4.54,1,0,1,100,1,29,6,60,12050,1,,,2,,,29,,,28,,,6,,,6,,,60,,,60,,,12050,,,4050,, 1970,2,292,,100,4.54,1,0,2,100,2,28,6,60,4050,1,,,1,,,29,,,29,,,6,,,6,,,60,,,60,,,12050,,,12050,, 1970,2,292,,100,4.54,1,0,3,100,2,7,1,12,9999999,1,2,1,,,,29,28,29,,,,6,6,6,,,,60,60,60,,,,12050,4050,12050,,, 1970,2,292,,100,4.54,1,0,4,100,1,6,1,11,9999999,1,2,1,,,,29,28,29,,,,6,6,6,,,,60,60,60,,,,12050,4050,12050,,, 1970,2,292,,100,4.54,1,0,5,100,1,3,0,2,9999999,1,2,1,,,,29,28,29,,,,6,6,6,,,,60,60,60,,,,12050,4050,12050,,, 1970,2,293,,100,4.54,1,0,1,100,2,63,6,60,9950,2,,,,,,63,,,,,,6,,,,,,60,,,,,,9950,,,,, 1970,2,294,,100,4.54,1,0,1,100,1,42,6,60,30150,1,,,2,,,42,,,39,,,6,,,7,,,60,,,70,,,30150,,,1050,, 1970,2,294,,100,4.54,1,0,2,100,2,39,7,70,1050,1,,,1,,,42,,,42,,,6,,,6,,,60,,,60,,,30150,,,30150,, 1970,2,294,,100,4.54,1,0,3,100,2,16,4,40,450,1,2,1,,,,42,39,42,,,,6,7,6,,,,60,70,60,,,,30150,1050,30150,,, 1970,2,294,,100,4.54,1,0,4,100,1,15,3,30,450,1,2,1,,,,42,39,42,,,,6,7,6,,,,60,70,60,,,,30150,1050,30150,,, 1970,2,294,,100,4.54,1,0,5,100,1,14,2,26,0,1,2,1,,,,42,39,42,,,,6,7,6,,,,60,70,60,,,,30150,1050,30150,,, 1970,2,294,,100,4.54,1,0,6,100,1,11,2,22,9999999,1,2,1,,,,42,39,42,,,,6,7,6,,,,60,70,60,,,,30150,1050,30150,,, 1970,2,294,,100,4.54,1,0,7,100,2,9,1,16,9999999,1,2,1,,,,42,39,42,,,,6,7,6,,,,60,70,60,,,,30150,1050,30150,,, 1970,2,295,,100,4.54,1,0,1,100,1,70,5,50,450,1,,,2,,,70,,,70,,,5,,,5,,,50,,,50,,,450,,,150,, 1970,2,295,,100,4.54,1,0,2,100,2,70,5,50,150,1,,,1,,,70,,,70,,,5,,,5,,,50,,,50,,,450,,,450,, 1970,2,296,,100,4.54,1,0,1,100,1,47,5,50,7050,1,,,2,,,47,,,44,,,5,,,5,,,50,,,50,,,7050,,,4550,, 1970,2,296,,100,4.54,1,0,2,100,2,44,5,50,4550,1,2,,1,,,47,71,,47,,,5,4,,5,,,50,40,,50,,,7050,650,,7050,, 1970,2,296,,100,4.54,1,0,3,100,2,71,4,40,650,1,,,,,,47,,,,,,5,,,,,,50,,,,,,7050,,,,, 1970,2,296,,100,4.54,1,0,4,100,1,17,5,50,550,1,2,1,,,,47,44,47,,,,5,5,5,,,,50,50,50,,,,7050,4550,7050,,, 1970,2,297,,100,4.54,1,0,1,100,1,33,10,100,17650,1,,,2,,,33,,,33,,,10,,,10,,,100,,,100,,,17650,,,0,, 1970,2,297,,100,4.54,1,0,2,100,2,33,10,100,0,1,,,1,,,33,,,33,,,10,,,10,,,100,,,100,,,17650,,,17650,, 1970,2,297,,100,4.54,1,0,3,100,1,8,1,15,9999999,1,2,1,,,,33,33,33,,,,10,10,10,,,,100,100,100,,,,17650,0,17650,,, 1970,2,297,,100,4.54,1,0,4,100,2,1,0,1,9999999,1,2,1,,,,33,33,33,,,,10,10,10,,,,100,100,100,,,,17650,0,17650,,, 1970,2,297,,100,4.54,1,0,5,100,2,4,1,11,9999999,1,2,1,,,,33,33,33,,,,10,10,10,,,,100,100,100,,,,17650,0,17650,,, 1970,2,298,,100,4.54,1,0,1,100,1,49,2,26,14050,1,,,2,,,49,,,46,,,2,,,6,,,26,,,60,,,14050,,,7450,, 1970,2,298,,100,4.54,1,0,2,100,2,46,6,60,7450,1,,,1,,,49,,,49,,,2,,,2,,,26,,,26,,,14050,,,14050,, 1970,2,298,,100,4.54,1,0,3,100,2,13,2,23,9999999,1,2,1,,,,49,46,49,,,,2,6,2,,,,26,60,26,,,,14050,7450,14050,,, 1970,2,299,,100,4.54,1,0,1,100,1,47,6,60,15750,1,,,2,,,47,,,46,,,6,,,6,,,60,,,60,,,15750,,,4850,, 1970,2,299,,100,4.54,1,0,2,100,2,46,6,60,4850,1,,,1,,,47,,,47,,,6,,,6,,,60,,,60,,,15750,,,15750,, 1970,2,299,,100,4.54,1,0,3,100,2,18,6,65,2550,1,2,1,,,,47,46,47,,,,6,6,6,,,,60,60,60,,,,15750,4850,15750,,, 1970,2,300,,100,4.54,1,0,1,100,1,64,5,50,2450,1,,,2,,,64,,,63,,,5,,,2,,,50,,,22,,,2450,,,0,, 1970,2,300,,100,4.54,1,0,2,100,2,63,2,22,0,1,,,1,,,64,,,64,,,5,,,5,,,50,,,50,,,2450,,,2450,, 1970,2,301,,100,4.54,1,0,1,100,1,20,7,70,8050,1,,,2,,,20,,,21,,,7,,,7,,,70,,,70,,,8050,,,3050,, 1970,2,301,,100,4.54,1,0,2,100,2,21,7,70,3050,1,,,1,,,20,,,20,,,7,,,7,,,70,,,70,,,8050,,,8050,, 1970,2,301,,100,4.54,1,0,3,100,1,0,0,1,9999999,1,2,1,,,,20,21,20,,,,7,7,7,,,,70,70,70,,,,8050,3050,8050,,, 1970,2,302,,100,4.54,1,0,1,100,2,23,2,26,4450,2,,,,,,23,,,,,,2,,,,,,26,,,,,,4450,,,,, 1970,2,303,,100,4.54,1,0,1,100,1,45,6,60,11750,1,,,2,,,45,,,41,,,6,,,6,,,60,,,60,,,11750,,,0,, 1970,2,303,,100,4.54,1,0,2,100,2,41,6,60,0,1,2,1,1,,,45,67,71,45,,,6,2,2,6,,,60,26,23,60,,,11750,1450,3350,11750,, 1970,2,303,,100,4.54,1,0,3,100,2,21,9,90,550,1,2,1,,,,45,41,45,,,,6,6,6,,,,60,60,60,,,,11750,0,11750,,, 1970,2,303,,100,4.54,1,0,4,100,2,17,5,50,550,1,2,1,,,,45,41,45,,,,6,6,6,,,,60,60,60,,,,11750,0,11750,,, 1970,2,303,,100,4.54,1,0,5,100,1,15,3,30,0,1,2,1,,,,45,41,45,,,,6,6,6,,,,60,60,60,,,,11750,0,11750,,, 1970,2,303,,100,4.54,1,0,6,100,1,71,2,23,3350,1,,,2,,,45,,,67,,,6,,,2,,,60,,,26,,,11750,,,1450,, 1970,2,303,,100,4.54,1,0,7,100,2,67,2,26,1450,1,,,1,,,45,,,71,,,6,,,2,,,60,,,23,,,11750,,,3350,, 1970,2,304,,100,4.54,1,0,1,100,1,75,2,26,3650,1,,,2,,,75,,,75,,,2,,,2,,,26,,,26,,,3650,,,850,, 1970,2,304,,100,4.54,1,0,2,100,2,75,2,26,850,1,,,1,,,75,,,75,,,2,,,2,,,26,,,26,,,3650,,,3650,, 1970,2,305,,100,4.54,1,0,1,100,1,38,6,60,10050,1,,,2,,,38,,,38,,,6,,,6,,,60,,,60,,,10050,,,0,, 1970,2,305,,100,4.54,1,0,2,100,2,38,6,60,0,1,,,1,,,38,,,38,,,6,,,6,,,60,,,60,,,10050,,,10050,, 1970,2,305,,100,4.54,1,0,3,100,1,11,2,22,9999999,1,2,1,,,,38,38,38,,,,6,6,6,,,,60,60,60,,,,10050,0,10050,,, 1970,2,305,,100,4.54,1,0,4,100,1,9,1,16,9999999,1,2,1,,,,38,38,38,,,,6,6,6,,,,60,60,60,,,,10050,0,10050,,, 1970,2,306,,100,4.54,1,0,1,100,1,47,3,30,11050,1,,,2,,,47,,,40,,,3,,,6,,,30,,,60,,,11050,,,0,, 1970,2,306,,100,4.54,1,0,2,100,2,40,6,60,0,1,,,1,,,47,,,47,,,3,,,3,,,30,,,30,,,11050,,,11050,, 1970,2,306,,100,4.54,1,0,3,100,1,16,3,30,0,1,2,1,,,,47,40,47,,,,3,6,3,,,,30,60,30,,,,11050,0,11050,,, 1970,2,306,,100,4.54,1,0,4,100,2,9,1,16,9999999,1,2,1,,,,47,40,47,,,,3,6,3,,,,30,60,30,,,,11050,0,11050,,, 1970,2,306,,100,4.54,1,0,5,100,2,7,1,14,9999999,1,2,1,,,,47,40,47,,,,3,6,3,,,,30,60,30,,,,11050,0,11050,,, 1970,2,307,,100,4.54,1,0,1,100,1,35,6,60,150,1,2,,,,,35,73,,,,,6,2,,,,,60,26,,,,,150,1250,,,, 1970,2,307,,100,4.54,1,0,2,100,2,73,2,26,1250,1,,,,,,35,,,,,,6,,,,,,60,,,,,,150,,,,, 1970,2,308,,100,4.54,1,0,1,100,1,77,2,22,4050,1,,,,,,77,,,,,,2,,,,,,22,,,,,,4050,,,,, 1970,2,308,,100,4.54,1,0,2,100,1,50,4,40,650,1,,1,,,,77,,77,,,,2,,2,,,,22,,22,,,,4050,,4050,,, 1970,2,308,,100,4.54,1,0,3,100,1,41,2,26,6550,1,,1,,,,77,,77,,,,2,,2,,,,22,,22,,,,4050,,4050,,, 1970,2,309,,100,4.54,1,0,1,100,1,61,5,50,3550,1,,,,,,61,,,,,,5,,,,,,50,,,,,,3550,,,,, 1970,2,309,,100,4.54,1,0,2,100,1,17,5,50,550,1,,1,,,,61,,61,,,,5,,5,,,,50,,50,,,,3550,,3550,,, 1970,2,310,,100,4.54,1,0,1,100,1,56,6,60,7050,1,,,2,,,56,,,61,,,6,,,6,,,60,,,60,,,7050,,,0,, 1970,2,310,,100,4.54,1,0,2,100,2,61,6,60,0,1,,,1,,,56,,,56,,,6,,,6,,,60,,,60,,,7050,,,7050,, 1970,2,311,,100,4.54,1,0,1,100,1,43,11,110,11850,1,,,2,,,43,,,42,,,11,,,10,,,110,,,100,,,11850,,,250,, 1970,2,311,,100,4.54,1,0,2,100,2,42,10,100,250,1,,,1,,,43,,,43,,,11,,,11,,,110,,,110,,,11850,,,11850,, 1970,2,312,,100,4.54,1,0,1,100,2,26,6,60,4050,2,,,,,,26,,,,,,6,,,,,,60,,,,,,4050,,,,, 1970,2,312,,100,4.54,1,0,2,100,1,9,1,15,9999999,2,2,,,,,26,26,,,,,6,6,,,,,60,60,,,,,4050,4050,,,, 1970,2,312,,100,4.54,1,0,3,100,1,7,1,12,9999999,2,2,,,,,26,26,,,,,6,6,,,,,60,60,,,,,4050,4050,,,, 1970,2,313,,100,4.54,1,0,1,100,1,22,10,100,0,1,,,,,,22,,,,,,10,,,,,,100,,,,,,0,,,,, 1970,2,314,,100,4.54,1,0,1,100,1,27,6,60,7050,1,,,,,,27,,,,,,6,,,,,,60,,,,,,7050,,,,, 1970,2,315,,100,4.54,3,0,1,100,2,61,8,80,150,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 1970,2,316,,100,4.54,1,0,1,100,1,62,4,40,12050,1,,,2,,,62,,,61,,,4,,,4,,,40,,,40,,,12050,,,1350,, 1970,2,316,,100,4.54,1,0,2,100,2,61,4,40,1350,1,,,1,,,62,,,62,,,4,,,4,,,40,,,40,,,12050,,,12050,, 1970,2,317,,100,4.54,1,0,1,100,2,45,8,80,5050,2,,,,,,45,,,,,,8,,,,,,80,,,,,,5050,,,,, 1970,2,318,,100,4.54,1,0,1,100,2,23,8,80,3050,2,,,,,,23,,,,,,8,,,,,,80,,,,,,3050,,,,, 1970,2,318,,100,4.54,1,0,2,100,1,2,0,1,9999999,2,2,,,,,23,23,,,,,8,8,,,,,80,80,,,,,3050,3050,,,, 1970,2,319,,100,4.54,1,0,1,100,1,43,10,100,22050,1,,,2,,,43,,,37,,,10,,,10,,,100,,,100,,,22050,,,150,, 1970,2,319,,100,4.54,1,0,2,100,2,37,10,100,150,1,,,1,,,43,,,43,,,10,,,10,,,100,,,100,,,22050,,,22050,, 1970,2,319,,100,4.54,1,0,3,100,2,14,2,26,0,1,2,1,,,,43,37,43,,,,10,10,10,,,,100,100,100,,,,22050,150,22050,,, 1970,2,319,,100,4.54,1,0,4,100,1,11,1,17,9999999,1,2,1,,,,43,37,43,,,,10,10,10,,,,100,100,100,,,,22050,150,22050,,, 1970,2,320,,100,4.54,1,0,1,100,2,79,2,26,750,2,,,,,,79,,,,,,2,,,,,,26,,,,,,750,,,,, 1970,2,321,,100,4.54,1,0,1,100,1,24,6,60,5050,1,,,2,,,24,,,21,,,6,,,6,,,60,,,60,,,5050,,,4050,, 1970,2,321,,100,4.54,1,0,2,100,2,21,6,60,4050,1,,,1,,,24,,,24,,,6,,,6,,,60,,,60,,,5050,,,5050,, 1970,2,321,,100,4.54,1,0,3,100,1,0,0,1,9999999,1,2,1,,,,24,21,24,,,,6,6,6,,,,60,60,60,,,,5050,4050,5050,,, 1970,2,322,,100,4.54,1,0,1,100,2,60,6,60,6550,2,,,,,,60,,,,,,6,,,,,,60,,,,,,6550,,,,, 1970,2,322,,100,4.54,1,0,2,100,1,24,6,60,1350,2,2,,,,,60,60,,,,,6,6,,,,,60,60,,,,,6550,6550,,,, 1970,2,323,,100,4.54,1,0,1,100,1,77,7,70,10050,1,,,2,,,77,,,71,,,7,,,3,,,70,,,30,,,10050,,,0,, 1970,2,323,,100,4.54,1,0,2,100,2,71,3,30,0,1,,,1,,,77,,,77,,,7,,,7,,,70,,,70,,,10050,,,10050,, 1970,2,324,,100,4.54,1,0,1,100,1,66,3,30,8050,1,,,2,,,66,,,60,,,3,,,5,,,30,,,50,,,8050,,,5050,, 1970,2,324,,100,4.54,1,0,2,100,2,60,5,50,5050,1,,,1,,,66,,,66,,,3,,,3,,,30,,,30,,,8050,,,8050,, 1970,2,325,,100,4.54,1,0,1,100,1,40,2,25,7050,1,,,2,,,40,,,34,,,2,,,4,,,25,,,40,,,7050,,,6050,, 1970,2,325,,100,4.54,1,0,2,100,2,34,4,40,6050,1,,,1,,,40,,,40,,,2,,,2,,,25,,,25,,,7050,,,7050,, 1970,2,325,,100,4.54,1,0,3,100,2,16,3,30,3850,1,2,1,,,,40,34,40,,,,2,4,2,,,,25,40,25,,,,7050,6050,7050,,, 1970,2,325,,100,4.54,1,0,4,100,1,15,2,25,0,1,2,1,,,,40,34,40,,,,2,4,2,,,,25,40,25,,,,7050,6050,7050,,, 1970,2,325,,100,4.54,1,0,5,100,2,13,2,25,9999999,1,2,1,,,,40,34,40,,,,2,4,2,,,,25,40,25,,,,7050,6050,7050,,, 1970,2,325,,100,4.54,1,0,6,100,2,13,2,22,9999999,1,2,1,,,,40,34,40,,,,2,4,2,,,,25,40,25,,,,7050,6050,7050,,, 1970,2,325,,100,4.54,1,0,7,100,2,0,0,1,9999999,1,2,1,,,,40,34,40,,,,2,4,2,,,,25,40,25,,,,7050,6050,7050,,, 1970,2,326,,100,4.54,1,0,1,100,1,37,6,60,4850,1,,,2,,,37,,,40,,,6,,,5,,,60,,,50,,,4850,,,0,, 1970,2,326,,100,4.54,1,0,2,100,2,40,5,50,0,1,,,1,,,37,,,37,,,6,,,6,,,60,,,60,,,4850,,,4850,, 1970,2,326,,100,4.54,1,0,3,100,2,19,6,60,2650,1,2,1,,,,37,40,37,,,,6,5,6,,,,60,50,60,,,,4850,0,4850,,, 1970,2,326,,100,4.54,1,0,4,100,2,17,5,50,0,1,2,1,,,,37,40,37,,,,6,5,6,,,,60,50,60,,,,4850,0,4850,,, 1970,2,326,,100,4.54,1,0,5,100,2,15,2,26,0,1,2,1,,,,37,40,37,,,,6,5,6,,,,60,50,60,,,,4850,0,4850,,, 1970,2,326,,100,4.54,1,0,6,100,1,14,2,25,0,1,2,1,,,,37,40,37,,,,6,5,6,,,,60,50,60,,,,4850,0,4850,,, 1970,2,326,,100,4.54,1,0,7,100,2,13,2,23,9999999,1,2,1,,,,37,40,37,,,,6,5,6,,,,60,50,60,,,,4850,0,4850,,, 1970,2,326,,100,4.54,1,0,8,100,2,12,2,22,9999999,1,2,1,,,,37,40,37,,,,6,5,6,,,,60,50,60,,,,4850,0,4850,,, 1970,2,326,,100,4.54,1,0,9,100,1,1,0,1,9999999,1,2,,,,,37,19,,,,,6,6,,,,,60,60,,,,,4850,2650,,,, 1970,2,327,,100,4.54,1,0,1,100,2,53,2,23,550,2,,,,,,53,,,,,,2,,,,,,23,,,,,,550,,,,, 1970,2,329,,100,4.54,1,0,1,100,1,33,1,17,7050,1,,,2,,,33,,,27,,,1,,,2,,,17,,,23,,,7050,,,2050,, 1970,2,329,,100,4.54,1,0,2,100,2,27,2,23,2050,1,,,1,,,33,,,33,,,1,,,1,,,17,,,17,,,7050,,,7050,, 1970,2,329,,100,4.54,1,0,3,100,2,0,0,1,9999999,1,2,1,,,,33,27,33,,,,1,2,1,,,,17,23,17,,,,7050,2050,7050,,, 1970,2,330,,100,4.54,1,0,1,100,1,29,3,30,9550,1,,,2,,,29,,,24,,,3,,,3,,,30,,,30,,,9550,,,0,, 1970,2,330,,100,4.54,1,0,2,100,2,24,3,30,0,1,,,1,,,29,,,29,,,3,,,3,,,30,,,30,,,9550,,,9550,, 1970,2,330,,100,4.54,1,0,3,100,2,3,0,2,9999999,1,2,1,,,,29,24,29,,,,3,3,3,,,,30,30,30,,,,9550,0,9550,,, 1970,2,330,,100,4.54,1,0,4,100,2,2,0,1,9999999,1,2,1,,,,29,24,29,,,,3,3,3,,,,30,30,30,,,,9550,0,9550,,, 1970,2,330,,100,4.54,1,0,5,100,1,0,0,1,9999999,1,2,1,,,,29,24,29,,,,3,3,3,,,,30,30,30,,,,9550,0,9550,,, 1970,2,330,,100,4.54,1,0,6,100,1,0,0,1,9999999,1,2,1,,,,29,24,29,,,,3,3,3,,,,30,30,30,,,,9550,0,9550,,, 1970,2,331,,100,4.54,1,0,1,100,2,45,1,17,0,2,,,,,,45,,,,,,1,,,,,,17,,,,,,0,,,,, 1970,2,331,,100,4.54,1,0,2,100,1,45,1,16,6150,2,,,,,,45,,,,,,1,,,,,,17,,,,,,0,,,,, 1970,2,331,,100,4.54,1,0,3,100,2,20,2,22,4150,2,2,,,,,45,45,,,,,1,1,,,,,17,17,,,,,0,0,,,, 1970,2,332,,100,4.54,1,0,1,100,1,39,6,60,6550,1,,,,,,39,,,,,,6,,,,,,60,,,,,,6550,,,,, 1970,2,333,,100,4.54,1,0,1,100,1,19,6,65,1750,1,,,,,,19,,,,,,6,,,,,,65,,,,,,1750,,,,, 1970,2,333,,100,4.54,1,0,2,100,1,18,7,70,2050,1,,,,,,19,,,,,,6,,,,,,65,,,,,,1750,,,,, 1970,2,333,,100,4.54,1,0,3,100,1,21,9,90,1950,1,,,,,,19,,,,,,6,,,,,,65,,,,,,1750,,,,, 1970,2,334,,100,4.54,1,0,1,100,1,40,3,30,9450,1,,,2,,,40,,,42,,,3,,,2,,,30,,,26,,,9450,,,0,, 1970,2,334,,100,4.54,1,0,2,100,2,42,2,26,0,1,,,1,,,40,,,40,,,3,,,3,,,30,,,30,,,9450,,,9450,, 1970,2,336,,100,4.54,1,0,1,100,2,29,5,50,3750,2,,,,,,29,,,,,,5,,,,,,50,,,,,,3750,,,,, 1970,2,336,,100,4.54,1,0,2,100,2,5,0,2,9999999,2,2,,,,,29,29,,,,,5,5,,,,,50,50,,,,,3750,3750,,,, 1970,2,336,,100,4.54,1,0,3,100,2,4,0,2,9999999,2,2,,,,,29,29,,,,,5,5,,,,,50,50,,,,,3750,3750,,,, 1970,2,337,,100,4.54,1,0,1,100,1,48,11,110,15050,1,,,2,,,48,,,49,,,11,,,2,,,110,,,23,,,15050,,,0,, 1970,2,337,,100,4.54,1,0,2,100,2,49,2,23,0,1,,,1,,,48,,,48,,,11,,,11,,,110,,,110,,,15050,,,15050,, 1970,2,337,,100,4.54,1,0,3,100,1,18,6,60,1750,1,2,1,,,,48,49,48,,,,11,2,11,,,,110,23,110,,,,15050,0,15050,,, 1970,2,337,,100,4.54,1,0,4,100,1,17,5,50,1050,1,2,1,,,,48,49,48,,,,11,2,11,,,,110,23,110,,,,15050,0,15050,,, 1970,2,337,,100,4.54,1,0,5,100,1,14,2,25,50,1,2,1,,,,48,49,48,,,,11,2,11,,,,110,23,110,,,,15050,0,15050,,, 1970,2,337,,100,4.54,1,0,6,100,2,11,2,22,9999999,1,2,1,,,,48,49,48,,,,11,2,11,,,,110,23,110,,,,15050,0,15050,,, 1970,2,337,,100,4.54,1,0,7,100,1,10,1,17,9999999,1,2,1,,,,48,49,48,,,,11,2,11,,,,110,23,110,,,,15050,0,15050,,, 1970,2,338,,100,4.54,1,0,1,100,1,57,9,90,10050,1,,,2,,,57,,,49,,,9,,,6,,,90,,,60,,,10050,,,250,, 1970,2,338,,100,4.54,1,0,2,100,2,49,6,60,250,1,,,1,,,57,,,57,,,9,,,9,,,90,,,90,,,10050,,,10050,, 1970,2,338,,100,4.54,1,0,3,100,1,16,4,40,1450,1,2,1,,,,57,49,57,,,,9,6,9,,,,90,60,90,,,,10050,250,10050,,, 1970,2,338,,100,4.54,1,0,4,100,2,14,2,26,0,1,2,1,,,,57,49,57,,,,9,6,9,,,,90,60,90,,,,10050,250,10050,,, 1970,2,338,,100,4.54,1,0,5,100,1,11,2,22,9999999,1,2,1,,,,57,49,57,,,,9,6,9,,,,90,60,90,,,,10050,250,10050,,, 1970,2,338,,100,4.54,1,0,6,100,1,6,1,12,9999999,1,2,1,,,,57,49,57,,,,9,6,9,,,,90,60,90,,,,10050,250,10050,,, 1970,2,339,,100,4.54,1,0,1,100,1,30,6,60,12050,1,,,2,,,30,,,28,,,6,,,6,,,60,,,60,,,12050,,,0,, 1970,2,339,,100,4.54,1,0,2,100,2,28,6,60,0,1,,,1,,,30,,,30,,,6,,,6,,,60,,,60,,,12050,,,12050,, 1970,2,339,,100,4.54,1,0,3,100,1,7,1,14,9999999,1,2,1,,,,30,28,30,,,,6,6,6,,,,60,60,60,,,,12050,0,12050,,, 1970,2,339,,100,4.54,1,0,4,100,1,5,1,11,9999999,1,2,1,,,,30,28,30,,,,6,6,6,,,,60,60,60,,,,12050,0,12050,,, 1970,2,339,,100,4.54,1,0,5,100,2,0,0,1,9999999,1,2,1,,,,30,28,30,,,,6,6,6,,,,60,60,60,,,,12050,0,12050,,, 1970,2,340,,100,4.54,1,0,1,100,1,57,3,30,5250,1,,,2,,,57,,,62,,,3,,,2,,,30,,,26,,,5250,,,0,, 1970,2,340,,100,4.54,1,0,2,100,2,62,2,26,0,1,,,1,,,57,,,57,,,3,,,3,,,30,,,30,,,5250,,,5250,, 1970,2,341,,100,4.54,1,0,1,100,1,73,1,16,1750,1,,,2,,,73,,,67,,,1,,,1,,,16,,,16,,,1750,,,750,, 1970,2,341,,100,4.54,1,0,2,100,2,67,1,16,750,1,,,1,,,73,,,73,,,1,,,1,,,16,,,16,,,1750,,,1750,, 1970,2,342,,100,4.54,1,0,1,100,1,29,6,60,7050,1,,,2,,,29,,,21,,,6,,,5,,,60,,,50,,,7050,,,3350,, 1970,2,342,,100,4.54,1,0,2,100,2,21,5,50,3350,1,,,1,,,29,,,29,,,6,,,6,,,60,,,60,,,7050,,,7050,, 1970,2,342,,100,4.54,1,0,3,100,1,0,0,1,9999999,1,2,1,,,,29,21,29,,,,6,5,6,,,,60,50,60,,,,7050,3350,7050,,, 1970,2,343,,100,4.54,1,0,1,100,1,36,11,111,24250,1,,,2,,,36,,,30,,,11,,,10,,,111,,,100,,,24250,,,7050,, 1970,2,343,,100,4.54,1,0,2,100,2,30,10,100,7050,1,,,1,,,36,,,36,,,11,,,11,,,111,,,111,,,24250,,,24250,, 1970,2,343,,100,4.54,1,0,3,100,2,7,1,14,9999999,1,2,1,,,,36,30,36,,,,11,10,11,,,,111,100,111,,,,24250,7050,24250,,, 1970,2,343,,100,4.54,1,0,4,100,2,3,0,2,9999999,1,2,1,,,,36,30,36,,,,11,10,11,,,,111,100,111,,,,24250,7050,24250,,, 1970,2,344,,100,4.54,1,0,1,100,1,24,11,111,250,1,,,,,,24,,,,,,11,,,,,,111,,,,,,250,,,,, 1970,2,344,,100,4.54,1,0,2,100,1,6,1,12,9999999,1,,1,,,,24,,24,,,,11,,11,,,,111,,111,,,,250,,250,,, 1970,2,344,,100,4.54,1,0,3,100,1,8,1,15,9999999,1,,1,,,,24,,24,,,,11,,11,,,,111,,111,,,,250,,250,,, 1970,2,345,,100,4.54,1,0,1,100,1,58,8,80,11850,1,,,2,,,58,,,57,,,8,,,6,,,80,,,60,,,11850,,,6450,, 1970,2,345,,100,4.54,1,0,2,100,2,57,6,60,6450,1,,,1,,,58,,,58,,,8,,,8,,,80,,,80,,,11850,,,11850,, 1970,2,345,,100,4.54,1,0,3,100,2,28,7,70,6150,1,2,1,,,,58,57,58,,,,8,6,8,,,,80,60,80,,,,11850,6450,11850,,, 1970,2,346,,100,4.54,1,0,1,100,1,39,6,60,12050,1,,,2,,,39,,,33,,,6,,,6,,,60,,,60,,,12050,,,3050,, 1970,2,346,,100,4.54,1,0,2,100,2,33,6,60,3050,1,,,1,,,39,,,39,,,6,,,6,,,60,,,60,,,12050,,,12050,, 1970,2,346,,100,4.54,1,0,3,100,1,16,3,30,0,1,2,1,,,,39,33,39,,,,6,6,6,,,,60,60,60,,,,12050,3050,12050,,, 1970,2,346,,100,4.54,1,0,4,100,1,14,2,25,0,1,2,1,,,,39,33,39,,,,6,6,6,,,,60,60,60,,,,12050,3050,12050,,, 1970,2,346,,100,4.54,1,0,5,100,1,4,1,11,9999999,1,2,1,,,,39,33,39,,,,6,6,6,,,,60,60,60,,,,12050,3050,12050,,, 1970,2,347,,100,4.54,1,0,1,100,1,38,11,111,46550,1,,,2,,,38,,,33,,,11,,,6,,,111,,,60,,,46550,,,0,, 1970,2,347,,100,4.54,1,0,2,100,2,33,6,60,0,1,,,1,,,38,,,38,,,11,,,11,,,111,,,111,,,46550,,,46550,, 1970,2,347,,100,4.54,1,0,3,100,1,10,1,16,9999999,1,2,1,,,,38,33,38,,,,11,6,11,,,,111,60,111,,,,46550,0,46550,,, 1970,2,347,,100,4.54,1,0,4,100,1,9,1,15,9999999,1,2,1,,,,38,33,38,,,,11,6,11,,,,111,60,111,,,,46550,0,46550,,, 1970,2,347,,100,4.54,1,0,5,100,2,7,1,14,9999999,1,2,1,,,,38,33,38,,,,11,6,11,,,,111,60,111,,,,46550,0,46550,,, 1970,2,347,,100,4.54,1,0,6,100,1,5,1,11,9999999,1,2,1,,,,38,33,38,,,,11,6,11,,,,111,60,111,,,,46550,0,46550,,, 1970,2,347,,100,4.54,1,0,7,100,2,4,0,2,9999999,1,2,1,,,,38,33,38,,,,11,6,11,,,,111,60,111,,,,46550,0,46550,,, 1970,2,347,,100,4.54,1,0,8,100,1,1,0,1,9999999,1,2,1,,,,38,33,38,,,,11,6,11,,,,111,60,111,,,,46550,0,46550,,, 1970,2,348,,100,4.54,1,0,1,100,2,57,8,80,9050,2,,,,,,57,,,,,,8,,,,,,80,,,,,,9050,,,,, ================================================ FILE: examples/data/nyc-taxi_1k.csv ================================================ 1460000001,2,2017-12-15 00:00:28,2017-12-15 00:15:43,N,1,,,,,2,1.50,11,0.5,0.5,1.25,0,,0.3,13.55,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000002,2,2017-12-15 00:33:12,2017-12-15 00:51:04,N,1,,,,,3,2.53,13,0.5,0.5,2.86,0,,0.3,17.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000003,2,2017-12-15 00:56:59,2017-12-15 00:59:51,N,1,,,,,3,0.06,3.5,0.5,0.5,0,0,,0.3,4.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000004,2,2017-12-15 00:09:19,2017-12-15 00:18:54,N,1,,,,,5,1.47,8.5,0.5,0.5,1.96,0,,0.3,11.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000005,2,2017-12-15 00:21:02,2017-12-15 00:25:30,N,1,,,,,5,0.23,4.5,0.5,0.5,1.16,0,,0.3,6.96,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000006,2,2017-12-15 00:35:38,2017-12-15 01:26:19,N,1,,,,,5,18.48,58,0.5,0.5,9.76,5.76,,0.3,76.77,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000007,1,2017-12-15 00:48:29,2017-12-15 01:01:42,N,1,,,,,1,2.60,11.5,0.5,0.5,2.55,0,,0.3,15.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000008,2,2017-12-15 00:09:01,2017-12-15 00:11:57,N,1,,,,,1,0.85,4.5,0.5,0.5,0,0,,0.3,5.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000009,2,2017-12-15 00:32:00,2017-12-15 00:35:42,N,1,,,,,1,0.48,4.5,0.5,0.5,0,0,,0.3,5.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000010,2,2017-12-15 00:42:13,2017-12-15 00:54:48,N,1,,,,,1,2.09,10.5,0.5,0.5,2.36,0,,0.3,14.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000011,2,2017-12-15 00:05:46,2017-12-15 00:09:49,N,1,,,,,3,0.60,4.5,0.5,0.5,0.87,0,,0.3,6.67,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000012,2,2017-12-15 00:10:48,2017-12-15 00:20:24,N,1,,,,,3,0.97,7.5,0.5,0.5,0,0,,0.3,8.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000013,2,2017-12-15 00:26:39,2017-12-15 00:39:18,N,1,,,,,3,1.82,9.5,0.5,0.5,3,0,,0.3,13.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000014,2,2017-12-15 00:40:49,2017-12-15 01:16:48,N,1,,,,,3,8.99,31,0.5,0.5,6.46,0,,0.3,38.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000015,2,2017-12-15 00:24:04,2017-12-15 00:34:56,N,1,,,,,1,5.04,16,0.5,0.5,5.19,0,,0.3,22.49,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000016,1,2017-12-15 00:06:12,2017-12-15 00:35:33,N,1,,,,,1,5.30,21.5,0.5,0.5,1.5,0,,0.3,24.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000017,1,2017-12-15 00:38:38,2017-12-15 00:54:18,N,1,,,,,1,0.90,10.5,0.5,0.5,1,0,,0.3,12.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000018,1,2017-12-15 00:33:57,2017-12-15 00:47:58,N,1,,,,,1,2.10,11.5,0.5,0.5,2.55,0,,0.3,15.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000019,1,2017-12-15 00:00:23,2017-12-15 00:13:38,N,1,,,,,1,1.40,9.5,0.5,0.5,1.5,0,,0.3,12.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000020,1,2017-12-15 00:21:30,2017-12-15 00:31:23,N,1,,,,,1,1.50,8,0.5,0.5,2.3,0,,0.3,11.6,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000021,1,2017-12-15 00:47:17,2017-12-15 01:02:26,N,1,,,,,1,2.80,12,0.5,0.5,1,0,,0.3,14.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000022,1,2017-12-15 00:22:56,2017-12-15 00:29:31,N,1,,,,,1,1.50,7,0.5,0.5,3.5,0,,0.3,11.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000023,1,2017-12-15 00:44:16,2017-12-15 00:50:13,N,1,,,,,1,0.90,6,0.5,0.5,1.45,0,,0.3,8.75,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000024,2,2017-12-15 00:21:58,2017-12-15 00:44:00,N,1,,,,,1,13.41,37,0.5,0.5,0,0,,0.3,38.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000025,2,2017-12-15 00:18:22,2017-12-15 00:33:48,N,1,,,,,1,1.71,11,0.5,0.5,0,0,,0.3,12.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000026,2,2017-12-15 00:39:06,2017-12-15 00:52:25,N,1,,,,,1,1.01,9,0.5,0.5,1.54,0,,0.3,11.84,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000027,2,2017-12-15 01:00:32,2017-12-15 01:20:29,N,1,,,,,1,3.69,16,0.5,0.5,2.6,0,,0.3,19.9,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000028,1,2017-12-15 00:09:13,2017-12-15 00:25:51,N,1,,,,,1,1.90,12,0.5,0.5,1,0,,0.3,14.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000029,1,2017-12-15 00:27:17,2017-12-15 00:35:39,N,1,,,,,1,1.00,7,0.5,0.5,1.65,0,,0.3,9.95,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000030,1,2017-12-15 00:37:01,2017-12-15 00:41:47,N,1,,,,,1,0.60,5,0.5,0.5,1.25,0,,0.3,7.55,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000031,1,2017-12-15 00:44:51,2017-12-15 00:52:31,N,1,,,,,1,1.80,8,0.5,0.5,1,0,,0.3,10.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000032,1,2017-12-15 00:58:04,2017-12-15 01:42:27,N,1,,,,,1,12.10,41,0.5,0.5,2,0,,0.3,44.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000033,1,2017-12-15 00:04:01,2017-12-15 00:20:26,N,1,,,,,1,1.70,12,0.5,0.5,0,0,,0.3,13.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000034,1,2017-12-15 00:21:36,2017-12-15 00:23:09,N,1,,,,,1,0.20,3,0.5,0.5,0,0,,0.3,4.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000035,1,2017-12-15 00:24:05,2017-12-15 00:38:33,N,1,,,,,1,3.10,12.5,0.5,0.5,0,0,,0.3,13.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000036,1,2017-12-15 00:49:14,2017-12-15 01:05:23,N,1,,,,,1,3.90,15,0.5,0.5,3.25,0,,0.3,19.55,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000037,1,2017-12-15 00:19:03,2017-12-15 00:44:02,N,1,,,,,1,3.50,17,0.5,0.5,2.75,0,,0.3,21.05,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000038,1,2017-12-15 00:47:45,2017-12-15 01:01:16,N,1,,,,,1,4.90,16.5,0.5,0.5,0,0,,0.3,17.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000039,2,2017-12-15 00:23:06,2017-12-15 00:24:06,N,1,,,,,1,0.36,3.5,0.5,0.5,0,0,,0.3,4.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000040,2,2017-12-15 00:27:28,2017-12-15 00:30:57,N,1,,,,,1,0.99,5,0.5,0.5,1.26,0,,0.3,7.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000041,2,2017-12-15 00:18:20,2017-12-15 00:41:29,N,1,,,,,1,3.74,18,0.5,0.5,3.86,0,,0.3,23.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000042,1,2017-12-15 00:08:31,2017-12-15 00:11:07,N,1,,,,,1,0.50,4,0.5,0.5,1.05,0,,0.3,6.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000043,1,2017-12-15 00:13:03,2017-12-15 00:33:47,N,1,,,,,1,7.20,22.5,0.5,0.5,0,5.76,,0.3,29.56,3,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000044,2,2017-12-15 00:30:10,2017-12-15 00:45:02,N,1,,,,,1,3.02,12.5,0.5,0.5,0,0,,0.3,13.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000045,2,2017-12-15 00:49:42,2017-12-15 01:13:51,N,1,,,,,1,5.27,20,0.5,0.5,2,0,,0.3,23.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000046,2,2017-12-15 00:05:58,2017-12-15 00:25:43,N,1,,,,,1,4.15,17,0.5,0.5,4.58,0,,0.3,22.88,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000047,2,2017-12-15 00:37:28,2017-12-15 00:42:26,N,1,,,,,1,1.00,5.5,0.5,0.5,1.36,0,,0.3,8.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000048,1,2017-12-15 00:15:17,2017-12-15 00:37:01,N,1,,,,,1,4.00,16.5,0.5,0.5,3.55,0,,0.3,21.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000049,1,2017-12-15 00:42:53,2017-12-15 00:59:39,N,1,,,,,1,3.30,14.5,0.5,0.5,3.15,0,,0.3,18.95,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000050,2,2017-12-15 00:33:52,2017-12-15 00:40:49,N,1,,,,,1,0.40,6,0.5,0.5,0,0,,0.3,7.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000051,2,2017-12-15 00:46:35,2017-12-15 00:56:49,N,1,,,,,1,1.26,8.5,0.5,0.5,2,0,,0.3,11.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000052,1,2017-12-15 00:06:04,2017-12-15 00:31:04,N,1,,,,,1,4.50,20,0.5,0.5,4.26,0,,0.3,25.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000053,2,2017-12-15 00:30:11,2017-12-15 00:34:09,N,1,,,,,3,1.14,5.5,0.5,0.5,2.04,0,,0.3,8.84,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000054,2,2017-12-15 00:45:10,2017-12-15 01:11:33,N,1,,,,,5,10.50,31.5,0.5,0.5,0,5.76,,0.3,38.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000055,2,2017-12-15 00:31:27,2017-12-15 00:53:23,N,1,,,,,2,4.40,18,0.5,0.5,4.82,0,,0.3,24.12,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000056,2,2017-12-15 00:23:15,2017-12-15 00:33:59,N,1,,,,,1,1.26,8.5,0.5,0.5,1.96,0,,0.3,11.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000057,1,2017-12-15 00:18:49,2017-12-15 00:25:53,N,1,,,,,2,1.10,6.5,0.5,0.5,1.55,0,,0.3,9.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000058,1,2017-12-15 00:28:53,2017-12-15 00:35:26,N,1,,,,,1,0.80,6,0.5,0.5,1.45,0,,0.3,8.75,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000059,1,2017-12-15 00:36:04,2017-12-15 00:58:54,N,1,,,,,1,4.00,17,0.5,0.5,3.65,0,,0.3,21.95,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000060,1,2017-12-15 00:52:09,2017-12-15 00:59:47,N,1,,,,,1,1.00,7,0.5,0.5,0,0,,0.3,8.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000061,2,2017-12-15 00:07:57,2017-12-15 00:25:09,N,1,,,,,1,2.79,13,0.5,0.5,0,0,,0.3,14.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000062,2,2017-12-15 00:26:09,2017-12-15 00:29:52,N,1,,,,,1,0.68,4.5,0.5,0.5,0,0,,0.3,5.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000063,2,2017-12-15 00:31:59,2017-12-15 00:40:18,N,1,,,,,1,1.31,7.5,0.5,0.5,1.25,0,,0.3,10.05,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000064,2,2017-12-15 00:49:06,2017-12-15 01:07:13,N,1,,,,,1,4.21,15.5,0.5,0.5,3.36,0,,0.3,20.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000065,2,2017-12-15 00:56:03,2017-12-15 01:37:49,N,1,,,,,1,22.40,62.5,0.5,0.5,17.39,5.76,,0.3,86.95,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000066,1,2017-12-15 00:01:19,2017-12-15 00:12:58,N,1,,,,,1,1.80,10,0.5,0.5,1,0,,0.3,12.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000067,1,2017-12-15 00:18:24,2017-12-15 01:04:16,N,5,,,,,1,5.00,0,0,0,0,10.5,,0.3,10.8,3,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000068,2,2017-12-15 00:08:27,2017-12-15 00:38:37,N,1,,,,,1,4.88,21.5,0.5,0.5,2.28,0,,0.3,25.08,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000069,2,2017-12-15 00:42:27,2017-12-15 00:53:50,N,1,,,,,1,5.28,17,0.5,0.5,4.81,5.76,,0.3,28.87,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000070,2,2017-12-15 00:05:56,2017-12-15 00:22:42,N,1,,,,,2,2.29,11.5,0.5,0.5,2.56,0,,0.3,15.36,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000071,2,2017-12-15 00:27:09,2017-12-15 00:31:47,N,1,,,,,2,0.77,5,0.5,0.5,0,0,,0.3,6.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000072,2,2017-12-15 00:42:03,2017-12-15 01:00:47,N,1,,,,,2,3.28,14.5,0.5,0.5,3.95,0,,0.3,19.75,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000073,1,2017-12-15 00:45:21,2017-12-15 00:50:56,N,1,,,,,1,1.50,7,0.5,0.5,1.65,0,,0.3,9.95,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000074,2,2017-12-15 00:08:09,2017-12-15 00:21:11,N,1,,,,,5,1.97,10.5,0.5,0.5,2.36,0,,0.3,14.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000075,2,2017-12-15 00:21:48,2017-12-15 00:57:13,N,1,,,,,5,5.40,24,0.5,0.5,0,0,,0.3,25.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000076,1,2017-12-15 00:09:11,2017-12-15 00:35:18,N,1,,,,,4,5.20,21.5,0.5,0.5,0,0,,0.3,22.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000077,1,2017-12-15 00:42:08,2017-12-15 00:42:09,N,1,,,,,1,5.30,2.5,0.5,0.5,0,0,,0.3,3.8,3,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000078,1,2017-12-15 00:45:28,2017-12-15 01:05:27,N,1,,,,,1,6.90,22.5,0.5,0.5,3,5.76,,0.3,32.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000079,2,2017-12-15 00:03:48,2017-12-15 00:38:38,N,1,,,,,6,7.86,29,0.5,0.5,9.09,0,,0.3,39.39,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000080,1,2017-12-15 00:25:04,2017-12-15 00:28:18,N,1,,,,,2,0.60,4.5,0.5,0.5,0,0,,0.3,5.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000081,1,2017-12-15 00:34:23,2017-12-15 00:59:40,N,1,,,,,1,3.60,17.5,0.5,0.5,3,0,,0.3,21.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000082,2,2017-12-15 00:53:37,2017-12-15 01:11:06,N,1,,,,,1,2.77,13,0.5,0.5,2.86,0,,0.3,17.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000083,2,2017-12-15 00:09:00,2017-12-15 00:42:14,N,1,,,,,1,4.94,24.5,0.5,0.5,5.16,0,,0.3,30.96,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000084,1,2017-12-15 00:00:49,2017-12-15 00:07:51,N,1,,,,,1,0.50,6,0.5,0.5,1,0,,0.3,8.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000085,1,2017-12-15 00:22:47,2017-12-15 00:30:21,N,1,,,,,4,0.60,6.5,0.5,0.5,0,0,,0.3,7.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000086,1,2017-12-15 00:32:04,2017-12-15 00:39:38,N,1,,,,,1,0.90,6.5,0.5,0.5,1.55,0,,0.3,9.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000087,1,2017-12-15 00:50:23,2017-12-15 01:05:29,N,1,,,,,1,3.00,13,0.5,0.5,2.85,0,,0.3,17.15,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000088,2,2017-12-15 00:08:01,2017-12-15 00:25:35,N,1,,,,,1,2.80,14,0.5,0.5,0,0,,0.3,15.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000089,2,2017-12-15 00:27:27,2017-12-15 00:55:33,N,1,,,,,1,4.05,19.5,0.5,0.5,5.2,0,,0.3,26,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000090,2,2017-12-15 00:11:22,2017-12-15 00:37:10,N,1,,,,,2,5.04,19.5,0.5,0.5,0,0,,0.3,20.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000091,2,2017-12-15 01:00:50,2017-12-15 01:29:18,N,1,,,,,2,7.11,24.5,0.5,0.5,0,0,,0.3,25.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000092,1,2017-12-15 00:26:01,2017-12-15 00:48:49,N,1,,,,,1,14.60,40,0.5,0.5,8.25,0,,0.3,49.55,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000093,1,2017-12-15 00:10:53,2017-12-15 00:48:57,N,1,,,,,1,14.30,43.5,0.5,0.5,8.95,0,,0.3,53.75,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000094,2,2017-12-15 00:31:37,2017-12-15 00:37:18,N,1,,,,,1,0.48,5,0.5,0.5,1.26,0,,0.3,7.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000095,2,2017-12-15 00:41:42,2017-12-15 00:51:54,N,1,,,,,1,1.66,9,0.5,0.5,1,0,,0.3,11.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000096,2,2017-12-15 00:53:10,2017-12-15 01:12:40,N,1,,,,,1,3.08,15.5,0.5,0.5,1,0,,0.3,17.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000097,2,2017-12-15 00:17:32,2017-12-15 00:30:33,N,1,,,,,5,1.14,9,0.5,0.5,1,0,,0.3,11.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000098,2,2017-12-15 00:57:51,2017-12-15 01:11:23,N,1,,,,,5,2.01,10,0.5,0.5,2.26,0,,0.3,13.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000099,1,2017-12-15 00:43:52,2017-12-15 00:52:43,N,1,,,,,4,1.60,8,0.5,0.5,0,0,,0.3,9.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000100,1,2017-12-15 00:02:26,2017-12-15 00:13:59,N,1,,,,,1,1.10,8.5,0.5,0.5,0,0,,0.3,9.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000101,1,2017-12-15 00:14:19,2017-12-15 00:56:54,N,5,,,,,1,18.50,100,0,0,15,0,,0.3,115.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000102,1,2017-12-15 00:40:45,2017-12-15 00:54:30,N,1,,,,,1,2.10,12,0.5,0.5,2.2,0,,0.3,15.5,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000103,2,2017-12-15 00:37:36,2017-12-15 00:53:05,N,1,,,,,1,3.60,14,0.5,0.5,2,0,,0.3,17.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000104,2,2017-12-15 00:53:59,2017-12-15 00:55:30,N,1,,,,,1,0.22,3,0.5,0.5,0,0,,0.3,4.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000105,1,2017-12-15 00:36:23,2017-12-15 00:51:06,N,1,,,,,1,2.30,11.5,0.5,0.5,2.55,0,,0.3,15.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000106,2,2017-12-15 00:19:27,2017-12-15 00:31:53,N,1,,,,,6,1.49,9.5,0.5,0.5,2.16,0,,0.3,12.96,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000107,2,2017-12-15 00:39:17,2017-12-15 00:44:01,N,1,,,,,6,0.79,5.5,0.5,0.5,0,0,,0.3,6.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000108,1,2017-12-15 00:09:38,2017-12-15 00:47:23,N,1,,,,,2,17.90,51,0.5,0.5,1,17.28,,0.3,70.58,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000109,1,2017-12-15 00:49:21,2017-12-15 01:11:02,N,1,,,,,1,14.00,38,0.5,0.5,1,0,,0.3,40.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000110,2,2017-12-15 00:32:24,2017-12-15 00:38:56,N,1,,,,,1,0.80,6,0.5,0.5,0,0,,0.3,7.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000111,2,2017-12-15 00:40:45,2017-12-15 00:55:44,N,1,,,,,1,1.54,10.5,0.5,0.5,2.36,0,,0.3,14.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000112,1,2017-12-15 00:34:39,2017-12-15 00:41:03,N,1,,,,,1,1.00,6.5,0.5,0.5,1.95,0,,0.3,9.75,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000113,1,2017-12-15 00:43:04,2017-12-15 00:58:19,N,1,,,,,1,2.80,12.5,0.5,0.5,2.75,0,,0.3,16.55,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000114,2,2017-12-15 00:32:50,2017-12-15 00:45:30,N,1,,,,,1,1.56,9.5,0.5,0.5,2.16,0,,0.3,12.96,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000115,2,2017-12-15 00:50:04,2017-12-15 00:59:07,N,1,,,,,1,1.70,8.5,0.5,0.5,2.45,0,,0.3,12.25,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000116,1,2017-12-15 00:31:07,2017-12-15 00:35:35,N,1,,,,,1,0.60,5,0.5,0.5,1,0,,0.3,7.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000117,1,2017-12-15 00:44:08,2017-12-15 01:12:08,N,1,,,,,1,7.30,26.5,0.5,0.5,0,0,,0.3,27.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000118,2,2017-12-15 00:40:24,2017-12-15 00:45:14,N,1,,,,,1,1.47,6.5,0.5,0.5,0,0,,0.3,7.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000119,2,2017-12-15 00:46:19,2017-12-15 01:02:19,N,1,,,,,2,1.87,11,0.5,0.5,0,0,,0.3,12.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000120,2,2017-12-15 00:02:07,2017-12-15 00:04:51,N,1,,,,,1,0.36,4,0.5,0.5,0,0,,0.3,5.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000121,2,2017-12-15 00:22:58,2017-12-15 00:34:06,N,1,,,,,2,0.73,8,0.5,0.5,1.86,0,,0.3,11.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000122,2,2017-12-15 00:41:31,2017-12-15 00:48:10,N,1,,,,,1,0.82,6,0.5,0.5,1.46,0,,0.3,8.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000123,2,2017-12-15 00:52:19,2017-12-15 01:01:01,N,1,,,,,1,1.54,7.5,0.5,0.5,0,0,,0.3,8.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000124,1,2017-12-15 00:31:59,2017-12-15 00:44:42,N,1,,,,,0,2.20,10.5,0.5,0.5,2,0,,0.3,13.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000125,2,2017-12-15 00:38:43,2017-12-15 00:44:54,N,1,,,,,1,1.44,7,0.5,0.5,1.66,0,,0.3,9.96,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000126,2,2017-12-15 00:51:10,2017-12-15 01:05:47,N,1,,,,,5,2.75,12,0.5,0.5,0,0,,0.3,13.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000127,2,2017-12-15 00:09:57,2017-12-15 00:25:53,N,1,,,,,1,2.61,12.5,0.5,0.5,2.76,0,,0.3,16.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000128,2,2017-12-15 00:27:34,2017-12-15 00:44:22,N,1,,,,,1,7.11,22.5,0.5,0.5,3.8,0,,0.3,27.6,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000129,2,2017-12-15 00:29:13,2017-12-15 00:53:56,N,1,,,,,2,9.73,30,0.5,0.5,0,0,,0.3,31.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000130,2,2017-12-15 00:28:32,2017-12-15 00:33:57,N,1,,,,,6,0.59,5.5,0.5,0.5,1.36,0,,0.3,8.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000131,2,2017-12-15 00:37:51,2017-12-15 01:55:38,N,1,,,,,6,15.34,58,0.5,0.5,0,0,,0.3,59.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000132,2,2017-12-15 00:04:24,2017-12-15 00:13:56,N,1,,,,,3,1.46,8,0.5,0.5,2,0,,0.3,11.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000133,2,2017-12-15 00:38:48,2017-12-15 01:01:48,N,1,,,,,2,2.72,15.5,0.5,0.5,4.2,0,,0.3,21,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000134,1,2017-12-15 00:33:15,2017-12-15 00:47:55,N,1,,,,,1,9.80,27,0.5,0.5,8,0,,0.3,36.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000135,2,2017-12-15 00:41:39,2017-12-15 01:14:47,N,1,,,,,1,8.90,30.5,0.5,0.5,0,0,,0.3,31.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000136,1,2017-12-15 00:50:41,2017-12-15 00:59:29,N,1,,,,,1,1.30,8,0.5,0.5,1.5,0,,0.3,10.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000137,2,2017-12-15 00:49:15,2017-12-15 00:58:07,N,1,,,,,4,1.87,9,0.5,0.5,2.06,0,,0.3,12.36,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000138,2,2017-12-14 23:51:20,2017-12-15 00:50:09,N,1,,,,,1,10.08,43.5,0.5,0.5,8.96,0,,0.3,53.76,1,,,,yellow,0.09,1,1.2,32,22,5.82,,,,,,,,,,,,,,,,,,,, 1460000139,2,2017-12-15 00:05:31,2017-12-15 00:19:55,N,1,,,,,5,2.97,12,0.5,0.5,2.66,0,,0.3,15.96,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000140,2,2017-12-15 00:29:06,2017-12-15 01:09:26,N,1,,,,,3,13.26,41.5,0.5,0.5,0,0,,0.3,42.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000141,2,2017-12-15 00:26:34,2017-12-15 00:51:30,N,1,,,,,1,5.36,21,0.5,0.5,2,0,,0.3,24.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000142,2,2017-12-15 00:04:47,2017-12-15 00:23:09,N,1,,,,,1,2.68,13.5,0.5,0.5,4.44,0,,0.3,19.24,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000143,2,2017-12-15 00:31:16,2017-12-15 00:44:45,N,1,,,,,1,2.40,11,0.5,0.5,1,0,,0.3,13.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000144,2,2017-12-15 00:49:37,2017-12-15 01:21:04,N,1,,,,,1,6.83,25.5,0.5,0.5,5.36,0,,0.3,32.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000145,2,2017-12-15 00:10:01,2017-12-15 00:16:49,N,1,,,,,1,0.91,6.5,0.5,0.5,0,0,,0.3,7.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000146,2,2017-12-15 00:19:03,2017-12-15 00:30:11,N,1,,,,,1,1.81,9.5,0.5,0.5,2,0,,0.3,12.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000147,2,2017-12-15 00:31:07,2017-12-15 00:34:53,N,1,,,,,1,0.92,5,0.5,0.5,0,0,,0.3,6.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000148,2,2017-12-15 00:43:48,2017-12-15 01:04:53,N,1,,,,,1,4.63,18,0.5,0.5,3.86,0,,0.3,23.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000149,1,2017-12-15 00:22:11,2017-12-15 00:32:11,N,1,,,,,1,2.20,10,0.5,0.5,2.25,0,,0.3,13.55,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000150,1,2017-12-15 00:48:02,2017-12-15 01:01:45,N,1,,,,,1,1.30,10.5,0.5,0.5,2.35,0,,0.3,14.15,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000151,2,2017-12-15 00:05:43,2017-12-15 00:12:57,N,1,,,,,1,2.05,9,0.5,0.5,2.06,0,,0.3,14.31,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000152,2,2017-12-15 00:32:48,2017-12-15 00:55:01,N,1,,,,,1,9.23,28,0.5,0.5,6,5.76,,0.3,41.06,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000153,2,2017-12-15 01:00:10,2017-12-15 01:16:04,N,1,,,,,1,4.37,15.5,0.5,0.5,3.36,0,,0.3,20.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000154,2,2017-12-15 00:08:13,2017-12-15 00:28:37,N,1,,,,,1,3.22,15,0.5,0.5,0,0,,0.3,16.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000155,2,2017-12-15 00:31:00,2017-12-15 00:41:23,N,1,,,,,1,1.60,8.5,0.5,0.5,0,0,,0.3,9.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000156,2,2017-12-15 00:42:50,2017-12-15 00:53:34,N,1,,,,,1,1.37,8.5,0.5,0.5,0,0,,0.3,9.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000157,2,2017-12-15 00:55:45,2017-12-15 00:59:31,N,1,,,,,1,1.08,5.5,0.5,0.5,1.7,0,,0.3,8.5,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000158,2,2017-12-15 00:09:20,2017-12-15 00:23:24,N,1,,,,,1,2.62,11.5,0.5,0.5,3.2,0,,0.3,16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000159,2,2017-12-15 00:24:46,2017-12-15 00:36:33,N,1,,,,,1,1.35,9.5,0.5,0.5,0,0,,0.3,10.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000160,2,2017-12-15 00:40:16,2017-12-15 00:57:48,N,1,,,,,1,3.03,13.5,0.5,0.5,1,0,,0.3,15.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000161,1,2017-12-15 00:00:35,2017-12-15 00:12:21,N,1,,,,,1,2.00,9.5,0.5,0.5,3.2,0,,0.3,14,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000162,1,2017-12-15 00:13:48,2017-12-15 00:20:03,N,1,,,,,1,1.00,6,0.5,0.5,0,0,,0.3,7.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000163,1,2017-12-15 00:20:47,2017-12-15 00:31:56,N,1,,,,,1,1.70,9,0.5,0.5,2.05,0,,0.3,12.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000164,1,2017-12-15 00:33:29,2017-12-15 01:02:24,N,1,,,,,1,9.00,29,0.5,0.5,7.55,0,,0.3,37.85,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000165,1,2017-12-15 00:09:42,2017-12-15 00:18:12,N,1,,,,,2,2.40,9,0.5,0.5,0,0,,0.3,10.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000166,1,2017-12-15 00:57:43,2017-12-15 01:16:05,N,1,,,,,1,3.30,13.5,0.5,0.5,2.95,0,,0.3,17.75,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000167,2,2017-12-15 00:09:55,2017-12-15 00:19:30,N,1,,,,,1,1.45,8,0.5,0.5,0,0,,0.3,9.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000168,2,2017-12-15 00:20:11,2017-12-15 00:33:01,N,1,,,,,1,2.18,11,0.5,0.5,2.46,0,,0.3,14.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000169,2,2017-12-15 00:35:09,2017-12-15 01:09:59,N,1,,,,,1,17.48,51,0.5,0.5,0,5.76,,0.3,58.06,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000170,2,2017-12-15 00:26:34,2017-12-15 00:41:45,N,1,,,,,1,1.81,11,0.5,0.5,3.69,0,,0.3,15.99,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000171,2,2017-12-15 00:46:38,2017-12-15 01:08:50,N,1,,,,,1,3.31,16,0.5,0.5,3.46,0,,0.3,20.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000172,2,2017-12-15 00:19:37,2017-12-15 00:35:33,N,1,,,,,2,1.61,10.5,0.5,0.5,5,0,,0.3,16.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000173,2,2017-12-15 00:38:17,2017-12-15 00:45:48,N,1,,,,,2,1.01,6.5,0.5,0.5,1.56,0,,0.3,9.36,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000174,2,2017-12-15 00:58:49,2017-12-15 01:35:08,N,1,,,,,2,5.57,26.5,0.5,0.5,5.56,0,,0.3,33.36,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000175,1,2017-12-15 00:17:10,2017-12-15 00:52:01,N,1,,,,,1,4.00,23,0.5,0.5,6,5.76,,0.3,36.06,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000176,2,2017-12-15 00:51:45,2017-12-15 01:16:36,N,2,,,,,2,16.55,52,0,0.5,10.56,0,,0.3,63.36,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000177,2,2017-12-15 00:53:53,2017-12-15 00:59:08,N,1,,,,,3,1.09,5.5,0.5,0.5,1.36,0,,0.3,8.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000178,2,2017-12-15 00:17:56,2017-12-15 00:41:33,N,1,,,,,1,5.12,19.5,0.5,0.5,4.16,0,,0.3,24.96,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000179,2,2017-12-15 00:45:41,2017-12-15 00:58:18,N,1,,,,,1,2.83,11.5,0.5,0.5,3.2,0,,0.3,16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000180,2,2017-12-15 00:01:12,2017-12-15 00:11:38,N,1,,,,,1,1.65,8.5,0.5,0.5,2.45,0,,0.3,12.25,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000181,2,2017-12-15 00:12:58,2017-12-15 00:19:54,N,1,,,,,1,0.89,6.5,0.5,0.5,1.95,0,,0.3,9.75,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000182,2,2017-12-15 00:24:50,2017-12-15 00:26:01,N,5,,,,,1,0.00,100,0,0.5,8.2,0,,0.3,109,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000183,1,2017-12-15 00:32:04,2017-12-15 01:01:36,N,2,,,,,3,20.80,52,0,0.5,0,5.76,,0.3,58.56,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000184,2,2017-12-15 00:18:06,2017-12-15 01:07:36,N,1,,,,,1,25.24,71,0.5,0.5,15.61,5.76,,0.3,93.67,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000185,2,2017-12-15 00:10:06,2017-12-15 00:20:55,N,1,,,,,1,1.84,9.5,0.5,0.5,2.5,0,,0.3,13.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000186,2,2017-12-15 00:31:27,2017-12-15 00:45:44,N,1,,,,,1,1.66,10.5,0.5,0.5,1.2,0,,0.3,13,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000187,2,2017-12-15 00:53:13,2017-12-15 01:13:27,N,1,,,,,1,2.40,14,0.5,0.5,0,0,,0.3,15.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000188,2,2017-12-15 00:03:43,2017-12-15 00:11:39,N,1,,,,,2,1.58,7.5,0.5,0.5,0,0,,0.3,8.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000189,2,2017-12-15 00:18:02,2017-12-15 00:43:48,N,1,,,,,2,4.61,19,0.5,0.5,0,0,,0.3,20.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000190,2,2017-12-15 00:48:20,2017-12-15 01:09:19,N,1,,,,,2,2.67,15,0.5,0.5,1,0,,0.3,17.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000191,1,2017-12-15 00:05:45,2017-12-15 00:26:31,N,1,,,,,2,3.50,17.5,0.5,0.5,2,0,,0.3,20.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000192,1,2017-12-15 00:35:30,2017-12-15 00:47:19,N,1,,,,,1,2.50,10.5,0.5,0.5,2,0,,0.3,13.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000193,1,2017-12-15 00:55:01,2017-12-15 01:06:13,N,1,,,,,1,0.90,8.5,0.5,0.5,2.45,0,,0.3,12.25,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000194,2,2017-12-15 00:23:31,2017-12-15 00:30:12,N,1,,,,,1,0.98,6.5,0.5,0.5,1,0,,0.3,8.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000195,2,2017-12-15 00:33:17,2017-12-15 00:55:09,N,1,,,,,1,3.20,15.5,0.5,0.5,3.36,0,,0.3,20.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000196,2,2017-12-15 00:56:39,2017-12-15 01:14:39,N,1,,,,,1,4.65,18,0.5,0.5,3.86,0,,0.3,23.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000197,2,2017-12-15 00:12:57,2017-12-15 00:18:57,N,1,,,,,1,1.03,6,0.5,0.5,0,0,,0.3,7.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000198,2,2017-12-15 00:20:41,2017-12-15 00:46:44,N,1,,,,,1,5.33,21,0.5,0.5,1,0,,0.3,23.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000199,2,2017-12-14 23:56:15,2017-12-15 00:20:06,N,1,,,,,1,10.45,30.5,0.5,0.5,7.51,5.76,,0.3,45.07,1,,,,yellow,0.09,1,1.2,32,22,5.82,,,,,,,,,,,,,,,,,,,, 1460000200,2,2017-12-15 00:30:30,2017-12-15 01:22:30,N,1,,,,,1,13.51,48,0.5,0.5,0,0,,0.3,49.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000201,2,2017-12-15 00:03:15,2017-12-15 00:21:04,N,1,,,,,1,2.38,13,0.5,0.5,2.86,0,,0.3,17.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000202,2,2017-12-15 00:23:15,2017-12-15 00:27:40,N,1,,,,,1,0.47,5,0.5,0.5,1.26,0,,0.3,7.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000203,2,2017-12-15 00:31:32,2017-12-15 00:49:33,N,1,,,,,1,2.91,14,0.5,0.5,1,0,,0.3,16.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000204,2,2017-12-15 00:08:42,2017-12-15 00:12:21,N,1,,,,,1,0.39,4.5,0.5,0.5,0,0,,0.3,5.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000205,2,2017-12-15 00:13:31,2017-12-15 00:24:20,N,1,,,,,1,2.49,10.5,0.5,0.5,1.2,0,,0.3,13,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000206,2,2017-12-15 00:29:19,2017-12-15 00:58:28,N,1,,,,,1,3.72,20,0.5,0.5,4.26,0,,0.3,25.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000207,2,2017-12-15 00:28:14,2017-12-15 00:34:25,N,1,,,,,1,1.55,7,0.5,0.5,1,0,,0.3,9.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000208,2,2017-12-15 00:36:22,2017-12-15 00:37:55,N,1,,,,,1,0.45,3.5,0.5,0.5,0.96,0,,0.3,5.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000209,2,2017-12-15 00:39:37,2017-12-15 01:35:03,N,1,,,,,1,13.96,47,0.5,0.5,5.08,5.76,,0.3,59.14,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000210,2,2017-12-15 00:25:04,2017-12-15 00:33:25,N,1,,,,,1,1.64,8,0.5,0.5,1.86,0,,0.3,11.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000211,2,2017-12-15 00:34:48,2017-12-15 00:43:08,N,1,,,,,1,1.60,8,0.5,0.5,1.86,0,,0.3,11.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000212,2,2017-12-15 01:01:52,2017-12-15 01:36:51,N,1,,,,,1,9.76,35,0.5,0.5,0,0,,0.3,36.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000213,1,2017-12-15 00:46:44,2017-12-15 00:49:28,N,1,,,,,1,0.50,4,0.5,0.5,0,0,,0.3,5.3,3,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000214,2,2017-12-15 00:06:12,2017-12-15 00:31:47,N,1,,,,,1,4.02,18,0.5,0.5,1.5,0,,0.3,20.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000215,2,2017-12-15 00:44:58,2017-12-15 01:05:54,N,1,,,,,1,4.67,18,0.5,0.5,0,0,,0.3,19.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000216,2,2017-12-15 00:17:30,2017-12-15 00:24:38,N,1,,,,,1,1.71,8,0.5,0.5,0,0,,0.3,9.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000217,2,2017-12-15 00:34:51,2017-12-15 00:47:29,N,1,,,,,1,1.75,10,0.5,0.5,2.26,0,,0.3,13.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000218,2,2017-12-15 00:50:18,2017-12-15 01:10:23,N,1,,,,,1,3.51,16,0.5,0.5,2.5,0,,0.3,19.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000219,1,2017-12-15 00:21:43,2017-12-15 00:30:20,N,1,,,,,1,1.20,7.5,0.5,0.5,1.75,0,,0.3,10.55,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000220,1,2017-12-15 00:45:42,2017-12-15 01:20:41,N,1,,,,,1,6.70,27,0.5,0.5,6.8,5.76,,0.3,40.86,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000221,2,2017-12-15 00:16:52,2017-12-15 00:27:51,N,1,,,,,2,1.33,8.5,0.5,0.5,1,0,,0.3,10.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000222,2,2017-12-15 00:31:34,2017-12-15 00:45:50,N,1,,,,,2,6.73,20.5,0.5,0.5,4.36,0,,0.3,26.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000223,2,2017-12-15 00:02:41,2017-12-15 00:30:26,N,1,,,,,1,6.01,23,0.5,0.5,0,0,,0.3,24.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000224,2,2017-12-15 00:47:01,2017-12-15 00:55:49,N,1,,,,,1,1.98,8.5,0.5,0.5,2,0,,0.3,11.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000225,2,2017-12-15 00:58:15,2017-12-15 01:14:41,N,1,,,,,1,3.40,13.5,0.5,0.5,4.44,0,,0.3,19.24,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000226,2,2017-12-15 00:13:35,2017-12-15 00:27:29,N,1,,,,,1,2.59,11.5,0.5,0.5,3.84,0,,0.3,16.64,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000227,2,2017-12-15 00:38:30,2017-12-15 00:43:51,N,1,,,,,1,1.15,6,0.5,0.5,0,0,,0.3,7.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000228,2,2017-12-15 00:45:24,2017-12-15 00:51:18,N,1,,,,,1,1.00,6,0.5,0.5,0,0,,0.3,7.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000229,2,2017-12-15 00:55:04,2017-12-15 01:08:57,N,1,,,,,1,3.47,13,0.5,0.5,0,0,,0.3,14.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000230,2,2017-12-15 00:04:17,2017-12-15 00:10:33,N,1,,,,,1,1.23,6.5,0.5,0.5,1.95,0,,0.3,9.75,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000231,2,2017-12-15 00:13:03,2017-12-15 00:26:55,N,1,,,,,1,2.78,12,0.5,0.5,3.32,0,,0.3,16.62,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000232,2,2017-12-15 00:30:01,2017-12-15 00:37:15,N,1,,,,,1,0.89,6.5,0.5,0.5,0,0,,0.3,7.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000233,2,2017-12-15 00:38:43,2017-12-15 00:47:26,N,1,,,,,1,1.40,8,0.5,0.5,1.86,0,,0.3,11.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000234,2,2017-12-15 00:49:45,2017-12-15 00:53:07,N,1,,,,,1,0.59,4.5,0.5,0.5,1.16,0,,0.3,6.96,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000235,2,2017-12-15 00:56:01,2017-12-15 01:07:10,N,1,,,,,1,2.46,10.5,0.5,0.5,0,0,,0.3,11.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000236,2,2017-12-15 00:33:40,2017-12-15 00:39:42,N,1,,,,,2,1.31,6.5,0.5,0.5,1,0,,0.3,8.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000237,2,2017-12-15 00:55:08,2017-12-15 01:29:46,N,1,,,,,2,11.64,37,0.5,0.5,8.81,5.76,,0.3,52.87,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000238,2,2017-12-15 00:38:07,2017-12-15 00:42:29,N,1,,,,,6,0.95,5.5,0.5,0.5,1.7,0,,0.3,8.5,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000239,1,2017-12-15 00:02:32,2017-12-15 00:14:54,N,1,,,,,2,2.40,11,0.5,0.5,0,0,,0.3,12.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000240,1,2017-12-15 00:47:13,2017-12-15 01:23:55,N,1,,,,,2,9.40,32.5,0.5,0.5,0,0,,0.3,33.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000241,2,2017-12-15 00:07:00,2017-12-15 00:22:39,N,1,,,,,2,2.66,12,0.5,0.5,3.99,0,,0.3,17.29,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000242,2,2017-12-15 00:24:20,2017-12-15 00:39:27,N,1,,,,,2,3.08,12.5,0.5,0.5,1,0,,0.3,14.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000243,2,2017-12-15 00:42:39,2017-12-15 00:57:57,N,1,,,,,2,2.33,11.5,0.5,0.5,2.56,0,,0.3,15.36,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000244,1,2017-12-15 00:00:39,2017-12-15 00:03:05,N,1,,,,,1,0.40,4,0.5,0.5,0,0,,0.3,5.3,3,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000245,2,2017-12-15 00:10:32,2017-12-15 00:43:16,N,1,,,,,5,8.04,28,0.5,0.5,3.7,0,,0.3,33,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000246,2,2017-12-15 00:35:42,2017-12-15 00:51:25,N,1,,,,,1,8.76,25,0.5,0.5,5,5.76,,0.3,37.06,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000247,2,2017-12-15 00:52:46,2017-12-15 01:04:05,N,1,,,,,3,1.78,9.5,0.5,0.5,1,0,,0.3,11.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000248,2,2017-12-15 00:05:05,2017-12-15 00:11:34,N,1,,,,,1,1.08,6.5,0.5,0.5,0,0,,0.3,7.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000249,2,2017-12-15 00:12:28,2017-12-15 00:37:12,N,1,,,,,1,3.21,16.5,0.5,0.5,3.56,0,,0.3,21.36,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000250,2,2017-12-15 00:44:06,2017-12-15 01:04:05,N,1,,,,,1,5.06,17,0.5,0.5,0,0,,0.3,18.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000251,1,2017-12-15 00:05:32,2017-12-15 00:09:10,N,1,,,,,1,0.60,4.5,0.5,0.5,1.15,0,,0.3,6.95,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000252,1,2017-12-15 00:27:29,2017-12-15 00:39:05,N,1,,,,,1,2.20,10,0.5,0.5,2.25,0,,0.3,13.55,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000253,1,2017-12-15 00:07:54,2017-12-15 00:10:50,N,1,,,,,2,0.60,4,0.5,0.5,0,0,,0.3,5.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000254,1,2017-12-15 00:12:02,2017-12-15 00:17:01,N,1,,,,,2,1.00,6,0.5,0.5,0,0,,0.3,7.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000255,1,2017-12-15 00:24:58,2017-12-15 00:37:38,N,1,,,,,1,1.60,9.5,0.5,0.5,1.7,0,,0.3,12.5,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000256,1,2017-12-15 00:41:49,2017-12-15 00:53:48,N,1,,,,,1,3.90,13,0.5,0.5,0,0,,0.3,14.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000257,2,2017-12-14 23:58:09,2017-12-15 00:02:07,N,1,,,,,1,0.49,4.5,0.5,0.5,1.16,0,,0.3,6.96,1,,,,yellow,0.09,1,1.2,32,22,5.82,,,,,,,,,,,,,,,,,,,, 1460000258,2,2017-12-15 00:05:04,2017-12-15 00:18:50,N,1,,,,,1,2.02,11,0.5,0.5,0,0,,0.3,12.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000259,2,2017-12-15 00:21:45,2017-12-15 00:55:32,N,1,,,,,1,5.18,23.5,0.5,0.5,4.96,0,,0.3,29.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000260,2,2017-12-15 00:04:25,2017-12-15 00:11:39,N,1,,,,,1,1.13,7,0.5,0.5,2.08,0,,0.3,10.38,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000261,2,2017-12-15 00:20:57,2017-12-15 00:31:42,N,1,,,,,1,1.08,8,0.5,0.5,1,0,,0.3,10.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000262,2,2017-12-15 00:39:26,2017-12-15 00:50:31,N,1,,,,,1,2.09,9.5,0.5,0.5,0,0,,0.3,10.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000263,2,2017-12-15 00:53:53,2017-12-15 01:04:58,N,1,,,,,1,1.76,9.5,0.5,0.5,1,0,,0.3,11.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000264,2,2017-12-15 00:08:30,2017-12-15 00:20:52,N,1,,,,,1,1.75,9.5,0.5,0.5,0,0,,0.3,10.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000265,2,2017-12-15 00:23:40,2017-12-15 00:41:49,N,1,,,,,1,2.18,12.5,0.5,0.5,2.76,0,,0.3,16.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000266,2,2017-12-15 00:44:35,2017-12-15 00:49:32,N,1,,,,,1,0.79,5.5,0.5,0.5,2.04,0,,0.3,8.84,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000267,2,2017-12-15 00:52:36,2017-12-15 01:02:00,N,1,,,,,1,0.73,7.5,0.5,0.5,1.76,0,,0.3,10.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000268,2,2017-12-15 00:31:52,2017-12-15 00:47:40,N,1,,,,,1,1.80,11.5,0.5,0.5,1.08,0,,0.3,13.88,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000269,2,2017-12-15 00:53:43,2017-12-15 01:05:05,N,1,,,,,1,2.54,10.5,0.5,0.5,0,0,,0.3,11.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000270,2,2017-12-15 00:06:34,2017-12-15 00:13:18,N,1,,,,,2,1.21,7,0.5,0.5,1.66,0,,0.3,9.96,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000271,2,2017-12-15 00:18:09,2017-12-15 00:25:11,N,1,,,,,2,1.29,7,0.5,0.5,2.08,0,,0.3,10.38,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000272,2,2017-12-15 00:30:15,2017-12-15 00:38:42,N,1,,,,,2,1.38,7.5,0.5,0.5,0,0,,0.3,8.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000273,2,2017-12-15 00:40:42,2017-12-15 00:44:57,N,1,,,,,2,1.32,6,0.5,0.5,0,0,,0.3,7.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000274,1,2017-12-15 00:47:31,2017-12-15 00:49:29,N,1,,,,,3,0.20,3.5,0.5,0.5,0,0,,0.3,4.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000275,1,2017-12-15 00:52:16,2017-12-15 01:28:36,N,1,,,,,1,7.30,29.5,0.5,0.5,0,0,,0.3,30.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000276,1,2017-12-15 00:01:57,2017-12-15 00:18:51,N,1,,,,,1,11.40,31.5,0.5,0.5,8.2,0,,0.3,41,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000277,1,2017-12-15 00:39:16,2017-12-15 01:04:21,N,1,,,,,1,10.10,30.5,0.5,0.5,7.5,5.76,,0.3,45.06,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000278,1,2017-12-15 00:18:06,2017-12-15 00:26:33,N,1,,,,,1,1.70,8,0.5,0.5,1.85,0,,0.3,11.15,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000279,1,2017-12-15 00:28:42,2017-12-15 01:10:48,N,1,,,,,1,8.70,34.5,0.5,0.5,7.15,0,,0.3,42.95,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000280,2,2017-12-15 00:20:44,2017-12-15 00:47:16,N,1,,,,,1,3.10,17.5,0.5,0.5,3.76,0,,0.3,22.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000281,2,2017-12-15 00:48:38,2017-12-15 00:58:46,N,1,,,,,1,1.87,9,0.5,0.5,2.06,0,,0.3,12.36,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000282,2,2017-12-15 00:04:31,2017-12-15 00:25:09,N,1,,,,,1,2.28,14.5,0.5,0.5,3.16,0,,0.3,18.96,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000283,2,2017-12-15 00:26:46,2017-12-15 00:33:08,N,1,,,,,1,0.56,5.5,0.5,0.5,0,0,,0.3,6.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000284,2,2017-12-15 00:37:12,2017-12-15 01:04:43,N,1,,,,,1,6.54,24,0.5,0.5,0,0,,0.3,25.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000285,2,2017-12-15 00:32:16,2017-12-15 01:27:04,N,1,,,,,1,14.64,50,0.5,0.5,10.26,0,,0.3,61.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000286,2,2017-12-15 00:59:27,2017-12-15 01:20:51,N,1,,,,,1,5.95,20,0.5,0.5,0,0,,0.3,21.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000287,2,2017-12-15 00:48:18,2017-12-15 01:14:34,N,1,,,,,1,10.73,32.5,0.5,0.5,0,5.76,,0.3,39.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000288,1,2017-12-15 00:07:31,2017-12-15 00:19:09,N,1,,,,,2,2.20,10,0.5,0.5,0,0,,0.3,11.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000289,1,2017-12-15 00:24:03,2017-12-15 00:42:25,N,1,,,,,1,2.30,13,0.5,0.5,3.55,0,,0.3,17.85,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000290,1,2017-12-15 00:44:12,2017-12-15 00:55:18,N,1,,,,,1,1.80,9.5,0.5,0.5,2.15,0,,0.3,12.95,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000291,1,2017-12-15 00:56:48,2017-12-15 01:13:09,N,1,,,,,2,2.60,13,0.5,0.5,0,0,,0.3,14.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000292,1,2017-12-15 00:19:32,2017-12-15 00:25:08,N,1,,,,,1,1.20,6,0.5,0.5,1,0,,0.3,8.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000293,1,2017-12-15 00:27:00,2017-12-15 00:51:04,N,1,,,,,1,4.60,19,0.5,0.5,4.05,0,,0.3,24.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000294,1,2017-12-15 00:20:38,2017-12-15 00:25:15,N,1,,,,,1,1.20,6,0.5,0.5,1.45,0,,0.3,8.75,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000295,1,2017-12-15 00:41:06,2017-12-15 00:52:18,N,1,,,,,1,1.80,9.5,0.5,0.5,2.7,0,,0.3,13.5,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000296,1,2017-12-15 00:55:55,2017-12-15 01:12:31,N,1,,,,,1,3.30,14,0.5,0.5,1,0,,0.3,16.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000297,2,2017-12-15 00:05:55,2017-12-15 00:14:30,N,1,,,,,1,1.63,8.5,0.5,0.5,1.96,0,,0.3,11.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000298,2,2017-12-15 00:27:26,2017-12-15 01:03:16,N,1,,,,,1,6.38,26,0.5,0.5,5.46,0,,0.3,32.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000299,1,2017-12-15 00:08:21,2017-12-15 00:28:24,N,1,,,,,1,3.50,15,0.5,0.5,0,0,,0.3,16.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000300,1,2017-12-15 00:32:08,2017-12-15 00:43:23,N,1,,,,,2,2.80,11,0.5,0.5,0,0,,0.3,12.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000301,1,2017-12-15 00:49:47,2017-12-15 00:53:20,N,1,,,,,2,0.60,4.5,0.5,0.5,0,0,,0.3,5.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000302,1,2017-12-15 00:30:10,2017-12-15 01:07:08,N,1,,,,,1,9.40,33.5,0.5,0.5,5,5.76,,0.3,45.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000303,1,2017-12-15 00:04:37,2017-12-15 00:11:07,N,1,,,,,1,1.00,6.5,0.5,0.5,0,0,,0.3,7.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000304,1,2017-12-15 00:11:47,2017-12-15 00:44:18,N,1,,,,,1,5.70,24.5,0.5,0.5,0,0,,0.3,25.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000305,1,2017-12-15 00:58:49,2017-12-15 01:17:57,N,1,,,,,1,5.20,18.5,0.5,0.5,3.95,0,,0.3,23.75,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000306,1,2017-12-15 00:02:04,2017-12-15 00:06:33,N,1,,,,,1,0.80,5.5,0.5,0.5,0,0,,0.3,6.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000307,1,2017-12-15 00:19:59,2017-12-15 00:38:04,Y,1,,,,,1,2.80,13,0.5,0.5,3,0,,0.3,17.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000308,2,2017-12-15 00:04:13,2017-12-15 00:44:18,N,1,,,,,6,7.07,30,0.5,0.5,6.26,0,,0.3,37.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000309,2,2017-12-15 00:54:44,2017-12-15 01:03:50,N,1,,,,,5,1.55,8,0.5,0.5,2.32,0,,0.3,11.62,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000310,1,2017-12-15 00:43:05,2017-12-15 00:52:01,N,1,,,,,1,2.10,9,0.5,0.5,2.05,0,,0.3,12.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000311,1,2017-12-15 00:05:03,2017-12-15 00:18:37,N,1,,,,,1,2.60,11,0.5,0.5,2.45,0,,0.3,14.75,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000312,1,2017-12-15 00:20:35,2017-12-15 00:38:49,N,1,,,,,1,2.50,13.5,0.5,0.5,4.4,0,,0.3,19.2,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000313,1,2017-12-15 00:39:52,2017-12-15 00:44:38,N,1,,,,,1,1.10,5.5,0.5,0.5,1.35,0,,0.3,8.15,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000314,1,2017-12-15 00:49:40,2017-12-15 01:00:41,N,1,,,,,1,1.70,9,0.5,0.5,2.05,0,,0.3,12.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000315,1,2017-12-15 00:47:28,2017-12-15 00:52:07,N,1,,,,,1,1.20,6.5,0.5,0.5,1.55,0,,0.3,9.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000316,2,2017-12-15 00:06:56,2017-12-15 00:44:48,N,1,,,,,5,3.86,23.5,0.5,0.5,4.96,0,,0.3,29.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000317,2,2017-12-15 00:55:16,2017-12-15 01:06:36,N,1,,,,,5,1.94,10,0.5,0.5,0,0,,0.3,11.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000318,1,2017-12-15 00:30:37,2017-12-15 00:30:47,N,2,,,,,1,0.00,52,0,0.5,8,5.76,,0.3,66.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000319,1,2017-12-15 00:34:47,2017-12-15 01:55:20,N,1,,,,,1,20.40,75.5,0.5,0.5,5,5.76,,0.3,87.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000320,2,2017-12-14 23:51:49,2017-12-15 00:09:43,N,1,,,,,1,1.19,11.5,0.5,0.5,0,0,,0.3,12.8,2,,,,yellow,0.09,1,1.2,32,22,5.82,,,,,,,,,,,,,,,,,,,, 1460000321,2,2017-12-15 00:19:42,2017-12-15 00:27:12,N,1,,,,,1,0.78,6.5,0.5,0.5,0,0,,0.3,7.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000322,2,2017-12-15 00:28:34,2017-12-15 00:40:23,N,1,,,,,1,2.75,11.5,0.5,0.5,1.5,0,,0.3,14.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000323,2,2017-12-15 00:35:15,2017-12-15 00:55:59,N,1,,,,,1,7.24,23.5,0.5,0.5,0,5.76,,0.3,30.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000324,2,2017-12-15 00:26:02,2017-12-15 00:29:23,N,1,,,,,1,0.35,4,0.5,0.5,0,0,,0.3,5.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000325,2,2017-12-15 00:31:42,2017-12-15 00:36:19,N,1,,,,,1,1.07,5.5,0.5,0.5,1.36,0,,0.3,8.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000326,2,2017-12-15 00:36:55,2017-12-15 00:43:24,N,1,,,,,1,1.38,6.5,0.5,0.5,0,0,,0.3,7.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000327,2,2017-12-15 00:45:40,2017-12-15 01:24:42,N,1,,,,,1,6.01,27,0.5,0.5,4.24,0,,0.3,32.54,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000328,2,2017-12-15 00:11:05,2017-12-15 00:19:49,N,1,,,,,2,1.16,7.5,0.5,0.5,1,0,,0.3,9.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000329,2,2017-12-15 00:20:45,2017-12-15 00:54:08,N,1,,,,,2,5.22,23.5,0.5,0.5,2.4,0,,0.3,27.2,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000330,2,2017-12-14 23:59:30,2017-12-15 00:27:27,N,1,,,,,6,4.86,21,0.5,0.5,5.58,0,,0.3,27.88,1,,,,yellow,0.09,1,1.2,32,22,5.82,,,,,,,,,,,,,,,,,,,, 1460000331,2,2017-12-15 00:32:34,2017-12-15 00:43:36,N,1,,,,,6,2.73,10.5,0.5,0.5,0.59,0,,0.3,12.39,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000332,2,2017-12-15 00:55:30,2017-12-15 01:14:02,N,1,,,,,6,3.22,14.5,0.5,0.5,1.5,0,,0.3,17.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000333,2,2017-12-15 00:48:56,2017-12-15 00:57:15,N,1,,,,,6,1.23,7.5,0.5,0.5,1.76,0,,0.3,10.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000334,1,2017-12-15 00:17:24,2017-12-15 00:28:44,N,1,,,,,1,2.10,10,0.5,0.5,0,0,,0.3,11.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000335,1,2017-12-15 00:29:20,2017-12-15 00:40:38,N,1,,,,,1,1.70,9,0.5,0.5,2.5,0,,0.3,12.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000336,1,2017-12-15 00:43:13,2017-12-15 01:04:32,N,1,,,,,1,1.90,14.5,0.5,0.5,3.15,0,,0.3,18.95,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000337,2,2017-12-15 00:18:55,2017-12-15 00:35:52,N,1,,,,,5,2.25,12.5,0.5,0.5,2,0,,0.3,15.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000338,2,2017-12-15 00:37:39,2017-12-15 00:52:21,N,1,,,,,5,1.76,11,0.5,0.5,2.46,0,,0.3,14.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000339,2,2017-12-15 00:02:22,2017-12-15 00:18:02,N,1,,,,,2,1.79,11.5,0.5,0.5,1.5,0,,0.3,14.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000340,2,2017-12-15 00:19:03,2017-12-15 00:55:49,N,1,,,,,2,8.96,33.5,0.5,0.5,6.96,0,,0.3,41.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000341,2,2017-12-15 00:38:47,2017-12-15 01:02:25,N,1,,,,,1,5.67,19.5,0.5,0.5,4.16,0,,0.3,24.96,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000342,1,2017-12-15 00:54:07,2017-12-15 00:56:25,N,1,,,,,1,0.30,4,0.5,0.5,0,0,,0.3,5.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000343,1,2017-12-15 00:59:33,2017-12-15 01:15:36,N,1,,,,,1,4.10,16,0.5,0.5,3.45,0,,0.3,20.75,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000344,1,2017-12-15 00:16:35,2017-12-15 00:38:22,N,1,,,,,1,3.70,16.5,0.5,0.5,3.55,0,,0.3,21.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000345,1,2017-12-15 00:41:35,2017-12-15 00:46:07,N,1,,,,,1,1.10,5.5,0.5,0.5,0,0,,0.3,6.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000346,1,2017-12-15 00:28:16,2017-12-15 00:49:43,N,1,,,,,1,2.10,14.5,0.5,0.5,1,0,,0.3,16.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000347,1,2017-12-15 00:50:32,2017-12-15 01:02:25,N,1,,,,,1,2.00,9.5,0.5,0.5,2.15,0,,0.3,12.95,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000348,2,2017-12-15 00:06:30,2017-12-15 00:32:46,N,2,,,,,2,18.00,52,0,0.5,8,0,,0.3,60.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000349,2,2017-12-15 00:35:57,2017-12-15 00:49:57,N,1,,,,,2,3.00,13,0.5,0.5,2.86,0,,0.3,17.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000350,2,2017-12-15 00:51:07,2017-12-15 01:03:37,N,1,,,,,2,1.78,10.5,0.5,0.5,0,0,,0.3,11.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000351,2,2017-12-14 23:58:18,2017-12-15 00:17:19,N,1,,,,,1,2.89,13.5,0.5,0.5,0,0,,0.3,14.8,1,,,,yellow,0.09,1,1.2,32,22,5.82,,,,,,,,,,,,,,,,,,,, 1460000352,2,2017-12-15 00:25:33,2017-12-15 00:37:17,N,1,,,,,1,1.84,9.5,0.5,0.5,2.7,0,,0.3,13.5,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000353,2,2017-12-15 00:38:50,2017-12-15 01:12:55,N,5,,,,,1,20.52,95,0,0.5,21.26,10.5,,0.3,127.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000354,1,2017-12-15 00:10:39,2017-12-15 00:23:43,N,1,,,,,0,1.80,10.5,0.5,0.5,2.35,0,,0.3,14.15,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000355,1,2017-12-15 00:26:21,2017-12-15 00:46:41,N,1,,,,,0,3.90,15.5,0.5,0.5,0,0,,0.3,16.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000356,1,2017-12-15 00:51:23,2017-12-15 01:23:51,N,1,,,,,0,4.60,21,0.5,0.5,3.2,0,,0.3,25.5,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000357,1,2017-12-15 00:01:01,2017-12-15 00:09:09,N,1,,,,,1,1.50,7.5,0.5,0.5,2.2,0,,0.3,11,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000358,1,2017-12-15 00:11:16,2017-12-15 00:28:12,N,1,,,,,1,4.20,16.5,0.5,0.5,3.55,0,,0.3,21.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000359,1,2017-12-15 00:41:38,2017-12-15 00:49:41,N,1,,,,,1,1.00,7,0.5,0.5,1,0,,0.3,9.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000360,1,2017-12-15 00:53:12,2017-12-15 01:03:55,N,1,,,,,1,2.30,10,0.5,0.5,0,0,,0.3,11.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000361,2,2017-12-15 00:17:03,2017-12-15 00:23:50,N,1,,,,,6,0.78,6.5,0.5,0.5,1.56,0,,0.3,9.36,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000362,2,2017-12-15 00:26:35,2017-12-15 00:30:26,N,1,,,,,6,0.70,4.5,0.5,0.5,0,0,,0.3,5.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000363,2,2017-12-15 00:33:08,2017-12-15 00:49:10,N,1,,,,,6,3.51,14.5,0.5,0.5,0,0,,0.3,15.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000364,2,2017-12-15 00:54:31,2017-12-15 01:18:59,N,1,,,,,6,6.01,21.5,0.5,0.5,0,0,,0.3,22.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000365,1,2017-12-15 00:06:33,2017-12-15 00:44:01,N,1,,,,,1,9.40,34,0.5,0.5,7.05,0,,0.3,42.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000366,1,2017-12-15 00:24:39,2017-12-15 00:49:07,N,1,,,,,1,2.60,16.5,0.5,0.5,0,0,,0.3,17.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000367,1,2017-12-15 00:53:08,2017-12-15 01:07:56,N,1,,,,,1,3.30,13,0.5,0.5,3.55,0,,0.3,17.85,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000368,1,2017-12-15 00:02:28,2017-12-15 00:13:55,N,1,,,,,1,1.80,9.5,0.5,0.5,2,0,,0.3,12.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000369,1,2017-12-15 00:33:04,2017-12-15 00:40:02,N,1,,,,,1,2.10,8,0.5,0.5,1.85,0,,0.3,11.15,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000370,1,2017-12-15 00:03:29,2017-12-15 00:08:56,N,1,,,,,1,1.00,6,0.5,0.5,1.45,0,,0.3,8.75,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000371,1,2017-12-15 00:10:46,2017-12-15 00:20:15,N,1,,,,,1,1.20,8,0.5,0.5,1.85,0,,0.3,11.15,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000372,1,2017-12-15 00:22:56,2017-12-15 00:37:19,N,1,,,,,1,4.00,15,0.5,0.5,0,0,,0.3,16.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000373,1,2017-12-15 00:42:52,2017-12-15 00:51:19,N,1,,,,,1,5.00,15.5,0.5,0.5,3.35,0,,0.3,20.15,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000374,1,2017-12-15 00:53:47,2017-12-15 01:23:10,N,1,,,,,1,4.70,21,0.5,0.5,0,0,,0.3,22.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000375,2,2017-12-15 00:36:27,2017-12-15 01:12:34,N,1,,,,,1,10.62,35.5,0.5,0.5,4.2,0,,0.3,41,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000376,1,2017-12-15 00:28:33,2017-12-15 00:45:42,N,1,,,,,1,3.10,14,0.5,0.5,3.8,0,,0.3,19.1,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000377,2,2017-12-15 00:15:28,2017-12-15 00:50:41,N,1,,,,,2,6.56,28,0.5,0.5,0,0,,0.3,29.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000378,2,2017-12-15 00:56:38,2017-12-15 01:09:17,N,1,,,,,2,6.86,20.5,0.5,0.5,1.1,0,,0.3,22.9,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000379,1,2017-12-15 00:13:36,2017-12-15 01:14:17,N,1,,,,,1,13.00,47.5,0.5,0.5,9.75,0,,0.3,58.55,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000380,1,2017-12-15 00:03:32,2017-12-15 00:28:38,N,1,,,,,1,4.60,19.5,0.5,0.5,1,0,,0.3,21.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000381,1,2017-12-15 00:46:55,2017-12-15 01:10:49,N,1,,,,,1,5.00,20.5,0.5,0.5,0,5.76,,0.3,27.56,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000382,1,2017-12-15 00:09:42,2017-12-15 00:20:45,N,1,,,,,1,2.50,10.5,0.5,0.5,0,0,,0.3,11.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000383,1,2017-12-15 00:23:14,2017-12-15 00:27:02,N,1,,,,,1,0.60,5,0.5,0.5,0,0,,0.3,6.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000384,1,2017-12-15 00:31:40,2017-12-15 00:36:49,N,1,,,,,1,1.00,6,0.5,0.5,2,0,,0.3,9.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000385,1,2017-12-15 00:52:15,2017-12-15 01:13:40,N,1,,,,,1,4.60,17.5,0.5,0.5,3.75,0,,0.3,22.55,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000386,2,2017-12-15 00:13:40,2017-12-15 00:33:20,N,1,,,,,1,1.93,13.5,0.5,0.5,1.48,0,,0.3,16.28,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000387,2,2017-12-15 00:52:47,2017-12-15 01:19:23,N,1,,,,,1,5.91,23.5,0.5,0.5,7.44,0,,0.3,32.24,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000388,1,2017-12-15 00:38:36,2017-12-15 00:56:37,N,1,,,,,1,5.90,20,0.5,0.5,4.25,0,,0.3,25.55,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000389,2,2017-12-15 00:15:53,2017-12-15 00:48:54,N,1,,,,,1,3.61,22,0.5,0.5,1,0,,0.3,24.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000390,2,2017-12-15 00:20:20,2017-12-15 00:40:11,N,1,,,,,1,2.02,13.5,0.5,0.5,2.96,0,,0.3,17.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000391,2,2017-12-15 00:41:22,2017-12-15 00:57:21,N,1,,,,,1,3.29,14,0.5,0.5,3.06,0,,0.3,18.36,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000392,2,2017-12-15 00:59:16,2017-12-15 01:09:53,N,1,,,,,1,1.08,8,0.5,0.5,1.86,0,,0.3,11.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000393,1,2017-12-15 00:44:17,2017-12-15 01:13:28,N,2,,,,,1,17.20,52,0,0.5,10.56,0,,0.3,63.36,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000394,2,2017-12-15 00:06:55,2017-12-15 00:17:28,N,1,,,,,2,1.58,9,0.5,0.5,2.06,0,,0.3,12.36,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000395,2,2017-12-15 00:22:35,2017-12-15 00:34:52,N,1,,,,,2,1.84,10,0.5,0.5,2.26,0,,0.3,13.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000396,2,2017-12-15 00:44:25,2017-12-15 00:54:41,N,1,,,,,2,0.92,8,0.5,0.5,0,0,,0.3,9.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000397,2,2017-12-15 00:10:17,2017-12-15 00:44:44,N,1,,,,,2,4.78,23,0.5,0.5,4.86,0,,0.3,29.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000398,1,2017-12-15 00:07:26,2017-12-15 00:11:49,N,1,,,,,1,0.80,5,0.5,0.5,0,0,,0.3,6.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000399,1,2017-12-15 00:16:45,2017-12-15 00:46:00,N,1,,,,,1,5.10,21.5,0.5,0.5,4.55,0,,0.3,27.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000400,1,2017-12-15 00:56:23,2017-12-15 00:59:44,N,1,,,,,1,0.80,4.5,0.5,0.5,1.45,0,,0.3,7.25,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000401,1,2017-12-15 00:02:45,2017-12-15 00:17:46,N,1,,,,,1,2.60,13,0.5,0.5,2.85,0,,0.3,17.15,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000402,1,2017-12-15 00:33:58,2017-12-15 01:10:08,N,1,,,,,1,13.80,42.5,0.5,0.5,12,0,,0.3,55.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000403,1,2017-12-15 00:08:20,2017-12-15 00:19:54,N,1,,,,,1,2.20,10.5,0.5,0.5,2.35,0,,0.3,14.15,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000404,1,2017-12-15 00:22:02,2017-12-15 00:24:32,N,1,,,,,1,0.40,4,0.5,0.5,0,0,,0.3,5.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000405,1,2017-12-15 00:49:45,2017-12-15 01:07:55,N,1,,,,,1,4.30,16.5,0.5,0.5,3.55,0,,0.3,21.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000406,1,2017-12-15 00:19:42,2017-12-15 00:35:10,N,1,,,,,1,7.20,21.5,0.5,0.5,4,0,,0.3,26.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000407,1,2017-12-15 00:36:17,2017-12-15 00:39:28,N,1,,,,,1,0.60,4.5,0.5,0.5,1.2,0,,0.3,7,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000408,1,2017-12-15 00:44:47,2017-12-15 00:47:53,N,1,,,,,1,0.50,4,0.5,0.5,1.05,0,,0.3,6.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000409,1,2017-12-15 00:06:26,2017-12-15 01:03:38,N,1,,,,,1,12.30,48,0.5,0.5,13.75,5.76,,0.3,68.81,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000410,1,2017-12-15 00:03:14,2017-12-15 00:10:39,N,1,,,,,1,1.30,7,0.5,0.5,1.5,0,,0.3,9.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000411,1,2017-12-15 00:12:39,2017-12-15 00:37:19,N,1,,,,,1,4.20,18,0.5,0.5,2.5,0,,0.3,21.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000412,1,2017-12-15 00:42:49,2017-12-15 00:46:16,N,1,,,,,1,0.80,5,0.5,0.5,1.55,0,,0.3,7.85,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000413,2,2017-12-14 19:10:23,2017-12-14 19:18:28,N,1,,,,,1,1.02,7,0.5,0.5,0,0,,0.3,8.3,2,,,,yellow,0.09,1,1.2,32,22,5.82,,,,,,,,,,,,,,,,,,,, 1460000414,2,2017-12-14 19:23:38,2017-12-15 19:06:32,N,5,,,,,3,4.24,60,0,0.5,0,10.5,,0.3,71.3,1,,,,yellow,0.09,1,1.2,32,22,5.82,,,,,,,,,,,,,,,,,,,, 1460000415,2,2017-12-15 00:55:49,2017-12-15 01:11:02,N,1,,,,,2,7.83,23.5,0.5,0.5,0,0,,0.3,24.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000416,2,2017-12-15 00:05:09,2017-12-15 00:20:48,N,1,,,,,2,1.36,10.5,0.5,0.5,2.36,0,,0.3,14.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000417,2,2017-12-15 00:29:27,2017-12-15 00:52:13,N,1,,,,,2,3.98,17.5,0.5,0.5,3.76,0,,0.3,22.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000418,2,2017-12-15 00:02:35,2017-12-15 00:21:33,N,1,,,,,1,2.34,13.5,0.5,0.5,2.96,0,,0.3,17.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000419,2,2017-12-15 00:26:40,2017-12-15 01:10:02,N,1,,,,,1,9.00,34.5,0.5,0.5,2.2,0,,0.3,38,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000420,2,2017-12-15 00:41:13,2017-12-15 00:47:16,N,1,,,,,5,0.95,6,0.5,0.5,1.46,0,,0.3,8.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000421,2,2017-12-15 00:48:07,2017-12-15 00:51:55,N,1,,,,,5,0.82,5,0.5,0.5,0,0,,0.3,6.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000422,2,2017-12-15 00:54:41,2017-12-15 01:36:33,N,1,,,,,5,8.13,31.5,0.5,0.5,6.56,0,,0.3,39.36,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000423,1,2017-12-15 00:10:16,2017-12-15 00:30:12,N,1,,,,,3,4.20,17,0.5,0.5,0,0,,0.3,18.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000424,1,2017-12-15 00:09:09,2017-12-15 00:36:56,N,1,,,,,1,4.20,20,0.5,0.5,0.01,0,,0.3,21.31,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000425,1,2017-12-15 00:54:57,2017-12-15 01:17:59,N,1,,,,,1,5.40,19.5,0.5,0.5,4.15,0,,0.3,24.95,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000426,2,2017-12-15 00:13:43,2017-12-15 00:42:45,N,1,,,,,1,6.63,25.5,0.5,0.5,9.77,5.76,,0.3,42.33,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000427,2,2017-12-15 00:09:34,2017-12-15 00:21:47,N,1,,,,,1,1.91,10,0.5,0.5,0,0,,0.3,11.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000428,2,2017-12-15 00:26:37,2017-12-15 00:54:33,N,1,,,,,1,6.54,25,0.5,0.5,0,5.76,,0.3,32.06,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000429,1,2017-12-15 00:09:48,2017-12-15 00:19:19,N,1,,,,,1,2.60,10.5,0.5,0.5,2.35,0,,0.3,14.15,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000430,1,2017-12-15 00:32:26,2017-12-15 00:36:48,N,1,,,,,1,0.50,4.5,0.5,0.5,0,0,,0.3,5.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000431,1,2017-12-15 00:42:32,2017-12-15 01:03:48,N,1,,,,,1,3.90,16.5,0.5,0.5,0,0,,0.3,17.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000432,2,2017-12-14 23:59:37,2017-12-15 00:04:06,N,1,,,,,1,0.61,5,0.5,0.5,1,0,,0.3,7.3,1,,,,yellow,0.09,1,1.2,32,22,5.82,,,,,,,,,,,,,,,,,,,, 1460000433,2,2017-12-15 00:06:30,2017-12-15 00:32:11,N,1,,,,,1,4.48,18.5,0.5,0.5,2,0,,0.3,21.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000434,2,2017-12-15 00:40:44,2017-12-15 01:00:28,N,1,,,,,1,4.72,16.5,0.5,0.5,4.71,5.76,,0.3,28.27,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000435,2,2017-12-15 00:10:53,2017-12-15 00:17:52,N,1,,,,,1,1.12,6.5,0.5,0.5,0,0,,0.3,7.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000436,2,2017-12-15 00:35:36,2017-12-15 00:59:12,N,1,,,,,1,10.99,32.5,0.5,0.5,6.76,0,,0.3,40.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000437,2,2017-12-15 00:43:04,2017-12-15 01:16:19,N,2,,,,,1,17.90,52,0,0.5,11.71,5.76,,0.3,70.27,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000438,1,2017-12-15 00:02:42,2017-12-15 00:10:12,N,1,,,,,1,1.30,7,0.5,0.5,1.65,0,,0.3,9.95,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000439,1,2017-12-15 00:11:22,2017-12-15 00:40:14,N,1,,,,,1,7.30,25.5,0.5,0.5,6.7,0,,0.3,33.5,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000440,1,2017-12-15 00:20:08,2017-12-15 00:40:28,N,1,,,,,1,2.40,14,0.5,0.5,3.05,0,,0.3,18.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000441,1,2017-12-15 00:43:20,2017-12-15 01:08:32,N,1,,,,,2,1.30,15.5,0.5,0.5,5,0,,0.3,21.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000442,1,2017-12-15 00:11:32,2017-12-15 00:47:05,N,1,,,,,1,5.10,26.5,0.5,0.5,8.3,0,,0.3,36.1,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000443,2,2017-12-15 00:09:29,2017-12-15 00:29:03,N,1,,,,,1,5.70,20.5,0.5,0.5,4.36,0,,0.3,26.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000444,2,2017-12-15 00:30:13,2017-12-15 00:37:34,N,1,,,,,1,1.67,8,0.5,0.5,0,0,,0.3,9.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000445,2,2017-12-14 23:58:56,2017-12-15 00:30:50,N,2,,,,,1,20.04,52,0,0.5,11.71,5.76,,0.3,70.27,1,,,,yellow,0.09,1,1.2,32,22,5.82,,,,,,,,,,,,,,,,,,,, 1460000446,2,2017-12-15 00:36:57,2017-12-15 00:45:41,N,1,,,,,1,1.46,8,0.5,0.5,0,0,,0.3,9.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000447,1,2017-12-15 00:04:54,2017-12-15 00:37:27,N,1,,,,,1,4.60,23,0.5,0.5,0,0,,0.3,24.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000448,2,2017-12-15 00:48:43,2017-12-15 00:53:39,N,1,,,,,1,1.30,6,0.5,0.5,1.46,0,,0.3,8.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000449,2,2017-12-15 00:57:01,2017-12-15 01:01:12,N,1,,,,,1,0.68,5,0.5,0.5,0.63,0,,0.3,6.93,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000450,1,2017-12-15 00:05:18,2017-12-15 00:49:34,N,1,,,,,1,15.40,51,0.5,0.5,5,0,,0.3,57.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000451,2,2017-12-15 00:45:45,2017-12-15 01:07:45,N,1,,,,,1,9.52,28,0.5,0.5,0,5.76,,0.3,35.06,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000452,2,2017-12-15 00:08:35,2017-12-15 00:56:01,N,1,,,,,5,7.01,31,0.5,0.5,9.69,0,,0.3,41.99,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000453,1,2017-12-15 00:10:24,2017-12-15 00:31:26,N,1,,,,,1,3.20,16,0.5,0.5,0,0,,0.3,17.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000454,2,2017-12-15 00:04:37,2017-12-15 00:53:48,N,1,,,,,1,11.07,40.5,0.5,0.5,5,5.76,,0.3,52.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000455,1,2017-12-15 00:05:10,2017-12-15 00:20:23,N,1,,,,,1,3.10,13,0.5,0.5,1.43,0,,0.3,15.73,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000456,1,2017-12-15 00:21:02,2017-12-15 00:36:30,N,1,,,,,2,2.30,11.5,0.5,0.5,1.92,0,,0.3,14.72,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000457,1,2017-12-15 00:38:38,2017-12-15 01:10:20,N,1,,,,,1,4.90,22.5,0.5,0.5,5.95,0,,0.3,29.75,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000458,1,2017-12-15 00:03:57,2017-12-15 00:07:19,N,1,,,,,2,1.10,5.5,0.5,0.5,1,0,,0.3,7.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000459,1,2017-12-15 00:26:00,2017-12-15 00:34:10,N,1,,,,,1,1.30,7.5,0.5,0.5,0,0,,0.3,8.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000460,1,2017-12-15 00:36:16,2017-12-15 00:49:08,N,1,,,,,1,2.60,11,0.5,0.5,2.45,0,,0.3,14.75,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000461,1,2017-12-15 00:53:25,2017-12-15 01:00:12,N,1,,,,,1,1.90,8,0.5,0.5,2.3,0,,0.3,11.6,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000462,2,2017-12-15 00:15:12,2017-12-15 00:26:54,N,1,,,,,1,1.17,8.5,0.5,0.5,2.94,0,,0.3,12.74,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000463,2,2017-12-15 00:47:47,2017-12-15 00:59:33,N,1,,,,,1,1.40,9,0.5,0.5,2.06,0,,0.3,12.36,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000464,2,2017-12-15 00:08:58,2017-12-15 00:43:22,N,1,,,,,5,6.10,25.5,0.5,0.5,8.04,0,,0.3,34.84,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000465,2,2017-12-15 00:48:26,2017-12-15 01:01:21,N,1,,,,,5,2.41,11.5,0.5,0.5,0,0,,0.3,12.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000466,1,2017-12-15 00:42:54,2017-12-15 00:46:52,N,1,,,,,1,0.70,4.5,0.5,0.5,1.15,0,,0.3,6.95,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000467,1,2017-12-15 00:01:07,2017-12-15 00:05:50,N,1,,,,,1,0.90,5,0.5,0.5,0,0,,0.3,6.3,3,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000468,1,2017-12-15 00:09:05,2017-12-15 00:26:40,N,1,,,,,1,3.60,16,0.5,0.5,1,0,,0.3,18.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000469,1,2017-12-15 00:59:32,2017-12-15 01:16:09,N,1,,,,,1,7.70,23,0.5,0.5,7.5,5.76,,0.3,37.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000470,1,2017-12-15 00:15:12,2017-12-15 00:36:25,N,1,,,,,1,1.40,13.5,0.5,0.5,2.95,0,,0.3,17.75,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000471,1,2017-12-15 00:39:37,2017-12-15 01:19:53,N,1,,,,,1,8.00,31,0.5,0.5,8.05,0,,0.3,40.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000472,1,2017-12-15 00:04:23,2017-12-15 00:37:18,N,1,,,,,1,6.20,25,0.5,0.5,5.25,0,,0.3,31.55,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000473,1,2017-12-15 00:53:13,2017-12-15 00:59:08,N,1,,,,,1,1.20,6.5,0.5,0.5,1.55,0,,0.3,9.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000474,1,2017-12-15 00:14:58,2017-12-15 00:43:09,N,1,,,,,1,4.20,19.5,0.5,0.5,4.16,0,,0.3,24.96,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000475,1,2017-12-15 00:54:26,2017-12-15 01:01:22,N,1,,,,,1,1.10,6.5,0.5,0.5,1,0,,0.3,8.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000476,2,2017-12-15 00:03:41,2017-12-15 00:33:29,N,1,,,,,1,6.35,25,0.5,0.5,5.26,0,,0.3,31.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000477,2,2017-12-15 00:55:24,2017-12-15 01:29:13,N,1,,,,,1,8.48,30,0.5,0.5,5,0,,0.3,36.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000478,2,2017-12-15 00:13:25,2017-12-15 00:43:14,N,2,,,,,2,17.12,52,0,0.5,11.71,5.76,,0.3,70.27,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000479,1,2017-12-15 00:10:35,2017-12-15 00:21:11,N,1,,,,,2,1.10,8.5,0.5,0.5,1.95,0,,0.3,11.75,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000480,1,2017-12-15 00:22:05,2017-12-15 00:25:54,N,1,,,,,1,0.60,4.5,0.5,0.5,1.15,0,,0.3,6.95,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000481,1,2017-12-15 00:27:36,2017-12-15 00:40:36,N,1,,,,,1,1.00,9.5,0.5,0.5,2.15,0,,0.3,12.95,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000482,1,2017-12-15 00:41:44,2017-12-15 00:47:23,N,1,,,,,1,1.20,6.5,0.5,0.5,0,0,,0.3,7.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000483,1,2017-12-15 00:49:04,2017-12-15 01:03:47,N,1,,,,,1,2.10,11.5,0.5,0.5,2.55,0,,0.3,15.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000484,1,2017-12-15 00:05:27,2017-12-15 00:29:12,N,1,,,,,0,10.50,31.5,0.5,0.5,9,5.76,,0.3,47.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000485,1,2017-12-15 00:35:54,2017-12-15 00:40:31,N,1,,,,,2,0.70,5.5,0.5,0.5,1.35,0,,0.3,8.15,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000486,1,2017-12-15 00:43:11,2017-12-15 00:53:22,N,1,,,,,2,1.00,8,0.5,0.5,1.85,0,,0.3,11.15,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000487,2,2017-12-15 00:31:19,2017-12-15 00:49:31,N,1,,,,,1,6.31,20.5,0.5,0.5,4.36,0,,0.3,26.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000488,2,2017-12-15 00:19:14,2017-12-15 00:29:43,N,1,,,,,1,1.05,8,0.5,0.5,1.5,0,,0.3,10.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000489,2,2017-12-15 00:32:35,2017-12-15 00:44:57,N,1,,,,,1,2.42,10.5,0.5,0.5,1,0,,0.3,12.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000490,2,2017-12-15 00:48:40,2017-12-15 00:57:57,N,1,,,,,1,1.22,8,0.5,0.5,2.32,0,,0.3,11.62,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000491,2,2017-12-15 00:38:41,2017-12-15 01:21:39,N,1,,,,,2,17.74,52,0.5,0.5,11.81,5.76,,0.3,70.87,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000492,2,2017-12-15 00:08:26,2017-12-15 00:19:44,N,1,,,,,1,1.66,8.5,0.5,0.5,2.5,0,,0.3,12.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000493,2,2017-12-15 00:22:53,2017-12-15 00:36:57,N,1,,,,,1,1.74,10,0.5,0.5,1,0,,0.3,12.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000494,2,2017-12-15 01:00:01,2017-12-15 01:05:22,N,1,,,,,1,0.96,5.5,0.5,0.5,1.7,0,,0.3,8.5,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000495,2,2017-12-15 00:02:53,2017-12-15 00:09:38,N,1,,,,,1,0.65,6,0.5,0.5,0,0,,0.3,7.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000496,2,2017-12-15 00:11:31,2017-12-15 00:12:25,N,5,,,,,1,0.00,15,0,0.5,40,0,,0.3,55.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000497,2,2017-12-15 00:43:55,2017-12-15 00:56:49,N,1,,,,,1,1.94,10.5,0.5,0.5,2.36,0,,0.3,14.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000498,1,2017-12-15 00:25:49,2017-12-15 00:35:05,N,1,,,,,1,1.10,7.5,0.5,0.5,1.75,0,,0.3,10.55,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000499,1,2017-12-15 00:52:06,2017-12-15 00:57:28,N,1,,,,,2,0.60,5.5,0.5,0.5,0,0,,0.3,6.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000500,1,2017-12-15 00:34:18,2017-12-15 01:21:09,N,1,,,,,1,6.60,30,0.5,0.5,9.35,0,,0.3,40.65,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000501,2,2017-12-15 00:07:37,2017-12-15 00:59:00,N,1,,,,,1,10.17,42,0.5,0.5,0,0,,0.3,43.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000502,1,2017-12-15 00:07:42,2017-12-15 00:30:07,N,1,,,,,2,2.90,16,0.5,0.5,0,0,,0.3,17.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000503,1,2017-12-15 00:30:45,2017-12-15 00:38:40,N,1,,,,,1,1.30,7.5,0.5,0.5,0,0,,0.3,8.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000504,1,2017-12-15 00:42:23,2017-12-15 00:55:29,N,1,,,,,2,1.80,10.5,0.5,0.5,2.35,0,,0.3,14.15,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000505,1,2017-12-15 00:57:04,2017-12-15 00:57:04,N,1,,,,,1,0.00,5,0.5,0.5,0,0,,0.3,6.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000506,1,2017-12-15 00:59:48,2017-12-15 01:21:26,N,1,,,,,1,4.50,17.5,0.5,0.5,3.75,0,,0.3,22.55,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000507,2,2017-12-15 00:20:44,2017-12-15 00:30:11,N,1,,,,,2,0.78,7.5,0.5,0.5,0,0,,0.3,8.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000508,2,2017-12-15 00:37:52,2017-12-15 00:52:16,N,1,,,,,2,1.16,10.5,0.5,0.5,1.5,0,,0.3,13.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000509,2,2017-12-15 00:58:26,2017-12-15 01:04:02,N,1,,,,,2,1.00,5.5,0.5,0.5,1.36,0,,0.3,8.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000510,1,2017-12-15 00:05:09,2017-12-15 00:25:20,N,1,,,,,1,2.60,14,0.5,0.5,0,0,,0.3,15.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000511,1,2017-12-15 00:29:09,2017-12-15 00:36:48,N,1,,,,,1,1.40,7.5,0.5,0.5,2.2,0,,0.3,11,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000512,1,2017-12-15 00:38:25,2017-12-15 00:51:42,N,1,,,,,1,1.50,10,0.5,0.5,3.35,0,,0.3,14.65,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000513,1,2017-12-15 00:23:44,2017-12-15 00:49:22,N,1,,,,,1,4.40,20.5,0.5,0.5,4.35,0,,0.3,26.15,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000514,1,2017-12-15 00:09:29,2017-12-15 00:14:30,N,1,,,,,1,0.80,5.5,0.5,0.5,1.7,0,,0.3,8.5,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000515,1,2017-12-15 00:16:07,2017-12-15 00:35:34,N,1,,,,,1,4.60,16.5,0.5,0.5,0,0,,0.3,17.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000516,1,2017-12-15 00:58:07,2017-12-15 01:05:15,N,1,,,,,2,1.40,7,0.5,0.5,1.65,0,,0.3,9.95,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000517,2,2017-12-15 00:25:15,2017-12-15 00:34:38,N,1,,,,,1,1.21,8,0.5,0.5,2.79,0,,0.3,12.09,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000518,2,2017-12-15 00:40:54,2017-12-15 00:52:56,N,1,,,,,1,0.86,8.5,0.5,0.5,1.96,0,,0.3,11.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000519,2,2017-12-15 00:54:22,2017-12-15 01:00:26,N,1,,,,,1,0.89,5.5,0.5,0.5,1,0,,0.3,7.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000520,1,2017-12-15 00:07:03,2017-12-15 00:17:51,N,1,,,,,1,1.00,8,0.5,0.5,1,0,,0.3,10.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000521,1,2017-12-15 00:27:43,2017-12-15 01:02:24,N,1,,,,,1,6.80,27,0.5,0.5,0,0,,0.3,28.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000522,2,2017-12-15 00:14:12,2017-12-15 00:57:30,N,4,,,,,1,33.89,158,0.5,0.5,10,0,,0.3,169.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000523,2,2017-12-15 00:01:06,2017-12-15 00:16:46,N,1,,,,,1,3.00,13,0.5,0.5,0,0,,0.3,14.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000524,2,2017-12-15 00:30:25,2017-12-15 00:41:04,N,1,,,,,1,3.33,11,0.5,0.5,2.46,0,,0.3,14.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000525,2,2017-12-15 00:42:14,2017-12-15 00:52:16,N,1,,,,,1,4.32,14,0.5,0.5,3.06,0,,0.3,18.36,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000526,1,2017-12-15 00:44:17,2017-12-15 00:53:15,N,1,,,,,1,2.10,9,0.5,0.5,2.05,0,,0.3,12.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000527,1,2017-12-15 00:58:36,2017-12-15 01:17:56,N,1,,,,,2,5.00,17.5,0.5,0.5,3.75,0,,0.3,22.55,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000528,2,2017-12-15 00:14:35,2017-12-15 00:18:59,N,1,,,,,1,0.79,5,0.5,0.5,1.26,0,,0.3,7.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000529,2,2017-12-15 00:26:09,2017-12-15 01:04:29,N,1,,,,,1,11.54,37.5,0.5,0.5,7.76,0,,0.3,48.51,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000530,2,2017-12-15 00:14:54,2017-12-15 00:31:45,N,1,,,,,1,2.37,12.5,0.5,0.5,2.07,0,,0.3,15.87,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000531,2,2017-12-15 00:33:52,2017-12-15 00:40:05,N,1,,,,,2,1.43,7,0.5,0.5,0,0,,0.3,8.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000532,2,2017-12-15 00:49:02,2017-12-15 00:52:27,N,1,,,,,2,1.01,5,0.5,0.5,0,0,,0.3,6.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000533,2,2017-12-15 00:21:05,2017-12-15 00:32:58,N,1,,,,,2,1.94,10,0.5,0.5,2.82,0,,0.3,14.12,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000534,2,2017-12-15 00:36:08,2017-12-15 00:44:27,N,1,,,,,2,1.33,7.5,0.5,0.5,1.2,0,,0.3,10,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000535,2,2017-12-15 00:49:05,2017-12-15 01:22:21,N,1,,,,,2,6.61,25.5,0.5,0.5,5.36,0,,0.3,32.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000536,2,2017-12-15 00:43:23,2017-12-15 01:16:38,N,1,,,,,1,7.28,27,0.5,0.5,5,0,,0.3,33.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000537,1,2017-12-15 00:08:26,2017-12-15 00:11:22,N,1,,,,,2,0.40,4,0.5,0.5,1.05,0,,0.3,6.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000538,1,2017-12-15 00:13:29,2017-12-15 00:46:41,N,1,,,,,1,6.30,25,0.5,0.5,5.25,0,,0.3,31.55,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000539,2,2017-12-15 00:11:39,2017-12-15 00:11:42,N,5,,,,,1,0.00,125,0,0,40,0,,0.3,165.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000540,2,2017-12-15 00:56:03,2017-12-15 01:00:44,N,1,,,,,1,0.97,5.5,0.5,0.5,0,0,,0.3,6.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000541,1,2017-12-15 00:42:52,2017-12-15 01:02:34,N,1,,,,,1,13.00,36,0.5,0.5,6,0,,0.3,43.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000542,1,2017-12-15 00:03:04,2017-12-15 00:52:32,N,1,,,,,1,6.60,32,0.5,0.5,6.65,0,,0.3,39.95,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000543,1,2017-12-15 00:13:41,2017-12-15 00:28:38,N,1,,,,,1,4.20,14.5,0.5,0.5,3.5,0,,0.3,19.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000544,1,2017-12-15 00:33:27,2017-12-15 00:40:04,N,1,,,,,1,1.70,7.5,0.5,0.5,0,0,,0.3,8.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000545,1,2017-12-15 00:47:29,2017-12-15 00:58:09,N,1,,,,,1,2.20,10,0.5,0.5,2.8,0,,0.3,14.1,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000546,1,2017-12-15 00:05:19,2017-12-15 00:38:17,N,1,,,,,1,8.20,30,0.5,0.5,6.25,0,,0.3,37.55,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000547,2,2017-12-15 00:53:06,2017-12-15 01:14:50,N,1,,,,,4,12.29,35,0.5,0.5,0,0,,0.3,36.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000548,1,2017-12-15 00:02:50,2017-12-15 00:07:18,N,1,,,,,1,0.60,5,0.5,0.5,0,0,,0.3,6.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000549,1,2017-12-15 00:12:02,2017-12-15 00:14:27,N,1,,,,,1,0.40,4,0.5,0.5,1.05,0,,0.3,6.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000550,1,2017-12-15 00:15:16,2017-12-15 00:21:59,N,1,,,,,1,0.90,6,0.5,0.5,1.45,0,,0.3,8.75,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000551,1,2017-12-15 00:24:38,2017-12-15 00:46:00,N,1,,,,,1,4.00,17,0.5,0.5,1,0,,0.3,19.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000552,1,2017-12-15 00:53:46,2017-12-15 00:59:26,N,1,,,,,1,1.20,6.5,0.5,0.5,2,0,,0.3,9.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000553,1,2017-12-15 00:00:57,2017-12-15 00:15:13,N,1,,,,,2,2.60,11.5,0.5,0.5,0,0,,0.3,12.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000554,1,2017-12-15 00:19:11,2017-12-15 01:02:12,N,1,,,,,2,6.70,30,0.5,0.5,7.8,0,,0.3,39.1,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000555,2,2017-12-15 00:35:27,2017-12-15 00:54:46,N,1,,,,,1,2.59,13.5,0.5,0.5,2.96,0,,0.3,17.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000556,2,2017-12-15 00:06:54,2017-12-15 00:37:31,N,1,,,,,1,9.21,31,0.5,0.5,8.08,0,,0.3,40.38,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000557,2,2017-12-15 00:38:32,2017-12-15 00:53:06,N,1,,,,,1,4.34,14.5,0.5,0.5,3.95,0,,0.3,19.75,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000558,2,2017-12-15 00:56:49,2017-12-15 01:03:38,N,1,,,,,1,1.81,7.5,0.5,0.5,0,0,,0.3,8.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000559,2,2017-12-15 00:03:50,2017-12-15 00:13:29,N,1,,,,,1,2.34,9.5,0.5,0.5,2.16,0,,0.3,12.96,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000560,2,2017-12-15 00:28:29,2017-12-15 00:35:00,N,1,,,,,1,1.32,6.5,0.5,0.5,1.56,0,,0.3,9.36,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000561,2,2017-12-15 00:50:11,2017-12-15 01:05:29,N,1,,,,,1,3.22,13.5,0.5,0.5,4.44,0,,0.3,19.24,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000562,1,2017-12-15 00:02:47,2017-12-15 00:41:54,N,1,,,,,1,7.80,31.5,0.5,0.5,6.55,0,,0.3,39.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000563,2,2017-12-15 00:05:55,2017-12-15 00:55:30,N,1,,,,,1,14.85,47,0.5,0.5,0,0,,0.3,48.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000564,2,2017-12-15 00:57:55,2017-12-15 00:58:01,N,5,,,,,1,0.00,10,0,0.5,0,0,,0.3,10.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000565,2,2017-12-15 00:13:16,2017-12-15 00:22:13,N,1,,,,,2,1.57,8,0.5,0.5,1.86,0,,0.3,11.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000566,2,2017-12-15 00:22:47,2017-12-15 00:24:31,N,1,,,,,1,0.43,3.5,0.5,0.5,0.72,0,,0.3,5.52,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000567,2,2017-12-15 00:32:29,2017-12-15 00:35:46,N,1,,,,,2,0.72,4.5,0.5,0.5,1.45,0,,0.3,7.25,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000568,2,2017-12-15 00:36:36,2017-12-15 00:52:02,N,1,,,,,2,2.10,11.5,0.5,0.5,2.56,0,,0.3,15.36,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000569,2,2017-12-15 00:54:25,2017-12-15 00:58:58,N,1,,,,,2,0.00,4.5,0.5,0.5,0,0,,0.3,5.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000570,1,2017-12-15 00:10:19,2017-12-15 00:42:43,N,1,,,,,1,5.40,22.5,0.5,0.5,0,0,,0.3,23.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000571,2,2017-12-15 00:22:17,2017-12-15 00:54:37,N,1,,,,,1,5.00,24,0.5,0.5,4,0,,0.3,29.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000572,2,2017-12-14 23:58:52,2017-12-15 00:49:35,N,1,,,,,1,11.20,41,0.5,0.5,7.21,5.76,,0.3,55.27,1,,,,yellow,0.09,1,1.2,32,22,5.82,,,,,,,,,,,,,,,,,,,, 1460000573,1,2017-12-15 00:36:07,2017-12-15 01:03:18,Y,1,,,,,1,5.00,20,0.5,0.5,4.25,0,,0.3,25.55,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000574,1,2017-12-15 00:15:45,2017-12-15 00:50:45,N,1,,,,,1,9.20,32,0.5,0.5,0,5.76,,0.3,39.06,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000575,1,2017-12-15 00:22:12,2017-12-15 00:38:30,N,1,,,,,1,2.20,12.5,0.5,0.5,0,0,,0.3,13.8,3,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000576,1,2017-12-15 00:41:46,2017-12-15 01:01:28,Y,1,,,,,1,3.70,15.5,0.5,0.5,3.35,0,,0.3,20.15,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000577,1,2017-12-15 00:25:24,2017-12-15 01:02:48,N,1,,,,,1,6.00,27.5,0.5,0.5,5.75,0,,0.3,34.55,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000578,2,2017-12-15 00:07:55,2017-12-15 00:12:34,N,1,,,,,1,0.96,5.5,0.5,0.5,0,0,,0.3,6.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000579,2,2017-12-15 00:13:36,2017-12-15 00:45:06,N,1,,,,,1,4.72,22,0.5,0.5,0,0,,0.3,23.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000580,2,2017-12-15 00:56:28,2017-12-15 01:00:43,N,1,,,,,1,0.79,5,0.5,0.5,1.26,0,,0.3,7.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000581,2,2017-12-15 00:08:58,2017-12-15 00:16:04,N,1,,,,,1,0.73,6,0.5,0.5,1.46,0,,0.3,8.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000582,2,2017-12-15 00:17:18,2017-12-15 00:50:54,N,1,,,,,1,3.66,22.5,0.5,0.5,4.76,0,,0.3,28.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000583,2,2017-12-15 00:49:08,2017-12-15 01:09:20,N,1,,,,,1,4.16,17,0.5,0.5,3.66,0,,0.3,21.96,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000584,2,2017-12-15 00:50:52,2017-12-15 01:03:54,N,1,,,,,1,2.09,10,0.5,0.5,2.82,0,,0.3,14.12,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000585,2,2017-12-15 00:31:15,2017-12-15 00:35:58,N,1,,,,,1,1.11,6,0.5,0.5,1.46,0,,0.3,8.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000586,2,2017-12-15 00:51:58,2017-12-15 01:15:57,N,1,,,,,1,2.79,16,0.5,0.5,0,5.76,,0.3,23.06,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000587,2,2017-12-15 00:03:58,2017-12-15 00:14:13,N,1,,,,,1,1.68,8.5,0.5,0.5,0,0,,0.3,9.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000588,2,2017-12-15 00:16:35,2017-12-15 01:25:44,N,1,,,,,1,15.30,59,0.5,0.5,12.06,0,,0.3,72.36,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000589,2,2017-12-15 00:06:16,2017-12-15 00:20:06,N,1,,,,,1,2.16,11,0.5,0.5,0,0,,0.3,12.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000590,2,2017-12-15 00:23:57,2017-12-15 00:34:07,N,1,,,,,1,1.73,8.5,0.5,0.5,2.45,0,,0.3,12.25,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000591,2,2017-12-15 00:35:00,2017-12-15 00:41:32,N,1,,,,,1,1.44,7,0.5,0.5,1.66,0,,0.3,9.96,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000592,2,2017-12-15 00:42:57,2017-12-15 01:17:17,N,1,,,,,1,8.22,28.5,0.5,0.5,7.45,0,,0.3,37.25,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000593,2,2017-12-15 00:11:18,2017-12-15 00:41:56,N,1,,,,,1,4.03,20.5,0.5,0.5,3.5,0,,0.3,25.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000594,2,2017-12-15 00:50:15,2017-12-15 01:05:16,N,1,,,,,1,2.87,12.5,0.5,0.5,0,0,,0.3,13.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000595,2,2017-12-15 00:44:35,2017-12-15 01:36:39,N,1,,,,,1,13.58,47,0.5,0.5,9,0,,0.3,57.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000596,2,2017-12-15 00:10:26,2017-12-15 01:14:11,N,2,,,,,5,22.01,52,0,0.5,11.71,5.76,,0.3,70.27,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000597,1,2017-12-15 00:20:22,2017-12-15 00:59:40,N,2,,,,,2,18.00,52,0,0.5,0,0,,0.3,52.8,3,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000598,1,2017-12-15 00:11:10,2017-12-15 00:22:38,N,1,,,,,2,2.00,10,0.5,0.5,0,0,,0.3,11.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000599,1,2017-12-15 00:25:37,2017-12-15 00:41:15,N,1,,,,,3,2.60,12.5,0.5,0.5,0,0,,0.3,13.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000600,2,2017-12-15 00:09:00,2017-12-15 00:36:33,N,1,,,,,2,2.91,18.5,0.5,0.5,0,0,,0.3,19.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000601,2,2017-12-15 00:53:19,2017-12-15 01:19:39,N,1,,,,,2,7.12,24,0.5,0.5,6.32,0,,0.3,31.62,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000602,2,2017-12-15 00:29:14,2017-12-15 00:38:05,N,1,,,,,1,1.50,8,0.5,0.5,2.32,0,,0.3,11.62,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000603,2,2017-12-15 00:39:24,2017-12-15 00:57:43,N,1,,,,,1,3.18,14,0.5,0.5,1,0,,0.3,16.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000604,2,2017-12-15 00:22:31,2017-12-15 00:29:31,N,1,,,,,3,1.44,7,0.5,0.5,0,0,,0.3,8.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000605,2,2017-12-15 00:48:10,2017-12-15 01:20:50,N,1,,,,,3,6.85,25.5,0.5,0.5,0,0,,0.3,26.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000606,1,2017-12-15 00:17:11,2017-12-15 00:24:41,N,1,,,,,1,1.30,7,0.5,0.5,0,0,,0.3,8.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000607,1,2017-12-15 00:26:31,2017-12-15 00:36:40,N,1,,,,,1,1.40,8.5,0.5,0.5,0,0,,0.3,9.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000608,1,2017-12-15 00:43:07,2017-12-15 01:24:10,N,1,,,,,1,10.60,37.5,0.5,0.5,2,0,,0.3,40.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000609,2,2017-12-15 00:22:39,2017-12-15 00:43:55,N,2,,,,,1,8.03,52,0,0.5,2,5.76,,0.3,60.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000610,1,2017-12-15 00:02:34,2017-12-15 00:11:48,N,1,,,,,1,4.90,15.5,0.5,0.5,3,0,,0.3,19.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000611,1,2017-12-15 00:29:52,2017-12-15 00:32:26,N,1,,,,,1,0.30,3.5,0.5,0.5,1.4,0,,0.3,6.2,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000612,1,2017-12-15 00:37:00,2017-12-15 00:40:47,N,1,,,,,1,0.70,5,0.5,0.5,1,0,,0.3,7.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000613,1,2017-12-15 00:23:17,2017-12-15 00:56:40,N,1,,,,,1,3.90,21.5,0.5,0.5,4.55,0,,0.3,27.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000614,2,2017-12-15 00:45:58,2017-12-15 01:16:25,N,1,,,,,1,15.59,44.5,0.5,0.5,0,5.76,,0.3,51.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000615,2,2017-12-15 00:10:34,2017-12-15 00:11:56,N,1,,,,,1,0.08,3,0.5,0.5,0,0,,0.3,4.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000616,2,2017-12-15 00:17:20,2017-12-15 00:36:29,N,1,,,,,1,2.99,14,0.5,0.5,0,0,,0.3,15.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000617,2,2017-12-15 00:10:12,2017-12-15 00:30:28,N,1,,,,,1,2.17,14,0.5,0.5,3.06,0,,0.3,18.36,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000618,2,2017-12-15 00:35:21,2017-12-15 00:55:02,N,1,,,,,1,6.77,22.5,0.5,0.5,4.76,0,,0.3,28.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000619,1,2017-12-15 00:46:25,2017-12-15 01:09:39,N,1,,,,,1,5.70,20.5,0.5,0.5,0,0,,0.3,21.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000620,1,2017-12-15 00:04:11,2017-12-15 00:18:22,N,1,,,,,1,3.10,12,0.5,0.5,2,0,,0.3,15.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000621,1,2017-12-15 00:31:24,2017-12-15 00:42:28,N,1,,,,,1,0.60,8,0.5,0.5,1.85,0,,0.3,11.15,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000622,1,2017-12-15 00:44:08,2017-12-15 00:54:43,N,1,,,,,1,1.60,9,0.5,0.5,3.05,0,,0.3,13.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000623,2,2017-12-15 00:19:42,2017-12-15 00:47:45,N,1,,,,,1,13.76,39,0.5,0.5,0,5.76,,0.3,46.06,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000624,1,2017-12-15 00:52:27,2017-12-15 01:08:45,N,1,,,,,1,7.70,23,0.5,0.5,0,0,,0.3,24.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000625,1,2017-12-15 00:19:22,2017-12-15 00:34:50,N,1,,,,,1,3.10,14,0.5,0.5,2,0,,0.3,17.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000626,1,2017-12-15 00:44:21,2017-12-15 01:23:25,N,1,,,,,1,10.70,35,0.5,0.5,7,0,,0.3,43.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000627,1,2017-12-15 00:50:13,2017-12-15 01:15:53,N,1,,,,,1,16.10,44,0.5,0.5,11.3,0,,0.3,56.6,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000628,2,2017-12-14 23:58:56,2017-12-15 00:03:51,N,1,,,,,3,1.18,6,0.5,0.5,1.02,0,,0.3,8.32,1,,,,yellow,0.09,1,1.2,32,22,5.82,,,,,,,,,,,,,,,,,,,, 1460000629,2,2017-12-15 00:05:10,2017-12-15 00:41:52,N,1,,,,,4,9.78,33.5,0.5,0.5,2,5.76,,0.3,42.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000630,1,2017-12-15 00:02:37,2017-12-15 00:09:45,N,1,,,,,1,1.30,7,0.5,0.5,1.65,0,,0.3,9.95,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000631,1,2017-12-15 00:16:03,2017-12-15 00:22:28,N,1,,,,,1,0.50,5.5,0.5,0.5,0,0,,0.3,6.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000632,1,2017-12-15 00:24:13,2017-12-15 00:37:26,N,1,,,,,2,1.90,10.5,0.5,0.5,0,0,,0.3,11.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000633,1,2017-12-15 00:48:50,2017-12-15 00:52:32,N,1,,,,,1,0.50,4.5,0.5,0.5,0,0,,0.3,5.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000634,2,2017-12-15 00:10:46,2017-12-15 00:39:54,N,1,,,,,2,4.28,21,0.5,0.5,4.46,0,,0.3,26.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000635,1,2017-12-15 00:03:20,2017-12-15 00:09:54,N,1,,,,,5,1.00,6.5,0.5,0.5,1.55,0,,0.3,9.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000636,1,2017-12-15 00:11:44,2017-12-15 00:13:57,N,1,,,,,2,0.50,4,0.5,0.5,1.05,0,,0.3,6.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000637,1,2017-12-15 00:14:52,2017-12-15 00:31:37,N,1,,,,,1,3.60,14.5,0.5,0.5,3.15,0,,0.3,18.95,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000638,1,2017-12-15 00:28:49,2017-12-15 00:35:36,N,1,,,,,1,1.30,7,0.5,0.5,0,0,,0.3,8.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000639,1,2017-12-15 00:46:13,2017-12-15 01:11:12,N,1,,,,,1,4.90,19,0.5,0.5,0,0,,0.3,20.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000640,1,2017-12-15 00:52:19,2017-12-15 01:28:24,N,1,,,,,1,6.60,29,0.5,0.5,6.05,0,,0.3,36.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000641,1,2017-12-15 00:09:02,2017-12-15 00:15:32,N,1,,,,,2,1.30,7,0.5,0.5,0,0,,0.3,8.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000642,1,2017-12-15 00:32:12,2017-12-15 00:38:02,N,1,,,,,1,1.30,6.5,0.5,0.5,1.55,0,,0.3,9.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000643,1,2017-12-15 00:57:40,2017-12-15 01:05:08,N,1,,,,,1,1.40,7.5,0.5,0.5,0,0,,0.3,8.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000644,2,2017-12-15 00:07:59,2017-12-15 00:37:06,N,1,,,,,1,3.73,19,0.5,0.5,1,0,,0.3,21.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000645,2,2017-12-15 00:49:50,2017-12-15 00:56:39,N,1,,,,,1,1.95,8.5,0.5,0.5,0,0,,0.3,9.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000646,2,2017-12-15 00:01:20,2017-12-15 00:25:50,N,1,,,,,5,5.33,20.5,0.5,0.5,4.36,0,,0.3,26.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000647,2,2017-12-15 00:28:14,2017-12-15 00:29:15,N,1,,,,,5,0.03,3,0.5,0.5,0,0,,0.3,4.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000648,2,2017-12-15 00:39:06,2017-12-15 00:53:21,N,1,,,,,5,1.21,10.5,0.5,0.5,2.36,0,,0.3,14.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000649,2,2017-12-15 00:54:22,2017-12-15 01:20:16,N,1,,,,,5,4.22,18.5,0.5,0.5,4.95,0,,0.3,24.75,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000650,2,2017-12-15 00:21:40,2017-12-15 00:42:49,N,1,,,,,2,3.51,15.5,0.5,0.5,0,0,,0.3,16.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000651,2,2017-12-15 01:00:21,2017-12-15 01:20:15,N,1,,,,,2,8.83,26.5,0.5,0.5,0,5.76,,0.3,33.56,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000652,1,2017-12-15 00:51:00,2017-12-15 01:03:07,N,1,,,,,1,2.10,10,0.5,0.5,2.3,0,,0.3,13.6,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000653,1,2017-12-15 00:00:05,2017-12-15 00:14:23,N,1,,,,,1,2.60,12,0.5,0.5,3.95,0,,0.3,17.25,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000654,1,2017-12-15 00:26:10,2017-12-15 00:30:41,N,1,,,,,1,0.70,5,0.5,0.5,0.95,0,,0.3,7.25,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000655,1,2017-12-15 00:32:06,2017-12-15 00:42:31,N,1,,,,,1,1.70,9,0.5,0.5,0,0,,0.3,10.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000656,1,2017-12-15 00:43:47,2017-12-15 01:04:09,N,1,,,,,1,3.90,15.5,0.5,0.5,0,0,,0.3,16.8,3,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000657,2,2017-12-15 00:00:11,2017-12-15 00:29:57,N,1,,,,,3,5.36,24,0.5,0.5,7.59,0,,0.3,32.89,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000658,2,2017-12-15 00:55:15,2017-12-15 01:12:55,N,1,,,,,4,2.32,13,0.5,0.5,2.86,0,,0.3,17.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000659,2,2017-12-15 00:05:39,2017-12-15 00:32:38,N,1,,,,,1,3.69,19,0.5,0.5,6.09,0,,0.3,26.39,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000660,2,2017-12-15 00:36:55,2017-12-15 01:15:17,N,1,,,,,1,10.12,36,0.5,0.5,7.46,0,,0.3,44.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000661,2,2017-12-15 00:11:13,2017-12-15 00:26:46,N,1,,,,,3,4.16,15.5,0.5,0.5,3.36,0,,0.3,20.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000662,2,2017-12-15 00:29:43,2017-12-15 01:09:14,N,1,,,,,3,4.33,25.5,0.5,0.5,3,0,,0.3,29.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000663,2,2017-12-15 00:04:03,2017-12-15 00:18:22,N,1,,,,,2,1.95,11,0.5,0.5,0,0,,0.3,12.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000664,2,2017-12-15 00:23:19,2017-12-15 00:25:03,N,1,,,,,2,0.55,3.5,0.5,0.5,1.2,0,,0.3,6,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000665,2,2017-12-15 00:28:30,2017-12-15 00:48:26,N,1,,,,,2,3.86,16.5,0.5,0.5,2,0,,0.3,19.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000666,2,2017-12-15 00:54:59,2017-12-16 00:48:26,N,1,,,,,2,6.51,27,0.5,0.5,0,0,,0.3,28.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000667,1,2017-12-15 00:00:19,2017-12-15 00:07:29,N,1,,,,,1,1.30,6.5,0.5,0.5,0.5,0,,0.3,8.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000668,1,2017-12-15 00:14:16,2017-12-15 00:24:38,N,1,,,,,1,1.70,8.5,0.5,0.5,1.95,0,,0.3,11.75,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000669,1,2017-12-15 00:28:03,2017-12-15 00:48:26,N,1,,,,,1,3.70,15.5,0.5,0.5,1.5,0,,0.3,18.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000670,2,2017-12-15 00:04:25,2017-12-15 00:10:06,N,1,,,,,1,0.92,6,0.5,0.5,0,0,,0.3,7.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000671,2,2017-12-15 00:11:22,2017-12-15 00:36:07,N,1,,,,,1,4.65,20,0.5,0.5,5.32,0,,0.3,26.62,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000672,2,2017-12-15 00:49:56,2017-12-15 00:56:19,N,1,,,,,1,1.85,7.5,0.5,0.5,1.76,0,,0.3,10.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000673,1,2017-12-15 00:54:32,2017-12-15 01:06:37,N,1,,,,,2,4.40,15,0.5,0.5,0,0,,0.3,16.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000674,2,2017-12-15 00:10:56,2017-12-15 00:49:42,N,2,,,,,1,19.22,52,0,0.5,0,5.76,,0.3,58.56,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000675,2,2017-12-15 00:53:14,2017-12-15 01:10:10,N,1,,,,,1,2.44,13,0.5,0.5,2.86,0,,0.3,17.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000676,1,2017-12-15 00:24:30,2017-12-15 00:27:49,N,1,,,,,1,0.80,5,0.5,0.5,1,0,,0.3,7.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000677,1,2017-12-15 00:35:28,2017-12-15 00:43:21,N,1,,,,,1,1.20,7,0.5,0.5,0,0,,0.3,8.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000678,1,2017-12-15 00:45:35,2017-12-15 00:55:48,N,1,,,,,1,1.60,8.5,0.5,0.5,2.9,0,,0.3,12.7,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000679,2,2017-12-15 00:07:29,2017-12-15 00:38:34,N,1,,,,,1,13.41,39.5,0.5,0.5,9.31,5.76,,0.3,55.87,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000680,2,2017-12-15 00:48:16,2017-12-15 01:03:17,N,1,,,,,1,3.08,13.5,0.5,0.5,0,0,,0.3,14.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000681,1,2017-12-15 00:22:27,2017-12-15 00:38:33,N,1,,,,,1,1.20,11,0.5,0.5,0,0,,0.3,12.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000682,1,2017-12-15 00:41:29,2017-12-15 00:46:11,N,1,,,,,1,0.80,5.5,0.5,0.5,12.3,0,,0.3,19.1,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000683,1,2017-12-15 00:57:43,2017-12-15 01:22:11,N,1,,,,,1,6.00,21.5,0.5,0.5,0,0,,0.3,22.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000684,2,2017-12-15 00:36:07,2017-12-15 01:08:34,N,1,,,,,1,5.41,22.5,0.5,0.5,0,0,,0.3,23.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000685,2,2017-12-15 00:32:43,2017-12-15 00:38:26,N,1,,,,,1,1.00,6.5,0.5,0.5,1,0,,0.3,8.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000686,2,2017-12-15 00:57:05,2017-12-15 01:05:25,N,1,,,,,1,1.67,8,0.5,0.5,1.86,0,,0.3,11.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000687,2,2017-12-15 00:19:35,2017-12-15 00:31:13,N,1,,,,,1,2.63,11,0.5,0.5,0,0,,0.3,12.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000688,1,2017-12-15 00:04:18,2017-12-15 00:12:26,N,1,,,,,1,0.90,7,0.5,0.5,1.65,0,,0.3,9.95,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000689,1,2017-12-15 00:14:22,2017-12-15 00:59:18,N,1,,,,,1,6.60,29.5,0.5,0.5,6.15,0,,0.3,36.95,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000690,1,2017-12-15 00:00:52,2017-12-15 00:49:27,N,1,,,,,1,8.70,40,0.5,0.5,0,5.76,,0.3,47.06,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000691,1,2017-12-15 00:17:13,2017-12-15 00:37:07,N,1,,,,,1,3.10,14.5,0.5,0.5,3.15,0,,0.3,18.95,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000692,1,2017-12-15 00:38:56,2017-12-15 00:49:41,N,1,,,,,1,1.10,8,0.5,0.5,1.85,0,,0.3,11.15,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000693,1,2017-12-15 00:52:01,2017-12-15 01:17:31,N,1,,,,,1,4.70,19.5,0.5,0.5,4,0,,0.3,24.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000694,2,2017-12-15 00:03:29,2017-12-15 00:03:31,N,5,,,,,1,0.06,75,0,0,15.06,0,,0.3,90.36,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000873,1,2017-12-15 00:48:42,2017-12-15 00:51:50,Y,1,,,,,1,0.40,4,0.5,0.5,0.79,0,,0.3,6.09,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000695,2,2017-12-15 00:22:23,2017-12-15 00:29:43,N,1,,,,,2,1.23,6.5,0.5,0.5,2.34,0,,0.3,10.14,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000696,2,2017-12-15 00:44:53,2017-12-15 00:48:01,N,1,,,,,2,0.38,4,0.5,0.5,1,0,,0.3,6.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000697,1,2017-12-15 00:19:26,2017-12-15 00:28:01,N,1,,,,,1,1.40,8,0.5,0.5,1.85,0,,0.3,11.15,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000698,1,2017-12-15 00:39:46,2017-12-15 01:14:14,N,1,,,,,1,7.80,28,0.5,0.5,3,0,,0.3,32.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000699,1,2017-12-15 00:16:44,2017-12-15 00:45:52,N,2,,,,,3,17.30,52,0,0.5,0,5.76,,0.3,58.56,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000700,2,2017-12-15 00:02:11,2017-12-15 00:29:09,N,1,,,,,1,6.58,24,0.5,0.5,5.06,0,,0.3,30.36,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000701,2,2017-12-15 00:30:14,2017-12-15 00:46:04,N,1,,,,,1,2.80,12.5,0.5,0.5,0,0,,0.3,13.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000702,1,2017-12-15 00:10:46,2017-12-15 00:40:01,N,1,,,,,1,9.90,31,0.5,0.5,7.6,5.76,,0.3,45.66,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000703,1,2017-12-15 00:48:41,2017-12-15 01:06:29,N,1,,,,,1,3.50,15,0.5,0.5,4.05,0,,0.3,20.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000704,2,2017-12-15 00:22:48,2017-12-15 00:42:03,N,1,,,,,1,3.05,15,0.5,0.5,2.44,0,,0.3,18.74,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000705,2,2017-12-15 00:46:30,2017-12-15 01:13:28,N,1,,,,,1,3.78,19,0.5,0.5,5.08,0,,0.3,25.38,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000706,2,2017-12-15 00:01:57,2017-12-15 00:25:18,N,1,,,,,1,4.73,19.5,0.5,0.5,6.24,0,,0.3,27.04,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000707,2,2017-12-15 00:01:06,2017-12-15 00:05:42,N,1,,,,,2,1.25,6,0.5,0.5,0,0,,0.3,7.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000708,2,2017-12-15 00:28:01,2017-12-15 00:35:18,N,1,,,,,2,1.25,7,0.5,0.5,1.66,0,,0.3,9.96,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000709,2,2017-12-15 00:36:39,2017-12-15 00:50:41,N,1,,,,,2,3.16,13,0.5,0.5,0,0,,0.3,14.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000710,2,2017-12-15 00:02:19,2017-12-15 00:22:36,N,1,,,,,1,3.68,15.5,0.5,0.5,3.36,0,,0.3,22.11,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000711,2,2017-12-15 00:23:44,2017-12-15 00:33:57,N,1,,,,,1,1.77,9,0.5,0.5,2.58,0,,0.3,12.88,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000712,2,2017-12-15 00:37:09,2017-12-15 00:56:53,N,1,,,,,1,2.92,15,0.5,0.5,2,0,,0.3,18.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000713,2,2017-12-15 00:12:13,2017-12-15 00:29:34,N,1,,,,,1,12.02,33,0.5,0.5,8.58,0,,0.3,42.88,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000714,2,2017-12-15 00:11:36,2017-12-15 00:40:50,N,1,,,,,3,5.63,23,0.5,0.5,4.86,0,,0.3,29.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000715,2,2017-12-15 00:43:02,2017-12-15 00:49:08,N,1,,,,,3,1.68,7,0.5,0.5,0,0,,0.3,8.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000716,1,2017-12-15 00:05:41,2017-12-15 00:32:29,N,1,,,,,2,4.20,20,0.5,0.5,5.3,0,,0.3,26.6,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000717,1,2017-12-15 00:44:27,2017-12-15 01:04:20,N,1,,,,,1,4.90,17,0.5,0.5,5.45,0,,0.3,23.75,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000718,1,2017-12-15 00:30:44,2017-12-15 00:40:04,N,1,,,,,1,1.50,8.5,0.5,0.5,1.5,0,,0.3,11.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000719,1,2017-12-15 00:41:48,2017-12-15 00:50:54,N,1,,,,,1,1.50,8,0.5,0.5,1.85,0,,0.3,11.15,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000720,1,2017-12-15 00:59:42,2017-12-15 01:12:12,N,1,,,,,1,4.20,13.5,0.5,0.5,2,0,,0.3,16.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000721,2,2017-12-15 00:08:17,2017-12-15 00:23:49,N,1,,,,,2,6.81,21.5,0.5,0.5,3.5,0,,0.3,26.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000722,2,2017-12-15 00:54:07,2017-12-15 01:12:23,N,1,,,,,2,8.71,25,0.5,0.5,0,5.76,,0.3,32.06,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000723,1,2017-12-15 00:10:46,2017-12-15 00:23:46,N,1,,,,,1,1.80,10.5,0.5,0.5,1,0,,0.3,12.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000724,1,2017-12-15 00:25:04,2017-12-15 00:40:45,N,1,,,,,1,3.10,13.5,0.5,0.5,3.7,0,,0.3,18.5,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000725,1,2017-12-15 00:46:29,2017-12-15 01:22:33,N,1,,,,,1,5.80,26.5,0.5,0.5,5.55,0,,0.3,33.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000726,2,2017-12-15 00:07:51,2017-12-15 00:14:32,N,1,,,,,1,0.81,6,0.5,0.5,1.46,0,,0.3,8.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000727,2,2017-12-15 00:17:00,2017-12-15 00:37:28,N,1,,,,,1,1.28,13,0.5,0.5,2.86,0,,0.3,17.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000728,2,2017-12-15 00:19:24,2017-12-15 00:34:41,N,1,,,,,1,4.52,15.5,0.5,0.5,0,0,,0.3,16.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000729,2,2017-12-15 00:01:16,2017-12-15 00:17:43,N,1,,,,,1,1.05,11,0.5,0.5,2.46,0,,0.3,14.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000730,2,2017-12-15 00:18:40,2017-12-15 00:31:35,N,1,,,,,1,1.22,9.5,0.5,0.5,0,0,,0.3,10.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000731,1,2017-12-15 00:31:52,2017-12-15 00:38:45,N,1,,,,,1,1.30,6.5,0.5,0.5,1.55,0,,0.3,9.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000732,1,2017-12-15 00:18:16,2017-12-15 00:43:11,N,1,,,,,1,7.00,25,0.5,0.5,7,0,,0.3,33.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000733,2,2017-12-15 00:05:53,2017-12-15 00:35:02,N,1,,,,,1,6.85,26,0.5,0.5,0,0,,0.3,27.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000734,2,2017-12-15 00:53:47,2017-12-15 01:24:10,N,1,,,,,1,8.55,29.5,0.5,0.5,3,5.76,,0.3,39.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000735,1,2017-12-15 00:18:57,2017-12-15 00:36:41,N,1,,,,,1,2.40,13,0.5,0.5,2.85,0,,0.3,17.15,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000736,1,2017-12-15 00:39:32,2017-12-15 00:51:27,N,1,,,,,1,2.20,10,0.5,0.5,0,0,,0.3,11.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000737,1,2017-12-15 00:09:35,2017-12-15 00:10:30,N,4,,,,,0,0.00,3,0.5,0.5,0,0,,0.3,4.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000738,1,2017-12-15 00:10:38,2017-12-15 00:50:50,N,2,,,,,2,18.30,52,0,0.5,10,0,,0.3,62.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000739,1,2017-12-15 00:59:40,2017-12-15 01:12:42,N,1,,,,,1,1.90,10,0.5,0.5,2.25,0,,0.3,13.55,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000740,2,2017-12-15 00:10:27,2017-12-15 00:32:38,N,1,,,,,1,15.36,41.5,0.5,0.5,8.56,0,,0.3,51.36,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000741,2,2017-12-15 00:43:57,2017-12-15 01:14:33,N,1,,,,,1,7.44,28,0.5,0.5,8.76,5.76,,0.3,43.82,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000742,1,2017-12-15 00:23:14,2017-12-15 00:40:55,N,1,,,,,1,3.10,14,0.5,0.5,1.5,0,,0.3,16.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000743,1,2017-12-15 00:45:14,2017-12-15 01:06:11,N,1,,,,,1,4.60,19.5,0.5,0.5,4.15,0,,0.3,24.95,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000744,1,2017-12-15 00:05:13,2017-12-15 00:31:01,N,1,,,,,1,4.90,20.5,0.5,0.5,4.35,0,,0.3,26.15,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000745,1,2017-12-15 00:55:03,2017-12-15 01:00:44,N,1,,,,,1,0.90,5.5,0.5,0.5,0,0,,0.3,6.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000746,1,2017-12-15 00:08:52,2017-12-15 00:44:05,N,1,,,,,1,3.80,23,0.5,0.5,6.05,0,,0.3,30.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000747,2,2017-12-15 00:33:05,2017-12-15 00:48:27,N,1,,,,,1,2.99,13.5,0.5,0.5,2.96,0,,0.3,17.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000748,2,2017-12-15 00:15:29,2017-12-15 00:54:51,N,1,,,,,1,4.36,26,0.5,0.5,5.46,0,,0.3,32.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000749,1,2017-12-15 00:01:36,2017-12-15 00:04:58,N,1,,,,,1,0.80,4.5,0.5,0.5,0,0,,0.3,5.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000750,1,2017-12-15 00:15:24,2017-12-15 00:21:09,N,1,,,,,1,1.80,7.5,0.5,0.5,0,0,,0.3,8.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000751,1,2017-12-15 00:40:23,2017-12-15 00:46:29,N,1,,,,,1,1.00,6,0.5,0.5,0,0,,0.3,7.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000752,1,2017-12-15 00:49:17,2017-12-15 01:00:42,N,1,,,,,1,1.30,8.5,0.5,0.5,0,0,,0.3,9.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000753,2,2017-12-15 00:11:14,2017-12-15 00:23:43,N,1,,,,,3,2.00,10,0.5,0.5,0,0,,0.3,11.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000754,2,2017-12-15 00:31:31,2017-12-15 00:45:43,N,1,,,,,2,2.13,11.5,0.5,0.5,1,0,,0.3,13.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000755,2,2017-12-15 00:40:55,2017-12-15 01:00:47,N,1,,,,,1,2.88,14,0.5,0.5,0,0,,0.3,15.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000756,2,2017-12-15 00:08:22,2017-12-15 00:10:21,N,1,,,,,1,0.68,4,0.5,0.5,0,0,,0.3,5.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000757,2,2017-12-15 00:11:45,2017-12-15 00:32:57,N,1,,,,,1,4.32,18,0.5,0.5,3.86,0,,0.3,23.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000758,2,2017-12-15 00:45:14,2017-12-15 01:04:07,N,1,,,,,1,4.99,18,0.5,0.5,5.79,0,,0.3,25.09,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000759,1,2017-12-15 00:31:29,2017-12-15 00:31:58,N,5,,,,,1,0.00,18,0,0,3.65,0,,0.3,21.95,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000760,1,2017-12-15 00:34:33,2017-12-15 00:39:54,N,1,,,,,1,1.60,7,0.5,0.5,2.05,0,,0.3,10.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000761,1,2017-12-15 00:42:43,2017-12-15 00:51:05,N,1,,,,,1,1.50,8.5,0.5,0.5,0,0,,0.3,9.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000762,1,2017-12-15 00:17:16,2017-12-15 00:26:19,N,1,,,,,1,2.30,9,0.5,0.5,2.05,0,,0.3,12.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000763,1,2017-12-15 00:35:40,2017-12-15 00:49:25,N,1,,,,,1,3.20,13,0.5,0.5,2.85,0,,0.3,17.15,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000764,2,2017-12-15 00:23:04,2017-12-15 00:42:31,N,1,,,,,1,8.98,26.5,0.5,0.5,2,0,,0.3,29.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000765,1,2017-12-15 00:46:24,2017-12-15 01:01:21,N,1,,,,,1,2.80,12,0.5,0.5,0,0,,0.3,13.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000766,2,2017-12-14 23:57:15,2017-12-15 00:15:42,N,1,,,,,1,5.55,20,0.5,0.5,4.26,0,,0.3,25.56,1,,,,yellow,0.09,1,1.2,32,22,5.82,,,,,,,,,,,,,,,,,,,, 1460000767,2,2017-12-15 00:23:19,2017-12-15 00:34:54,N,1,,,,,1,1.26,8.5,0.5,0.5,1.96,0,,0.3,11.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000768,2,2017-12-15 00:41:52,2017-12-15 00:48:13,N,1,,,,,1,0.82,6,0.5,0.5,1.82,0,,0.3,9.12,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000769,2,2017-12-15 00:49:53,2017-12-15 01:02:43,N,1,,,,,1,2.31,11,0.5,0.5,3.69,0,,0.3,15.99,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000770,2,2017-12-15 00:29:46,2017-12-15 01:15:02,N,1,,,,,5,9.27,37.5,0.5,0.5,8.91,5.76,,0.3,53.47,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000771,2,2017-12-15 00:03:21,2017-12-15 00:16:01,N,1,,,,,5,2.26,10.5,0.5,0.5,2.36,0,,0.3,14.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000772,2,2017-12-15 00:16:49,2017-12-15 00:24:40,N,1,,,,,5,1.39,7.5,0.5,0.5,1,0,,0.3,9.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000773,2,2017-12-15 00:29:56,2017-12-15 00:42:19,N,1,,,,,5,1.61,9.5,0.5,0.5,2.16,0,,0.3,12.96,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000774,2,2017-12-15 00:43:51,2017-12-15 00:54:08,N,1,,,,,5,1.04,7,0.5,0.5,0.7,0,,0.3,9,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000775,2,2017-12-15 00:55:15,2017-12-15 00:59:45,N,1,,,,,5,0.38,4.5,0.5,0.5,0,0,,0.3,5.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000776,2,2017-12-15 00:15:43,2017-12-15 00:43:10,N,1,,,,,3,3.34,19,0.5,0.5,4.06,0,,0.3,24.36,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000777,2,2017-12-15 00:54:12,2017-12-15 01:05:13,N,1,,,,,3,2.16,10,0.5,0.5,0,0,,0.3,11.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000778,2,2017-12-15 00:22:21,2017-12-15 00:48:52,N,1,,,,,1,9.82,30.5,0.5,0.5,0,0,,0.3,31.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000779,2,2017-12-15 00:53:35,2017-12-15 00:55:32,N,1,,,,,1,0.36,3.5,0.5,0.5,0,0,,0.3,4.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000780,2,2017-12-15 00:33:11,2017-12-15 00:39:57,N,1,,,,,1,1.35,6.5,0.5,0.5,1.56,0,,0.3,9.36,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000781,2,2017-12-15 00:40:32,2017-12-15 00:59:34,N,1,,,,,1,4.29,16,0.5,0.5,0,0,,0.3,17.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000782,2,2017-12-15 00:21:31,2017-12-15 00:25:39,N,1,,,,,1,0.27,4.5,0.5,0.5,1.16,0,,0.3,6.96,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000783,2,2017-12-15 00:22:28,2017-12-15 00:57:26,N,1,,,,,1,5.58,26,0.5,0.5,1,0,,0.3,28.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000784,1,2017-12-15 00:44:31,2017-12-15 00:48:39,N,1,,,,,3,0.60,4.5,0.5,0.5,1.7,0,,0.3,7.5,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000785,1,2017-12-15 00:54:03,2017-12-15 01:08:58,N,1,,,,,1,2.60,12,0.5,0.5,2.65,0,,0.3,15.95,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000786,1,2017-12-15 00:17:08,2017-12-15 00:38:33,N,1,,,,,1,9.00,27.5,0.5,0.5,6,5.76,,0.3,40.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000787,1,2017-12-15 00:40:42,2017-12-15 00:58:49,N,1,,,,,1,3.70,15,0.5,0.5,0.02,0,,0.3,16.32,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000788,1,2017-12-15 00:07:49,2017-12-15 00:32:05,N,1,,,,,1,3.90,17.5,0.5,0.5,1.88,0,,0.3,20.68,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000789,1,2017-12-15 00:36:09,2017-12-15 00:37:14,N,1,,,,,1,0.30,3,0.5,0.5,0.85,0,,0.3,5.15,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000790,1,2017-12-15 00:41:57,2017-12-15 01:13:05,N,1,,,,,1,8.30,29.5,0.5,0.5,7.7,0,,0.3,38.5,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000791,2,2017-12-15 00:33:18,2017-12-15 00:39:28,N,1,,,,,2,1.87,8,0.5,0.5,1,0,,0.3,10.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000792,2,2017-12-15 00:41:31,2017-12-15 01:14:47,N,1,,,,,2,9.43,32.5,0.5,0.5,6.76,0,,0.3,40.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000793,2,2017-12-15 00:51:28,2017-12-15 01:18:40,N,1,,,,,3,15.96,45,0.5,0.5,0,0,,0.3,46.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000794,2,2017-12-15 00:24:56,2017-12-15 00:57:40,N,1,,,,,1,5.40,23.5,0.5,0.5,0,5.76,,0.3,30.56,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000795,2,2017-12-15 00:50:36,2017-12-15 01:11:29,N,1,,,,,1,14.03,38,0.5,0.5,7.86,0,,0.3,47.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000796,2,2017-12-15 00:16:48,2017-12-15 00:25:42,N,1,,,,,1,3.31,11,0.5,0.5,2.46,0,,0.3,14.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000797,2,2017-12-15 00:28:58,2017-12-15 00:45:19,N,1,,,,,1,2.56,13,0.5,0.5,0,0,,0.3,14.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000798,2,2017-12-15 00:47:35,2017-12-15 01:25:59,N,1,,,,,1,5.91,26,0.5,0.5,2,5.76,,0.3,35.06,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000799,1,2017-12-15 00:04:08,2017-12-15 00:15:31,N,1,,,,,1,1.90,9,0.5,0.5,2.55,0,,0.3,12.85,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000800,1,2017-12-15 00:17:18,2017-12-15 00:31:04,N,1,,,,,1,2.50,11.5,0.5,0.5,1.5,0,,0.3,14.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000801,2,2017-12-15 00:17:03,2017-12-15 00:54:29,N,1,,,,,2,6.26,25.5,0.5,0.5,0,0,,0.3,26.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000802,1,2017-12-15 00:18:30,2017-12-15 00:32:04,N,1,,,,,2,1.40,9.5,0.5,0.5,1,0,,0.3,11.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000803,1,2017-12-15 00:40:55,2017-12-15 00:52:32,N,1,,,,,2,1.70,9.5,0.5,0.5,2.15,0,,0.3,12.95,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000804,1,2017-12-15 00:58:53,2017-12-15 01:14:54,N,1,,,,,1,3.00,13,0.5,0.5,3.55,0,,0.3,17.85,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000805,1,2017-12-15 00:21:00,2017-12-15 00:39:48,N,1,,,,,1,4.40,16.5,0.5,0.5,0,0,,0.3,17.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000806,1,2017-12-15 00:41:50,2017-12-15 00:54:39,N,1,,,,,1,5.00,16.5,0.5,0.5,0,0,,0.3,17.8,4,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000807,1,2017-12-15 00:16:11,2017-12-15 00:23:24,N,1,,,,,1,1.10,6.5,0.5,0.5,1.4,0,,0.3,9.2,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000808,1,2017-12-15 00:36:58,2017-12-15 01:23:38,N,1,,,,,3,4.50,29.5,0.5,0.5,6.15,0,,0.3,36.95,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000809,1,2017-12-15 00:02:18,2017-12-15 00:27:13,N,1,,,,,1,5.20,20,0.5,0.5,4.25,0,,0.3,25.55,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000810,1,2017-12-15 00:45:21,2017-12-15 00:53:02,N,1,,,,,1,1.40,7,0.5,0.5,1,0,,0.3,9.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000811,1,2017-12-15 00:54:15,2017-12-15 01:08:34,N,1,,,,,1,2.90,12,0.5,0.5,0,0,,0.3,13.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000812,1,2017-12-15 00:05:06,2017-12-15 00:10:53,N,1,,,,,1,1.60,7,0.5,0.5,3,0,,0.3,11.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000813,1,2017-12-15 00:13:19,2017-12-15 00:26:11,N,1,,,,,1,2.40,11,0.5,0.5,2.45,0,,0.3,14.75,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000814,1,2017-12-15 00:56:51,2017-12-15 01:03:21,N,1,,,,,1,0.50,6,0.5,0.5,0,0,,0.3,7.3,3,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000815,1,2017-12-15 00:05:46,2017-12-15 00:43:24,N,1,,,,,1,3.20,23,0.5,0.5,0,0,,0.3,24.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000816,1,2017-12-15 00:49:45,2017-12-15 01:01:15,N,1,,,,,1,1.80,9.5,0.5,0.5,2.15,0,,0.3,12.95,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000817,1,2017-12-15 00:06:47,2017-12-15 00:19:20,N,1,,,,,1,2.60,11,0.5,0.5,2.45,0,,0.3,14.75,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000818,1,2017-12-15 00:21:08,2017-12-15 00:39:13,N,1,,,,,1,4.20,16,0.5,0.5,2,0,,0.3,19.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000819,1,2017-12-15 00:41:09,2017-12-15 00:46:56,N,1,,,,,1,0.70,5.5,0.5,0.5,1.35,0,,0.3,8.15,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000820,1,2017-12-15 00:49:48,2017-12-15 00:56:43,N,1,,,,,4,0.80,6,0.5,0.5,1.8,0,,0.3,9.1,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000821,1,2017-12-15 00:57:42,2017-12-15 01:16:36,N,1,,,,,1,3.00,14,0.5,0.5,2.5,0,,0.3,17.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000822,1,2017-12-15 00:04:26,2017-12-15 00:19:40,N,1,,,,,2,1.40,10,0.5,0.5,0,0,,0.3,11.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000823,1,2017-12-15 00:26:21,2017-12-15 00:34:11,N,1,,,,,1,1.10,7,0.5,0.5,2,0,,0.3,10.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000824,1,2017-12-15 00:39:20,2017-12-15 00:56:50,N,1,,,,,1,5.00,17,0.5,0.5,3.65,0,,0.3,21.95,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000825,2,2017-12-15 00:12:02,2017-12-15 00:26:03,N,1,,,,,1,2.27,11,0.5,0.5,2.46,0,,0.3,14.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000826,2,2017-12-15 00:32:43,2017-12-15 00:43:57,N,1,,,,,1,3.13,11.5,0.5,0.5,0,0,,0.3,12.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000827,2,2017-12-15 00:45:55,2017-12-15 00:54:31,N,1,,,,,1,1.89,8.5,0.5,0.5,0,0,,0.3,9.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000828,2,2017-12-15 00:04:28,2017-12-15 00:48:05,N,1,,,,,1,7.08,31,0.5,0.5,6.46,0,,0.3,38.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000829,1,2017-12-15 00:01:24,2017-12-15 00:38:04,N,1,,,,,2,6.60,27.5,0.5,0.5,5.75,0,,0.3,34.55,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000830,1,2017-12-15 00:17:14,2017-12-15 00:26:11,N,1,,,,,1,2.00,8.5,0.5,0.5,0,0,,0.3,9.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000831,1,2017-12-15 00:30:59,2017-12-15 00:47:45,N,1,,,,,1,3.00,13,0.5,0.5,2.85,0,,0.3,17.15,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000832,1,2017-12-15 00:51:48,2017-12-15 01:05:41,N,1,,,,,1,6.70,20.5,0.5,0.5,4.35,0,,0.3,26.15,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000833,2,2017-12-15 00:04:55,2017-12-15 00:17:37,N,1,,,,,1,2.77,12,0.5,0.5,0,0,,0.3,13.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000834,2,2017-12-15 00:27:10,2017-12-15 00:40:45,N,1,,,,,1,4.32,15,0.5,0.5,3.26,0,,0.3,19.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000835,2,2017-12-15 00:47:01,2017-12-15 01:00:18,N,1,,,,,1,1.12,10,0.5,0.5,2.26,0,,0.3,13.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000836,2,2017-12-15 00:34:54,2017-12-15 00:51:54,N,1,,,,,1,1.76,11.5,0.5,0.5,2.56,0,,0.3,15.36,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000837,1,2017-12-15 00:41:18,2017-12-15 00:54:16,N,1,,,,,1,2.70,11.5,0.5,0.5,2.55,0,,0.3,15.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000838,1,2017-12-15 00:16:37,2017-12-15 00:25:14,N,1,,,,,1,1.00,7.5,0.5,0.5,2.2,0,,0.3,11,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000839,1,2017-12-15 00:29:29,2017-12-15 00:35:48,N,1,,,,,1,0.80,6,0.5,0.5,1.45,0,,0.3,8.75,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000840,1,2017-12-15 00:44:11,2017-12-15 00:54:00,N,1,,,,,1,1.30,8,0.5,0.5,1.85,0,,0.3,11.15,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000841,2,2017-12-15 00:03:15,2017-12-15 00:12:39,N,1,,,,,1,1.44,8,0.5,0.5,1.86,0,,0.3,11.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000842,2,2017-12-15 00:13:34,2017-12-15 00:25:05,N,1,,,,,1,0.70,8,0.5,0.5,0,0,,0.3,9.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000843,2,2017-12-15 00:29:16,2017-12-15 00:32:52,N,1,,,,,1,0.63,4.5,0.5,0.5,1.74,0,,0.3,7.54,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000844,2,2017-12-15 00:44:44,2017-12-15 00:55:18,N,1,,,,,1,2.22,10,0.5,0.5,2.26,0,,0.3,13.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000845,2,2017-12-15 00:56:56,2017-12-15 01:08:12,N,1,,,,,1,2.25,10.5,0.5,0.5,0,0,,0.3,11.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000846,1,2017-12-15 00:05:27,2017-12-15 00:40:39,N,1,,,,,1,4.10,23,0.5,0.5,3,0,,0.3,27.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000847,1,2017-12-15 00:50:51,2017-12-15 00:55:54,N,1,,,,,1,1.00,5.5,0.5,0.5,1.35,0,,0.3,8.15,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000848,1,2017-12-15 00:59:52,2017-12-15 01:11:28,N,1,,,,,2,1.30,9,0.5,0.5,2.05,0,,0.3,12.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000849,2,2017-12-15 00:43:01,2017-12-15 01:04:17,N,1,,,,,1,4.68,18,0.5,0.5,2.5,0,,0.3,21.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000850,1,2017-12-15 00:52:23,2017-12-15 01:16:27,N,1,,,,,1,4.00,17.5,0.5,0.5,1.2,0,,0.3,20,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000851,1,2017-12-15 00:50:17,2017-12-15 01:18:00,N,1,,,,,1,7.70,26,0.5,0.5,5.45,0,,0.3,32.75,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000852,2,2017-12-15 00:29:48,2017-12-15 00:47:32,N,1,,,,,5,4.13,15.5,0.5,0.5,4.2,0,,0.3,21,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000853,1,2017-12-15 00:20:05,2017-12-15 00:28:06,N,1,,,,,1,0.60,6.5,0.5,0.5,2,0,,0.3,9.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000854,1,2017-12-15 00:30:22,2017-12-15 01:10:08,N,1,,,,,1,5.80,27,0.5,0.5,7.05,0,,0.3,35.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000855,2,2017-12-15 00:30:08,2017-12-15 00:36:48,N,1,,,,,1,0.72,6,0.5,0.5,1.46,0,,0.3,8.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000856,2,2017-12-15 00:38:28,2017-12-15 00:57:44,N,1,,,,,1,1.65,13,0.5,0.5,0,0,,0.3,14.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000857,1,2017-12-15 00:11:26,2017-12-15 00:33:14,N,1,,,,,1,4.30,18.5,0.5,0.5,2,0,,0.3,21.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000858,1,2017-12-15 00:53:08,2017-12-15 01:24:53,N,1,,,,,2,8.90,31,0.5,0.5,5,5.76,,0.3,43.06,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000859,1,2017-12-15 00:53:46,2017-12-15 01:07:14,N,1,,,,,2,1.90,10.5,0.5,0.5,1.2,0,,0.3,13,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000860,2,2017-12-15 00:31:50,2017-12-15 00:37:17,N,1,,,,,1,1.77,7,0.5,0.5,1,0,,0.3,9.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000861,1,2017-12-15 00:08:40,2017-12-15 00:58:36,N,1,,,,,1,19.60,59.5,0.5,0.5,13,0,,0.3,73.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000862,2,2017-12-15 00:02:27,2017-12-15 00:06:04,N,1,,,,,5,0.60,4.5,0.5,0.5,1.16,0,,0.3,6.96,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000863,2,2017-12-15 00:17:42,2017-12-15 01:05:29,N,1,,,,,5,6.86,30,0.5,0.5,0,0,,0.3,31.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000864,2,2017-12-15 00:22:02,2017-12-15 00:40:51,N,1,,,,,1,3.48,15.5,0.5,0.5,0,0,,0.3,16.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000865,2,2017-12-15 00:42:18,2017-12-15 00:49:35,N,1,,,,,1,1.42,7,0.5,0.5,1.66,0,,0.3,9.96,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000866,2,2017-12-15 00:02:19,2017-12-15 00:24:09,N,1,,,,,1,10.38,31,0.5,0.5,6.46,0,,0.3,38.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000867,2,2017-12-15 00:21:42,2017-12-15 01:25:06,N,1,,,,,1,11.80,45.5,0.5,0.5,0,0,,0.3,46.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000868,2,2017-12-15 00:03:13,2017-12-15 00:11:15,N,1,,,,,1,1.05,7,0.5,0.5,1.66,0,,0.3,9.96,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000869,2,2017-12-15 00:13:04,2017-12-15 00:43:07,N,1,,,,,1,5.54,22,0.5,0.5,0,0,,0.3,23.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000870,2,2017-12-15 00:03:01,2017-12-15 00:17:43,N,1,,,,,1,1.14,10,0.5,0.5,0,0,,0.3,11.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000871,2,2017-12-15 00:19:26,2017-12-15 00:36:55,N,1,,,,,1,3.51,15,0.5,0.5,20,0,,0.3,36.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000872,1,2017-12-15 00:42:05,2017-12-15 00:46:23,Y,1,,,,,2,0.70,5,0.5,0.5,0,0,,0.3,6.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000874,1,2017-12-15 00:32:08,2017-12-15 00:38:53,N,1,,,,,1,0.60,6,0.5,0.5,0,0,,0.3,7.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000875,1,2017-12-15 00:52:04,2017-12-15 01:04:53,N,1,,,,,1,2.00,10.5,0.5,0.5,2.35,0,,0.3,14.15,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000876,2,2017-12-15 00:39:40,2017-12-15 01:22:10,N,2,,,,,6,21.28,52,0,0.5,6,5.76,,0.3,64.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000877,1,2017-12-15 00:04:14,2017-12-15 00:13:52,N,1,,,,,2,2.10,9.5,0.5,0.5,1.2,0,,0.3,12,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000878,1,2017-12-15 00:44:57,2017-12-15 00:57:59,N,1,,,,,2,1.40,9.5,0.5,0.5,2,0,,0.3,12.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000879,2,2017-12-15 00:12:01,2017-12-15 00:18:24,N,1,,,,,1,1.32,7,0.5,0.5,0.1,0,,0.3,8.4,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000880,2,2017-12-15 00:22:58,2017-12-15 00:40:06,N,1,,,,,1,4.69,16.5,0.5,0.5,3.56,0,,0.3,21.36,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000881,2,2017-12-15 00:47:43,2017-12-15 00:54:11,N,1,,,,,1,1.30,6.5,0.5,0.5,1.56,0,,0.3,9.36,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000882,2,2017-12-15 00:16:46,2017-12-15 00:24:04,N,1,,,,,1,1.86,8,0.5,0.5,1.86,0,,0.3,11.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000883,2,2017-12-15 00:29:13,2017-12-15 00:43:55,N,1,,,,,1,2.95,12.5,0.5,0.5,2.2,0,,0.3,16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000884,2,2017-12-15 00:48:04,2017-12-15 00:49:49,N,1,,,,,1,0.34,3.5,0.5,0.5,0,0,,0.3,4.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000885,2,2017-12-15 00:51:53,2017-12-15 00:55:13,N,1,,,,,1,0.86,5,0.5,0.5,1.89,0,,0.3,8.19,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000886,2,2017-12-15 00:21:27,2017-12-15 01:14:34,N,1,,,,,1,10.62,42.5,0.5,0.5,10.95,0,,0.3,54.75,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000887,1,2017-12-15 00:04:08,2017-12-15 00:36:58,N,1,,,,,2,5.10,23,0.5,0.5,0,0,,0.3,24.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000888,1,2017-12-15 00:40:51,2017-12-15 00:57:22,N,1,,,,,2,3.10,14,0.5,0.5,3.8,0,,0.3,19.1,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000889,1,2017-12-15 00:08:02,2017-12-15 00:29:01,N,1,,,,,1,2.50,14.5,0.5,0.5,1,0,,0.3,16.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000890,1,2017-12-15 00:48:54,2017-12-15 01:33:00,N,1,,,,,1,15.60,48.5,0.5,0.5,5.2,0,,0.3,55,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000891,2,2017-12-14 23:59:19,2017-12-15 00:23:03,N,1,,,,,1,3.84,17.5,0.5,0.5,1,0,,0.3,19.8,1,,,,yellow,0.09,1,1.2,32,22,5.82,,,,,,,,,,,,,,,,,,,, 1460000892,2,2017-12-15 00:42:48,2017-12-15 01:03:15,N,1,,,,,1,2.45,14.5,0.5,0.5,1,0,,0.3,16.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000893,2,2017-12-15 00:16:33,2017-12-15 00:26:08,N,1,,,,,1,1.67,8.5,0.5,0.5,1.96,0,,0.3,11.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000894,2,2017-12-15 00:38:41,2017-12-15 00:56:22,N,1,,,,,1,2.95,13.5,0.5,0.5,0,0,,0.3,14.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000895,2,2017-12-15 00:57:11,2017-12-15 01:14:48,N,1,,,,,1,0.82,11.5,0.5,0.5,0,0,,0.3,12.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000896,2,2017-12-15 00:02:45,2017-12-15 00:36:35,N,2,,,,,1,18.85,52,0,0.5,11.71,5.76,,0.3,70.27,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000897,2,2017-12-15 00:43:23,2017-12-15 01:00:33,N,1,,,,,1,2.82,13.5,0.5,0.5,2.96,0,,0.3,17.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000898,2,2017-12-15 00:08:07,2017-12-15 00:41:12,N,1,,,,,1,5.36,24,0.5,0.5,6.32,0,,0.3,31.62,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000899,2,2017-12-15 00:48:01,2017-12-15 00:54:37,N,1,,,,,1,0.87,6.5,0.5,0.5,1.56,0,,0.3,9.36,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000900,2,2017-12-15 01:00:05,2017-12-15 01:04:23,N,1,,,,,1,1.04,5.5,0.5,0.5,2.04,0,,0.3,8.84,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000901,1,2017-12-15 00:43:08,2017-12-15 00:50:53,N,1,,,,,1,2.10,8.5,0.5,0.5,1.95,0,,0.3,11.75,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000902,1,2017-12-15 00:51:34,2017-12-15 00:54:56,N,1,,,,,1,1.10,5.5,0.5,0.5,0,0,,0.3,6.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000903,2,2017-12-15 00:53:48,2017-12-15 01:04:17,N,1,,,,,1,1.66,9,0.5,0.5,0,0,,0.3,10.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000904,1,2017-12-15 00:22:51,2017-12-15 00:50:12,N,2,,,,,1,18.40,52,0,0.5,10.55,0,,0.3,63.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000905,1,2017-12-15 00:02:29,2017-12-15 00:27:44,N,1,,,,,1,4.20,18,0.5,0.5,0,0,,0.3,19.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000906,1,2017-12-15 00:31:33,2017-12-15 00:50:28,N,1,,,,,1,3.40,15,0.5,0.5,4.05,0,,0.3,20.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000907,1,2017-12-15 00:53:36,2017-12-15 00:57:11,N,1,,,,,1,0.90,5,0.5,0.5,1.25,0,,0.3,7.55,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000908,2,2017-12-15 00:01:47,2017-12-15 00:35:22,N,1,,,,,1,6.78,26.5,0.5,0.5,2,0,,0.3,29.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000909,2,2017-12-15 00:55:50,2017-12-15 01:21:26,N,1,,,,,1,4.70,19,0.5,0.5,0,0,,0.3,20.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000910,1,2017-12-15 00:15:27,2017-12-15 00:20:38,N,1,,,,,1,1.40,6.5,0.5,0.5,0,0,,0.3,7.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000911,1,2017-12-15 00:34:52,2017-12-15 01:01:21,N,1,,,,,1,5.10,21,0.5,0.5,3.35,0,,0.3,25.65,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000912,1,2017-12-15 00:15:57,2017-12-15 00:22:46,N,1,,,,,1,1.80,7.5,0.5,0.5,1.75,0,,0.3,10.55,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000913,1,2017-12-15 00:31:27,2017-12-15 01:11:20,N,1,,,,,1,5.30,26.5,0.5,0.5,1,0,,0.3,28.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000914,1,2017-12-15 00:14:15,2017-12-15 00:29:52,N,1,,,,,1,3.00,13,0.5,0.5,1,0,,0.3,15.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000915,1,2017-12-15 00:48:36,2017-12-15 01:24:10,N,1,,,,,1,4.90,26,0.5,0.5,8.15,0,,0.3,35.45,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000916,1,2017-12-15 00:22:04,2017-12-15 00:32:41,Y,1,,,,,2,1.90,9,0.5,0.5,2.05,0,,0.3,12.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000917,1,2017-12-15 00:33:58,2017-12-15 00:47:41,N,1,,,,,1,3.30,12.5,0.5,0.5,0,0,,0.3,13.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000918,1,2017-12-15 00:54:47,2017-12-15 00:55:21,N,5,,,,,1,0.00,14,0,0,2.85,0,,0.3,17.15,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000919,1,2017-12-15 00:35:43,2017-12-15 01:08:42,N,1,,,,,1,11.70,37.5,0.5,0.5,9.7,0,,0.3,48.5,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000920,2,2017-12-15 00:13:56,2017-12-15 00:41:08,N,1,,,,,1,3.73,19,0.5,0.5,3,0,,0.3,23.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000921,2,2017-12-15 00:50:32,2017-12-15 01:06:42,N,1,,,,,1,2.81,12,0.5,0.5,2.66,0,,0.3,15.96,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000922,1,2017-12-15 00:39:58,2017-12-15 01:01:50,N,1,,,,,1,4.50,18,0.5,0.5,3.85,0,,0.3,23.15,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000923,1,2017-12-15 00:08:23,2017-12-15 01:07:29,N,1,,,,,1,7.70,38,0.5,0.5,0,0,,0.3,39.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000924,1,2017-12-15 00:00:04,2017-12-15 00:14:22,N,1,,,,,2,2.30,11.5,0.5,0.5,2.55,0,,0.3,15.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000925,1,2017-12-15 00:15:32,2017-12-15 00:39:24,N,1,,,,,2,3.10,16.5,0.5,0.5,2,0,,0.3,19.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000926,1,2017-12-15 00:58:47,2017-12-15 01:01:23,N,1,,,,,1,0.60,4,0.5,0.5,1.05,0,,0.3,6.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000927,2,2017-12-15 00:19:03,2017-12-15 00:26:33,N,1,,,,,1,1.07,7,0.5,0.5,1,0,,0.3,9.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000928,2,2017-12-15 00:27:47,2017-12-15 01:05:07,N,1,,,,,1,5.63,26,0.5,0.5,5.46,0,,0.3,32.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000929,1,2017-12-15 00:02:07,2017-12-15 00:28:16,N,1,,,,,0,6.70,23.5,0.5,0.5,0,0,,0.3,24.8,3,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000930,1,2017-12-15 00:33:18,2017-12-15 01:03:18,N,1,,,,,1,4.80,21.5,0.5,0.5,5.7,5.76,,0.3,34.26,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000931,1,2017-12-15 00:38:52,2017-12-15 00:38:57,N,2,,,,,1,4.60,52,0,0.5,0,0,,0.3,52.8,3,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000932,1,2017-12-15 00:43:53,2017-12-15 01:18:44,N,1,,,,,1,6.40,26.5,0,0.5,0,0,,0.3,27.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000933,1,2017-12-15 00:04:13,2017-12-15 00:51:06,N,1,,,,,2,9.20,36,0.5,0.5,7.45,0,,0.3,44.75,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000934,2,2017-12-15 00:09:49,2017-12-15 00:23:32,N,1,,,,,1,2.51,11.5,0.5,0.5,0,0,,0.3,12.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000935,2,2017-12-15 00:28:33,2017-12-15 01:04:59,N,1,,,,,1,6.32,25.5,0.5,0.5,5.36,0,,0.3,32.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000936,2,2017-12-15 00:06:03,2017-12-15 00:22:31,N,1,,,,,5,3.26,14,0.5,0.5,3.06,0,,0.3,18.36,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000937,2,2017-12-15 00:37:08,2017-12-15 01:03:40,N,1,,,,,5,2.83,17.5,0.5,0.5,0,0,,0.3,18.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000938,2,2017-12-15 00:57:16,2017-12-15 01:16:42,N,1,,,,,1,2.93,14,0.5,0.5,1,0,,0.3,16.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000939,2,2017-12-15 00:10:53,2017-12-15 00:22:45,N,1,,,,,2,1.44,9,0.5,0.5,2.5,0,,0.3,12.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000940,2,2017-12-15 00:50:00,2017-12-15 01:26:28,N,1,,,,,2,10.03,35.5,0.5,0.5,0,0,,0.3,36.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000941,2,2017-12-15 00:07:21,2017-12-15 00:20:07,N,1,,,,,1,1.85,10.5,0.5,0.5,2.36,0,,0.3,14.16,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000942,2,2017-12-15 00:29:10,2017-12-15 00:36:06,N,1,,,,,1,1.68,7.5,0.5,0.5,0,0,,0.3,8.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000943,2,2017-12-15 00:47:04,2017-12-15 01:20:23,N,1,,,,,1,15.51,45,0.5,0.5,9.26,0,,0.3,55.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000944,1,2017-12-15 00:19:02,2017-12-15 00:29:05,N,1,,,,,1,1.90,9.5,0.5,0.5,1,0,,0.3,11.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000945,1,2017-12-15 00:31:52,2017-12-15 00:44:24,N,1,,,,,1,2.20,10.5,0.5,0.5,0,0,,0.3,11.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000946,1,2017-12-15 00:46:42,2017-12-15 01:05:12,N,1,,,,,1,6.20,21,0.5,0.5,3.35,0,,0.3,25.65,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000947,2,2017-12-15 00:11:42,2017-12-15 00:43:20,N,1,,,,,1,6.04,25.5,0.5,0.5,5.36,0,,0.3,34.11,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000948,1,2017-12-15 00:21:34,2017-12-15 00:41:16,N,1,,,,,1,2.30,14,0.5,0.5,0,0,,0.3,15.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000949,1,2017-12-15 00:42:11,2017-12-15 00:55:13,N,1,,,,,2,3.50,13,0.5,0.5,2.85,0,,0.3,17.15,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000950,2,2017-12-15 00:00:16,2017-12-15 00:17:07,N,1,,,,,2,4.05,15.5,0.5,0.5,0,0,,0.3,16.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000951,2,2017-12-15 00:47:25,2017-12-15 01:08:08,N,1,,,,,2,4.73,18,0.5,0.5,0,0,,0.3,19.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000952,2,2017-12-14 23:58:52,2017-12-15 23:48:39,N,1,,,,,2,0.27,3.5,0.5,0.5,0,0,,0.3,4.8,1,,,,yellow,0.09,1,1.2,32,22,5.82,,,,,,,,,,,,,,,,,,,, 1460000953,2,2017-12-15 00:02:43,2017-12-15 00:31:33,N,1,,,,,1,3.81,19.5,0.5,0.5,3,0,,0.3,23.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000954,2,2017-12-15 00:08:07,2017-12-15 00:22:27,N,1,,,,,1,1.58,10.5,0.5,0.5,1.5,0,,0.3,13.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000955,2,2017-12-15 00:23:59,2017-12-15 00:43:51,N,1,,,,,1,3.09,14.5,0.5,0.5,4.74,0,,0.3,20.54,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000956,2,2017-12-15 00:53:01,2017-12-15 00:53:05,N,5,,,,,1,0.00,5,0,0.5,0,0,,0.3,5.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000957,2,2017-12-15 00:55:09,2017-12-15 01:07:32,N,1,,,,,1,2.52,11.5,0.5,0.5,0,0,,0.3,12.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000958,2,2017-12-15 00:22:57,2017-12-15 00:36:58,N,1,,,,,1,2.75,12,0.5,0.5,0.66,0,,0.3,13.96,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000959,2,2017-12-15 00:03:25,2017-12-15 00:13:02,N,1,,,,,1,0.99,7.5,0.5,0.5,0,0,,0.3,8.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000960,2,2017-12-15 00:17:06,2017-12-15 00:31:31,N,1,,,,,1,1.79,10.5,0.5,0.5,0,0,,0.3,11.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000961,2,2017-12-15 00:37:42,2017-12-15 01:37:47,N,5,,,,,1,16.84,80,0,0.5,0,0,,0.3,80.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000962,1,2017-12-15 00:24:46,2017-12-15 00:34:31,N,1,,,,,1,1.60,8.5,0.5,0.5,1.95,0,,0.3,11.75,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000963,1,2017-12-15 00:47:28,2017-12-15 01:07:04,N,1,,,,,1,4.20,16.5,0.5,0.5,5,0,,0.3,22.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000964,2,2017-12-15 00:01:11,2017-12-15 00:09:14,N,1,,,,,1,1.57,8,0.5,0.5,0,0,,0.3,9.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000965,2,2017-12-15 00:30:12,2017-12-15 01:01:05,N,1,,,,,1,4.68,22.5,0.5,0.5,4.76,0,,0.3,28.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000966,1,2017-12-15 00:24:00,2017-12-15 00:38:13,N,1,,,,,1,3.20,12.5,0.5,0.5,2.75,0,,0.3,16.55,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000967,2,2017-12-15 00:49:53,2017-12-15 01:17:25,N,1,,,,,3,6.37,22.5,0.5,0.5,4.76,0,,0.3,28.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000968,2,2017-12-15 00:05:30,2017-12-15 00:37:30,N,2,,,,,4,18.09,52,0,0.5,11.71,5.76,,0.3,70.27,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000969,2,2017-12-15 00:47:11,2017-12-15 01:02:37,N,1,,,,,4,3.35,13,0.5,0.5,0,0,,0.3,14.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000970,2,2017-12-15 00:32:31,2017-12-15 01:15:13,N,1,,,,,6,10.56,38.5,0.5,0.5,4,5.76,,0.3,49.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000971,1,2017-12-15 00:55:25,2017-12-15 01:00:03,N,1,,,,,1,0.50,5,0.5,0.5,0,0,,0.3,6.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000972,1,2017-12-15 00:00:54,2017-12-15 00:04:07,N,1,,,,,1,0.90,5,0.5,0.5,1.55,0,,0.3,7.85,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000973,1,2017-12-15 00:11:51,2017-12-15 00:24:10,N,1,,,,,1,1.70,10,0.5,0.5,2,0,,0.3,13.3,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000974,1,2017-12-15 00:33:26,2017-12-15 01:19:20,N,1,,,,,1,8.50,34.5,0.5,0.5,7.15,0,,0.3,42.95,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000975,1,2017-12-15 00:11:27,2017-12-15 00:54:09,N,1,,,,,2,7.90,31.5,0.5,0.5,6.55,0,,0.3,39.35,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000976,2,2017-12-15 00:10:38,2017-12-15 00:50:52,N,1,,,,,1,6.88,29.5,0.5,0.5,7.7,0,,0.3,38.5,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000977,2,2017-12-15 00:04:00,2017-12-15 00:21:02,N,1,,,,,6,1.51,12,0.5,0.5,3.99,0,,0.3,17.29,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000978,2,2017-12-15 00:21:43,2017-12-15 00:33:56,N,1,,,,,6,0.71,8,0.5,0.5,0,0,,0.3,9.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000979,2,2017-12-15 00:36:45,2017-12-15 00:48:13,N,1,,,,,6,1.00,8.5,0.5,0.5,2.94,0,,0.3,12.74,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000980,2,2017-12-15 00:50:35,2017-12-15 01:14:51,N,1,,,,,6,4.34,18,0.5,0.5,4.82,0,,0.3,24.12,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000981,2,2017-12-15 00:35:10,2017-12-15 00:40:56,N,1,,,,,1,1.21,6,0.5,0.5,1.46,0,,0.3,8.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000982,2,2017-12-15 00:53:17,2017-12-15 01:09:14,N,1,,,,,1,3.38,13.5,0.5,0.5,3.7,0,,0.3,18.5,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000983,2,2017-12-15 00:48:43,2017-12-15 00:58:44,N,1,,,,,1,1.34,8.5,0.5,0.5,1,0,,0.3,10.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000984,2,2017-12-15 00:37:51,2017-12-15 00:38:05,N,5,,,,,1,0.00,60,0,0,0,0,,0.3,60.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000985,2,2017-12-15 00:45:54,2017-12-15 00:52:31,N,1,,,,,1,1.92,7.5,0.5,0.5,2,0,,0.3,10.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000986,2,2017-12-15 00:07:43,2017-12-15 00:28:15,N,1,,,,,2,10.02,29.5,0.5,0.5,0,0,,0.3,30.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000987,2,2017-12-15 00:32:40,2017-12-15 00:41:01,N,1,,,,,2,1.29,7.5,0.5,0.5,1,0,,0.3,9.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000988,2,2017-12-15 00:41:52,2017-12-15 00:48:28,N,1,,,,,2,1.00,6.5,0.5,0.5,0,0,,0.3,7.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000989,2,2017-12-15 00:52:43,2017-12-15 01:37:15,N,5,,,,,2,4.32,70,0,0.5,0,0,,0.3,70.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000990,1,2017-12-15 00:03:36,2017-12-15 00:25:14,N,1,,,,,2,3.90,17.5,0.5,0.5,3.75,0,,0.3,22.55,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000991,1,2017-12-15 00:39:01,2017-12-15 01:05:04,N,1,,,,,2,4.70,19.5,0.5,0.5,3,0,,0.3,23.8,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000992,2,2017-12-15 00:04:21,2017-12-15 00:09:29,N,1,,,,,1,0.69,5,0.5,0.5,1.89,0,,0.3,8.19,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000993,2,2017-12-15 00:19:51,2017-12-15 00:24:23,N,1,,,,,1,0.79,5,0.5,0.5,1.26,0,,0.3,7.56,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000994,2,2017-12-15 00:25:57,2017-12-15 00:55:44,N,1,,,,,1,3.62,19.5,0.5,0.5,4.16,0,,0.3,24.96,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000995,1,2017-12-15 00:00:00,2017-12-15 00:15:00,N,1,,,,,1,2.20,12,0.5,0.5,2.65,0,,0.3,15.95,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000996,1,2017-12-15 00:16:51,2017-12-15 00:49:40,N,1,,,,,2,8.50,29,0.5,0.5,7.2,5.76,,0.3,43.26,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000997,2,2017-12-15 00:42:37,2017-12-15 01:24:17,N,1,,,,,1,8.46,31.5,0.5,0.5,8.2,0,,0.3,41,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000998,2,2017-12-15 00:06:02,2017-12-15 00:18:45,N,1,,,,,1,2.28,11,0.5,0.5,2.46,0,,0.3,14.76,1,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460000999,2,2017-12-15 00:21:04,2017-12-15 00:31:14,N,1,,,,,1,1.01,8,0.5,0.5,0,0,,0.3,9.3,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, 1460001000,2,2017-12-15 00:33:21,2017-12-15 00:42:19,N,1,,,,,1,1.87,8.5,0.5,0.5,0,0,,0.3,9.8,2,,,,yellow,0.11,1,1.2,28,20,3.80,,,,,,,,,,,,,,,,,,,, ================================================ FILE: examples/data/plasticc_test_set_1k.csv ================================================ object_id,mjd,passband,flux,flux_err,detected 13,59798.3205,2,-1.299735,1.357315,0 13,59798.3281,1,-2.095392,1.148654,0 13,59798.3357,3,-0.923794,1.763655,0 13,59798.3466,4,-4.009815,2.602911,0 13,59798.3576,5,-3.403503,5.367328,0 13,59801.3553,2,-1.778855,2.448943,0 13,59801.3629,1,2.491993,3.540421,0 13,59801.3705,3,1.644129,2.284999,0 13,59801.3815,4,-0.158192,2.515900,0 13,59801.3924,5,-6.457387,5.381231,0 13,59818.2740,0,1.962846,1.795587,0 13,59819.2541,0,-1.697929,2.433431,0 13,59820.2522,0,-1.698675,1.898612,0 13,59821.2478,0,-0.776626,2.435191,0 13,59822.2433,0,-3.826187,2.853957,0 13,59823.2659,0,-0.837001,2.690573,0 13,59826.3105,2,0.529480,0.920972,0 13,59826.3181,1,-0.702092,0.923219,0 13,59826.3258,3,-0.797231,1.508073,0 13,59826.3367,4,-3.898993,2.156529,0 13,59826.3477,5,-2.883462,5.535779,0 13,59842.2456,2,-0.838558,0.849664,0 13,59842.2532,1,-0.038403,0.813772,0 13,59842.2608,3,2.863780,1.322607,0 13,59842.2718,4,0.088068,2.166394,0 13,59842.2827,5,-1.672899,6.068114,0 13,59851.1792,0,-0.737309,2.574759,0 13,59854.1485,2,0.040905,1.224270,0 13,59854.1563,1,-0.265884,1.336087,0 13,59854.1640,3,-0.369843,1.901700,0 13,59854.1750,4,2.415236,2.771065,0 13,59854.1860,5,-4.745292,5.788470,0 13,59857.1408,2,4.814427,2.132096,0 13,59857.1485,1,-2.618018,2.915296,0 13,59857.1563,3,-2.852325,2.092946,0 13,59857.1673,4,-1.584260,2.676820,0 13,59857.1782,5,-12.249563,6.628258,0 13,59867.1112,2,-1.164210,1.083173,0 13,59867.1189,1,-1.176300,0.974351,0 13,59867.1267,3,0.451480,1.538879,0 13,59867.1377,4,0.359918,2.357359,0 13,59867.1487,5,-2.699421,5.830664,0 13,59870.1049,2,1.381781,0.971909,0 13,59870.1126,1,0.387448,0.822428,0 13,59870.1204,3,0.817313,1.373219,0 13,59870.1314,4,0.151035,2.283455,0 13,59870.1424,5,2.986818,5.280643,0 13,59873.0971,2,-1.240265,1.040900,0 13,59873.1049,1,0.931574,1.042814,0 13,59873.1126,3,2.134935,1.442823,0 13,59873.1236,4,-0.530458,2.068875,0 13,59873.1346,5,-0.314489,4.587787,0 13,59874.1461,0,-2.490195,1.611522,0 13,59875.0995,0,-1.276708,2.019888,0 13,59876.0980,0,3.425512,2.443817,0 13,59877.0976,0,2.518677,1.776579,0 13,59878.0964,0,1.872855,2.020787,0 13,59879.0895,0,0.522355,1.864210,0 13,59880.1017,0,-0.761489,1.371363,0 13,59884.1760,2,0.408623,0.768234,0 13,59884.1836,1,0.610256,0.887502,0 13,59884.1913,3,-0.134252,1.138571,0 13,59884.2022,4,-0.009705,1.609182,0 13,59884.2132,5,6.633543,4.214399,0 13,59887.2856,2,1.784338,1.826830,0 13,59887.2933,1,1.591402,2.206727,0 13,59887.3009,3,0.109085,2.350564,0 13,59887.3118,4,-2.874674,3.145338,0 13,59887.3228,5,-2.586321,7.947824,0 13,59896.1307,2,1.084506,0.822793,0 13,59896.1384,1,-0.561922,0.703150,0 13,59896.1460,3,0.284255,1.393400,0 13,59896.1569,4,0.466466,2.192684,0 13,59896.1679,5,0.433727,5.948182,0 13,59899.1519,2,0.102152,1.365073,0 13,59899.1595,1,1.624047,1.388812,0 13,59899.1672,3,1.820581,2.378815,0 13,59899.1781,4,-3.816161,3.709328,0 13,59899.1891,5,-8.805643,7.905914,0 13,59902.1384,2,0.196845,1.186146,0 13,59902.1460,1,0.993784,1.079431,0 13,59902.1537,3,1.891725,1.943785,0 13,59902.1646,4,-1.455820,2.762865,0 13,59902.1755,5,-0.622898,7.024046,0 13,59904.1053,0,-4.067552,2.281505,0 13,59905.0555,0,0.633017,2.504281,0 13,59906.0562,0,1.127558,2.030226,0 13,59907.0567,0,0.456890,1.749502,0 13,59908.0681,0,-1.925056,2.683702,0 13,59909.0582,0,-0.678187,1.834134,0 13,59910.0503,0,1.194135,1.869489,0 13,59914.0526,2,-1.644144,1.438881,0 13,59914.0602,1,-1.866351,1.956770,0 13,59914.0678,3,-0.710913,1.686324,0 13,59914.0788,4,5.825469,2.269697,0 13,59914.0897,5,3.513655,5.356808,0 13,59924.1060,2,-0.028742,1.131780,0 13,59924.1136,1,-1.108478,1.574127,0 13,59924.1212,3,-1.323133,1.728779,0 13,59924.1322,4,-5.513097,2.527147,0 13,59924.1431,5,4.266651,5.633382,0 13,59927.1074,2,-0.400232,1.190879,0 13,59927.1151,1,1.286833,1.135341,0 13,59927.1227,3,2.155155,1.593354,0 13,59927.1336,4,2.415218,2.299777,0 13,59927.1446,5,3.778454,5.891465,0 13,59930.1236,2,1.373614,0.942084,0 13,59930.1312,1,-1.140287,0.828822,0 13,59930.1388,3,0.832613,1.283629,0 13,59930.1498,4,2.345280,2.033006,0 13,59930.1607,5,-4.211092,5.330056,0 13,59933.1249,2,0.395000,0.691634,0 13,59933.1325,1,-0.423847,0.693980,0 13,59933.1401,3,-1.108444,1.268825,0 13,59933.1511,4,-0.328867,1.913908,0 13,59933.1620,5,7.551563,5.429599,0 13,59934.0638,0,-1.113758,1.141001,0 13,59935.0646,0,-0.818122,1.122888,0 13,59936.0642,0,2.141036,1.531045,0 13,59937.0650,0,-1.277371,1.639949,0 13,59938.0647,0,4.045178,2.015162,0 13,59939.0650,0,-0.050647,2.194245,0 13,60165.3032,2,-0.244917,1.195023,0 13,60165.3109,1,-0.051452,1.347307,0 13,60165.3186,3,-1.446614,1.518360,0 13,60165.3295,4,-1.239533,2.022870,0 13,60165.3405,5,-2.233357,4.782032,0 13,60168.2892,2,-0.272922,0.834000,0 13,60168.2970,1,-0.551885,0.787909,0 13,60168.3047,3,0.215221,1.300493,0 13,60168.3157,4,-0.354714,2.065893,0 13,60168.3267,5,9.800998,4.821743,0 13,60176.2820,0,-2.711282,2.363240,0 13,60177.2726,0,2.907485,2.777568,0 13,60181.4088,2,-0.778592,0.924176,0 13,60181.4164,1,0.776635,1.262891,0 13,60181.4232,3,5.557160,3.239328,0 13,60183.2660,2,0.008992,1.915637,0 13,60183.2736,1,3.495092,2.936934,0 13,60183.2812,3,-4.313775,2.548015,0 13,60183.2922,4,4.681122,2.996080,0 13,60183.3031,5,-0.644280,6.178194,0 13,60195.2812,2,-0.992008,0.933265,0 13,60195.2888,1,-0.199896,1.020470,0 13,60195.2964,3,1.175473,1.626211,0 13,60195.3073,4,-0.928975,2.339218,0 13,60195.3183,5,3.537798,6.276374,0 13,60198.2690,0,0.879443,2.172960,0 13,60199.2186,0,1.393119,2.161035,0 13,60200.2139,0,0.226138,2.589170,0 13,60201.2072,0,0.024500,2.490959,0 13,60202.2089,0,-1.152337,2.483400,0 13,60209.1811,2,-0.534034,1.565458,0 13,60209.1888,1,0.822128,1.410793,0 13,60209.1965,3,-0.684217,1.831758,0 13,60209.2075,4,-3.066642,2.909104,0 13,60209.2184,5,0.602669,6.696602,0 13,60212.1675,2,1.117115,1.936030,0 13,60212.1753,1,1.782217,2.752773,0 13,60212.1830,3,0.321750,2.295181,0 13,60212.1941,4,-1.936185,2.590936,0 13,60212.2050,5,11.339810,5.666715,0 13,60223.2416,2,0.123572,1.182341,0 13,60223.2493,1,0.568967,1.241492,0 13,60223.2569,3,0.040453,1.648758,0 13,60223.2678,4,0.459117,2.472937,0 13,60223.2788,5,-7.159738,5.794432,0 13,60226.3337,2,0.034159,0.944593,0 13,60226.3413,1,1.465278,0.946326,0 13,60226.3489,3,0.247305,1.793391,0 13,60226.3599,4,4.326116,3.071592,0 13,60226.3708,5,1.558216,8.109918,0 13,60238.3197,2,-0.738155,1.053325,0 13,60238.3273,1,-0.990783,1.152318,0 13,60238.3349,3,1.786266,2.171606,0 13,60238.3459,4,3.042687,3.829919,0 13,60238.3568,5,8.879921,11.257108,0 13,60241.0870,2,-0.147198,1.999162,0 13,60241.0948,1,-1.383542,3.066069,0 13,60241.1025,3,-0.347256,2.444988,0 13,60241.1136,4,2.242954,2.917921,0 13,60241.1245,5,4.122897,5.941785,0 13,60250.1708,2,2.348668,2.103043,0 13,60250.1957,1,1.762817,2.811031,0 13,60250.2034,3,-4.162498,2.927869,0 13,60250.2143,4,3.563345,3.983845,0 13,60250.2253,5,5.870506,7.937796,0 13,60261.1296,0,2.993462,2.083503,0 13,60262.0550,0,1.952810,2.311998,0 13,60263.0556,0,1.620923,1.977922,0 13,60264.0559,0,3.218953,2.187246,0 13,60265.0780,0,-4.046216,2.407065,0 13,60268.0449,2,1.923153,1.404630,0 13,60268.0525,1,-0.380431,1.849085,0 13,60268.0601,3,1.653331,2.130050,0 13,60268.0711,4,-2.881836,2.734059,0 13,60268.0820,5,7.530805,6.747447,0 13,60278.0993,2,-1.574151,1.643888,0 13,60278.1069,1,-4.148108,2.273913,0 13,60278.1145,3,-1.136458,2.169661,0 13,60278.1255,4,-4.712543,3.059077,0 13,60278.1364,5,-4.843711,7.638536,0 13,60281.1023,2,-0.579702,0.798074,0 13,60281.1099,1,0.559643,0.795803,0 13,60281.1175,3,0.985490,1.369862,0 13,60281.1285,4,-0.125175,1.990803,0 13,60281.1394,5,-6.211294,5.176151,0 13,60284.1027,2,0.920098,0.837512,0 13,60284.1104,1,-0.531105,0.761165,0 13,60284.1180,3,-1.038009,1.160178,0 13,60284.1289,4,-0.543047,1.702779,0 13,60284.1399,5,-5.946274,4.435276,0 13,60287.1047,2,-0.388991,1.250426,0 13,60287.1123,1,0.622866,1.083712,0 13,60287.1200,3,-1.365885,1.796073,0 13,60287.1309,4,-1.566869,2.963949,0 13,60287.1418,5,-12.680235,7.013420,0 13,60290.0761,0,-0.996401,1.912932,0 13,60291.0689,0,-1.639395,1.661116,0 13,60292.0699,0,-0.087568,1.855572,0 13,60293.0699,0,3.384405,1.578842,0 13,60294.0708,0,0.056788,1.397227,0 13,60532.3019,2,24.529644,1.046373,1 13,60532.3097,1,23.404964,0.992504,1 13,60532.3173,3,36.069386,1.568371,1 13,60532.3282,4,42.765503,2.305008,1 13,60532.3392,5,36.567162,5.439748,1 13,60535.2802,2,23.662449,1.422315,1 13,60535.2879,1,20.202259,1.361849,1 13,60535.2957,3,39.966290,2.106815,1 13,60535.3068,4,39.323189,3.034410,1 13,60535.3177,5,25.412567,7.170496,1 13,60538.2826,2,19.899044,1.856537,1 13,60538.2903,1,14.983138,2.683123,1 13,60538.2980,3,32.445835,1.936947,1 13,60538.3089,4,37.566696,2.327296,1 13,60538.3199,5,36.802090,5.025538,1 13,60554.2651,0,2.890699,2.258298,0 13,60555.2411,0,1.421973,2.038275,0 13,60556.2370,0,5.816270,2.553418,0 13,60557.2322,0,2.976124,1.849058,0 13,60558.2332,0,3.465247,2.503530,0 13,60559.2274,0,2.022449,2.213130,0 13,60560.2268,0,3.503388,3.087183,0 13,60567.3291,2,14.211590,0.945795,1 13,60567.3368,1,8.278180,1.052801,1 13,60567.3444,3,24.237070,1.263879,1 13,60567.3553,4,31.986134,1.675546,1 13,60567.3663,5,26.146296,4.267163,1 13,60580.1736,2,10.443220,1.455966,1 13,60580.1813,1,4.004329,1.395011,0 13,60580.1889,3,21.601105,2.202044,1 13,60580.1999,4,33.927616,3.210988,1 13,60580.2108,5,31.062853,7.647927,0 13,60582.1681,0,1.094513,2.405975,0 13,60583.1640,0,-1.259893,2.979695,0 13,60584.1591,0,1.617939,2.344479,0 13,60585.1601,0,1.637716,1.874226,0 13,60586.1564,0,0.117864,1.815223,0 13,60587.1540,0,0.302183,2.121386,0 13,60588.1461,0,3.094076,1.792624,0 13,60593.1209,2,10.800429,1.153115,1 13,60593.1287,1,4.406773,1.308495,0 13,60593.1365,3,16.092447,1.545535,1 13,60593.1476,4,24.343342,2.181189,1 13,60593.1585,5,19.054943,5.111032,1 13,60596.1351,2,12.383826,2.473228,0 13,60596.1427,1,0.372005,3.515397,0 13,60596.1504,3,16.204580,2.651616,1 13,60596.1613,4,26.244993,3.235833,1 13,60596.1723,5,20.389833,7.186456,0 13,60605.0908,2,7.542615,0.977457,1 13,60605.0986,1,3.585346,0.917699,0 13,60605.1063,3,15.296724,1.463083,1 13,60605.1174,4,21.654158,2.154012,1 13,60605.1283,5,12.261124,5.138537,1 13,60608.0836,2,7.212741,0.886919,1 13,60608.0913,1,3.103603,0.823795,0 13,60608.0991,3,15.016788,1.333418,1 13,60608.1101,4,22.194675,1.964862,1 13,60608.1211,5,16.844475,4.664538,0 13,60611.0756,2,5.219585,0.772730,1 13,60611.0833,1,2.315308,0.703447,0 13,60611.0911,3,13.987851,1.177967,1 13,60611.1021,4,23.238731,1.752712,1 13,60611.1130,5,13.764248,4.170574,0 13,60612.0813,0,-0.500034,1.440229,0 13,60613.0818,0,1.002482,1.524789,0 13,60614.0803,0,-1.957333,1.736466,0 13,60615.0761,0,-1.864921,1.981273,0 13,60616.0769,0,-0.823935,2.007182,0 13,60617.0737,0,4.029193,1.584846,0 13,60620.1444,0,1.135807,1.713108,0 13,60621.2673,2,5.227184,0.942700,1 13,60621.2749,1,0.979372,0.945305,0 13,60621.2825,3,10.753699,1.650229,1 13,60621.2934,4,15.025331,2.743479,1 13,60621.3044,5,20.601212,7.516092,0 13,60624.1290,2,4.608896,1.191301,0 13,60624.1366,1,3.302126,1.565634,0 13,60624.1442,3,13.692765,1.502649,1 13,60624.1551,4,22.563152,2.025444,1 13,60624.1661,5,17.466129,4.765408,0 13,60633.0541,2,5.740388,1.090800,1 13,60633.0617,1,-0.248447,1.055601,0 13,60633.0693,3,11.721886,1.678442,1 13,60633.0803,4,21.894281,2.489034,1 13,60633.0912,5,8.198497,5.988672,0 13,60636.0482,2,4.445689,1.029320,0 13,60636.0558,1,0.510712,0.992636,0 13,60636.0635,3,8.838101,1.589973,0 13,60636.0744,4,18.146425,2.370814,1 13,60636.0854,5,20.537880,5.701680,0 13,60640.0972,2,2.708286,0.871447,0 13,60640.1049,1,0.387127,0.847005,0 13,60640.1125,3,11.844381,1.395034,1 13,60640.1234,4,16.336960,2.129130,1 13,60640.1344,5,22.784824,5.287734,0 13,60642.0643,0,-0.045857,2.075920,0 13,60643.0521,0,-0.339986,1.668015,0 13,60644.0621,0,-0.061904,1.665856,0 13,60645.0625,0,-1.683454,1.768579,0 13,60646.0636,0,-1.090810,1.782456,0 13,60647.0635,0,-0.917836,1.968023,0 13,60648.0642,0,-0.471162,1.443392,0 13,60652.1289,2,2.063019,0.939241,0 13,60652.1365,1,0.914091,1.117558,0 13,60652.1441,3,8.505517,1.381162,1 13,60652.1550,4,20.247869,2.050198,1 13,60652.1660,5,4.584575,5.200393,0 14,59798.3205,2,14.465278,1.364599,1 14,59798.3281,1,13.748290,1.165200,1 14,59798.3357,3,8.555202,1.766522,1 14,59798.3466,4,7.253281,2.603756,0 14,59798.3576,5,1.134257,5.365180,0 14,59801.3553,2,9.855556,2.449258,0 14,59801.3629,1,10.256726,3.538008,0 14,59801.3705,3,10.469539,2.286622,0 14,59801.3815,4,4.839510,2.516017,0 14,59801.3924,5,6.016302,5.381228,0 14,59818.2740,0,-1.523922,1.781903,0 14,59819.2541,0,6.043163,2.455132,0 14,59820.2522,0,-3.440181,1.895149,0 14,59821.2478,0,-1.079028,2.430641,0 14,59822.2433,0,-0.827454,2.848530,0 14,59823.2659,0,5.701578,2.706588,0 14,59826.3105,2,2.735408,0.921596,0 14,59826.3181,1,2.361817,0.925133,0 14,59826.3258,3,4.882393,1.509568,1 14,59826.3367,4,5.898125,2.157517,0 14,59826.3477,5,7.172944,5.536250,0 14,59842.2456,2,1.210632,0.849698,0 14,59842.2532,1,0.480856,0.813325,0 14,59842.2608,3,3.207553,1.321817,0 14,59842.2718,4,4.451898,2.166812,0 14,59842.2827,5,4.518466,6.066945,0 14,59851.1792,0,4.186630,2.586060,0 14,59854.1485,2,0.507158,1.223279,0 14,59854.1563,1,1.509088,1.335596,0 14,59854.1640,3,3.079645,1.901563,0 14,59854.1750,4,2.476080,2.769468,0 14,59854.1860,5,5.571058,5.787987,0 14,59857.1408,2,2.902585,2.129320,0 14,59857.1485,1,0.076045,2.911045,0 14,59857.1563,3,5.179635,2.093322,0 14,59857.1673,4,4.421298,2.676636,0 14,59857.1782,5,4.276851,6.626348,0 14,59867.1112,2,-0.034593,1.082066,0 14,59867.1189,1,-1.146521,0.972931,0 14,59867.1267,3,4.685015,1.540005,0 14,59867.1377,4,3.080778,2.356951,0 14,59867.1487,5,-3.318216,5.827800,0 14,59870.1049,2,0.554243,0.970361,0 14,59870.1126,1,0.375991,0.821212,0 14,59870.1204,3,4.517774,1.374431,0 14,59870.1314,4,1.906460,2.282754,0 14,59870.1424,5,-0.145795,5.276641,0 14,59873.0971,2,2.108685,1.041195,0 14,59873.1049,1,0.095400,1.040191,0 14,59873.1126,3,3.909108,1.442765,0 14,59873.1236,4,9.230726,2.071694,0 14,59873.1346,5,5.649127,4.588614,0 14,59874.1461,0,-3.214277,1.608558,0 14,59875.0995,0,2.425352,2.028404,0 14,59876.0980,0,3.646141,2.440229,0 14,59877.0976,0,-1.473760,1.759196,0 14,59878.0964,0,0.576515,2.010597,0 14,59879.0895,0,0.572945,1.861002,0 14,59880.1017,0,-0.224757,1.368790,0 14,59884.1760,2,-0.132164,0.767148,0 14,59884.1836,1,-0.991678,0.885456,0 14,59884.1913,3,2.358108,1.139322,0 14,59884.2022,4,3.557226,1.610064,0 14,59884.2132,5,1.323540,4.209188,0 14,59887.2856,2,-2.655194,1.824434,0 14,59887.2933,1,-3.118680,2.202677,0 14,59887.3009,3,4.665102,2.350384,0 14,59887.3118,4,1.135563,3.143775,0 14,59887.3228,5,4.491710,7.945457,0 14,59896.1307,2,-0.963295,0.821164,0 14,59896.1384,1,-0.071138,0.702119,0 14,59896.1460,3,2.277665,1.393450,0 14,59896.1569,4,7.248254,2.194077,0 14,59896.1679,5,4.658720,5.946997,0 14,59899.1519,2,-1.774250,1.363683,0 14,59899.1595,1,-1.174699,1.385324,0 14,59899.1672,3,-0.953666,2.376407,0 14,59899.1781,4,8.315199,3.709089,0 14,59899.1891,5,-5.444651,7.901953,0 14,59902.1384,2,0.053353,1.184878,0 14,59902.1460,1,-0.247240,1.076702,0 14,59902.1537,3,3.547660,1.943007,0 14,59902.1646,4,3.456085,2.762274,0 14,59902.1755,5,-2.950504,7.020600,0 14,59904.1053,0,4.177117,2.292928,0 14,59905.0555,0,-1.216755,2.497270,0 14,59906.0562,0,2.410480,2.032158,0 14,59907.0567,0,-1.391141,1.743943,0 14,59908.0681,0,3.657772,2.691214,0 14,59909.0582,0,0.521167,1.833233,0 14,59910.0503,0,-0.100852,1.860389,0 14,59914.0526,2,1.198847,1.437903,0 14,59914.0602,1,-4.574095,1.954059,0 14,59914.0678,3,0.367522,1.685196,0 14,59914.0788,4,2.276051,2.267016,0 14,59914.0897,5,10.662569,5.357585,0 14,59924.1060,2,0.898902,1.131097,0 14,59924.1136,1,-0.533751,1.571826,0 14,59924.1212,3,0.148105,1.727512,0 14,59924.1322,4,4.691455,2.527090,0 14,59924.1431,5,-2.120271,5.628747,0 14,59927.1074,2,-0.466719,1.189681,0 14,59927.1151,1,-0.370304,1.132207,0 14,59927.1227,3,2.069784,1.592095,0 14,59927.1336,4,2.988137,2.298649,0 14,59927.1446,5,2.182713,5.887877,0 14,59930.1236,2,0.240297,0.940370,0 14,59930.1312,1,-0.920183,0.827606,0 14,59930.1388,3,1.243185,1.282906,0 14,59930.1498,4,3.796952,2.032441,0 14,59930.1607,5,4.695318,5.329674,0 14,59933.1249,2,0.006509,0.690589,0 14,59933.1325,1,1.157228,0.695202,0 14,59933.1401,3,2.223172,1.269250,0 14,59933.1511,4,2.050447,1.913679,0 14,59933.1620,5,6.551545,5.426427,0 14,59934.0638,0,-1.038826,1.138865,0 14,59935.0646,0,-0.855022,1.120789,0 14,59936.0642,0,4.664287,1.541755,0 14,59937.0650,0,0.068496,1.637206,0 14,59938.0647,0,-0.037584,1.994424,0 14,59939.0650,0,0.482500,2.191997,0 14,60165.3032,2,-0.497132,1.193823,0 14,60165.3109,1,2.598282,1.347751,0 14,60165.3186,3,-2.635130,1.517240,0 14,60165.3295,4,5.500406,2.024001,0 14,60165.3405,5,-1.459476,4.779670,0 14,60168.2892,2,-0.132310,0.833159,0 14,60168.2970,1,1.305878,0.789044,0 14,60168.3047,3,1.487449,1.300279,0 14,60168.3157,4,2.190609,2.065560,0 14,60168.3267,5,-2.313916,4.814189,0 14,60176.2820,0,-2.258721,2.358804,0 14,60177.2726,0,-0.240173,2.761198,0 14,60181.4088,2,-0.118050,0.923237,0 14,60181.4164,1,-1.757872,1.260326,0 14,60181.4232,3,-0.566288,3.235162,0 14,60183.2660,2,-0.732747,1.913709,0 14,60183.2736,1,0.012732,2.931251,0 14,60183.2812,3,1.402672,2.546405,0 14,60183.2922,4,-1.725346,2.993038,0 14,60183.3031,5,0.494266,6.175346,0 14,60195.2812,2,3.029382,0.934420,0 14,60195.2888,1,-0.670324,1.018990,0 14,60195.2964,3,-0.662677,1.624419,0 14,60195.3073,4,-3.872562,2.337922,0 14,60195.3183,5,2.712977,6.272959,0 14,60198.2690,0,2.956521,2.177715,0 14,60199.2186,0,1.331341,2.156719,0 14,60200.2139,0,1.351745,2.588871,0 14,60201.2072,0,2.853797,2.498285,0 14,60202.2089,0,1.906982,2.486693,0 14,60209.1811,2,-1.602512,1.563901,0 14,60209.1888,1,-0.671917,1.407978,0 14,60209.1965,3,0.374204,1.830516,0 14,60209.2075,4,2.568392,2.908100,0 14,60209.2184,5,6.678216,6.695596,0 14,60212.1675,2,3.776216,1.935000,0 14,60212.1753,1,0.021062,2.747988,0 14,60212.1830,3,-1.807372,2.293340,0 14,60212.1941,4,2.568171,2.590221,0 14,60212.2050,5,10.840719,5.663691,0 14,60223.2416,2,-0.194212,1.181089,0 14,60223.2493,1,-0.036469,1.239146,0 14,60223.2569,3,1.530641,1.648199,0 14,60223.2678,4,1.801575,2.471934,0 14,60223.2788,5,4.744702,5.793488,0 14,60226.3337,2,-0.744695,0.943627,0 14,60226.3413,1,-0.106145,0.942899,0 14,60226.3489,3,2.442077,1.793019,0 14,60226.3599,4,1.509347,3.068994,0 14,60226.3708,5,-6.627614,8.105484,0 14,60238.3197,2,-0.863180,1.052267,0 14,60238.3273,1,1.557186,1.152551,0 14,60238.3349,3,-4.336288,2.169387,0 14,60238.3459,4,4.612921,3.828061,0 14,60238.3568,5,-2.021824,11.249375,0 14,60241.0870,2,0.738195,1.997376,0 14,60241.0948,1,-1.971425,3.061613,0 14,60241.1025,3,-2.025038,2.443142,0 14,60241.1136,4,-0.312927,2.915593,0 14,60241.1245,5,3.056638,5.938406,0 14,60250.1708,2,-2.482518,2.100286,0 14,60250.1957,1,-0.029834,2.806254,0 14,60250.2034,3,4.678575,2.926827,0 14,60250.2143,4,-3.449543,3.980825,0 14,60250.2253,5,-3.656875,7.932066,0 14,60261.1296,0,-3.352771,2.067494,0 14,60262.0550,0,-3.894481,2.299585,0 14,60263.0556,0,0.061202,1.966700,0 14,60264.0559,0,-0.441779,2.168793,0 14,60265.0780,0,-0.070926,2.402439,0 14,60268.0449,2,-2.115148,1.402396,0 14,60268.0525,1,-1.156284,1.846402,0 14,60268.0601,3,-0.248761,2.127820,0 14,60268.0711,4,1.514031,2.732877,0 14,60268.0820,5,-11.142164,6.741534,0 14,60278.0993,2,0.058940,1.642234,0 14,60278.1069,1,1.004120,2.270966,0 14,60278.1145,3,-0.427660,2.168000,0 14,60278.1255,4,0.148053,3.057248,0 14,60278.1364,5,14.839427,7.639682,0 14,60281.1023,2,0.190632,0.797410,0 14,60281.1099,1,-0.334674,0.793767,0 14,60281.1175,3,-1.839893,1.368291,0 14,60281.1285,4,-1.943368,1.989664,0 14,60281.1394,5,3.423336,5.175146,0 14,60284.1027,2,0.880585,0.836640,0 14,60284.1104,1,-1.071148,0.760074,0 14,60284.1180,3,-0.493422,1.159284,0 14,60284.1289,4,-1.769790,1.701805,0 14,60284.1399,5,10.491048,4.439059,0 14,60287.1047,2,-2.440768,1.249228,0 14,60287.1123,1,-0.587990,1.081420,0 14,60287.1200,3,-0.658860,1.794693,0 14,60287.1309,4,6.071679,2.963972,0 14,60287.1418,5,-0.912052,7.009711,0 14,60290.0761,0,0.791311,1.912837,0 14,60291.0689,0,-0.916465,1.657989,0 14,60292.0699,0,-0.940145,1.852110,0 14,60293.0699,0,-0.065888,1.558097,0 14,60294.0708,0,-0.362213,1.394291,0 14,60532.3019,2,-0.128229,1.022716,0 14,60532.3097,1,-0.763027,0.950918,0 14,60532.3173,3,-0.986068,1.539248,0 14,60532.3282,4,-0.845234,2.279716,0 14,60532.3392,5,8.809470,5.421995,0 14,60535.2802,2,-0.482045,1.404915,0 14,60535.2879,1,1.358840,1.336113,0 14,60535.2957,3,-0.531306,2.081220,0 14,60535.3068,4,-3.251427,3.016502,0 14,60535.3177,5,-1.253254,7.157213,0 14,60538.2826,2,1.802223,1.846149,0 14,60538.2903,1,-0.026881,2.671491,0 14,60538.2980,3,1.533429,1.916902,0 14,60538.3089,4,-0.856327,2.306201,0 14,60538.3199,5,-1.663799,5.002286,0 14,60554.2651,0,-1.160009,2.241721,0 14,60555.2411,0,-2.432893,2.027424,0 14,60556.2370,0,2.398724,2.534231,0 14,60557.2322,0,-1.568709,1.829169,0 14,60558.2332,0,-4.116055,2.484439,0 14,60559.2274,0,0.190265,2.200325,0 14,60560.2268,0,1.347325,3.073875,0 14,60567.3291,2,0.910389,0.934006,0 14,60567.3368,1,-1.444133,1.041686,0 14,60567.3444,3,-1.500671,1.243064,0 14,60567.3553,4,2.684830,1.653721,0 14,60567.3663,5,0.345307,4.248466,0 14,60580.1736,2,0.807077,1.449346,0 14,60580.1813,1,2.870951,1.391824,0 14,60580.1889,3,-0.262873,2.190222,0 14,60580.1999,4,-6.388680,3.196808,0 14,60580.2108,5,9.181289,7.635880,0 14,60582.1681,0,3.604234,2.412230,0 14,60583.1640,0,-2.491946,2.974151,0 14,60584.1591,0,0.669646,2.335869,0 14,60585.1601,0,0.574023,1.865054,0 14,60586.1564,0,-0.297618,1.811183,0 14,60587.1540,0,-1.218754,2.115987,0 14,60588.1461,0,-1.992268,1.771815,0 14,60593.1209,2,-0.511945,1.144706,0 14,60593.1287,1,-0.914986,1.302187,0 14,60593.1365,3,-1.629705,1.534286,0 14,60593.1476,4,-3.500163,2.167956,0 14,60593.1585,5,-7.225812,5.098632,0 14,60596.1351,2,1.942903,2.467501,0 14,60596.1427,1,-3.828583,3.510221,0 14,60596.1504,3,-0.379595,2.643848,0 14,60596.1613,4,-4.746652,3.225241,0 14,60596.1723,5,-0.740908,7.175334,0 14,60605.0908,2,-0.240784,0.970734,0 14,60605.0986,1,1.228715,0.912676,0 14,60605.1063,3,-0.173487,1.451932,0 14,60605.1174,4,-3.157050,2.142289,0 14,60605.1283,5,-4.353776,5.129833,0 14,60608.0836,2,-1.290518,0.880033,0 14,60608.0913,1,0.897031,0.818780,0 14,60608.0991,3,-1.789510,1.321652,0 14,60608.1101,4,-0.283683,1.951819,0 14,60608.1211,5,-1.071998,4.652730,0 14,60611.0756,2,0.428420,0.767492,0 14,60611.0833,1,-0.175538,0.697855,0 14,60611.0911,3,-1.351069,1.165860,0 14,60611.1021,4,3.187355,1.739082,0 14,60611.1130,5,-2.026892,4.159950,0 14,60612.0813,0,1.714202,1.449054,0 14,60613.0818,0,-1.208979,1.515573,0 14,60614.0803,0,2.366692,1.746634,0 14,60615.0761,0,0.562962,1.980418,0 14,60616.0769,0,-0.108613,2.003422,0 14,60617.0737,0,1.100774,1.564082,0 14,60620.1444,0,-3.990349,1.704574,0 14,60621.2673,2,-1.006580,0.937842,0 14,60621.2749,1,-0.619488,0.942508,0 14,60621.2825,3,1.860522,1.643903,0 14,60621.2934,4,1.823893,2.737007,0 14,60621.3044,5,-2.549210,7.504488,0 14,60624.1290,2,-1.234065,1.187694,0 14,60624.1366,1,0.769079,1.561521,0 14,60624.1442,3,-1.651142,1.493277,0 14,60624.1551,4,-1.015730,2.012733,0 14,60624.1661,5,-1.555369,4.753550,0 14,60633.0541,2,0.136783,1.086145,0 14,60633.0617,1,-0.152341,1.054063,0 14,60633.0693,3,-0.510173,1.670818,0 14,60633.0803,4,0.315384,2.478524,0 14,60633.0912,5,-6.652580,5.982395,0 14,60636.0482,2,-0.451914,1.025376,0 14,60636.0558,1,0.286247,0.990897,0 14,60636.0635,3,0.255769,1.584027,0 14,60636.0744,4,-0.340420,2.361790,0 14,60636.0854,5,9.500138,5.693579,0 14,60640.0972,2,0.332550,0.868813,0 14,60640.1049,1,-0.473409,0.845206,0 14,60640.1125,3,0.729877,1.386701,0 14,60640.1234,4,-1.934210,2.120461,0 14,60640.1344,5,-4.023204,5.273717,0 14,60642.0643,0,6.371965,2.098372,0 14,60643.0521,0,0.069668,1.665262,0 14,60644.0621,0,-0.031334,1.662740,0 14,60645.0625,0,0.388892,1.767081,0 14,60646.0636,0,-0.740859,1.779115,0 14,60647.0635,0,3.793745,1.980666,0 14,60648.0642,0,-2.506170,1.440771,0 14,60652.1289,2,2.536718,0.938627,0 14,60652.1365,1,-0.345612,1.114977,0 14,60652.1441,3,-0.218325,1.374793,0 14,60652.1550,4,3.384219,2.040273,0 14,60652.1660,5,5.932649,5.198509,0 17,59750.4229,2,0.384775,1.502702,0 17,59750.4306,1,2.970657,2.100801,0 17,59750.4383,3,-3.890317,2.298941,0 17,59750.4450,4,-7.424517,10.310197,0 17,59752.4070,2,1.180267,1.063318,0 17,59752.4147,1,-0.427451,1.103617,0 17,59752.4224,3,0.804619,1.459435,0 17,59752.4334,4,0.082026,2.345975,0 17,59752.4435,5,-12.127212,9.469489,0 17,59767.2968,2,0.300545,0.879810,0 17,59767.3045,1,-0.789039,0.757829,0 17,59767.3122,3,-2.516992,1.335309,0 17,59767.3233,4,-0.305087,2.137903,0 17,59767.3343,5,-2.361575,4.981940,0 17,59770.2179,2,2.652607,2.031536,0 17,59770.2256,1,0.104747,2.751110,0 17,59770.2334,3,-1.251170,2.138434,0 17,59770.2445,4,1.627103,2.665143,0 17,59770.2557,5,3.234555,6.275570,0 17,59779.3188,2,-1.560282,2.314641,0 17,59779.3265,1,1.138921,2.800010,0 17,59779.3342,3,1.399184,2.246128,0 17,59779.3452,4,-4.086601,2.966733,0 17,59779.3562,5,3.600588,7.075419,0 17,59782.1897,2,-0.205182,1.441666,0 17,59782.1974,1,-0.008989,1.378359,0 17,59782.2051,3,-3.370768,2.336834,0 17,59782.2162,4,-1.506593,3.089562,0 17,59782.2274,5,-1.340868,7.552328,0 17,59797.2861,2,0.574704,1.059202,0 17,59797.2938,1,-1.925344,1.095736,0 17,59797.3015,3,-1.303792,1.916743,0 17,59797.3126,4,-0.918545,2.786953,0 17,59797.3237,5,-7.784748,6.963860,0 17,59800.3168,2,0.594011,2.277387,0 17,59800.3244,1,6.002035,3.310603,0 17,59800.3320,3,0.828272,2.636445,0 17,59800.3429,4,-2.143990,3.045394,0 17,59800.3539,5,2.182031,6.526192,0 17,59807.1738,2,0.345367,1.600768,0 17,59807.1815,1,1.497795,2.061084,0 17,59807.1892,3,-1.738689,1.721755,0 17,59807.2003,4,-2.008608,2.187723,0 17,59807.2114,5,-4.591490,5.073585,0 17,59810.1045,2,-0.190427,1.024519,0 17,59810.1122,1,1.409750,0.987204,0 17,59810.1200,3,2.097727,1.766024,0 17,59810.1311,4,4.661858,2.723413,0 17,59810.1422,5,1.852001,6.541757,0 17,59813.1044,2,-0.609008,0.979608,0 17,59813.1122,1,-1.283370,0.947661,0 17,59813.1199,3,0.443231,1.563090,0 17,59813.1310,4,0.463331,2.578759,0 17,59813.1422,5,-4.206830,5.891186,0 17,59819.1532,0,-0.111748,1.900487,0 17,59820.1047,0,-1.127300,1.845100,0 17,59821.1026,0,-2.983045,2.653126,0 17,59822.1105,0,-3.786463,3.135153,0 17,59823.1505,0,-6.251946,2.924953,0 17,59835.0600,2,-0.105554,1.651957,0 17,59835.0678,1,-3.813735,2.461054,0 17,59835.0755,3,-5.183056,2.248735,0 17,59835.0866,4,-3.032220,2.768835,0 17,59835.0978,5,-1.849971,5.832443,0 17,59839.0306,2,1.615393,1.178528,0 17,59839.0384,1,-0.088986,1.136387,0 17,59839.0461,3,-3.522223,1.587615,0 17,59839.0573,4,-2.323542,2.134739,0 17,59839.0684,5,2.161493,5.221899,0 17,59842.0207,2,1.318505,1.033401,0 17,59842.0285,1,0.922532,1.039005,0 17,59842.0362,3,1.569138,1.599077,0 17,59842.0473,4,3.807324,2.315629,0 17,59842.0585,5,-1.946610,5.253097,0 17,59851.1114,0,0.466809,1.833261,0 17,59854.0796,2,-0.432977,1.106505,0 17,59854.0873,1,-0.349620,1.186553,0 17,59854.0950,3,-1.028248,1.475277,0 17,59854.1061,4,0.624902,2.189718,0 17,59854.1172,5,-6.577788,5.223208,0 17,59857.0453,2,1.048279,1.770880,0 17,59857.0531,1,4.293002,2.442868,0 17,59857.0608,3,0.670081,1.885174,0 17,59857.0719,4,-1.393756,2.439591,0 17,59857.0830,5,7.867962,5.441516,0 17,59864.0162,2,1.446581,0.850881,0 17,59864.0239,1,1.881013,0.881634,0 17,59864.0316,3,1.176724,1.403741,0 17,59864.0428,4,1.061918,2.333516,0 17,59864.0539,5,7.476207,5.917978,0 17,59867.0178,2,0.380713,1.309081,0 17,59867.0255,1,-3.510653,1.302897,0 17,59867.0332,3,-2.578208,2.183999,0 17,59867.0443,4,1.046922,3.227464,0 17,59867.0554,5,-14.202744,9.457583,0 17,59870.0194,2,-1.554325,0.927003,0 17,59870.0272,1,-0.831158,0.920545,0 17,59870.0349,3,-1.187623,1.500118,0 17,59870.0459,4,2.125045,2.528746,0 17,59870.0571,5,2.077787,5.911781,0 17,59873.0212,2,-0.457801,0.738598,0 17,59873.0289,1,2.796051,0.760300,0 17,59873.0366,3,0.138800,1.232345,0 17,59873.0477,4,0.273784,1.886916,0 17,59873.0588,5,-5.356641,4.801971,0 17,59874.0599,0,-2.398834,1.584590,0 17,59875.0311,0,-0.540180,1.315395,0 17,59876.0231,0,0.411913,2.120848,0 17,59877.0238,0,-0.891288,1.592397,0 17,59878.0246,0,0.252658,1.625712,0 17,59879.0248,0,0.494818,1.668736,0 17,59880.0258,0,1.609941,1.357399,0 17,59884.0823,2,1.431002,1.091537,0 17,59884.0900,1,-0.669697,1.493729,0 17,59884.0976,3,0.632236,1.522321,0 17,59884.1085,4,2.388964,2.008811,0 17,59884.1195,5,-3.279281,4.883777,0 17,59887.0298,2,-1.951323,1.872626,0 17,59887.0375,1,-0.140049,2.947258,0 17,59887.0451,3,-0.987463,2.277290,0 17,59887.0562,4,-1.669648,2.256846,0 17,59887.0673,5,4.776433,5.354571,0 17,60118.4163,0,1.237189,1.560750,0 17,60124.2541,2,-0.027002,2.279922,0 17,60124.2618,1,-1.582832,2.932222,0 17,60124.2695,3,-2.243939,2.501337,0 17,60124.2807,4,-2.010008,2.976800,0 17,60124.2918,5,-3.812106,6.686803,0 17,60140.2290,0,2.889142,1.885088,0 17,60141.2225,0,-2.675623,3.532677,0 17,60142.2202,0,3.701951,3.092139,0 17,60143.2212,0,0.107055,2.421978,0 17,60144.2186,0,2.121859,1.934957,0 17,60145.2123,0,3.519005,2.431906,0 17,60153.2274,2,1.331808,1.381732,0 17,60153.2351,1,1.678943,1.884794,0 17,60153.2428,3,-0.669065,1.646313,0 17,60153.2539,4,-2.336847,2.069594,0 17,60153.2650,5,-3.040281,5.269622,0 17,60162.1477,2,0.274822,1.883733,0 17,60162.1554,1,-1.090832,2.591505,0 17,60162.1631,3,-4.857473,2.765956,0 17,60162.1742,4,-10.275881,3.141578,0 17,60162.1853,5,-9.252888,7.983027,0 17,60165.1369,2,-0.468279,0.846974,0 17,60165.1446,1,-0.430437,0.708165,0 17,60165.1524,3,-0.581242,1.364931,0 17,60165.1635,4,3.187243,2.182051,0 17,60165.1746,5,6.814413,5.636336,0 17,60168.1260,2,1.795461,0.881347,0 17,60168.1337,1,-0.350579,0.854263,0 17,60168.1414,3,-2.100255,1.659393,0 17,60168.1525,4,-1.526452,2.377539,0 17,60168.1637,5,2.514493,5.409767,0 17,60176.1332,0,-1.022222,2.365174,0 17,60177.1370,0,-3.289955,2.790762,0 17,60181.3147,2,-1.503265,1.258997,0 17,60181.3223,1,1.620242,1.237350,0 17,60181.3299,3,2.305993,2.212692,0 17,60181.3409,4,-0.697285,4.016526,0 17,60181.3518,5,3.620666,8.753998,0 17,60184.3625,2,-0.132555,2.135256,0 17,60184.3701,1,-0.614091,2.560603,0 17,60184.3777,3,0.653611,2.486900,0 17,60184.3887,4,1.677764,2.891815,0 17,60184.3996,5,7.855544,6.898280,0 17,60194.1575,2,1.309505,1.067185,0 17,60194.1652,1,-0.309205,1.053532,0 17,60194.1729,3,-0.334617,1.786434,0 17,60194.1839,4,0.655740,3.139240,0 17,60194.1926,5,5.044827,11.278636,0 17,60197.1181,2,0.552177,1.097758,0 17,60197.1258,1,0.541816,0.909236,0 17,60197.1335,3,0.639729,1.336890,0 17,60197.1446,4,-0.978055,2.037800,0 17,60197.1557,5,4.619338,4.987423,0 17,60198.1077,0,2.149141,2.452106,0 17,60199.0914,0,5.486719,3.143868,0 17,60200.0650,0,1.228997,2.524841,0 17,60201.0680,0,6.056491,2.897650,0 17,60202.0552,0,3.771713,2.094126,0 17,60206.1107,0,1.964452,1.559410,0 17,60207.1469,0,1.315419,2.819893,0 17,60208.0229,2,-0.600505,1.423023,0 17,60208.0307,1,0.017527,1.343573,0 17,60208.0384,3,-1.550505,1.831250,0 17,60208.0495,4,3.882280,2.726225,0 17,60208.0606,5,9.559813,7.414577,0 17,60211.0124,2,1.860704,2.250480,0 17,60211.0202,1,-2.090354,3.034887,0 17,60211.0279,3,0.410192,2.599820,0 17,60211.0390,4,-1.062310,3.468099,0 17,60211.0502,5,8.321412,9.169344,0 17,60221.0153,2,0.448097,0.997295,0 17,60221.0230,1,0.082163,0.883182,0 17,60221.0308,3,1.515744,1.365631,0 17,60221.0419,4,-0.955922,2.171834,0 17,60221.0530,5,6.189006,5.659595,0 17,60224.0140,2,-0.977404,1.146213,0 17,60224.0217,1,-1.527085,1.044619,0 17,60224.0294,3,-2.026714,1.896492,0 17,60224.0405,4,-2.270058,2.563563,0 17,60224.0516,5,1.532559,5.857242,0 17,60227.0151,2,0.269555,1.054280,0 17,60227.0228,1,1.115057,0.999069,0 17,60227.0305,3,1.552577,1.598891,0 17,60227.0416,4,0.899977,2.226238,0 17,60227.0527,5,-13.314797,5.586868,0 17,60228.0187,0,1.561344,2.716171,0 17,60229.0162,0,2.115411,2.732387,0 17,60234.0265,0,-0.632177,2.254489,0 17,60237.2206,2,0.991648,0.921157,0 17,60237.2283,1,1.011999,0.870146,0 17,60237.2359,3,1.932343,1.516595,0 17,60237.2468,4,0.683186,2.506894,0 17,60237.2578,5,-2.942844,7.042232,0 17,60240.0223,2,0.455927,1.997859,0 17,60240.0300,1,2.247968,2.647892,0 17,60240.0377,3,6.028008,2.647428,0 17,60240.0488,4,2.535219,3.012484,0 17,60240.0598,5,8.448805,5.811503,0 17,60249.0338,2,0.460854,0.866548,0 17,60249.0415,1,0.452554,0.772836,0 17,60249.0492,3,-1.191506,1.286252,0 17,60249.0602,4,2.830095,2.116775,0 17,60249.0712,5,-4.378712,5.177276,0 17,60260.0423,0,-7.464523,2.644202,0 17,60261.0361,0,0.930443,1.914282,0 17,60262.0367,0,4.964463,1.854471,0 17,60263.0373,0,0.783741,2.196650,0 17,60264.0465,0,-3.080768,1.916776,0 17,60490.2647,2,-0.977129,1.482041,0 17,60490.2725,1,-2.611819,1.794772,0 17,60490.2802,3,1.078630,2.071858,0 17,60490.2913,4,-0.492853,2.955702,0 17,60490.3024,5,3.792550,7.104066,0 17,60493.2372,2,0.021979,1.054925,0 17,60493.2450,1,-1.376170,0.980580,0 17,60493.2527,3,1.015628,1.668951,0 17,60493.2639,4,-0.002133,2.546342,0 17,60493.2750,5,-11.551120,6.173368,0 17,60499.2467,0,0.438197,2.600874,0 17,60500.2437,0,-1.686091,2.074696,0 17,60501.2385,0,0.020948,2.299457,0 17,60502.2355,0,1.068030,2.629417,0 17,60508.2638,2,-2.790141,2.436066,0 17,60508.2715,1,5.700086,3.490426,0 17,60508.2792,3,0.113332,2.677060,0 17,60508.2903,4,-1.245703,3.280095,0 17,60508.3014,5,16.609587,7.427779,0 17,60524.2390,0,-0.490797,2.080026,0 17,60525.1736,0,2.666769,2.941305,0 17,60532.3489,2,-0.207433,0.864004,0 17,60532.3565,1,-0.134437,0.845488,0 17,60532.3641,3,-0.515472,1.462743,0 17,60532.3751,4,0.897895,2.323136,0 17,60532.3860,5,1.356515,5.973135,0 17,60535.1253,2,3.357187,1.895133,0 17,60535.1330,1,1.788090,2.447693,0 17,60535.1408,3,-0.746670,2.393641,0 17,60535.1519,4,1.761451,3.298109,0 17,60535.1630,5,-3.134825,7.769337,0 17,60538.2351,2,7.960939,1.803240,0 17,60538.2428,1,6.926830,2.602727,0 17,60538.2505,3,3.353295,1.972350,0 17,60538.2615,4,5.710919,2.318584,0 17,60538.2725,5,0.716567,5.113697,0 17,60546.3406,2,12.973317,1.903937,1 17,60546.3482,1,11.756248,2.837219,1 17,60546.3558,3,10.891992,2.308349,0 17,60546.3668,4,10.706098,3.126808,0 17,60546.3777,5,1.998101,7.646282,0 17,60549.0879,2,8.457420,1.114208,1 17,60549.0956,1,10.614448,1.048073,1 17,60549.1034,3,9.131392,1.747332,1 17,60549.1145,4,12.109291,2.651627,0 17,60549.1256,5,-0.454903,6.443316,0 17,60554.0964,0,3.385612,2.492159,0 17,60555.0951,0,-0.653454,2.008215,0 17,60556.0879,0,4.359657,2.500103,0 17,60557.0831,0,0.208935,1.804377,0 17,60558.1093,0,2.002906,2.298758,0 17,60559.1097,0,-1.713623,1.974990,0 17,60560.1065,0,-4.030066,2.814673,0 17,60567.2821,2,5.086202,1.698282,0 17,60567.2897,1,4.862064,2.316888,0 17,60567.2973,3,2.821583,1.933260,0 17,60567.3083,4,8.017381,2.382588,0 17,60567.3192,5,5.673804,5.440264,0 17,60574.1118,2,4.409955,1.550580,0 17,60574.1195,1,2.795052,2.196265,0 17,60574.1272,3,4.197219,1.987329,0 17,60574.1383,4,10.014592,2.528254,0 17,60574.1493,5,12.793771,5.745250,0 17,60577.0186,2,4.357023,1.470498,0 17,60577.0263,1,1.377891,1.400257,0 17,60577.0340,3,3.097299,2.332945,0 17,60577.0451,4,5.029422,3.520210,0 17,60577.0563,5,9.703295,8.530192,0 17,60580.0095,2,0.080061,1.449560,0 17,60580.0173,1,0.636153,1.387539,0 17,60580.0250,3,1.814262,2.309473,0 17,60580.0361,4,2.268791,3.480996,0 17,60580.0472,5,3.766253,8.433430,0 17,60582.0840,0,2.160855,2.057876,0 17,60583.0169,0,-2.377121,2.935992,0 17,60584.0117,0,-7.030778,2.272231,0 17,60585.0117,0,-2.432227,1.843772,0 17,60586.0123,0,0.942960,1.782520,0 17,60587.0127,0,-0.601708,2.076788,0 17,60588.0131,0,-2.069814,1.647215,0 17,60593.0636,2,0.346279,0.950896,0 17,60593.0713,1,1.045972,1.126506,0 17,60593.0790,3,1.954885,1.349871,0 17,60593.0901,4,4.170722,1.979371,0 17,60593.1012,5,5.737381,4.821476,0 17,60596.0304,2,1.884329,2.578470,0 17,60596.0381,1,-2.693286,3.627853,0 17,60596.0458,3,0.705422,2.812787,0 17,60596.0569,4,1.202975,3.396812,0 17,60596.0680,5,16.761280,7.451248,0 17,60603.0208,2,0.386061,0.711484,0 17,60603.0286,1,-0.007518,0.663680,0 17,60603.0363,3,0.688574,1.169908,0 17,60603.0473,4,4.215033,1.826377,0 17,60603.0584,5,4.616831,4.548802,0 17,60606.0225,2,-0.884437,1.024675,0 17,60606.0303,1,-0.808401,0.984848,0 17,60606.0379,3,1.421415,1.656912,0 17,60606.0490,4,-0.165550,2.551519,0 17,60606.0601,5,-0.249700,6.335219,0 17,60609.0247,2,0.342698,0.744517,0 17,60609.0323,1,0.365452,0.700571,0 17,60609.0400,3,3.474475,1.220512,0 17,60609.0510,4,5.966081,1.906627,0 17,60609.0621,5,-4.503197,4.756433,0 17,60612.0266,0,-1.850411,1.158724,0 17,60613.0269,0,-1.095945,1.252873,0 17,60614.0276,0,1.658578,1.438634,0 17,60615.0375,0,-1.868202,1.603315,0 ================================================ FILE: examples/data/plasticc_test_set_metadata_1k.csv ================================================ object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv 13,34.453125,-5.229529,169.987075,-59.956185,1,0.3048,0.3193,0.0542,41.1123,0.019 14,33.398438,-4.331149,167.226341,-59.936551,1,nan,0.6323,0.0179,42.8774,0.018 17,348.529419,-61.755440,321.293980,-51.763351,1,nan,0.8297,0.0605,43.6000,0.016 23,34.804688,-5.829153,171.307861,-60.174401,1,nan,0.6533,0.1479,42.9640,0.023 34,351.321442,-64.198746,317.458993,-50.429931,1,0.4557,0.4617,0.0122,42.0540,0.023 35,35.332031,-5.979157,172.286722,-59.931743,1,nan,0.8388,0.0375,43.6290,0.022 43,0.574468,-45.981140,327.041068,-68.778764,1,nan,0.6669,0.0546,43.0186,0.006 50,0.574468,-45.981140,327.041068,-68.778764,1,nan,1.4663,0.0529,45.1281,0.006 60,346.562500,-63.448284,320.824720,-49.866957,1,nan,0.9462,0.0116,43.9519,0.021 69,349.160583,-64.760857,318.219706,-49.458924,1,nan,1.0432,0.1092,44.2138,0.020 88,349.160583,-64.760857,318.219706,-49.458924,1,0.1608,0.1650,0.0053,39.4929,0.020 96,151.171875,2.537361,237.288526,43.169764,1,0.3277,0.3680,0.0340,41.4711,0.024 106,1.666667,-44.399834,327.519190,-70.529554,1,nan,0.8532,0.0602,43.6747,0.009 114,351.259003,-64.386185,317.344860,-50.255113,1,nan,0.7996,0.2747,43.5011,0.020 115,151.347656,4.181528,235.568369,44.259942,1,nan,0.8979,0.0515,43.8114,0.016 116,150.468750,1.641510,237.714575,42.075234,1,nan,1.1244,0.0363,44.4151,0.017 130,34.277344,-5.679190,170.314930,-60.410322,1,0.3395,0.3368,0.0728,41.2464,0.020 142,1.694561,-45.191612,326.278557,-69.858253,1,nan,1.2710,0.0796,44.7444,0.011 147,150.820312,1.641510,237.994507,42.358984,1,nan,0.2904,0.1155,40.8738,0.020 151,151.171875,1.342993,238.602520,42.464379,1,nan,0.5090,0.0122,42.3075,0.026 168,349.429535,-62.508568,320.039643,-51.393745,1,nan,0.0000,0.0000,nan,0.020 171,52.910156,-27.953188,223.774083,-54.639214,1,nan,0.8623,0.0583,43.7031,0.007 173,150.996094,4.181528,235.291975,43.970869,1,nan,0.4490,0.0219,41.9820,0.015 176,52.910156,-27.953188,223.774083,-54.639214,1,0.3775,0.3642,0.0064,41.4450,0.007 184,352.711273,-63.823658,316.922299,-51.059403,1,nan,0.9112,0.0513,43.8508,0.024 186,0.929752,-44.597992,328.531426,-70.083244,1,nan,0.9302,0.0118,43.9062,0.011 195,152.050781,2.985506,237.495952,44.143927,1,nan,0.4658,0.0250,42.0768,0.019 198,1.694561,-45.191612,326.278557,-69.858253,1,0.4060,0.3959,0.0146,41.6579,0.011 204,349.046051,-61.943836,320.796530,-51.753706,1,0.5584,0.4997,0.0312,42.2594,0.017 211,53.613281,-27.953188,223.929533,-54.024772,1,0.5469,0.5644,0.0113,42.5781,0.007 216,150.820312,1.641510,237.994507,42.358984,1,nan,0.4056,0.0489,41.7202,0.020 236,2.457983,-45.389202,324.632685,-69.945696,1,0.3436,0.2885,0.0162,40.8574,0.011 240,151.171875,2.537361,237.288526,43.169764,1,nan,1.0936,0.0318,44.3405,0.024 260,150.820312,3.732834,235.666318,43.572109,1,nan,0.7554,0.0425,43.3496,0.016 268,149.589844,3.583322,234.885369,42.474696,1,nan,0.6234,0.0184,42.8401,0.024 272,149.414062,3.433834,234.919132,42.245550,1,nan,0.7059,0.0220,43.1693,0.027 277,348.595886,-63.072620,320.023289,-50.713060,1,nan,0.8751,0.0187,43.7426,0.021 289,53.613281,-28.630989,225.073365,-54.119461,1,nan,0.7442,0.0682,43.3099,0.006 306,148.886719,2.686724,235.347248,41.389003,1,0.7180,0.7265,0.0182,43.2458,0.028 316,32.871094,-4.780192,166.959493,-60.615132,1,nan,0.5615,0.1166,42.5647,0.017 337,150.117188,2.836105,236.124718,42.483719,1,nan,1.4098,0.0499,45.0228,0.016 349,34.453125,-5.229529,169.987075,-59.956185,1,nan,0.7679,0.0318,43.3934,0.019 357,349.966217,-62.696659,319.542989,-51.376556,1,nan,0.8937,0.0213,43.7988,0.021 366,53.613281,-26.944359,222.237403,-53.863858,1,nan,1.3577,0.2274,44.9217,0.009 384,359.816315,-44.003082,331.451340,-70.123054,1,nan,0.8134,0.0374,43.5469,0.013 402,349.891296,-64.573555,317.972107,-49.786192,1,nan,0.5684,0.0804,42.5965,0.023 406,32.695312,-4.929937,166.868469,-60.841230,1,nan,0.8989,0.0967,43.8145,0.018 409,347.861847,-61.943836,321.519104,-51.424048,1,nan,0.9110,0.0551,43.8503,0.017 413,349.429535,-62.508568,320.039643,-51.393745,1,0.6430,0.6411,0.0083,42.9139,0.020 443,150.996094,2.388015,237.313912,42.939977,1,0.3682,0.3649,0.0113,41.4497,0.021 451,349.615387,-63.636005,318.927246,-50.506542,1,nan,0.8853,0.0298,43.7734,0.018 455,0.929752,-44.597992,328.531426,-70.083244,1,nan,0.4671,0.0117,42.0842,0.011 466,34.277344,-5.079716,169.526841,-59.956640,1,0.4986,0.5527,0.0171,42.5229,0.019 467,358.665253,-45.783966,330.353593,-68.203652,1,nan,0.6573,0.0463,42.9800,0.009 478,358.636353,-46.768478,328.890146,-67.388837,1,nan,0.7283,0.0517,43.2524,0.008 483,348.529419,-61.755440,321.293980,-51.763351,1,nan,0.7835,0.0575,43.4469,0.016 489,0.574468,-45.981140,327.041068,-68.778764,1,nan,0.5798,0.0174,42.6490,0.006 524,34.277344,-5.079716,169.526841,-59.956640,1,0.4067,0.3530,0.0854,41.3660,0.019 561,349.891296,-64.573555,317.972107,-49.786192,1,nan,0.9267,0.0135,43.8959,0.023 565,152.050781,3.284369,237.157374,44.318466,1,nan,0.8253,0.0201,43.5857,0.019 568,53.789062,-27.784405,223.685697,-53.845803,1,nan,1.0649,0.2186,44.2692,0.009 583,149.414062,2.238686,236.239766,41.565558,1,0.7070,0.7129,0.0137,43.1952,0.017 607,349.160583,-64.760857,318.219706,-49.458924,1,0.2424,0.4948,0.3294,42.2339,0.020 611,149.589844,3.583322,234.885369,42.474696,1,0.5068,0.5222,0.0459,42.3744,0.024 613,150.820312,1.641510,237.994507,42.358984,1,0.3014,0.6138,0.3121,42.7991,0.020 622,358.648071,-46.375080,329.462659,-67.716008,1,nan,0.9127,0.3417,43.8552,0.009 639,52.207031,-28.630989,224.800211,-55.343637,1,nan,0.6900,0.0255,43.1086,0.009 662,33.750000,-4.630479,168.146242,-59.949072,1,0.4181,0.4889,0.0244,42.2029,0.019 670,51.855469,-28.630989,224.733260,-55.649872,1,nan,1.3168,0.1915,44.8395,0.009 672,350.230255,-61.943836,320.053946,-52.070537,1,nan,0.9496,0.0789,43.9616,0.017 674,32.695312,-4.929937,166.868469,-60.841230,1,nan,0.6032,0.0466,42.7531,0.018 680,32.871094,-4.780192,166.959493,-60.615132,1,nan,1.4883,0.2383,45.1681,0.017 683,1.708861,-45.586655,325.688716,-69.520253,1,nan,0.8370,0.0432,43.6235,0.011 686,358.665253,-45.783966,330.353593,-68.203652,1,nan,1.3769,0.2502,44.9594,0.009 687,150.468750,3.732834,235.392208,43.283244,1,nan,0.3617,0.2373,41.4273,0.020 694,34.453125,-5.229529,169.987075,-59.956185,1,0.4544,0.4391,0.0361,41.9241,0.019 699,0.589520,-47.161343,325.385896,-67.769893,1,0.5659,0.5592,0.0064,42.5537,0.009 721,358.665253,-45.783966,330.353593,-68.203652,1,0.4664,0.4355,0.3217,41.9033,0.009 725,359.446716,-44.201530,331.730015,-69.805709,1,nan,0.7186,0.0156,43.2165,0.010 729,52.207031,-26.610098,221.298836,-55.042928,1,nan,0.7994,0.0124,43.5005,0.014 731,346.276581,-64.011238,320.448031,-49.344136,1,0.5315,0.5418,0.0087,42.4710,0.019 734,34.804688,-5.829153,171.307861,-60.174401,1,nan,0.9281,0.0119,43.9001,0.023 747,54.667969,-27.615883,223.610785,-53.050840,1,nan,1.3670,0.0656,44.9399,0.009 759,151.347656,3.583322,236.252362,43.918627,1,nan,1.5633,0.2334,45.2997,0.015 779,347.861847,-61.943836,321.519104,-51.424048,1,0.7469,0.7381,0.0118,43.2877,0.017 793,359.058563,-45.191612,330.695783,-68.844915,1,nan,0.6826,0.0160,43.0802,0.011 810,35.332031,-5.979157,172.286722,-59.931743,1,nan,0.9306,0.0189,43.9073,0.022 830,1.694561,-45.191612,326.278557,-69.858253,1,nan,0.6893,0.0081,43.1060,0.011 833,150.292969,2.686724,236.427488,42.541447,1,nan,0.4998,0.0202,42.2604,0.016 834,359.058563,-45.191612,330.695783,-68.844915,1,0.5767,0.5866,0.0128,42.6794,0.011 843,53.789062,-27.784405,223.685697,-53.845803,1,nan,0.8523,0.0367,43.6717,0.009 868,35.332031,-5.979157,172.286722,-59.931743,1,0.3881,0.3855,1.3203,41.5900,0.022 883,53.261719,-27.615883,223.280041,-54.281374,1,nan,0.8622,0.0603,43.7027,0.006 886,33.574219,-4.780192,168.064587,-60.175886,1,0.4615,0.4476,0.7778,41.9738,0.019 887,358.648071,-46.375080,329.462659,-67.716008,1,nan,1.2642,0.2378,44.7300,0.009 888,359.814819,-44.399834,330.775011,-69.801007,1,nan,1.2159,0.1488,44.6254,0.009 905,0.189873,-45.586655,328.254458,-68.969298,1,nan,0.6630,0.0275,43.0028,0.007 916,150.292969,2.686724,236.427488,42.541447,1,0.4052,0.4393,0.0217,41.9253,0.016 917,1.666667,-44.399834,327.519190,-70.529554,1,nan,1.3559,0.2676,44.9180,0.009 943,2.457983,-45.389202,324.632685,-69.945696,1,0.8582,0.8663,0.0291,43.7153,0.011 946,359.446716,-44.201530,331.730015,-69.805709,1,nan,1.5205,0.0959,45.2254,0.010 960,351.382965,-64.011238,317.574052,-50.604657,1,nan,0.6480,0.0129,42.9424,0.023 962,150.820312,1.641510,237.994507,42.358984,1,nan,0.6779,0.0117,43.0616,0.020 965,2.457983,-45.389202,324.632685,-69.945696,1,nan,0.6636,0.0081,43.0054,0.011 968,52.910156,-27.953188,223.774083,-54.639214,1,nan,0.2472,0.0116,40.4743,0.007 978,34.101562,-5.829153,170.247753,-60.638325,1,0.4938,0.4954,0.0349,42.2373,0.019 979,148.710938,2.836105,235.050801,41.328739,1,nan,1.1407,0.1072,44.4539,0.031 983,1.753247,-46.768478,324.030235,-68.498041,1,nan,0.3106,0.0065,41.0429,0.014 1017,351.299988,-62.320400,319.038597,-52.026867,1,nan,1.4323,0.0481,45.0652,0.018 1018,349.285706,-62.884678,319.786163,-51.046461,1,nan,0.9262,0.0111,43.8945,0.018 1020,149.238281,3.882372,234.283829,42.351155,1,nan,1.0264,0.2278,44.1701,0.033 1030,352.711273,-63.823658,316.922299,-51.059403,1,nan,1.2671,0.0235,44.7362,0.024 1039,150.644531,3.583322,235.698235,43.342784,1,nan,0.7124,0.0358,43.1936,0.018 1049,1.723404,-45.981140,325.117958,-69.180825,1,nan,1.2628,0.1822,44.7271,0.010 1059,53.085938,-27.784405,223.525509,-54.460748,1,nan,1.0223,0.0247,44.1595,0.007 1063,53.789062,-27.784405,223.685697,-53.845803,1,nan,0.0000,0.0000,nan,0.009 1065,53.613281,-26.944359,222.237403,-53.863858,1,0.4795,0.4443,0.0390,41.9546,0.009 1067,2.071130,-45.191612,325.606223,-69.989264,1,0.7417,0.8350,0.0544,43.6171,0.011 1084,0.965665,-46.375080,325.845907,-68.579427,1,0.2126,0.5647,0.4351,42.5797,0.007 1087,352.132874,-63.636005,317.424173,-51.095855,1,nan,1.1039,0.1661,44.3657,0.021 1088,34.277344,-5.679190,170.314930,-60.410322,1,0.7550,0.7142,0.0477,43.2001,0.020 1100,34.101562,-5.829153,170.247753,-60.638325,1,0.9332,0.8409,0.0631,43.6357,0.019 1106,33.925781,-5.979157,170.179895,-60.866303,1,nan,0.9258,0.0219,43.8935,0.022 1108,51.855469,-26.276812,220.627031,-55.293792,1,0.6459,0.7378,0.0318,43.2867,0.014 1111,53.085938,-27.111860,222.384291,-54.355086,1,nan,0.3981,0.7822,41.6724,0.007 1114,33.398438,-4.331149,167.226341,-59.936551,1,0.5401,0.5514,0.0091,42.5169,0.018 1115,150.820312,3.732834,235.666318,43.572109,1,nan,1.1354,0.0545,44.4412,0.016 1123,151.171875,2.238686,237.619933,42.994783,1,0.5819,0.5516,0.3972,42.5177,0.024 1127,0.965665,-46.375080,325.845907,-68.579427,1,nan,1.0761,0.1787,44.2973,0.007 1128,149.414062,1.940072,236.565366,41.393323,1,nan,0.3618,0.0387,41.4282,0.018 1138,359.816315,-44.003082,331.451340,-70.123054,1,nan,1.4331,0.2745,45.0668,0.013 1151,33.574219,-4.780192,168.064587,-60.175886,1,nan,0.9335,0.2471,43.9155,0.019 1168,347.861847,-61.943836,321.519104,-51.424048,1,nan,1.5435,0.1436,45.2656,0.017 1174,0.949367,-45.586655,326.991548,-69.251686,1,nan,1.0215,0.1042,44.1572,0.013 1193,347.861847,-61.943836,321.519104,-51.424048,1,nan,0.7156,0.0493,43.2055,0.017 1216,53.964844,-28.630989,225.142950,-53.813613,1,nan,0.9028,0.0148,43.8260,0.009 1245,53.085938,-28.122234,224.100909,-54.509752,1,0.4937,0.5333,0.0166,42.4294,0.007 1254,34.980469,-6.279288,172.180075,-60.389399,1,nan,1.5079,0.3209,45.2031,0.023 1265,54.667969,-27.615883,223.610785,-53.050840,1,0.5310,0.6159,0.0352,42.8082,0.009 1266,149.414062,1.940072,236.565366,41.393323,1,nan,0.6497,0.0183,42.9492,0.018 1271,33.750000,-4.630479,168.146242,-59.949072,1,nan,1.0088,0.0104,44.1239,0.019 1274,0.189873,-45.586655,328.254458,-68.969298,1,nan,1.4221,0.2779,45.0460,0.007 1288,33.222656,-4.780192,167.515653,-60.396584,1,nan,0.9169,0.1056,43.8676,0.018 1289,346.276581,-64.011238,320.448031,-49.344136,1,nan,1.1128,0.0552,44.3872,0.019 1304,347.846710,-64.760857,318.929827,-49.143596,1,0.3102,0.3081,0.0119,41.0226,0.019 1321,151.523438,3.134927,236.900695,43.803170,1,nan,1.2033,0.1759,44.5975,0.019 1347,349.429535,-62.508568,320.039643,-51.393745,1,nan,0.7305,0.0458,43.2603,0.020 1354,53.085938,-28.122234,224.100909,-54.509752,1,nan,0.8265,0.0527,43.5896,0.007 1365,1.708861,-45.586655,325.688716,-69.520253,1,0.6978,0.7233,0.0170,43.2338,0.011 1380,351.259003,-64.386185,317.344860,-50.255113,1,nan,0.9291,0.0148,43.9029,0.020 1388,52.910156,-26.276812,220.926149,-54.363918,1,nan,0.8363,0.0411,43.6210,0.008 1393,152.050781,2.985506,237.495952,44.143927,1,0.4202,2.8977,0.8121,46.9354,0.019 1415,51.328125,-27.784405,223.130589,-55.999499,1,0.7371,0.7551,0.0216,43.3483,0.013 1421,358.648071,-46.375080,329.462659,-67.716008,1,nan,0.6717,0.0415,43.0374,0.009 1439,34.101562,-5.829153,170.247753,-60.638325,1,nan,1.0442,0.1545,44.2165,0.019 1450,53.964844,-28.630989,225.142950,-53.813613,1,nan,0.8016,0.0528,43.5077,0.009 1463,1.363636,-46.768478,324.669342,-68.371416,1,0.2333,0.5070,0.1130,42.2975,0.008 1467,151.523438,3.134927,236.900695,43.803170,1,nan,0.8282,0.1183,43.5952,0.019 1468,346.562500,-63.448284,320.824720,-49.866957,1,nan,0.8677,0.0290,43.7199,0.021 1469,149.414062,2.238686,236.239766,41.565558,1,nan,1.0850,0.0150,44.3193,0.017 1487,150.468750,3.732834,235.392208,43.283244,1,0.6295,0.5829,0.0311,42.6627,0.020 1492,347.812500,-63.448284,320.128971,-50.202348,1,nan,0.7396,0.0157,43.2934,0.021 1498,152.050781,3.284369,237.157374,44.318466,1,nan,0.8187,0.0238,43.5644,0.019 1500,149.414062,2.238686,236.239766,41.565558,1,nan,1.0392,0.0139,44.2035,0.017 1522,352.132874,-63.636005,317.424173,-51.095855,1,nan,1.2524,0.0277,44.7049,0.021 1523,150.468750,1.641510,237.714575,42.075234,1,0.2669,0.2407,0.0254,40.4086,0.017 1536,33.574219,-5.079716,168.448505,-60.407218,1,nan,0.5789,0.1445,42.6449,0.016 1545,346.562500,-63.448284,320.824720,-49.866957,1,0.6731,0.7236,0.0459,43.2351,0.021 1567,152.050781,2.985506,237.495952,44.143927,1,0.5581,0.5870,0.0381,42.6815,0.019 1570,347.861847,-61.943836,321.519104,-51.424048,1,nan,0.7967,0.0516,43.4914,0.017 1578,51.855469,-28.630989,224.733260,-55.649872,1,nan,0.5831,0.1709,42.6637,0.009 1589,51.679688,-27.447618,222.618229,-55.642263,1,0.3366,0.3400,0.0077,41.2708,0.010 1593,150.996094,4.181528,235.291975,43.970869,1,nan,1.2366,0.1482,44.6708,0.015 1597,351.299988,-62.320400,319.038597,-52.026867,1,0.3161,0.3182,0.0159,41.1033,0.018 1599,52.207031,-26.610098,221.298836,-55.042928,1,nan,1.0088,0.2519,44.1237,0.014 1600,34.101562,-5.829153,170.247753,-60.638325,1,0.5596,0.5490,0.0474,42.5054,0.019 1601,53.085938,-28.122234,224.100909,-54.509752,1,0.7155,0.7376,0.0244,43.2861,0.007 1619,2.097458,-45.783966,324.737840,-69.478613,1,nan,1.1271,0.0568,44.4216,0.011 1630,1.753247,-46.768478,324.030235,-68.498041,1,nan,0.7373,0.0283,43.2851,0.014 1636,0.965665,-46.375080,325.845907,-68.579427,1,nan,0.9029,0.0862,43.8264,0.007 1660,359.446716,-44.201530,331.730015,-69.805709,1,nan,0.8022,0.0194,43.5100,0.010 1667,33.574219,-5.079716,168.448505,-60.407218,1,nan,1.0792,0.0660,44.3050,0.016 1676,0.965665,-46.375080,325.845907,-68.579427,1,nan,0.9666,0.2691,44.0090,0.007 1678,0.929752,-44.597992,328.531426,-70.083244,1,nan,0.7378,0.0146,43.2866,0.011 1687,53.613281,-28.630989,225.073365,-54.119461,1,nan,0.9107,0.0140,43.8494,0.006 1691,152.050781,2.985506,237.495952,44.143927,1,0.7346,0.7037,0.0529,43.1609,0.019 1695,351.259003,-64.386185,317.344860,-50.255113,1,nan,0.9741,0.2405,44.0299,0.020 1702,150.996094,2.388015,237.313912,42.939977,1,nan,0.7806,0.0916,43.4371,0.021 1720,1.694561,-45.191612,326.278557,-69.858253,1,nan,0.6746,0.0276,43.0488,0.011 1729,51.328125,-27.447618,222.535046,-55.950727,1,0.6110,0.5932,0.0122,42.7092,0.013 1730,150.117188,3.732834,235.120533,42.993809,1,nan,0.6999,0.0471,43.1464,0.020 1754,148.710938,2.836105,235.050801,41.328739,1,nan,0.9109,0.0431,43.8500,0.031 1763,347.861847,-61.943836,321.519104,-51.424048,1,nan,0.8860,0.0855,43.7757,0.017 1770,0.190678,-45.783966,327.956322,-68.803772,1,0.2562,0.2577,0.0118,40.5774,0.005 1780,0.929752,-44.597992,328.531426,-70.083244,1,nan,0.3954,0.0285,41.6551,0.011 1781,351.382965,-64.011238,317.574052,-50.604657,1,0.4976,0.4790,0.0223,42.1493,0.023 1830,51.328125,-27.784405,223.130589,-55.999499,1,nan,0.7349,0.0181,43.2763,0.013 1833,32.871094,-4.780192,166.959493,-60.615132,1,0.3202,0.3262,0.0567,41.1659,0.017 1841,0.589520,-47.161343,325.385896,-67.769893,1,nan,0.9916,0.0165,44.0776,0.009 1871,33.574219,-4.780192,168.064587,-60.175886,1,nan,0.5952,0.0260,42.7180,0.019 1894,150.468750,1.641510,237.714575,42.075234,1,nan,1.2779,0.0607,44.7590,0.017 1914,359.816315,-44.003082,331.451340,-70.123054,1,0.6648,0.6345,0.0370,42.8867,0.013 1933,150.117188,2.836105,236.124718,42.483719,1,nan,0.4788,0.2350,42.1486,0.016 1934,351.734680,-62.884678,318.284128,-51.651217,1,0.6909,0.7567,0.0533,43.3540,0.019 1941,52.207031,-26.610098,221.298836,-55.042928,1,nan,0.9255,0.0135,43.8926,0.014 1948,359.814819,-44.399834,330.775011,-69.801007,1,nan,1.2610,0.0630,44.7233,0.009 1949,33.398438,-3.732834,166.492280,-59.466614,1,0.5070,0.5068,0.0084,42.2963,0.022 1971,149.414062,1.940072,236.565366,41.393323,1,nan,1.1763,0.1964,44.5365,0.018 1978,54.667969,-27.615883,223.610785,-53.050840,1,nan,0.8494,0.0510,43.6626,0.009 1999,349.160583,-64.760857,318.219706,-49.458924,1,0.6580,0.5147,0.5022,42.3368,0.020 2000,349.046051,-61.943836,320.796530,-51.753706,1,nan,0.6559,0.0240,42.9743,0.017 2017,51.679688,-27.447618,222.618229,-55.642263,1,nan,0.3325,0.1725,41.2145,0.010 2021,1.753247,-46.768478,324.030235,-68.498041,1,nan,0.6515,0.0269,42.9566,0.014 2046,150.644531,3.583322,235.698235,43.342784,1,0.7425,0.7898,0.0207,43.4684,0.018 2054,152.050781,2.985506,237.495952,44.143927,1,nan,0.7912,0.0208,43.4730,0.019 2074,359.446716,-44.201530,331.730015,-69.805709,1,0.5175,0.4921,0.0570,42.2198,0.010 2093,150.996094,4.181528,235.291975,43.970869,1,nan,0.3252,0.0452,41.1578,0.015 2097,151.347656,4.181528,235.568369,44.259942,1,nan,0.6327,0.0348,42.8789,0.016 2106,1.694561,-45.191612,326.278557,-69.858253,1,nan,1.1768,0.0524,44.5376,0.011 2109,358.665253,-45.783966,330.353593,-68.203652,1,0.4692,0.4797,0.0133,42.1532,0.009 2133,349.891296,-64.573555,317.972107,-49.786192,1,nan,0.6612,0.0157,42.9957,0.023 2145,51.328125,-27.447618,222.535046,-55.950727,1,nan,1.2076,0.0605,44.6070,0.013 2147,0.589520,-47.161343,325.385896,-67.769893,1,nan,1.5213,0.2618,45.2269,0.009 2160,34.277344,-5.079716,169.526841,-59.956640,1,nan,0.5716,0.0242,42.6114,0.019 2168,359.446716,-44.201530,331.730015,-69.805709,1,0.2214,0.2308,0.2438,40.3055,0.010 2172,0.189873,-45.586655,328.254458,-68.969298,1,0.6266,0.6681,0.0264,43.0232,0.007 2182,151.523438,3.134927,236.900695,43.803170,1,nan,0.8239,0.0320,43.5812,0.019 2183,148.710938,2.836105,235.050801,41.328739,1,0.2383,0.2618,0.0166,40.6165,0.031 2187,34.101562,-5.829153,170.247753,-60.638325,1,0.3599,0.3656,0.5424,41.4546,0.019 2193,347.846710,-64.760857,318.929827,-49.143596,1,nan,1.4268,0.0800,45.0550,0.019 2195,150.468750,3.732834,235.392208,43.283244,1,nan,0.4598,0.0125,42.0435,0.020 2198,351.382965,-64.011238,317.574052,-50.604657,1,nan,0.9277,0.2414,43.8988,0.023 2206,52.910156,-25.944481,220.366350,-54.301439,1,nan,1.5014,0.0616,45.1915,0.010 2208,352.398651,-62.696659,318.017427,-51.967966,1,0.7188,0.7126,0.0121,43.1942,0.020 2223,358.648071,-46.375080,329.462659,-67.716008,1,nan,1.1672,0.0321,44.5157,0.009 2228,1.753247,-46.768478,324.030235,-68.498041,1,nan,0.6837,0.0267,43.0844,0.014 2243,150.820312,3.134927,236.341348,43.230123,1,nan,1.3813,0.1932,44.9680,0.016 2246,348.529419,-61.755440,321.293980,-51.763351,1,0.6467,0.7143,0.0413,43.2008,0.016 2252,53.437500,-29.142223,225.908120,-54.336118,1,nan,0.5851,0.0120,42.6728,0.008 2265,359.058563,-45.191612,330.695783,-68.844915,1,0.2786,0.2719,0.0249,40.7105,0.011 2270,151.171875,2.238686,237.619933,42.994783,1,nan,0.0000,0.0000,nan,0.024 2276,346.655182,-63.260487,320.952196,-50.040935,1,nan,0.6827,0.0726,43.0806,0.019 2281,346.655182,-63.260487,320.952196,-50.040935,1,0.3442,0.5476,0.1871,42.4986,0.019 2287,351.953644,-62.132156,318.777388,-52.347124,1,nan,0.5387,0.0152,42.4558,0.019 2292,1.753247,-46.768478,324.030235,-68.498041,1,nan,0.9430,0.0086,43.9427,0.014 2293,51.679688,-27.447618,222.618229,-55.642263,1,nan,0.9148,0.0376,43.8614,0.010 2296,1.694561,-45.191612,326.278557,-69.858253,1,nan,0.8033,0.0310,43.5136,0.011 2299,352.132874,-63.636005,317.424173,-51.095855,1,nan,0.8109,0.0190,43.5386,0.021 2304,1.708861,-45.586655,325.688716,-69.520253,1,nan,1.3102,0.0895,44.8261,0.011 2333,349.966217,-62.696659,319.542989,-51.376556,1,0.4284,0.4611,0.0224,42.0509,0.021 2348,33.398438,-4.331149,167.226341,-59.936551,1,nan,0.5904,0.0092,42.6966,0.018 2351,32.695312,-4.929937,166.868469,-60.841230,1,nan,0.7537,0.1688,43.3435,0.018 2354,51.679688,-27.447618,222.618229,-55.642263,1,0.3829,0.3935,0.0180,41.6425,0.010 2370,35.683594,-5.379379,171.992947,-59.253501,1,nan,0.6099,0.0096,42.7820,0.020 2372,34.453125,-5.229529,169.987075,-59.956185,1,0.3353,0.4757,0.6757,42.1315,0.019 2388,54.667969,-27.615883,223.610785,-53.050840,1,nan,1.0298,0.0768,44.1791,0.009 2395,359.814819,-44.399834,330.775011,-69.801007,1,0.6315,0.6359,0.0276,42.8924,0.009 2407,0.190678,-45.783966,327.956322,-68.803772,1,0.5859,0.5249,0.0829,42.3879,0.005 2410,52.207031,-26.610098,221.298836,-55.042928,1,0.3848,0.4258,0.0241,41.8449,0.014 2451,359.814819,-44.399834,330.775011,-69.801007,1,nan,0.7416,0.0343,43.3003,0.009 2476,149.414062,1.940072,236.565366,41.393323,1,nan,0.4730,0.1009,42.1166,0.018 2538,346.130127,-63.072620,321.423103,-50.042305,1,0.6846,0.6643,0.0319,43.0081,0.020 2550,351.259003,-64.386185,317.344860,-50.255113,1,nan,0.3640,0.1891,41.4439,0.020 2603,0.965665,-46.375080,325.845907,-68.579427,1,nan,1.2175,0.4670,44.6290,0.007 2615,52.910156,-26.276812,220.926149,-54.363918,1,nan,1.2323,0.0534,44.6613,0.008 2660,151.523438,3.134927,236.900695,43.803170,1,1.0412,1.3254,0.1765,44.8570,0.019 2661,346.130127,-63.072620,321.423103,-50.042305,1,0.5160,0.5185,0.0319,42.3560,0.020 2683,0.965665,-46.375080,325.845907,-68.579427,1,0.6609,0.6757,0.0091,43.0530,0.007 2687,348.529419,-61.755440,321.293980,-51.763351,1,0.6669,0.6531,0.0406,42.9628,0.016 2693,358.665253,-45.783966,330.353593,-68.203652,1,nan,1.5860,0.2426,45.3384,0.009 2701,349.891296,-64.573555,317.972107,-49.786192,1,0.4057,0.3811,0.0322,41.5609,0.023 2702,35.683594,-5.379379,171.992947,-59.253501,1,0.5081,0.5489,0.0165,42.5049,0.020 2707,33.398438,-4.331149,167.226341,-59.936551,1,0.1061,0.1359,0.0193,39.0322,0.018 2744,348.595886,-63.072620,320.023289,-50.713060,1,nan,0.5735,0.0245,42.6203,0.021 2753,51.328125,-27.784405,223.130589,-55.999499,1,0.4302,0.3267,0.0389,41.1697,0.013 2759,148.886719,2.686724,235.347248,41.389003,1,nan,0.5974,0.0069,42.7274,0.028 2760,149.414062,3.433834,234.919132,42.245550,1,nan,0.3415,0.2635,41.2821,0.027 2766,150.468750,3.732834,235.392208,43.283244,1,nan,1.0362,0.0118,44.1956,0.020 2786,349.615387,-63.636005,318.927246,-50.506542,1,nan,0.3010,0.0504,40.9634,0.018 2790,349.285706,-62.884678,319.786163,-51.046461,1,nan,0.0000,0.0000,nan,0.018 2794,359.415588,-46.768478,327.729895,-67.686097,1,0.6983,0.7077,0.0266,43.1760,0.009 2814,150.117188,2.836105,236.124718,42.483719,1,nan,0.7215,0.0782,43.2272,0.016 2818,52.207031,-28.291550,224.208534,-55.300157,1,nan,0.8251,0.0216,43.5851,0.007 2856,53.964844,-28.630989,225.142950,-53.813613,1,nan,0.6348,0.0515,42.8877,0.009 2858,34.980469,-6.279288,172.180075,-60.389399,1,nan,1.1145,0.0322,44.3914,0.023 2860,348.529419,-61.755440,321.293980,-51.763351,1,nan,0.4398,0.2606,41.9285,0.016 2871,346.655182,-63.260487,320.952196,-50.040935,1,nan,0.4647,0.0229,42.0709,0.019 2882,151.171875,1.342993,238.602520,42.464379,1,nan,2.7824,0.2944,46.8290,0.026 2886,148.710938,2.836105,235.050801,41.328739,1,nan,0.3475,0.0073,41.3259,0.031 2909,359.446716,-44.201530,331.730015,-69.805709,1,nan,0.3229,0.3149,41.1406,0.010 2932,352.711273,-63.823658,316.922299,-51.059403,1,nan,0.1911,0.0104,39.8466,0.024 2933,348.586945,-64.573555,318.693903,-49.477869,1,0.6336,0.6157,0.0140,42.8070,0.018 2943,0.589520,-47.161343,325.385896,-67.769893,1,0.7816,0.7511,0.0242,43.3344,0.009 2945,52.910156,-25.944481,220.366350,-54.301439,1,nan,1.1553,0.0720,44.4880,0.010 2949,52.910156,-25.944481,220.366350,-54.301439,1,nan,1.2125,0.0257,44.6178,0.010 2958,358.648071,-46.375080,329.462659,-67.716008,1,nan,0.5145,0.0383,42.3356,0.009 2961,351.734680,-62.884678,318.284128,-51.651217,1,nan,0.3771,0.0426,41.5336,0.019 2962,149.414062,3.433834,234.919132,42.245550,1,nan,0.7453,0.0637,43.3138,0.027 2970,53.085938,-27.111860,222.384291,-54.355086,1,0.4506,0.5123,0.0647,42.3247,0.007 2975,53.085938,-28.122234,224.100909,-54.509752,1,nan,1.4981,0.2615,45.1857,0.007 3004,349.966217,-62.696659,319.542989,-51.376556,1,0.4053,0.4978,0.0481,42.2498,0.021 3008,358.312500,-44.993881,332.185785,-68.685906,1,nan,0.1344,0.7139,39.0065,0.009 3012,150.117188,2.238686,236.784618,42.139082,1,nan,1.4853,0.0818,45.1627,0.016 3022,52.207031,-28.630989,224.800211,-55.343637,1,0.4266,0.3630,0.0443,41.4365,0.009 3025,359.415588,-46.768478,327.729895,-67.686097,1,0.6636,0.6827,0.0129,43.0806,0.009 3028,32.871094,-4.780192,166.959493,-60.615132,1,nan,0.6075,0.2450,42.7719,0.017 3029,347.617462,-62.508568,321.121462,-50.904708,1,nan,0.7168,0.0344,43.2100,0.019 3037,150.820312,3.732834,235.666318,43.572109,1,0.3807,0.3856,0.0770,41.5907,0.016 3052,351.734680,-62.884678,318.284128,-51.651217,1,nan,0.9395,0.0475,43.9327,0.019 3073,1.753247,-46.768478,324.030235,-68.498041,1,nan,0.5457,0.7707,42.4896,0.014 3077,0.189873,-45.586655,328.254458,-68.969298,1,0.2853,0.3011,0.0497,40.9649,0.007 3133,0.574468,-45.981140,327.041068,-68.778764,1,nan,1.6167,0.0739,45.3897,0.006 3163,34.277344,-5.079716,169.526841,-59.956640,1,0.6388,0.6322,0.0085,42.8770,0.019 3170,33.398438,-4.331149,167.226341,-59.936551,1,nan,0.4754,0.0150,42.1298,0.018 3171,33.925781,-5.979157,170.179895,-60.866303,1,0.5550,0.5525,0.0096,42.5220,0.022 3175,347.617462,-62.508568,321.121462,-50.904708,1,nan,1.0217,0.0243,44.1579,0.019 3176,150.820312,3.134927,236.341348,43.230123,1,nan,0.7794,0.0377,43.4329,0.016 3190,33.925781,-5.979157,170.179895,-60.866303,1,nan,0.7705,0.0220,43.4023,0.022 3193,347.812500,-63.448284,320.128971,-50.202348,1,nan,0.8700,0.0495,43.7269,0.021 3196,358.648071,-46.375080,329.462659,-67.716008,1,0.3112,0.2901,0.0115,40.8716,0.009 3206,53.085938,-27.111860,222.384291,-54.355086,1,0.6472,0.6860,0.0225,43.0931,0.007 3209,1.363636,-46.768478,324.669342,-68.371416,1,0.4369,0.4994,1.2314,42.2578,0.008 3210,53.437500,-29.142223,225.908120,-54.336118,1,nan,0.7133,0.0739,43.1968,0.008 3214,348.908447,-63.823658,319.169886,-50.176186,1,0.2600,0.2876,0.3195,40.8500,0.018 3217,151.347656,4.181528,235.568369,44.259942,1,0.1826,0.4846,0.3727,42.1795,0.016 3247,1.708861,-45.586655,325.688716,-69.520253,1,nan,1.3152,0.1148,44.8363,0.011 3252,349.615387,-63.636005,318.927246,-50.506542,1,nan,0.8013,0.0111,43.5070,0.018 3278,53.613281,-27.953188,223.929533,-54.024772,1,nan,1.5343,0.1417,45.2497,0.007 3284,0.929752,-44.597992,328.531426,-70.083244,1,nan,0.4121,1.2419,41.7612,0.011 3286,33.574219,-4.780192,168.064587,-60.175886,1,nan,0.7985,0.0224,43.4974,0.019 3288,51.855469,-26.276812,220.627031,-55.293792,1,nan,1.2928,0.0725,44.7902,0.014 3325,53.613281,-28.630989,225.073365,-54.119461,1,0.4825,0.5158,0.0204,42.3423,0.006 3336,34.453125,-5.229529,169.987075,-59.956185,1,nan,0.7542,0.0326,43.3453,0.019 3347,52.910156,-27.953188,223.774083,-54.639214,1,nan,0.5940,0.0536,42.7127,0.007 3351,359.811707,-45.191612,329.485675,-69.150905,1,0.5478,0.5828,0.0191,42.6626,0.010 3362,347.617462,-62.508568,321.121462,-50.904708,1,nan,0.6700,0.1306,43.0306,0.019 3373,352.398651,-62.696659,318.017427,-51.967966,1,nan,0.4474,0.2471,41.9728,0.020 3395,53.613281,-27.953188,223.929533,-54.024772,1,0.3963,0.3657,0.8633,41.4551,0.007 3398,34.804688,-5.829153,171.307861,-60.174401,1,nan,1.0070,0.0407,44.1188,0.023 3410,349.615387,-63.636005,318.927246,-50.506542,1,nan,1.1667,0.0446,44.5144,0.018 3418,0.190678,-45.783966,327.956322,-68.803772,1,nan,1.0109,0.0153,44.1294,0.005 3441,0.189873,-45.586655,328.254458,-68.969298,1,nan,0.4974,0.0111,42.2477,0.007 3457,151.347656,4.181528,235.568369,44.259942,1,nan,0.7204,0.0318,43.2232,0.016 3475,149.414062,3.433834,234.919132,42.245550,1,nan,0.7214,0.0251,43.2270,0.027 3498,351.734680,-62.884678,318.284128,-51.651217,1,nan,0.9131,0.0795,43.8565,0.019 3499,150.996094,2.985506,236.647967,43.287350,1,0.9971,1.0190,0.0171,44.1508,0.020 3500,52.207031,-28.630989,224.800211,-55.343637,1,0.2306,0.4849,0.6744,42.1816,0.009 3504,351.734680,-62.884678,318.284128,-51.651217,1,nan,0.5829,0.0306,42.6630,0.019 3509,53.261719,-27.615883,223.280041,-54.281374,1,0.5035,0.5169,0.2812,42.3478,0.006 3522,52.910156,-27.279613,222.625192,-54.536648,1,0.8636,0.9140,0.0250,43.8590,0.007 3529,348.586945,-64.573555,318.693903,-49.477869,1,nan,0.9039,0.0247,43.8293,0.018 3535,359.814819,-44.399834,330.775011,-69.801007,1,nan,1.4756,0.1618,45.1450,0.009 3538,150.996094,2.985506,236.647967,43.287350,1,0.4615,0.4664,0.0103,42.0806,0.020 3547,2.097458,-45.783966,324.737840,-69.478613,1,nan,0.9467,0.0338,43.9533,0.011 3551,358.636353,-46.768478,328.890146,-67.388837,1,0.2573,0.2393,0.0145,40.3948,0.008 3566,151.171875,2.238686,237.619933,42.994783,1,0.6331,0.5448,0.6502,42.4852,0.024 3581,51.855469,-26.276812,220.627031,-55.293792,1,0.4844,0.4938,0.0183,42.2287,0.014 3585,150.820312,3.732834,235.666318,43.572109,1,nan,0.5231,0.1034,42.3791,0.016 3597,2.457983,-45.389202,324.632685,-69.945696,1,0.8409,0.8742,0.0306,43.7396,0.011 3620,35.859375,-4.630479,171.270769,-58.580806,1,nan,0.7190,0.0186,43.2181,0.022 3629,51.855469,-28.630989,224.733260,-55.649872,1,nan,1.3951,0.1453,44.9947,0.009 3641,149.414062,3.433834,234.919132,42.245550,1,nan,0.5965,0.0131,42.7237,0.027 3645,346.562500,-63.448284,320.824720,-49.866957,1,0.5351,0.5274,0.0205,42.4003,0.021 3652,149.238281,3.882372,234.283829,42.351155,1,nan,0.8499,0.0477,43.6644,0.033 3657,34.277344,-5.679190,170.314930,-60.410322,1,0.3050,2.7750,0.9745,46.8221,0.020 3661,347.812500,-63.448284,320.128971,-50.202348,1,nan,1.3297,0.0487,44.8658,0.021 3666,150.117188,2.836105,236.124718,42.483719,1,nan,1.1820,0.1059,44.5495,0.016 3672,348.529419,-61.755440,321.293980,-51.763351,1,0.6020,0.5785,0.0108,42.6431,0.016 3681,150.996094,2.985506,236.647967,43.287350,1,0.6459,0.6429,0.0113,42.9215,0.020 3702,151.171875,2.238686,237.619933,42.994783,1,nan,0.8082,0.1270,43.5299,0.024 3706,51.328125,-27.784405,223.130589,-55.999499,1,0.3284,0.3150,0.1745,41.0780,0.013 3709,151.171875,2.238686,237.619933,42.994783,1,nan,0.8753,0.0139,43.7432,0.024 3710,1.666667,-44.399834,327.519190,-70.529554,1,nan,0.8356,0.0335,43.6190,0.009 3735,0.574468,-45.981140,327.041068,-68.778764,1,nan,0.7982,0.1565,43.4967,0.006 3737,150.292969,2.686724,236.427488,42.541447,1,nan,0.7567,0.0404,43.3543,0.016 3748,52.910156,-25.944481,220.366350,-54.301439,1,nan,1.0812,0.0621,44.3099,0.010 3749,359.415588,-46.768478,327.729895,-67.686097,1,0.7033,0.7840,0.0371,43.4487,0.009 3751,352.132874,-63.636005,317.424173,-51.095855,1,0.6527,0.6351,0.0190,42.8890,0.021 3753,346.562500,-63.448284,320.824720,-49.866957,1,0.1191,0.0761,0.0195,37.6870,0.021 3763,347.861847,-61.943836,321.519104,-51.424048,1,nan,0.3349,0.0091,41.2320,0.017 3764,347.812500,-63.448284,320.128971,-50.202348,1,nan,0.8530,0.0218,43.6740,0.021 3777,33.574219,-5.079716,168.448505,-60.407218,1,nan,0.5510,0.0652,42.5152,0.016 3779,348.595886,-63.072620,320.023289,-50.713060,1,nan,0.9939,0.2200,44.0838,0.021 3785,359.415588,-46.768478,327.729895,-67.686097,1,0.5052,0.5168,0.0229,42.3475,0.009 3794,52.910156,-26.276812,220.926149,-54.363918,1,0.6236,0.6316,0.0331,42.8743,0.008 3803,151.171875,1.342993,238.602520,42.464379,1,nan,0.7038,0.0688,43.1613,0.026 3821,52.910156,-27.953188,223.774083,-54.639214,1,nan,1.4631,0.2823,45.1222,0.007 3822,358.648071,-46.375080,329.462659,-67.716008,1,0.3455,2.9896,1.2696,47.0170,0.009 3824,1.753247,-46.768478,324.030235,-68.498041,1,0.8063,0.8352,0.0339,43.6178,0.014 3841,2.071130,-45.191612,325.606223,-69.989264,1,nan,0.5971,0.0990,42.7263,0.011 3844,2.457983,-45.389202,324.632685,-69.945696,1,0.4789,0.3748,0.4652,41.5181,0.011 3855,349.891296,-64.573555,317.972107,-49.786192,1,0.4507,0.3704,0.5053,41.4880,0.023 3857,2.457983,-45.389202,324.632685,-69.945696,1,nan,0.7254,0.0296,43.2418,0.011 3875,52.207031,-28.630989,224.800211,-55.343637,1,0.6825,0.6468,0.0279,42.9374,0.009 3878,358.636353,-46.768478,328.890146,-67.388837,1,nan,0.9110,0.2538,43.8503,0.008 3880,351.382965,-64.011238,317.574052,-50.604657,1,nan,1.5002,0.2529,45.1894,0.023 3889,53.964844,-28.630989,225.142950,-53.813613,1,0.1582,0.2015,1.1876,39.9743,0.009 3925,150.996094,4.181528,235.291975,43.970869,1,nan,0.2421,1.1561,40.4235,0.015 3928,150.996094,2.985506,236.647967,43.287350,1,nan,0.7806,0.0538,43.4371,0.020 3929,149.589844,3.583322,234.885369,42.474696,1,nan,1.8897,0.1427,45.8062,0.024 3936,34.980469,-6.279288,172.180075,-60.389399,1,nan,1.0853,0.0192,44.3200,0.023 3950,352.132874,-63.636005,317.424173,-51.095855,1,0.7004,0.6924,0.0235,43.1180,0.021 3970,1.753247,-46.768478,324.030235,-68.498041,1,nan,1.4637,0.1090,45.1234,0.014 3972,33.574219,-5.079716,168.448505,-60.407218,1,nan,0.7985,0.0333,43.4976,0.016 4001,150.996094,4.181528,235.291975,43.970869,1,nan,1.5533,0.1631,45.2826,0.015 4007,150.820312,3.732834,235.666318,43.572109,1,nan,0.7855,0.0241,43.4536,0.016 4016,346.130127,-63.072620,321.423103,-50.042305,1,nan,1.2676,0.0738,44.7372,0.020 4023,0.589520,-47.161343,325.385896,-67.769893,1,nan,0.9574,0.1398,43.9835,0.009 4025,33.222656,-4.780192,167.515653,-60.396584,1,nan,0.0000,0.0000,nan,0.018 4038,35.859375,-4.630479,171.270769,-58.580806,1,nan,1.1349,0.0866,44.4401,0.022 4044,151.171875,2.238686,237.619933,42.994783,1,nan,0.6693,0.0222,43.0278,0.024 4054,34.453125,-5.229529,169.987075,-59.956185,1,0.5653,0.5478,0.0237,42.4998,0.019 4062,358.312500,-44.993881,332.185785,-68.685906,1,nan,1.1896,0.0988,44.5666,0.009 4063,348.586945,-64.573555,318.693903,-49.477869,1,nan,0.6406,0.1744,42.9119,0.018 4065,35.332031,-5.979157,172.286722,-59.931743,1,nan,1.3023,0.0984,44.8098,0.022 4077,150.468750,1.641510,237.714575,42.075234,1,nan,1.1662,0.1634,44.5133,0.017 4103,349.160583,-64.760857,318.219706,-49.458924,1,0.3113,0.3331,0.0201,41.2185,0.020 4109,52.910156,-26.276812,220.926149,-54.363918,1,nan,0.0000,0.0000,nan,0.008 4183,51.855469,-28.630989,224.733260,-55.649872,1,nan,0.8906,0.0209,43.7897,0.009 4197,54.667969,-27.615883,223.610785,-53.050840,1,nan,0.7358,0.0471,43.2794,0.009 4201,34.453125,-5.229529,169.987075,-59.956185,1,0.2193,0.2338,0.1899,40.3374,0.019 4202,33.398438,-3.732834,166.492280,-59.466614,1,0.6916,0.7529,0.0348,43.3408,0.022 4216,53.964844,-28.630989,225.142950,-53.813613,1,0.4807,0.4515,0.0246,41.9964,0.009 4224,33.925781,-5.979157,170.179895,-60.866303,1,nan,0.6240,0.0370,42.8426,0.022 4225,346.562500,-63.448284,320.824720,-49.866957,1,0.2704,0.3546,0.2924,41.3770,0.021 4249,51.679688,-27.447618,222.618229,-55.642263,1,nan,1.3003,0.1720,44.8058,0.010 4262,346.130127,-63.072620,321.423103,-50.042305,1,nan,0.7108,0.0343,43.1877,0.020 4267,33.222656,-4.780192,167.515653,-60.396584,1,0.4186,0.3797,0.0298,41.5512,0.018 4272,53.613281,-26.944359,222.237403,-53.863858,1,0.2892,0.2718,0.0268,40.7089,0.009 4274,351.321442,-64.198746,317.458993,-50.429931,1,0.6479,0.6553,0.0091,42.9721,0.023 4278,149.414062,2.238686,236.239766,41.565558,1,nan,0.8546,0.0815,43.6790,0.017 4282,53.085938,-27.784405,223.525509,-54.460748,1,nan,0.4929,0.0100,42.2237,0.007 4283,52.910156,-27.279613,222.625192,-54.536648,1,0.3708,0.3947,0.0245,41.6505,0.007 4293,1.753247,-46.768478,324.030235,-68.498041,1,nan,1.6305,0.1495,45.4125,0.014 4294,150.117188,2.836105,236.124718,42.483719,1,nan,0.8510,0.1009,43.6678,0.016 4300,51.855469,-26.276812,220.627031,-55.293792,1,0.6127,0.6028,0.0088,42.7511,0.014 4303,32.695312,-4.929937,166.868469,-60.841230,1,nan,0.9797,0.0098,44.0451,0.018 4304,34.277344,-5.679190,170.314930,-60.410322,1,nan,0.2127,0.0101,40.1063,0.020 4306,151.171875,2.537361,237.288526,43.169764,1,nan,1.4930,0.1628,45.1765,0.024 4313,149.414062,3.433834,234.919132,42.245550,1,nan,1.1395,0.1626,44.4510,0.027 4322,151.347656,4.181528,235.568369,44.259942,1,0.7816,0.7648,0.0184,43.3824,0.016 4325,34.980469,-6.279288,172.180075,-60.389399,1,nan,0.6802,0.0118,43.0708,0.023 4330,53.613281,-28.630989,225.073365,-54.119461,1,nan,1.1409,0.1366,44.4543,0.006 4337,34.804688,-5.829153,171.307861,-60.174401,1,nan,0.6082,0.0100,42.7747,0.023 4347,35.683594,-5.379379,171.992947,-59.253501,1,nan,1.5051,0.0376,45.1982,0.020 4369,53.964844,-28.630989,225.142950,-53.813613,1,nan,1.1974,0.0630,44.5843,0.009 4372,351.259003,-64.386185,317.344860,-50.255113,1,0.8588,0.8990,0.0160,43.8146,0.020 4376,33.222656,-4.780192,167.515653,-60.396584,1,nan,0.9810,0.0139,44.0488,0.018 4390,53.437500,-29.142223,225.908120,-54.336118,1,nan,1.0152,0.3139,44.1407,0.008 4424,151.699219,3.583322,236.533224,44.205648,1,nan,1.0317,0.0256,44.1840,0.016 4467,53.261719,-27.615883,223.280041,-54.281374,1,0.3489,0.3023,0.1233,40.9750,0.006 4469,150.820312,1.641510,237.994507,42.358984,1,nan,1.0352,0.2064,44.1930,0.020 4475,1.753247,-46.768478,324.030235,-68.498041,1,nan,0.5952,0.0666,42.7177,0.014 4489,35.332031,-5.979157,172.286722,-59.931743,1,nan,0.3478,0.0152,41.3280,0.022 4490,1.723404,-45.981140,325.117958,-69.180825,1,nan,1.1304,0.0573,44.4295,0.010 4492,358.636353,-46.768478,328.890146,-67.388837,1,0.2686,0.6256,0.1162,42.8492,0.008 4494,2.097458,-45.783966,324.737840,-69.478613,1,0.4329,0.4492,0.0147,41.9831,0.011 4507,35.683594,-5.379379,171.992947,-59.253501,1,nan,1.1883,0.0370,44.5636,0.020 4508,347.812500,-63.448284,320.128971,-50.202348,1,nan,0.7777,0.0513,43.4272,0.021 4525,149.589844,3.583322,234.885369,42.474696,1,nan,0.7945,0.0729,43.4840,0.024 4528,359.811707,-45.191612,329.485675,-69.150905,1,0.6181,0.5632,0.0527,42.5723,0.010 4533,150.996094,2.985506,236.647967,43.287350,1,nan,0.9019,0.2483,43.8233,0.020 4551,358.312500,-44.993881,332.185785,-68.685906,1,0.2666,0.6085,0.0540,42.7761,0.009 4559,0.929752,-44.597992,328.531426,-70.083244,1,0.3229,0.3027,0.6578,40.9778,0.011 4561,351.953644,-62.132156,318.777388,-52.347124,1,nan,0.7697,0.0339,43.3995,0.019 4576,150.292969,2.686724,236.427488,42.541447,1,0.5613,0.5636,0.0277,42.5742,0.016 4586,2.071130,-45.191612,325.606223,-69.989264,1,0.3810,0.3822,0.0308,41.5678,0.011 4589,150.820312,1.641510,237.994507,42.358984,1,nan,1.2748,0.1135,44.7525,0.020 4592,52.207031,-26.610098,221.298836,-55.042928,1,0.7073,0.7753,0.0429,43.4190,0.014 4597,349.285706,-62.884678,319.786163,-51.046461,1,nan,0.6120,0.0070,42.7913,0.018 4611,53.085938,-27.111860,222.384291,-54.355086,1,nan,0.8291,0.0669,43.5981,0.007 4625,347.812500,-63.448284,320.128971,-50.202348,1,nan,0.7723,0.0357,43.4086,0.021 4644,35.859375,-4.630479,171.270769,-58.580806,1,0.4138,0.4027,0.0331,41.7017,0.022 4645,358.665253,-45.783966,330.353593,-68.203652,1,nan,0.6546,0.1405,42.9690,0.009 4653,150.292969,2.686724,236.427488,42.541447,1,nan,0.9818,0.0169,44.0509,0.016 4673,352.711273,-63.823658,316.922299,-51.059403,1,nan,0.5303,0.0066,42.4145,0.024 4677,53.085938,-27.784405,223.525509,-54.460748,1,0.3093,0.1395,0.2071,39.0948,0.007 4695,349.966217,-62.696659,319.542989,-51.376556,1,0.6195,0.6168,0.0156,42.8120,0.021 4707,359.814819,-44.399834,330.775011,-69.801007,1,nan,0.8457,0.0478,43.6511,0.009 4712,0.190678,-45.783966,327.956322,-68.803772,1,0.1206,0.5167,0.0880,42.3468,0.005 4713,358.665253,-45.783966,330.353593,-68.203652,1,0.8042,0.7677,0.0483,43.3928,0.009 4719,0.965665,-46.375080,325.845907,-68.579427,1,0.3619,0.3727,0.0303,41.5035,0.007 4720,359.805206,-46.768478,327.135979,-67.829903,1,0.5530,0.5247,0.1218,42.3871,0.011 4724,352.398651,-62.696659,318.017427,-51.967966,1,0.5390,0.4826,0.1275,42.1692,0.020 4738,35.332031,-5.979157,172.286722,-59.931743,1,nan,1.0768,0.0572,44.2988,0.022 4739,51.855469,-26.276812,220.627031,-55.293792,1,nan,0.7661,0.0148,43.3871,0.014 4750,0.965665,-46.375080,325.845907,-68.579427,1,0.4822,0.2376,0.3459,40.3769,0.007 4759,359.811707,-45.191612,329.485675,-69.150905,1,nan,0.7922,0.2472,43.4764,0.010 4760,350.230255,-61.943836,320.053946,-52.070537,1,nan,1.1776,0.0799,44.5394,0.017 4762,51.679688,-27.447618,222.618229,-55.642263,1,nan,1.0371,0.2322,44.1980,0.010 4768,349.285706,-62.884678,319.786163,-51.046461,1,nan,1.0012,0.0457,44.1033,0.018 4776,150.117188,2.238686,236.784618,42.139082,1,0.4602,0.4465,0.0429,41.9677,0.016 4812,151.171875,1.342993,238.602520,42.464379,1,nan,0.8278,0.0161,43.5940,0.026 4822,149.589844,3.583322,234.885369,42.474696,1,nan,0.8752,0.0381,43.7427,0.024 4824,346.655182,-63.260487,320.952196,-50.040935,1,0.4116,0.3961,1.0457,41.6592,0.019 4825,150.468750,3.732834,235.392208,43.283244,1,nan,0.8466,0.0165,43.6540,0.020 4830,1.666667,-44.399834,327.519190,-70.529554,1,nan,0.8267,0.0561,43.5904,0.009 4833,2.457983,-45.389202,324.632685,-69.945696,1,nan,0.6975,0.0469,43.1373,0.011 4834,358.636353,-46.768478,328.890146,-67.388837,1,nan,0.6570,0.0487,42.9789,0.008 4840,151.171875,2.537361,237.288526,43.169764,1,nan,0.9905,0.0337,44.0747,0.024 4844,34.277344,-5.679190,170.314930,-60.410322,1,nan,1.0144,0.1816,44.1385,0.020 4853,347.861847,-61.943836,321.519104,-51.424048,1,nan,0.8895,0.1178,43.7862,0.017 4854,0.949367,-45.586655,326.991548,-69.251686,1,nan,0.7034,0.0178,43.1597,0.013 4864,151.171875,1.342993,238.602520,42.464379,1,0.1623,0.1825,0.0221,39.7354,0.026 4866,351.953644,-62.132156,318.777388,-52.347124,1,nan,0.6113,0.0972,42.7883,0.019 4903,151.171875,2.238686,237.619933,42.994783,1,nan,0.4125,0.0223,41.7635,0.024 4910,1.723404,-45.981140,325.117958,-69.180825,1,nan,0.6558,0.0453,42.9739,0.010 4934,52.031250,-26.443335,220.963669,-55.168557,1,nan,1.2051,0.0888,44.6015,0.014 4937,359.811707,-45.191612,329.485675,-69.150905,1,nan,0.5072,0.0257,42.2985,0.010 4948,151.171875,2.537361,237.288526,43.169764,1,nan,1.0614,0.0814,44.2604,0.024 4986,32.871094,-4.780192,166.959493,-60.615132,1,0.5554,0.4904,0.4394,42.2108,0.017 4990,149.414062,1.940072,236.565366,41.393323,1,nan,0.6203,0.0205,42.8268,0.018 4994,150.468750,1.641510,237.714575,42.075234,1,nan,0.8252,0.0451,43.5856,0.017 5024,352.711273,-63.823658,316.922299,-51.059403,1,0.3343,0.5197,0.1503,42.3618,0.024 5034,346.276581,-64.011238,320.448031,-49.344136,1,nan,0.2533,0.8554,40.5348,0.019 5057,150.468750,3.732834,235.392208,43.283244,1,0.8187,0.7582,0.0593,43.3595,0.020 5061,150.292969,2.686724,236.427488,42.541447,1,0.4812,0.4838,0.0111,42.1754,0.016 5084,52.207031,-26.610098,221.298836,-55.042928,1,nan,1.0849,0.0363,44.3190,0.014 5088,349.160583,-64.760857,318.219706,-49.458924,1,0.7677,0.7828,0.0261,43.4447,0.020 5094,2.071130,-45.191612,325.606223,-69.989264,1,nan,0.3655,0.0077,41.4542,0.011 5099,1.753247,-46.768478,324.030235,-68.498041,1,nan,0.3400,0.2409,41.2708,0.014 5135,33.925781,-5.979157,170.179895,-60.866303,1,0.7377,0.7425,0.0053,43.3035,0.022 5145,150.468750,1.641510,237.714575,42.075234,1,nan,1.1364,0.0385,44.4437,0.017 5153,149.414062,2.238686,236.239766,41.565558,1,0.7765,0.7695,0.0155,43.3989,0.017 5158,359.415588,-46.768478,327.729895,-67.686097,1,0.4004,0.4266,0.0302,41.8498,0.009 5162,52.558594,-27.279613,222.538937,-54.845107,1,nan,0.5872,0.0094,42.6823,0.008 5166,358.636353,-46.768478,328.890146,-67.388837,1,nan,0.7222,0.0331,43.2299,0.008 5176,359.814819,-44.399834,330.775011,-69.801007,1,nan,0.8147,0.0386,43.5513,0.009 5204,33.574219,-4.780192,168.064587,-60.175886,1,0.4879,0.5068,0.0211,42.2965,0.019 5210,33.574219,-4.780192,168.064587,-60.175886,1,0.4971,0.4954,0.0328,42.2371,0.019 5217,53.261719,-27.615883,223.280041,-54.281374,1,nan,1.6018,0.0726,45.3649,0.006 5234,51.328125,-27.784405,223.130589,-55.999499,1,0.7616,0.8177,0.0589,43.5609,0.013 5236,35.332031,-5.979157,172.286722,-59.931743,1,0.3651,0.4588,0.1365,42.0379,0.022 5249,348.586945,-64.573555,318.693903,-49.477869,1,nan,0.9021,0.0339,43.8239,0.018 5264,347.617462,-62.508568,321.121462,-50.904708,1,nan,0.6893,0.0103,43.1059,0.019 5278,53.789062,-27.784405,223.685697,-53.845803,1,0.4394,0.3893,0.1926,41.6151,0.009 5280,351.299988,-62.320400,319.038597,-52.026867,1,nan,1.2309,0.2137,44.6583,0.018 5283,347.013428,-62.508568,321.472056,-50.735330,1,nan,0.6296,0.1513,42.8661,0.018 5286,347.846710,-64.760857,318.929827,-49.143596,1,0.5384,0.4991,0.1472,42.2564,0.019 5307,2.457983,-45.389202,324.632685,-69.945696,1,0.4932,0.4967,0.0234,42.2437,0.011 5313,349.891296,-64.573555,317.972107,-49.786192,1,0.3924,0.4004,0.3110,41.6868,0.023 5315,359.805206,-46.768478,327.135979,-67.829903,1,nan,0.7107,0.0354,43.1872,0.011 5317,33.574219,-5.379379,168.838090,-60.637536,1,0.2383,0.2830,0.1436,40.8096,0.017 5319,148.710938,2.836105,235.050801,41.328739,1,nan,0.5440,0.0154,42.4816,0.031 5322,0.929752,-44.597992,328.531426,-70.083244,1,nan,1.0052,0.0257,44.1143,0.011 5335,348.908447,-63.823658,319.169886,-50.176186,1,0.2108,0.2283,0.1361,40.2787,0.018 5338,152.050781,3.284369,237.157374,44.318466,1,nan,0.8439,0.0429,43.6452,0.019 5354,34.980469,-6.279288,172.180075,-60.389399,1,0.2692,0.3663,0.1994,41.4598,0.023 5365,349.285706,-62.884678,319.786163,-51.046461,1,nan,0.9173,0.0221,43.8686,0.018 5370,358.636353,-46.768478,328.890146,-67.388837,1,0.7437,0.6297,0.0526,42.8666,0.008 5372,348.908447,-63.823658,319.169886,-50.176186,1,0.5219,0.5300,0.0576,42.4134,0.018 5386,34.453125,-5.229529,169.987075,-59.956185,1,nan,0.6316,0.0102,42.8746,0.019 5403,51.855469,-28.630989,224.733260,-55.649872,1,nan,0.9827,0.0233,44.0533,0.009 5412,151.347656,4.181528,235.568369,44.259942,1,0.3435,0.3761,0.2378,41.5268,0.016 5417,32.871094,-4.780192,166.959493,-60.615132,1,nan,0.7730,0.0296,43.4109,0.017 5419,358.636353,-46.768478,328.890146,-67.388837,1,nan,0.5571,0.0159,42.5440,0.008 5422,0.190678,-45.783966,327.956322,-68.803772,1,nan,1.1195,0.1674,44.4033,0.005 5433,53.085938,-28.122234,224.100909,-54.509752,1,nan,1.1700,0.1749,44.5221,0.007 5453,150.117188,3.732834,235.120533,42.993809,1,nan,0.5035,0.3843,42.2794,0.020 5473,53.437500,-29.142223,225.908120,-54.336118,1,nan,0.5390,0.0204,42.4575,0.008 5478,33.398438,-4.331149,167.226341,-59.936551,1,nan,0.9349,0.0204,43.9195,0.018 5520,152.050781,2.985506,237.495952,44.143927,1,0.2992,0.2421,0.0411,40.4233,0.019 5521,51.328125,-27.784405,223.130589,-55.999499,1,nan,0.9909,0.0426,44.0758,0.013 5535,53.261719,-27.615883,223.280041,-54.281374,1,nan,0.9054,0.0151,43.8337,0.006 5543,33.398438,-3.732834,166.492280,-59.466614,1,nan,0.8296,0.1240,43.5996,0.022 5550,151.171875,1.342993,238.602520,42.464379,1,nan,0.5898,0.0156,42.6937,0.026 5562,351.734680,-62.884678,318.284128,-51.651217,1,0.3430,0.4407,0.1538,41.9338,0.019 5564,351.321442,-64.198746,317.458993,-50.429931,1,nan,0.6240,0.0197,42.8426,0.023 5565,1.708861,-45.586655,325.688716,-69.520253,1,nan,0.7586,0.0376,43.3607,0.011 5566,347.013428,-62.508568,321.472056,-50.735330,1,nan,0.9684,0.0328,44.0142,0.018 5569,33.574219,-5.079716,168.448505,-60.407218,1,0.2696,0.2658,0.6279,40.6536,0.016 5570,347.861847,-61.943836,321.519104,-51.424048,1,nan,0.6613,0.0352,42.9962,0.017 5576,0.589520,-47.161343,325.385896,-67.769893,1,nan,1.4255,0.1860,45.0525,0.009 5591,350.230255,-61.943836,320.053946,-52.070537,1,nan,0.7811,0.0143,43.4387,0.017 5623,1.753247,-46.768478,324.030235,-68.498041,1,0.3915,0.4107,1.0343,41.7522,0.014 5635,359.415588,-46.768478,327.729895,-67.686097,1,nan,1.0468,0.1246,44.2231,0.009 5655,1.753247,-46.768478,324.030235,-68.498041,1,nan,1.4730,0.2564,45.1403,0.014 5656,2.457983,-45.389202,324.632685,-69.945696,1,nan,0.9835,0.0168,44.0555,0.011 5661,35.683594,-5.379379,171.992947,-59.253501,1,nan,0.2668,0.2983,40.6628,0.020 5696,349.285706,-62.884678,319.786163,-51.046461,1,nan,0.8087,0.0795,43.5316,0.018 5704,350.230255,-61.943836,320.053946,-52.070537,1,nan,0.5054,0.0636,42.2892,0.017 5723,351.321442,-64.198746,317.458993,-50.429931,1,nan,0.8582,0.1911,43.6902,0.023 5729,53.085938,-27.784405,223.525509,-54.460748,1,nan,1.3930,0.0637,44.9906,0.007 5758,52.910156,-25.944481,220.366350,-54.301439,1,nan,0.9711,0.0522,44.0216,0.010 5761,150.468750,1.641510,237.714575,42.075234,1,nan,0.9866,0.2871,44.0641,0.017 5763,349.891296,-64.573555,317.972107,-49.786192,1,nan,1.0138,0.2387,44.1370,0.023 5794,351.299988,-62.320400,319.038597,-52.026867,1,nan,1.3563,0.0554,44.9189,0.018 5798,51.328125,-27.784405,223.130589,-55.999499,1,nan,0.8854,0.0513,43.7737,0.013 5801,53.613281,-28.630989,225.073365,-54.119461,1,nan,0.8968,0.0208,43.8080,0.006 5817,34.277344,-5.079716,169.526841,-59.956640,1,0.5298,0.5217,0.0200,42.3721,0.019 5834,352.398651,-62.696659,318.017427,-51.967966,1,nan,0.8078,0.0389,43.5284,0.020 5844,2.097458,-45.783966,324.737840,-69.478613,1,nan,1.1469,0.0339,44.4683,0.011 5852,0.190678,-45.783966,327.956322,-68.803772,1,nan,0.8192,0.0304,43.5659,0.005 5864,51.855469,-28.630989,224.733260,-55.649872,1,nan,1.3871,0.1040,44.9791,0.009 5877,151.347656,4.181528,235.568369,44.259942,1,0.3096,0.3166,0.7338,41.0910,0.016 5881,33.398438,-4.331149,167.226341,-59.936551,1,nan,0.5213,0.0134,42.3702,0.018 5895,348.908447,-63.823658,319.169886,-50.176186,1,nan,0.8744,0.0299,43.7403,0.018 5911,33.750000,-4.630479,168.146242,-59.949072,1,0.2117,0.5363,0.4038,42.4441,0.019 5922,1.694561,-45.191612,326.278557,-69.858253,1,0.1845,0.2240,0.3763,40.2324,0.011 5924,51.855469,-28.630989,224.733260,-55.649872,1,0.7987,0.8068,0.0181,43.5253,0.009 5930,149.589844,3.583322,234.885369,42.474696,1,nan,1.3364,0.0830,44.8793,0.024 5937,2.457983,-45.389202,324.632685,-69.945696,1,nan,1.2012,0.0366,44.5926,0.011 5954,347.617462,-62.508568,321.121462,-50.904708,1,nan,1.1290,0.0618,44.4261,0.019 5956,51.855469,-28.630989,224.733260,-55.649872,1,nan,0.8904,0.0380,43.7889,0.009 5957,150.468750,3.732834,235.392208,43.283244,1,nan,0.7308,0.0756,43.2613,0.020 5978,346.655182,-63.260487,320.952196,-50.040935,1,nan,0.5152,0.0154,42.3393,0.019 5982,33.925781,-5.979157,170.179895,-60.866303,1,nan,0.4710,0.0617,42.1060,0.022 5995,359.058563,-45.191612,330.695783,-68.844915,1,nan,1.5541,0.1950,45.2840,0.011 6000,0.574468,-45.981140,327.041068,-68.778764,1,nan,1.2101,0.0361,44.6126,0.006 6001,53.437500,-29.142223,225.908120,-54.336118,1,nan,0.8961,0.0542,43.8060,0.008 6003,349.891296,-64.573555,317.972107,-49.786192,1,0.6069,0.5395,0.0894,42.4600,0.023 6017,151.347656,4.181528,235.568369,44.259942,1,nan,0.5050,0.5080,42.2869,0.016 6021,358.636353,-46.768478,328.890146,-67.388837,1,nan,0.9940,0.0435,44.0841,0.008 6022,352.711273,-63.823658,316.922299,-51.059403,1,nan,0.2460,0.2146,40.4626,0.024 6023,348.595886,-63.072620,320.023289,-50.713060,1,nan,0.4699,0.0341,42.0998,0.021 6028,358.648071,-46.375080,329.462659,-67.716008,1,nan,0.7000,0.0060,43.1467,0.009 6030,358.636353,-46.768478,328.890146,-67.388837,1,nan,0.7964,0.0420,43.4906,0.008 6076,0.190678,-45.783966,327.956322,-68.803772,1,nan,1.2728,0.0758,44.7483,0.005 6087,51.679688,-27.447618,222.618229,-55.642263,1,nan,0.8998,0.0507,43.8170,0.010 6091,359.058563,-45.191612,330.695783,-68.844915,1,0.1812,0.1971,0.0181,39.9205,0.011 6092,348.595886,-63.072620,320.023289,-50.713060,1,nan,1.2574,0.2139,44.7155,0.021 6096,351.953644,-62.132156,318.777388,-52.347124,1,nan,1.1012,0.0271,44.3592,0.019 6106,150.117188,2.836105,236.124718,42.483719,1,nan,0.5103,0.0193,42.3145,0.016 6119,1.708861,-45.586655,325.688716,-69.520253,1,0.3975,0.4588,0.5979,42.0377,0.011 6120,346.655182,-63.260487,320.952196,-50.040935,1,nan,1.1674,0.4165,44.5161,0.019 6126,53.085938,-27.111860,222.384291,-54.355086,1,nan,0.7902,0.0393,43.4696,0.007 6135,348.586945,-64.573555,318.693903,-49.477869,1,0.8477,0.8451,0.1229,43.6492,0.018 6151,150.292969,2.686724,236.427488,42.541447,1,nan,0.8598,0.0520,43.6952,0.016 6162,2.071130,-45.191612,325.606223,-69.989264,1,nan,1.3977,0.0957,44.9997,0.011 6164,34.980469,-6.279288,172.180075,-60.389399,1,nan,1.5785,0.3260,45.3258,0.023 6173,150.117188,2.836105,236.124718,42.483719,1,0.3881,0.3821,0.0142,41.5673,0.016 6179,348.586945,-64.573555,318.693903,-49.477869,1,0.5991,0.5030,0.0291,42.2767,0.018 6187,349.891296,-64.573555,317.972107,-49.786192,1,nan,1.1064,0.0831,44.3719,0.023 6191,53.613281,-28.630989,225.073365,-54.119461,1,nan,0.2328,0.0166,40.3267,0.006 6197,151.523438,3.134927,236.900695,43.803170,1,nan,1.0306,0.0174,44.1812,0.019 6205,33.574219,-6.579593,170.455585,-61.548219,1,0.5161,0.5873,0.0270,42.6825,0.021 6211,0.949367,-45.586655,326.991548,-69.251686,1,0.5940,0.5967,0.0095,42.7247,0.013 6217,35.683594,-5.379379,171.992947,-59.253501,1,nan,0.6309,0.0313,42.8717,0.020 6223,349.046051,-61.943836,320.796530,-51.753706,1,nan,0.2937,0.0805,40.9020,0.017 6287,358.648071,-46.375080,329.462659,-67.716008,1,nan,0.6517,0.0263,42.9572,0.009 6289,52.207031,-26.610098,221.298836,-55.042928,1,0.2186,0.3975,0.2967,41.6682,0.014 6293,33.925781,-5.979157,170.179895,-60.866303,1,nan,0.8837,0.0179,43.7686,0.022 6297,53.964844,-28.630989,225.142950,-53.813613,1,0.5674,0.6449,0.0377,42.9295,0.009 6308,347.812500,-63.448284,320.128971,-50.202348,1,nan,0.9701,0.0203,44.0187,0.021 6313,349.966217,-62.696659,319.542989,-51.376556,1,nan,0.3825,0.0215,41.5700,0.021 6343,33.398438,-4.331149,167.226341,-59.936551,1,nan,1.0126,0.0334,44.1339,0.018 6345,0.190678,-45.783966,327.956322,-68.803772,1,nan,1.0255,0.0178,44.1678,0.005 6347,149.414062,3.433834,234.919132,42.245550,1,nan,0.2243,0.0261,40.2360,0.027 6350,33.574219,-5.379379,168.838090,-60.637536,1,nan,1.2142,0.1253,44.6216,0.017 6352,150.996094,4.181528,235.291975,43.970869,1,nan,1.4079,0.0591,45.0191,0.015 6354,34.804688,-5.829153,171.307861,-60.174401,1,0.3701,0.4966,0.3999,42.2433,0.023 6368,351.299988,-62.320400,319.038597,-52.026867,1,nan,0.9252,0.0390,43.8918,0.018 6369,32.871094,-4.780192,166.959493,-60.615132,1,nan,0.5996,0.0136,42.7374,0.017 6372,359.805206,-46.768478,327.135979,-67.829903,1,0.7964,0.7850,0.0422,43.4520,0.011 6376,32.695312,-4.929937,166.868469,-60.841230,1,nan,0.7511,0.0307,43.3345,0.018 6378,348.529419,-61.755440,321.293980,-51.763351,1,nan,0.6002,0.0209,42.7397,0.016 6390,0.589520,-47.161343,325.385896,-67.769893,1,0.5421,0.4998,0.2593,42.2602,0.009 6391,34.101562,-5.829153,170.247753,-60.638325,1,nan,1.1253,0.1662,44.4173,0.019 6402,51.679688,-27.447618,222.618229,-55.642263,1,0.4580,0.4038,0.0453,41.7087,0.010 6405,34.804688,-5.829153,171.307861,-60.174401,1,nan,1.3943,0.2658,44.9931,0.023 6436,53.613281,-28.630989,225.073365,-54.119461,1,nan,0.4164,0.3756,41.7878,0.006 6437,33.925781,-5.979157,170.179895,-60.866303,1,0.4279,0.4344,0.0322,41.8968,0.022 6450,0.190678,-45.783966,327.956322,-68.803772,1,nan,0.7339,0.0231,43.2726,0.005 6458,352.711273,-63.823658,316.922299,-51.059403,1,1.1032,1.0971,0.1861,44.3491,0.024 6460,51.679688,-27.447618,222.618229,-55.642263,1,nan,0.3954,0.0255,41.6549,0.010 6470,358.636353,-46.768478,328.890146,-67.388837,1,nan,1.0362,0.0170,44.1958,0.008 6474,51.855469,-27.953188,223.543603,-55.561470,1,nan,0.7091,0.1441,43.1812,0.008 6498,348.586945,-64.573555,318.693903,-49.477869,1,nan,1.1153,0.1670,44.3933,0.018 6514,2.071130,-45.191612,325.606223,-69.989264,1,0.3119,0.3565,0.4554,41.3906,0.011 6515,34.804688,-5.829153,171.307861,-60.174401,1,nan,1.1225,0.0927,44.4106,0.023 6537,152.050781,2.985506,237.495952,44.143927,1,nan,1.2730,0.1522,44.7487,0.019 6546,53.085938,-28.122234,224.100909,-54.509752,1,0.2509,0.3826,0.2720,41.5706,0.007 6548,148.710938,2.836105,235.050801,41.328739,1,nan,1.1286,0.1579,44.4252,0.031 6555,151.699219,3.583322,236.533224,44.205648,1,0.2053,0.2236,0.0177,40.2274,0.016 6560,53.613281,-27.953188,223.929533,-54.024772,1,nan,0.7824,0.0662,43.4432,0.007 6567,32.871094,-4.780192,166.959493,-60.615132,1,nan,0.7820,0.0137,43.4419,0.017 6568,51.679688,-27.447618,222.618229,-55.642263,1,nan,1.0385,0.0242,44.2017,0.010 6575,33.398438,-3.732834,166.492280,-59.466614,1,0.5197,0.5580,0.0155,42.5483,0.022 6598,0.589520,-47.161343,325.385896,-67.769893,1,nan,0.7918,0.0238,43.4750,0.009 6606,151.523438,3.134927,236.900695,43.803170,1,nan,0.5715,0.0474,42.6108,0.019 6620,351.259003,-64.386185,317.344860,-50.255113,1,nan,0.9861,0.0153,44.0628,0.020 6623,33.750000,-4.630479,168.146242,-59.949072,1,nan,0.8838,0.0439,43.7691,0.019 6633,349.891296,-64.573555,317.972107,-49.786192,1,0.5312,0.5073,0.1220,42.2989,0.023 6654,150.468750,3.732834,235.392208,43.283244,1,0.8058,0.8088,0.0111,43.5318,0.020 6655,0.929752,-44.597992,328.531426,-70.083244,1,0.6631,0.6768,0.0337,43.0574,0.011 6663,52.910156,-26.276812,220.926149,-54.363918,1,nan,0.4449,0.3362,41.9584,0.008 6673,150.820312,3.732834,235.666318,43.572109,1,0.5028,0.5238,0.0312,42.3823,0.016 6682,352.711273,-63.823658,316.922299,-51.059403,1,0.2960,0.5328,0.7393,42.4271,0.024 6692,1.694561,-45.191612,326.278557,-69.858253,1,nan,0.4504,0.0611,41.9897,0.011 6704,348.586945,-64.573555,318.693903,-49.477869,1,0.3131,0.2939,0.0390,40.9044,0.018 6723,349.160583,-64.760857,318.219706,-49.458924,1,0.2401,0.2742,0.1134,40.7312,0.020 6729,2.457983,-45.389202,324.632685,-69.945696,1,nan,0.9192,0.0320,43.8743,0.011 6742,349.615387,-63.636005,318.927246,-50.506542,1,nan,1.7240,0.2208,45.5614,0.018 6756,359.816315,-44.003082,331.451340,-70.123054,1,nan,1.0236,0.0293,44.1629,0.013 6814,152.050781,3.284369,237.157374,44.318466,1,0.6160,0.6226,0.6151,42.8365,0.019 6821,1.708861,-45.586655,325.688716,-69.520253,1,nan,0.3834,0.0765,41.5757,0.011 6827,358.636353,-46.768478,328.890146,-67.388837,1,nan,1.1052,0.1737,44.3688,0.008 6833,150.996094,2.388015,237.313912,42.939977,1,nan,1.4382,0.0724,45.0762,0.021 6837,32.871094,-4.780192,166.959493,-60.615132,1,nan,0.8994,0.0420,43.8158,0.017 6847,33.574219,-5.079716,168.448505,-60.407218,1,nan,0.6653,0.0303,43.0120,0.016 6855,346.500000,-62.320400,321.951129,-50.736054,1,nan,0.9313,0.0387,43.9092,0.020 6856,1.363636,-46.768478,324.669342,-68.371416,1,0.3068,0.2935,0.0090,40.9003,0.008 6876,150.644531,3.583322,235.698235,43.342784,1,0.3246,0.3216,0.4661,41.1300,0.018 6878,32.871094,-4.780192,166.959493,-60.615132,1,0.5919,0.5848,0.1811,42.6713,0.017 6884,52.910156,-27.279613,222.625192,-54.536648,1,0.2612,0.2825,1.1744,40.8053,0.007 6897,359.814819,-44.399834,330.775011,-69.801007,1,nan,0.6130,0.0239,42.7954,0.009 6907,151.171875,1.342993,238.602520,42.464379,1,0.4791,0.5900,0.3591,42.6946,0.026 6911,0.929752,-44.597992,328.531426,-70.083244,1,0.3125,0.3006,0.0066,40.9604,0.011 6919,346.655182,-63.260487,320.952196,-50.040935,1,nan,1.6301,0.2474,45.4117,0.019 6927,151.171875,2.537361,237.288526,43.169764,1,nan,0.5389,0.2528,42.4568,0.024 6930,52.207031,-26.610098,221.298836,-55.042928,1,0.6478,0.5687,0.2240,42.5980,0.014 6932,52.910156,-27.953188,223.774083,-54.639214,1,nan,1.3042,0.0324,44.8137,0.007 6937,0.965665,-46.375080,325.845907,-68.579427,1,nan,0.8820,0.0370,43.7636,0.007 6949,346.130127,-63.072620,321.423103,-50.042305,1,nan,0.8422,0.0195,43.6399,0.020 6951,0.589520,-47.161343,325.385896,-67.769893,1,0.5372,0.5224,0.0077,42.3758,0.009 6952,150.117188,2.238686,236.784618,42.139082,1,nan,1.7307,0.2954,45.5719,0.016 6964,359.816315,-44.003082,331.451340,-70.123054,1,nan,1.1443,0.2764,44.4622,0.013 6982,358.312500,-44.993881,332.185785,-68.685906,1,0.3227,0.3189,0.0263,41.1089,0.009 6988,150.996094,2.985506,236.647967,43.287350,1,nan,0.8042,0.0261,43.5165,0.020 7013,34.277344,-5.679190,170.314930,-60.410322,1,nan,0.5236,0.0416,42.3817,0.020 7017,349.429535,-62.508568,320.039643,-51.393745,1,nan,0.3210,0.0095,41.1251,0.020 7021,358.665253,-45.783966,330.353593,-68.203652,1,nan,0.9959,0.0260,44.0892,0.009 7032,51.328125,-27.784405,223.130589,-55.999499,1,nan,0.8584,0.0214,43.6910,0.013 7055,34.277344,-5.679190,170.314930,-60.410322,1,nan,0.9347,0.0439,43.9190,0.020 7060,52.207031,-28.291550,224.208534,-55.300157,1,nan,0.6635,0.0179,43.0048,0.007 7065,52.910156,-26.276812,220.926149,-54.363918,1,nan,1.5901,0.1813,45.3453,0.008 7129,51.328125,-27.447618,222.535046,-55.950727,1,nan,1.1651,0.0362,44.5107,0.013 7140,150.820312,3.134927,236.341348,43.230123,1,nan,0.9363,0.0122,43.9237,0.016 7161,1.363636,-46.768478,324.669342,-68.371416,1,0.3784,0.3433,0.1675,41.2947,0.008 7162,52.910156,-27.953188,223.774083,-54.639214,1,0.3593,0.4691,0.9910,42.0956,0.007 7163,53.085938,-27.784405,223.525509,-54.460748,1,nan,1.1234,0.1962,44.4129,0.007 7172,151.171875,1.342993,238.602520,42.464379,1,nan,0.6339,0.0129,42.8842,0.026 7175,347.617462,-62.508568,321.121462,-50.904708,1,nan,1.0926,0.1310,44.3382,0.019 7224,53.964844,-28.630989,225.142950,-53.813613,1,nan,1.2376,0.0743,44.6730,0.009 7226,52.207031,-26.610098,221.298836,-55.042928,1,0.3847,0.3273,0.8615,41.1744,0.014 7233,35.683594,-5.379379,171.992947,-59.253501,1,nan,1.2494,0.2608,44.6985,0.020 7241,150.820312,3.732834,235.666318,43.572109,1,nan,0.7200,0.0438,43.2218,0.016 7246,151.347656,4.181528,235.568369,44.259942,1,nan,0.6039,0.0147,42.7560,0.016 7265,151.171875,2.238686,237.619933,42.994783,1,nan,0.7073,0.0647,43.1743,0.024 7275,1.708861,-45.586655,325.688716,-69.520253,1,0.4268,0.4094,0.0089,41.7441,0.011 7282,51.855469,-28.630989,224.733260,-55.649872,1,nan,1.4386,0.0998,45.0769,0.009 7292,33.574219,-5.379379,168.838090,-60.637536,1,nan,0.7780,0.0880,43.4282,0.017 7297,0.574468,-45.981140,327.041068,-68.778764,1,0.5290,0.5535,0.4055,42.5268,0.006 7326,351.299988,-62.320400,319.038597,-52.026867,1,0.2394,0.2364,0.0161,40.3644,0.018 7344,33.398438,-4.331149,167.226341,-59.936551,1,0.5794,0.5951,0.0111,42.7176,0.018 7378,347.846710,-64.760857,318.929827,-49.143596,1,nan,1.0318,0.1438,44.1843,0.019 7381,52.031250,-26.443335,220.963669,-55.168557,1,0.3373,0.5151,0.2782,42.3389,0.014 7385,52.910156,-26.276812,220.926149,-54.363918,1,0.7536,0.7669,0.0410,43.3898,0.008 7389,359.446716,-44.201530,331.730015,-69.805709,1,0.6180,0.6621,0.1010,42.9994,0.010 7447,0.190678,-45.783966,327.956322,-68.803772,1,nan,0.6276,0.0506,42.8577,0.005 7451,351.734680,-62.884678,318.284128,-51.651217,1,nan,0.7431,0.0243,43.3058,0.019 7464,346.500000,-62.320400,321.951129,-50.736054,1,0.5846,0.6343,0.0145,42.8857,0.020 7479,359.058563,-45.191612,330.695783,-68.844915,1,0.3899,0.3252,0.7307,41.1585,0.011 7496,0.190678,-45.783966,327.956322,-68.803772,1,nan,0.6583,0.0121,42.9840,0.005 7508,34.980469,-6.279288,172.180075,-60.389399,1,nan,1.1198,0.0531,44.4042,0.023 7515,150.820312,3.134927,236.341348,43.230123,1,nan,1.0715,0.0116,44.2857,0.016 7535,33.222656,-4.780192,167.515653,-60.396584,1,nan,1.4866,0.1394,45.1649,0.018 7556,53.085938,-27.784405,223.525509,-54.460748,1,0.4003,0.4870,0.4078,42.1925,0.007 7590,359.811707,-45.191612,329.485675,-69.150905,1,nan,0.7702,0.0277,43.4013,0.010 7593,53.085938,-27.784405,223.525509,-54.460748,1,nan,1.1395,0.0880,44.4511,0.007 7596,358.312500,-44.993881,332.185785,-68.685906,1,nan,1.0214,0.2333,44.1572,0.009 7597,152.050781,2.985506,237.495952,44.143927,1,0.7873,0.8649,0.0311,43.7112,0.019 7599,346.130127,-63.072620,321.423103,-50.042305,1,nan,0.6460,0.0576,42.9341,0.020 7616,151.347656,3.583322,236.252362,43.918627,1,nan,0.6020,0.0567,42.7478,0.015 7651,2.097458,-45.783966,324.737840,-69.478613,1,0.4739,0.4618,0.0214,42.0550,0.011 7655,150.117188,2.836105,236.124718,42.483719,1,nan,0.7487,0.0442,43.3259,0.016 7657,352.398651,-62.696659,318.017427,-51.967966,1,nan,1.1347,0.0316,44.4397,0.020 7667,53.613281,-27.953188,223.929533,-54.024772,1,nan,0.6882,0.0180,43.1017,0.007 7693,349.615387,-63.636005,318.927246,-50.506542,1,0.5467,0.5108,0.1636,42.3168,0.018 7705,347.013428,-62.508568,321.472056,-50.735330,1,0.6672,0.6780,0.0682,43.0621,0.018 7723,33.398438,-4.331149,167.226341,-59.936551,1,nan,0.7023,0.0297,43.1555,0.018 7764,349.429535,-62.508568,320.039643,-51.393745,1,nan,0.5974,0.0081,42.7277,0.020 7771,349.046051,-61.943836,320.796530,-51.753706,1,nan,0.5341,0.0966,42.4333,0.017 7775,349.615387,-63.636005,318.927246,-50.506542,1,0.3416,0.3512,0.0280,41.3526,0.018 7788,346.130127,-63.072620,321.423103,-50.042305,1,nan,1.3320,0.0596,44.8704,0.020 7793,33.925781,-5.979157,170.179895,-60.866303,1,nan,1.1112,0.0841,44.3835,0.022 7806,152.050781,2.985506,237.495952,44.143927,1,0.1438,1.9587,0.6082,45.9018,0.019 7809,358.648071,-46.375080,329.462659,-67.716008,1,nan,0.6914,0.0120,43.1142,0.009 7820,148.710938,2.836105,235.050801,41.328739,1,0.3550,0.2846,0.0753,40.8234,0.031 7830,1.708861,-45.586655,325.688716,-69.520253,1,nan,0.6696,0.0570,43.0291,0.011 7858,1.363636,-46.768478,324.669342,-68.371416,1,0.4673,0.4927,0.0100,42.2231,0.008 7867,359.811707,-45.191612,329.485675,-69.150905,1,nan,0.6117,0.0088,42.7899,0.010 7873,150.468750,1.641510,237.714575,42.075234,1,0.5708,0.6019,0.0233,42.7474,0.017 7920,358.665253,-45.783966,330.353593,-68.203652,1,nan,0.8342,0.0228,43.6144,0.009 7928,151.171875,1.342993,238.602520,42.464379,1,nan,0.8590,0.0235,43.6927,0.026 7931,33.574219,-6.579593,170.455585,-61.548219,1,nan,0.5225,0.4053,42.3762,0.021 7940,52.910156,-25.944481,220.366350,-54.301439,1,0.6969,0.7104,0.0284,43.1859,0.010 7942,0.190678,-45.783966,327.956322,-68.803772,1,0.2814,0.3115,0.5654,41.0497,0.005 7957,1.708861,-45.586655,325.688716,-69.520253,1,nan,0.9375,0.0504,43.9271,0.011 7977,33.398438,-3.732834,166.492280,-59.466614,1,nan,1.0780,0.0578,44.3020,0.022 7980,358.312500,-44.993881,332.185785,-68.685906,1,nan,1.2456,0.1373,44.6902,0.009 7982,346.276581,-64.011238,320.448031,-49.344136,1,0.4520,0.4716,0.0369,42.1091,0.019 7983,0.949367,-45.586655,326.991548,-69.251686,1,nan,1.0778,0.0319,44.3015,0.013 7984,34.453125,-5.229529,169.987075,-59.956185,1,nan,1.4448,0.2401,45.0886,0.019 7990,1.723404,-45.981140,325.117958,-69.180825,1,nan,0.8379,0.0368,43.6263,0.010 8019,33.574219,-6.579593,170.455585,-61.548219,1,0.2036,0.2032,0.0104,39.9944,0.021 8034,52.207031,-26.610098,221.298836,-55.042928,1,nan,1.5348,0.1021,45.2506,0.014 8036,51.328125,-27.447618,222.535046,-55.950727,1,0.3381,0.5336,0.3006,42.4311,0.013 8037,0.189873,-45.586655,328.254458,-68.969298,1,nan,0.8038,0.0994,43.5152,0.007 8054,52.910156,-25.944481,220.366350,-54.301439,1,nan,0.8669,0.0250,43.7173,0.010 8057,359.058563,-45.191612,330.695783,-68.844915,1,nan,0.7677,0.0225,43.3927,0.011 8058,52.031250,-26.443335,220.963669,-55.168557,1,nan,1.0909,0.0215,44.3339,0.014 8063,150.468750,3.732834,235.392208,43.283244,1,nan,1.0023,0.2042,44.1065,0.020 8094,149.414062,2.238686,236.239766,41.565558,1,nan,0.5647,0.0343,42.5793,0.017 8102,2.071130,-45.191612,325.606223,-69.989264,1,nan,1.1741,0.0373,44.5314,0.011 8129,359.814819,-44.399834,330.775011,-69.801007,1,0.4569,0.4721,0.2974,42.1119,0.009 8135,0.190678,-45.783966,327.956322,-68.803772,1,0.2615,0.2935,0.0129,40.9008,0.005 8153,350.230255,-61.943836,320.053946,-52.070537,1,0.2924,0.3076,0.0101,41.0184,0.017 8165,33.750000,-4.630479,168.146242,-59.949072,1,nan,1.0355,0.1353,44.1940,0.019 8166,150.292969,2.686724,236.427488,42.541447,1,nan,1.2423,0.1331,44.6832,0.016 8179,52.207031,-28.630989,224.800211,-55.343637,1,nan,0.6201,0.0376,42.8258,0.009 8181,0.965665,-46.375080,325.845907,-68.579427,1,0.6159,0.6147,0.0327,42.8026,0.007 8182,348.586945,-64.573555,318.693903,-49.477869,1,nan,1.0126,0.0872,44.1337,0.018 8196,53.085938,-27.111860,222.384291,-54.355086,1,nan,0.6223,0.0130,42.8354,0.007 8197,33.750000,-4.630479,168.146242,-59.949072,1,nan,0.8678,0.0526,43.7202,0.019 8201,53.261719,-27.615883,223.280041,-54.281374,1,nan,1.0299,0.0450,44.1792,0.006 8242,150.468750,1.641510,237.714575,42.075234,1,nan,0.9643,0.0352,44.0026,0.017 8250,51.855469,-27.953188,223.543603,-55.561470,1,nan,1.3103,0.1902,44.8263,0.008 8252,359.446716,-44.201530,331.730015,-69.805709,1,nan,0.8891,0.1482,43.7851,0.010 8263,32.695312,-4.929937,166.868469,-60.841230,1,0.7023,0.7653,0.0398,43.3844,0.018 8271,53.964844,-28.630989,225.142950,-53.813613,1,nan,1.2974,0.0994,44.7998,0.009 8283,349.160583,-64.760857,318.219706,-49.458924,1,nan,1.1567,0.0332,44.4912,0.020 8284,35.859375,-4.630479,171.270769,-58.580806,1,0.4045,0.4502,0.0352,41.9889,0.022 8315,152.050781,3.284369,237.157374,44.318466,1,0.7654,0.7712,0.0372,43.4049,0.019 8322,150.117188,2.836105,236.124718,42.483719,1,0.0718,0.0803,0.1025,37.8083,0.016 8329,52.031250,-26.443335,220.963669,-55.168557,1,nan,1.5343,0.1326,45.2496,0.014 8331,33.574219,-5.079716,168.448505,-60.407218,1,0.6347,0.6283,0.0176,42.8605,0.016 8332,35.859375,-4.630479,171.270769,-58.580806,1,0.5166,0.5551,0.0652,42.5347,0.022 8345,359.058563,-45.191612,330.695783,-68.844915,1,0.5651,0.5479,0.0133,42.5004,0.011 8361,51.855469,-28.630989,224.733260,-55.649872,1,nan,0.9078,0.0244,43.8409,0.009 8386,359.811707,-45.191612,329.485675,-69.150905,1,nan,0.9630,0.0958,43.9990,0.010 8388,33.750000,-4.630479,168.146242,-59.949072,1,nan,1.0142,0.0893,44.1381,0.019 8399,1.363636,-46.768478,324.669342,-68.371416,1,0.6489,0.6460,0.0193,42.9339,0.008 8401,152.050781,2.985506,237.495952,44.143927,1,0.5695,0.5583,0.0070,42.5495,0.019 8404,52.910156,-25.944481,220.366350,-54.301439,1,nan,0.9683,0.0275,44.0136,0.010 8413,349.966217,-62.696659,319.542989,-51.376556,1,nan,0.9296,0.2134,43.9042,0.021 8420,359.811707,-45.191612,329.485675,-69.150905,1,nan,1.2904,0.0367,44.7852,0.010 8430,51.855469,-26.276812,220.627031,-55.293792,1,nan,1.3057,0.0606,44.8168,0.014 8441,32.695312,-4.929937,166.868469,-60.841230,1,nan,0.6746,0.0423,43.0487,0.018 8442,351.953644,-62.132156,318.777388,-52.347124,1,nan,0.0000,0.0000,nan,0.019 8455,151.347656,3.583322,236.252362,43.918627,1,nan,1.0052,0.0667,44.1141,0.015 8463,150.644531,3.583322,235.698235,43.342784,1,nan,0.9599,0.0304,43.9903,0.018 8469,348.586945,-64.573555,318.693903,-49.477869,1,nan,0.7521,0.0183,43.3379,0.018 8487,53.613281,-26.944359,222.237403,-53.863858,1,nan,0.6150,0.0095,42.8042,0.009 8492,53.613281,-28.630989,225.073365,-54.119461,1,nan,0.9894,0.1246,44.0715,0.006 8512,52.558594,-27.279613,222.538937,-54.845107,1,0.8384,0.8562,0.0216,43.6841,0.008 8513,33.574219,-5.079716,168.448505,-60.407218,1,0.3943,0.4243,0.0320,41.8362,0.016 8521,33.750000,-4.630479,168.146242,-59.949072,1,0.4536,0.4521,0.0212,41.9996,0.019 8534,349.285706,-62.884678,319.786163,-51.046461,1,nan,0.8932,0.0144,43.7973,0.018 8548,151.347656,3.583322,236.252362,43.918627,1,0.4318,0.5090,0.0311,42.3075,0.015 8549,358.636353,-46.768478,328.890146,-67.388837,1,nan,0.7646,0.0436,43.3818,0.008 8553,347.812500,-63.448284,320.128971,-50.202348,1,nan,0.4066,1.0187,41.7265,0.021 8565,359.058563,-45.191612,330.695783,-68.844915,1,0.6011,0.5779,0.1187,42.6400,0.011 8592,53.613281,-27.953188,223.929533,-54.024772,1,nan,0.8871,0.0176,43.7788,0.007 8619,53.789062,-27.784405,223.685697,-53.845803,1,nan,1.4192,0.1271,45.0406,0.009 8625,351.734680,-62.884678,318.284128,-51.651217,1,0.3067,0.3063,0.0081,41.0077,0.019 8644,52.207031,-28.630989,224.800211,-55.343637,1,0.6866,0.7308,0.0554,43.2613,0.009 8646,1.694561,-45.191612,326.278557,-69.858253,1,nan,1.3300,0.1125,44.8664,0.011 8661,349.046051,-61.943836,320.796530,-51.753706,1,nan,0.9744,0.0308,44.0307,0.017 8664,52.558594,-27.279613,222.538937,-54.845107,1,nan,0.5444,0.0180,42.4836,0.008 8665,151.171875,2.537361,237.288526,43.169764,1,nan,1.2262,0.0861,44.6480,0.024 8672,35.683594,-5.379379,171.992947,-59.253501,1,nan,1.1123,0.0767,44.3861,0.020 8674,53.789062,-27.784405,223.685697,-53.845803,1,0.9353,0.7665,0.0891,43.3883,0.009 8682,349.046051,-61.943836,320.796530,-51.753706,1,nan,1.0543,0.0550,44.2423,0.017 8683,52.910156,-27.953188,223.774083,-54.639214,1,0.6644,0.6624,0.0227,43.0003,0.007 8684,51.855469,-27.953188,223.543603,-55.561470,1,nan,0.8518,0.0200,43.6702,0.008 8691,351.734680,-62.884678,318.284128,-51.651217,1,nan,0.6565,0.0183,42.9769,0.019 8697,351.734680,-62.884678,318.284128,-51.651217,1,nan,1.4889,0.0787,45.1691,0.019 8701,347.812500,-63.448284,320.128971,-50.202348,1,0.2506,0.2395,0.0127,40.3969,0.021 8702,1.753247,-46.768478,324.030235,-68.498041,1,nan,1.2922,0.0134,44.7890,0.014 8705,150.820312,3.732834,235.666318,43.572109,1,nan,0.9669,0.0557,44.0099,0.016 8717,52.207031,-28.291550,224.208534,-55.300157,1,nan,1.4656,0.3245,45.1269,0.007 8724,2.071130,-45.191612,325.606223,-69.989264,1,0.2852,0.2994,0.0072,40.9502,0.011 8730,348.586945,-64.573555,318.693903,-49.477869,1,0.4718,2.9626,1.3143,46.9933,0.018 8734,348.586945,-64.573555,318.693903,-49.477869,1,0.4341,0.4524,0.0446,42.0014,0.018 8738,52.910156,-26.276812,220.926149,-54.363918,1,nan,1.0394,0.1965,44.2041,0.008 8739,53.789062,-27.784405,223.685697,-53.845803,1,nan,1.1723,0.2076,44.5273,0.009 8767,150.820312,3.134927,236.341348,43.230123,1,nan,0.6681,0.0088,43.0230,0.016 8803,2.097458,-45.783966,324.737840,-69.478613,1,nan,1.0540,0.2342,44.2414,0.011 8805,0.965665,-46.375080,325.845907,-68.579427,1,nan,1.1091,0.1745,44.3785,0.007 8806,352.398651,-62.696659,318.017427,-51.967966,1,nan,1.3680,0.2494,44.9420,0.020 8812,151.347656,4.181528,235.568369,44.259942,1,nan,1.2164,0.1126,44.6264,0.016 8816,149.414062,3.433834,234.919132,42.245550,1,nan,0.8846,0.0380,43.7713,0.027 8821,35.859375,-4.630479,171.270769,-58.580806,1,nan,0.7988,0.0369,43.4987,0.022 8837,351.299988,-62.320400,319.038597,-52.026867,1,nan,0.5320,0.0097,42.4234,0.018 8850,346.562500,-63.448284,320.824720,-49.866957,1,nan,1.0271,0.0826,44.1720,0.021 8857,150.117188,2.238686,236.784618,42.139082,1,nan,1.0907,0.1270,44.3334,0.016 8861,351.321442,-64.198746,317.458993,-50.429931,1,nan,0.3476,1.1210,41.3264,0.023 8871,32.695312,-4.929937,166.868469,-60.841230,1,nan,1.8695,0.2279,45.7776,0.018 8883,359.446716,-44.201530,331.730015,-69.805709,1,nan,1.3015,0.2137,44.8082,0.010 8902,151.171875,2.238686,237.619933,42.994783,1,nan,0.4347,0.0093,41.8984,0.024 8904,347.812500,-63.448284,320.128971,-50.202348,1,nan,1.1116,0.0337,44.3843,0.021 8917,34.101562,-5.829153,170.247753,-60.638325,1,0.5976,0.5969,0.0088,42.7253,0.019 8919,51.679688,-27.447618,222.618229,-55.642263,1,nan,0.9152,0.0404,43.8626,0.010 8921,150.644531,3.583322,235.698235,43.342784,1,0.5296,0.5248,0.0115,42.3874,0.018 8933,349.046051,-61.943836,320.796530,-51.753706,1,nan,0.0000,0.0000,nan,0.017 8947,33.574219,-5.379379,168.838090,-60.637536,1,0.3757,2.9135,0.9540,46.9496,0.017 8962,2.071130,-45.191612,325.606223,-69.989264,1,nan,1.2366,0.1848,44.6708,0.011 8978,348.529419,-61.755440,321.293980,-51.763351,1,nan,0.5523,0.1275,42.5212,0.016 8983,53.085938,-28.122234,224.100909,-54.509752,1,0.4014,0.3910,0.0331,41.6261,0.007 8987,0.574468,-45.981140,327.041068,-68.778764,1,nan,1.0629,0.0133,44.2641,0.006 9033,0.190678,-45.783966,327.956322,-68.803772,1,0.3132,0.2593,0.0293,40.5923,0.005 9041,347.617462,-62.508568,321.121462,-50.904708,1,nan,1.0424,0.0183,44.2118,0.019 9051,358.665253,-45.783966,330.353593,-68.203652,1,nan,0.9828,0.0093,44.0536,0.009 9053,1.708861,-45.586655,325.688716,-69.520253,1,nan,0.5661,0.0926,42.5860,0.011 9080,33.925781,-5.979157,170.179895,-60.866303,1,nan,0.8653,0.0487,43.7124,0.022 9084,52.910156,-26.276812,220.926149,-54.363918,1,nan,0.8458,0.0301,43.6512,0.008 9099,352.398651,-62.696659,318.017427,-51.967966,1,nan,0.5841,0.0141,42.6684,0.020 9107,150.468750,1.641510,237.714575,42.075234,1,0.4687,0.5719,0.2556,42.6127,0.017 9110,0.929752,-44.597992,328.531426,-70.083244,1,0.2531,0.2978,0.0321,40.9371,0.011 9115,53.261719,-27.615883,223.280041,-54.281374,1,nan,0.5434,0.6581,42.4785,0.006 9124,34.277344,-5.679190,170.314930,-60.410322,1,nan,1.5382,0.2744,45.2564,0.020 9138,352.132874,-63.636005,317.424173,-51.095855,1,0.2703,0.2829,0.0150,40.8088,0.021 9165,349.429535,-62.508568,320.039643,-51.393745,1,nan,1.1058,0.0670,44.3703,0.020 9167,346.130127,-63.072620,321.423103,-50.042305,1,nan,1.1829,0.0376,44.5516,0.020 9170,346.500000,-62.320400,321.951129,-50.736054,1,nan,0.5880,0.0130,42.6856,0.020 9187,35.859375,-4.630479,171.270769,-58.580806,1,nan,0.3500,0.0275,41.3440,0.022 9197,2.071130,-45.191612,325.606223,-69.989264,1,nan,1.3572,0.2236,44.9207,0.011 9209,150.644531,3.583322,235.698235,43.342784,1,nan,0.5872,0.0397,42.6820,0.018 9210,359.415588,-46.768478,327.729895,-67.686097,1,nan,1.2507,0.0994,44.7013,0.009 9216,359.811707,-45.191612,329.485675,-69.150905,1,0.9399,1.5923,0.1936,45.3491,0.010 9220,52.207031,-28.630989,224.800211,-55.343637,1,nan,0.6692,0.0244,43.0276,0.009 9229,52.910156,-27.953188,223.774083,-54.639214,1,0.3075,0.3138,0.0076,41.0683,0.007 9237,0.929752,-44.597992,328.531426,-70.083244,1,nan,0.5799,0.2003,42.6494,0.011 9277,149.414062,1.940072,236.565366,41.393323,1,nan,0.7257,0.0452,43.2426,0.018 9293,151.171875,2.238686,237.619933,42.994783,1,0.5639,0.5209,0.5082,42.3682,0.024 9302,351.321442,-64.198746,317.458993,-50.429931,1,nan,1.3310,0.0582,44.8683,0.023 9303,34.453125,-5.229529,169.987075,-59.956185,1,nan,1.0260,0.0163,44.1691,0.019 9316,33.574219,-5.379379,168.838090,-60.637536,1,nan,0.6163,0.0159,42.8095,0.017 9322,33.574219,-5.079716,168.448505,-60.407218,1,nan,1.3730,0.1469,44.9518,0.016 9346,351.321442,-64.198746,317.458993,-50.429931,1,nan,1.3099,0.0995,44.8254,0.023 9362,53.437500,-29.142223,225.908120,-54.336118,1,nan,0.2692,0.0170,40.6858,0.008 9365,34.277344,-5.679190,170.314930,-60.410322,1,0.2935,0.2933,0.0217,40.8992,0.020 9374,53.085938,-27.111860,222.384291,-54.355086,1,nan,0.7189,0.0162,43.2179,0.007 9380,346.655182,-63.260487,320.952196,-50.040935,1,nan,0.7849,0.0428,43.4518,0.019 9408,346.500000,-62.320400,321.951129,-50.736054,1,nan,0.9664,0.0147,44.0085,0.020 9423,351.382965,-64.011238,317.574052,-50.604657,1,nan,0.5558,0.0329,42.5379,0.023 9444,349.046051,-61.943836,320.796530,-51.753706,1,nan,1.1578,0.0579,44.4939,0.017 9451,351.382965,-64.011238,317.574052,-50.604657,1,nan,0.9855,0.0096,44.0612,0.023 9461,358.665253,-45.783966,330.353593,-68.203652,1,nan,1.1498,0.0929,44.4752,0.009 9468,51.855469,-26.276812,220.627031,-55.293792,1,nan,0.7969,0.0460,43.4923,0.014 9469,32.871094,-4.780192,166.959493,-60.615132,1,0.8180,0.7700,0.0220,43.4006,0.017 9482,53.085938,-28.122234,224.100909,-54.509752,1,nan,0.7314,0.0368,43.2634,0.007 9497,51.328125,-27.784405,223.130589,-55.999499,1,0.4664,0.4234,0.0109,41.8306,0.013 9507,359.811707,-45.191612,329.485675,-69.150905,1,nan,0.7375,0.0178,43.2857,0.010 9550,346.130127,-63.072620,321.423103,-50.042305,1,0.6995,0.7070,0.0097,43.1732,0.020 9554,51.679688,-27.447618,222.618229,-55.642263,1,0.6646,0.6800,0.0238,43.0701,0.010 9562,349.160583,-64.760857,318.219706,-49.458924,1,nan,0.3891,0.0546,41.6136,0.020 9566,52.910156,-25.944481,220.366350,-54.301439,1,nan,0.8552,0.0311,43.6808,0.010 9588,51.855469,-28.630989,224.733260,-55.649872,1,nan,0.8782,0.0321,43.7520,0.009 9597,53.964844,-28.630989,225.142950,-53.813613,1,nan,0.8452,0.2344,43.6493,0.009 9603,349.046051,-61.943836,320.796530,-51.753706,1,nan,1.6743,0.3261,45.4832,0.017 9626,150.644531,3.583322,235.698235,43.342784,1,nan,1.0665,0.0241,44.2730,0.018 9652,53.613281,-28.630989,225.073365,-54.119461,1,nan,0.6624,0.0296,43.0006,0.006 9662,53.613281,-26.944359,222.237403,-53.863858,1,nan,1.4334,0.0635,45.0673,0.009 9676,1.753247,-46.768478,324.030235,-68.498041,1,0.6764,0.7221,0.0336,43.2295,0.014 9678,151.523438,3.134927,236.900695,43.803170,1,nan,1.1508,0.7055,44.4777,0.019 9699,2.097458,-45.783966,324.737840,-69.478613,1,nan,0.6400,0.1023,42.9095,0.011 9705,352.711273,-63.823658,316.922299,-51.059403,1,0.5977,0.5685,0.0384,42.5972,0.024 9720,351.734680,-62.884678,318.284128,-51.651217,1,0.6071,0.5353,0.0256,42.4395,0.019 9725,350.230255,-61.943836,320.053946,-52.070537,1,nan,1.2775,0.0723,44.7581,0.017 9726,358.312500,-44.993881,332.185785,-68.685906,1,nan,0.7580,0.0243,43.3588,0.009 9761,52.207031,-28.291550,224.208534,-55.300157,1,nan,1.4229,0.1718,45.0476,0.007 9772,53.613281,-26.944359,222.237403,-53.863858,1,nan,0.5526,0.0238,42.5224,0.009 9776,351.953644,-62.132156,318.777388,-52.347124,1,0.4229,0.4114,0.0259,41.7568,0.019 9782,149.589844,3.583322,234.885369,42.474696,1,nan,1.1776,0.1586,44.5394,0.024 9799,346.276581,-64.011238,320.448031,-49.344136,1,nan,0.9878,0.3118,44.0673,0.019 9806,0.965665,-46.375080,325.845907,-68.579427,1,0.4619,0.5359,0.0762,42.4424,0.007 9809,347.812500,-63.448284,320.128971,-50.202348,1,nan,0.6301,0.0342,42.8682,0.021 9839,53.613281,-27.953188,223.929533,-54.024772,1,nan,1.5628,0.1334,45.2989,0.007 9864,0.929752,-44.597992,328.531426,-70.083244,1,nan,0.7718,0.0734,43.4067,0.011 9865,150.117188,3.732834,235.120533,42.993809,1,0.4651,2.3854,0.8502,46.4240,0.020 9868,54.667969,-27.615883,223.610785,-53.050840,1,nan,0.7611,0.0247,43.3696,0.009 9884,51.328125,-27.447618,222.535046,-55.950727,1,0.6206,0.6001,0.0161,42.7395,0.013 9893,33.750000,-4.630479,168.146242,-59.949072,1,0.4883,0.5053,0.0956,42.2888,0.019 9916,349.891296,-64.573555,317.972107,-49.786192,1,nan,0.6122,0.0358,42.7920,0.023 9924,349.285706,-62.884678,319.786163,-51.046461,1,0.7986,0.8711,0.0680,43.7303,0.018 9950,51.328125,-27.447618,222.535046,-55.950727,1,nan,0.5462,0.0962,42.4921,0.013 9951,346.276581,-64.011238,320.448031,-49.344136,1,0.8417,0.7427,0.0547,43.3044,0.019 9961,53.789062,-27.784405,223.685697,-53.845803,1,nan,1.0492,0.2680,44.2292,0.009 9967,150.292969,2.686724,236.427488,42.541447,1,nan,1.4395,0.2623,45.0786,0.016 9989,346.130127,-63.072620,321.423103,-50.042305,1,nan,0.9517,0.2190,43.9673,0.020 9998,2.457983,-45.389202,324.632685,-69.945696,1,nan,0.5127,0.0376,42.3265,0.011 10021,52.207031,-28.630989,224.800211,-55.343637,1,nan,1.4337,0.0978,45.0679,0.009 10056,52.558594,-27.279613,222.538937,-54.845107,1,0.4511,0.3564,0.0517,41.3900,0.008 10071,0.949367,-45.586655,326.991548,-69.251686,1,0.3219,0.4590,0.0597,42.0387,0.013 10102,359.058563,-45.191612,330.695783,-68.844915,1,nan,0.7450,0.0248,43.3128,0.011 10103,359.805206,-46.768478,327.135979,-67.829903,1,nan,0.8130,0.0375,43.5455,0.011 10110,54.667969,-27.615883,223.610785,-53.050840,1,0.7638,0.8007,0.0367,43.5050,0.009 10119,351.953644,-62.132156,318.777388,-52.347124,1,0.6209,0.6434,0.0311,42.9234,0.019 10133,352.398651,-62.696659,318.017427,-51.967966,1,nan,0.9030,0.0216,43.8266,0.020 10139,350.230255,-61.943836,320.053946,-52.070537,1,nan,1.3210,0.0562,44.8481,0.017 10146,346.500000,-62.320400,321.951129,-50.736054,1,nan,0.5522,0.0066,42.5205,0.020 10150,359.814819,-44.399834,330.775011,-69.801007,1,0.4395,2.7379,0.4044,46.7868,0.009 10155,0.589520,-47.161343,325.385896,-67.769893,1,nan,1.0100,0.2301,44.1270,0.009 10180,359.811707,-45.191612,329.485675,-69.150905,1,nan,0.5829,0.0112,42.6631,0.010 10213,359.058563,-45.191612,330.695783,-68.844915,1,nan,0.5634,0.0192,42.5732,0.011 10218,52.558594,-27.279613,222.538937,-54.845107,1,0.3131,0.3414,0.0364,41.2809,0.008 10228,151.171875,1.342993,238.602520,42.464379,1,nan,0.7524,0.0449,43.3390,0.026 10244,53.789062,-27.784405,223.685697,-53.845803,1,nan,0.7687,0.0207,43.3962,0.009 ================================================ FILE: examples/data/plasticc_training_set_1k.csv ================================================ object_id,mjd,passband,flux,flux_err,detected 615,59750.4229,2,-544.810303,3.622952,1 615,59750.4306,1,-816.434326,5.553370,1 615,59750.4383,3,-471.385529,3.801213,1 615,59750.4450,4,-388.984985,11.395031,1 615,59752.4070,2,-681.858887,4.041204,1 615,59752.4147,1,-1061.457031,6.472994,1 615,59752.4224,3,-524.954590,3.552751,1 615,59752.4334,4,-393.480225,3.599346,1 615,59752.4435,5,-355.886780,10.421921,1 615,59767.2968,2,-548.013550,3.462291,1 615,59767.3045,1,-815.188599,5.293019,1 615,59767.3122,3,-475.516052,3.340643,1 615,59767.3233,4,-405.663818,3.496113,1 615,59767.3343,5,-421.199066,6.377517,1 615,59770.2179,2,-554.903198,3.927843,1 615,59770.2256,1,-820.042786,5.875329,1 615,59770.2334,3,-477.004730,3.736262,1 615,59770.2445,4,-400.270386,3.834955,1 615,59770.2557,5,-415.286896,7.435979,1 615,59779.3188,2,-630.523682,4.333287,1 615,59779.3265,1,-921.002502,6.306800,1 615,59779.3342,3,-518.533997,3.915225,1 615,59779.3452,4,-422.184509,4.089213,1 615,59779.3562,5,-422.815094,8.124096,1 615,59782.1897,2,-280.039520,2.819228,1 615,59782.1974,1,-449.095612,4.028310,1 615,59782.2051,3,-316.704865,3.491153,1 615,59782.2162,4,-332.885437,4.021619,1 615,59782.2274,5,-365.075775,8.514805,1 615,59797.2861,2,391.399231,3.098059,1 615,59797.2938,1,35.511822,3.163646,1 615,59797.3015,3,330.623901,3.388776,1 615,59797.3126,4,360.397858,3.980607,1 615,59797.3237,5,369.439667,8.207490,1 615,59800.3168,2,168.739899,3.128495,1 615,59800.3244,1,129.541901,4.358776,1 615,59800.3320,3,30.120724,3.396606,1 615,59800.3429,4,-60.942333,3.704243,1 615,59800.3539,5,-128.920334,7.495701,1 615,59807.1738,2,-256.660980,2.781354,1 615,59807.1815,1,-420.796417,4.037735,1 615,59807.1892,3,-298.936859,3.041390,1 615,59807.2003,4,-311.977783,3.318007,1 615,59807.2114,5,-344.536072,6.367201,1 615,59810.1045,2,-342.819763,2.808321,1 615,59810.1122,1,-527.020325,4.204173,1 615,59810.1200,3,-363.282532,3.237536,1 615,59810.1311,4,-348.628662,3.774855,1 615,59810.1422,5,-391.271271,7.657067,1 615,59813.1044,2,-678.045715,4.032819,1 615,59813.1122,1,-1100.440063,6.709106,1 615,59813.1199,3,-506.687408,3.547398,1 615,59813.1310,4,-304.049713,3.590496,1 615,59813.1422,5,-187.285919,6.984019,1 615,59819.1532,0,6.878784,3.633152,0 615,59820.1047,0,39.364853,3.775619,1 615,59821.1026,0,-10.422381,4.172683,0 615,59822.1105,0,-65.485130,4.362876,1 615,59823.1505,0,-113.349159,4.069051,1 615,59835.0600,2,-54.949490,2.575779,1 615,59835.0678,1,-178.149399,3.809858,1 615,59835.0755,3,-140.818436,3.149077,1 615,59835.0866,4,-200.294128,3.578005,1 615,59835.0978,5,-263.578430,6.954262,1 615,59839.0306,2,-639.035950,3.928531,1 615,59839.0384,1,-953.883728,6.035410,1 615,59839.0461,3,-518.293274,3.595869,1 615,59839.0573,4,-418.723907,3.536483,1 615,59839.0684,5,-418.799927,6.580595,1 615,59842.0207,2,-502.215332,3.348443,1 615,59842.0285,1,-1003.971497,6.256784,1 615,59842.0362,3,-233.167755,2.872840,1 615,59842.0473,4,111.507675,3.229112,1 615,59842.0585,5,206.425323,6.615822,1 615,59851.1114,0,-68.502457,3.338555,1 615,59854.0796,2,459.452667,3.336711,1 615,59854.0873,1,217.894211,3.332742,1 615,59854.0950,3,361.023438,3.237847,1 615,59854.1061,4,374.446442,3.622074,1 615,59854.1172,5,370.346283,6.789766,1 615,59857.0453,2,599.812195,4.121032,1 615,59857.0531,1,646.523193,5.291624,1 615,59857.0608,3,354.961365,3.407785,1 615,59857.0719,4,293.879608,3.581862,1 615,59857.0830,5,232.535995,6.761845,1 615,59864.0162,2,-637.105347,3.818432,1 615,59864.0239,1,-942.167908,5.916004,1 615,59864.0316,3,-524.586548,3.538931,1 615,59864.0428,4,-414.447723,3.635253,1 615,59864.0539,5,-408.089233,7.119730,1 615,59867.0178,2,-332.763123,2.872951,1 615,59867.0255,1,-910.677734,5.852032,1 615,59867.0332,3,-62.065010,3.073413,1 615,59867.0443,4,202.288223,4.026751,1 615,59867.0554,5,270.584869,10.321785,1 615,59870.0194,2,604.344543,3.901750,1 615,59870.0272,1,659.486694,4.991051,1 615,59870.0349,3,373.986511,3.294667,1 615,59870.0459,4,322.604034,3.716555,1 615,59870.0571,5,263.481476,7.193131,1 615,59873.0212,2,4.656033,2.130510,0 615,59873.0289,1,-98.796974,3.034533,1 615,59873.0366,3,-93.732880,2.499724,1 615,59873.0477,4,-165.793457,2.906058,1 615,59873.0588,5,-233.501724,6.087882,1 615,59874.0599,0,-97.353195,3.133990,1 615,59875.0311,0,-97.523880,2.963075,1 615,59876.0231,0,-108.672577,3.449714,1 615,59877.0238,0,-116.913223,3.097836,1 615,59878.0246,0,-102.768921,3.135772,1 615,59879.0248,0,-52.407089,3.261559,1 615,59880.0258,0,55.567715,3.355268,1 615,59884.0823,2,-274.711029,2.572093,1 615,59884.0900,1,-437.425110,3.831595,1 615,59884.0976,3,-310.010925,2.957125,1 615,59884.1085,4,-317.630920,3.213168,1 615,59884.1195,5,-351.278198,6.231324,1 615,59887.0298,2,-491.146423,3.592675,1 615,59887.0375,1,-743.267212,5.624708,1 615,59887.0451,3,-449.714752,3.728483,1 615,59887.0562,4,-393.971649,3.532816,1 615,59887.0673,5,-406.549103,6.648589,1 615,60118.4163,0,-107.080536,3.102513,1 615,60124.2541,2,-588.397949,4.176047,1 615,60124.2618,1,-878.043396,6.210247,1 615,60124.2695,3,-495.472015,4.011444,1 615,60124.2807,4,-417.145325,4.094360,1 615,60124.2918,5,-413.673431,7.793959,1 615,60140.2290,0,-88.981155,3.468430,1 615,60141.2225,0,-50.179337,4.734193,1 615,60142.2202,0,50.008640,4.636651,1 615,60143.2212,0,110.753555,4.380840,1 615,60144.2186,0,120.867218,4.103332,1 615,60145.2123,0,111.464226,4.367030,1 615,60153.2274,2,-322.420471,2.833071,1 615,60153.2351,1,-917.875488,5.951387,1 615,60153.2428,3,-52.056461,2.698249,1 615,60153.2539,4,205.180893,3.169676,1 615,60153.2650,5,269.709167,6.682271,1 615,60162.1477,2,31.499735,2.753767,1 615,60162.1554,1,-62.120552,3.818642,1 615,60162.1631,3,-72.958771,3.515574,1 615,60162.1742,4,-151.126511,3.838288,1 615,60162.1853,5,-216.914032,8.832489,1 615,60165.1369,2,-568.408875,3.553168,1 615,60165.1446,1,-836.233154,5.405759,1 615,60165.1524,3,-483.071381,3.387615,1 615,60165.1635,4,-409.470642,3.541994,1 615,60165.1746,5,-412.820221,6.907444,1 615,60168.1260,2,-628.321350,3.804775,1 615,60168.1337,1,-1077.347900,6.591075,1 615,60168.1414,3,-421.859406,3.336656,1 615,60168.1525,4,-93.729095,3.211201,1 615,60168.1637,5,31.207939,6.615005,0 615,60176.1332,0,-49.905262,3.874426,1 615,60177.1370,0,-87.160583,4.078375,1 615,60181.3147,2,-180.729568,2.498579,1 615,60181.3223,1,-339.875153,3.626661,1 615,60181.3299,3,-249.205673,3.280824,1 615,60181.3409,4,-275.762329,4.694962,1 615,60181.3518,5,-330.891327,9.580047,1 615,60184.3625,2,-555.853943,3.979171,1 615,60184.3701,1,-1028.441528,6.719577,1 615,60184.3777,3,-306.200500,3.557627,1 615,60184.3887,4,49.555847,3.627351,1 615,60184.3996,5,154.876785,7.988054,1 615,60194.1575,2,469.654999,3.386857,1 615,60194.1652,1,276.757751,3.594162,1 615,60194.1729,3,374.669556,3.437137,1 615,60194.1839,4,374.948822,4.267094,1 615,60194.1926,5,363.130493,12.845472,1 615,60197.1181,2,607.786804,3.960346,1 615,60197.1258,1,650.984314,4.970811,1 615,60197.1335,3,365.408752,3.197298,1 615,60197.1446,4,305.330750,3.360043,1 615,60197.1557,5,256.966217,6.443069,1 615,60198.1077,0,100.129280,4.266314,1 615,60199.0914,0,86.776741,4.679742,1 615,60200.0650,0,82.078186,4.342434,1 615,60201.0680,0,41.947815,4.467065,1 615,60202.0552,0,9.061676,3.831397,0 615,60206.1107,0,-83.072884,3.130236,1 615,60207.1469,0,108.483109,4.458607,1 615,60208.0229,2,-672.681335,4.138056,1 615,60208.0307,1,-1094.027588,6.717340,1 615,60208.0384,3,-503.870422,3.665424,1 615,60208.0495,4,-284.747498,3.666287,1 615,60208.0606,5,-176.409851,8.308295,1 615,60211.0124,2,208.281052,3.229784,1 615,60211.0202,1,-370.189575,4.525907,1 615,60211.0279,3,269.200806,3.688238,1 615,60211.0390,4,326.272308,4.424663,1 615,60211.0502,5,358.320099,10.152412,1 615,60221.0153,2,-648.682312,3.906177,1 615,60221.0230,1,-1086.777710,6.620100,1 615,60221.0308,3,-455.588196,3.295532,1 615,60221.0419,4,-145.305023,3.087424,1 615,60221.0530,5,2.939076,6.798505,0 615,60224.0140,2,491.748383,3.509416,1 615,60224.0217,1,346.335083,3.835582,1 615,60224.0294,3,384.185303,3.529593,1 615,60224.0405,4,381.953735,3.885009,1 615,60224.0516,5,378.118225,7.311360,1 615,60227.0151,2,341.057709,2.940147,1 615,60227.0228,1,356.632690,3.856145,1 615,60227.0305,3,153.004929,2.847803,1 615,60227.0416,4,52.912033,3.109148,1 615,60227.0527,5,-19.384567,6.713308,0 615,60228.0187,0,6.768485,4.174600,0 615,60229.0162,0,-35.149330,4.086384,1 615,60234.0265,0,-52.922794,3.681808,1 615,60237.2206,2,-676.669189,4.009161,1 615,60237.2283,1,-1098.651489,6.689435,1 615,60237.2359,3,-511.148254,3.546333,1 615,60237.2468,4,-347.090027,3.624199,1 615,60237.2578,5,-240.316895,8.035271,1 615,60240.0223,2,85.162651,2.829378,1 615,60240.0300,1,14.526012,3.783879,0 615,60240.0377,3,-24.350578,3.397041,1 615,60240.0488,4,-111.062698,3.698180,1 615,60240.0598,5,-180.234787,6.894514,1 615,60249.0338,2,611.984558,3.908728,1 615,60249.0415,1,660.626343,4.961018,1 615,60249.0492,3,386.311920,3.240422,1 615,60249.0602,4,325.401184,3.454910,1 615,60249.0712,5,280.721069,6.623785,1 615,60260.0423,0,108.020546,4.337497,1 615,60261.0361,0,125.182808,3.909554,1 615,60262.0367,0,107.649780,3.796611,1 615,60263.0373,0,61.068066,3.877589,1 615,60264.0465,0,-9.100937,3.518127,0 615,60490.2647,2,-408.570984,3.169784,1 615,60490.2725,1,-624.518799,4.704853,1 615,60490.2802,3,-405.614258,3.513195,1 615,60490.2913,4,-371.286377,3.985296,1 615,60490.3024,5,-395.406128,8.139952,1 615,60493.2372,2,-680.489441,4.065931,1 615,60493.2450,1,-1031.102905,6.378702,1 615,60493.2527,3,-530.644592,3.672556,1 615,60493.2639,4,-406.733521,3.772714,1 615,60493.2750,5,-358.876160,7.310321,1 615,60499.2467,0,106.447296,4.481476,1 615,60500.2437,0,67.234062,4.020935,1 615,60501.2385,0,24.868933,4.027500,1 615,60502.2355,0,-15.392517,4.142292,0 615,60508.2638,2,365.607056,3.696270,1 615,60508.2715,1,-32.986282,4.440859,0 615,60508.2792,3,319.249847,3.828632,1 615,60508.2903,4,360.507599,4.336362,1 615,60508.3014,5,370.305267,8.601955,1 615,60524.2390,0,89.070496,3.901179,1 615,60525.1736,0,118.935989,4.737393,1 615,60532.3489,2,510.690094,3.489832,1 615,60532.3565,1,566.281433,4.607503,1 615,60532.3641,3,271.663910,3.007311,1 615,60532.3751,4,204.409866,3.353202,1 615,60532.3860,5,128.521851,7.134325,1 615,60535.1253,2,-664.729675,4.282414,1 615,60535.1330,1,-1084.891113,6.952323,1 615,60535.1408,3,-488.010925,3.928481,1 615,60535.1519,4,-222.254257,4.034600,1 615,60535.1630,5,-85.524307,8.625449,1 615,60538.2351,2,113.021248,2.712380,1 615,60538.2428,1,51.060081,3.762334,1 615,60538.2505,3,-4.268328,2.895656,0 615,60538.2615,4,-96.020035,3.141703,1 615,60538.2725,5,-175.912643,6.308159,1 615,60546.3406,2,178.624359,2.905459,1 615,60546.3482,1,142.089966,4.065646,1 615,60546.3558,3,41.418739,3.163731,1 615,60546.3668,4,-52.460590,3.784039,1 615,60546.3777,5,-112.286079,8.527776,1 615,60549.0879,2,-629.010254,3.867215,1 615,60549.0956,1,-1076.652100,6.604701,1 615,60549.1034,3,-435.558533,3.417534,1 615,60549.1145,4,-111.499573,3.424588,1 615,60549.1256,5,30.267401,7.478198,0 615,60554.0964,0,82.168922,4.318140,1 615,60555.0951,0,49.886921,3.917516,1 615,60556.0879,0,9.075453,4.103900,0 615,60557.0831,0,-30.764908,3.555157,1 615,60558.1093,0,-101.419899,3.653430,1 615,60559.1097,0,-110.688477,3.426444,1 615,60560.1065,0,-114.774445,4.013463,1 615,60567.2821,2,-447.681580,3.368270,1 615,60567.2897,1,-972.201111,6.347886,1 615,60567.2973,3,-176.163651,2.960412,1 615,60567.3083,4,140.860611,3.302721,1 615,60567.3192,5,228.033112,6.797573,1 615,60574.1118,2,-143.843872,2.547544,1 615,60574.1195,1,-812.792908,5.570116,1 615,60574.1272,3,86.606873,2.966459,1 615,60574.1383,4,257.570221,3.575394,1 615,60574.1493,5,302.167328,7.100554,1 615,60577.0186,2,-425.988464,3.239578,1 615,60577.0263,1,-963.216980,6.134610,1 615,60577.0340,3,-148.178238,3.247297,1 615,60577.0451,4,161.872543,4.228243,1 615,60577.0563,5,238.576889,9.461221,1 615,60580.0095,2,586.178345,3.996895,1 615,60580.0173,1,655.284058,5.148244,1 615,60580.0250,3,445.737061,3.952905,1 615,60580.0361,4,361.595764,4.508256,1 615,60580.0472,5,328.836731,9.460338,1 615,60582.0840,0,-51.614189,3.517908,1 615,60583.0169,0,20.364273,4.460314,0 615,60584.0117,0,-24.682575,3.866380,1 615,60585.0117,0,-63.546600,3.497667,1 615,60586.0123,0,-101.819290,3.383004,1 615,60587.0127,0,-110.978699,3.555624,1 615,60588.0131,0,-113.588432,3.241369,1 615,60593.0636,2,226.696259,2.514855,1 615,60593.0713,1,205.029755,3.258004,1 615,60593.0790,3,73.384720,2.584785,1 615,60593.0901,4,-19.212976,2.899512,0 615,60593.1012,5,-83.394951,6.073453,1 615,60596.0304,2,-224.917938,3.388916,1 615,60596.0381,1,-388.231476,4.931039,1 615,60596.0458,3,-274.108429,3.720238,1 615,60596.0569,4,-292.558990,4.188871,1 615,60596.0680,5,-354.074280,8.392479,1 615,60603.0208,2,404.391388,3.043772,1 615,60603.0286,1,70.494507,3.060846,1 615,60603.0363,3,338.994537,3.051842,1 615,60603.0473,4,362.888550,3.381572,1 615,60603.0584,5,378.188141,6.295821,1 615,60606.0225,2,422.610779,3.198191,1 615,60606.0303,1,457.502197,4.173640,1 615,60606.0379,3,205.937546,2.957614,1 615,60606.0490,4,123.048210,3.402847,1 615,60606.0601,5,33.726837,7.368811,0 615,60609.0247,2,-355.611389,2.720825,1 615,60609.0323,1,-537.169312,4.090708,1 615,60609.0400,3,-372.485565,2.985755,1 615,60609.0510,4,-350.518677,3.225662,1 615,60609.0621,5,-371.873230,6.150734,1 615,60612.0266,0,-110.649872,2.844200,1 615,60613.0269,0,-89.973892,2.937887,1 615,60614.0276,0,-10.015225,3.212408,0 615,60615.0375,0,99.438087,3.662484,1 615,60616.0290,0,120.849113,3.776495,1 615,60617.0295,0,121.411896,3.569777,1 615,60621.1734,2,56.559818,2.259825,1 615,60621.1810,1,-607.040771,4.452463,1 615,60621.1886,3,208.770279,2.959783,1 615,60621.1996,4,297.624725,3.718585,1 615,60621.2105,5,332.919006,8.157172,1 615,60624.1760,2,552.150269,3.917989,1 615,60624.1836,1,607.047668,5.140991,1 615,60624.1913,3,296.946533,3.475000,1 615,60624.2022,4,235.489929,3.926538,1 615,60624.2132,5,157.080200,8.453112,1 713,59825.2600,2,9.110147,1.013889,0 713,59825.2676,1,7.615042,1.160329,1 713,59825.2752,3,6.673631,1.932316,1 713,59825.2862,4,5.214194,3.018003,0 713,59825.2971,5,12.060948,7.163382,0 713,59839.2161,2,4.953065,1.196956,0 713,59839.2236,1,3.131028,1.351706,0 713,59839.2313,3,6.108739,1.789895,0 713,59839.2422,4,8.283792,2.527953,0 713,59839.2532,5,9.686500,5.882469,0 713,59842.1987,2,6.472355,1.110572,0 713,59842.2064,1,5.914848,1.134476,1 713,59842.2140,3,5.311658,1.683777,0 713,59842.2250,4,4.680908,2.349234,0 713,59842.2359,5,6.921503,5.573885,0 713,59851.2006,0,7.267655,2.866838,0 713,59854.2089,2,3.945918,0.922779,0 713,59854.2165,1,2.956484,0.953529,0 713,59854.2242,3,4.768611,1.422675,0 713,59854.2351,4,4.065430,1.943197,0 713,59854.2461,5,1.290383,3.980583,0 713,59857.1879,2,3.937931,2.088610,0 713,59857.1956,1,-2.223347,2.861396,0 713,59857.2032,3,-0.476698,2.280299,0 713,59857.2141,4,8.054095,2.712542,0 713,59857.2251,5,-9.332252,5.174713,0 713,59867.1600,2,1.380378,1.105223,0 713,59867.1676,1,1.569406,1.369900,0 713,59867.1753,3,1.433712,1.730335,0 713,59867.1862,4,5.299760,2.530279,0 713,59867.1971,5,2.355590,5.574841,0 713,59870.1521,2,3.404463,0.889940,0 713,59870.1597,1,2.961649,0.946383,0 713,59870.1673,3,2.594970,1.322884,0 713,59870.1782,4,2.805032,1.753258,0 713,59870.1892,5,3.101222,4.041600,0 713,59873.1442,2,3.869869,0.763644,0 713,59873.1519,1,4.354049,0.790372,1 713,59873.1595,3,2.949366,1.101570,0 713,59873.1704,4,3.117238,1.623630,0 713,59873.1814,5,4.010789,3.832515,0 713,59874.1612,0,4.171277,2.180456,0 713,59875.1175,0,2.671449,2.536783,0 713,59876.1160,0,-0.354117,2.566333,0 713,59877.1178,0,2.332870,2.217193,0 713,59878.1127,0,3.797837,2.270967,0 713,59879.1104,0,4.900619,2.339577,0 713,59880.1181,0,1.331082,1.865762,0 713,59884.1292,2,3.091794,0.995209,0 713,59884.1368,1,1.712878,1.129837,0 713,59884.1444,3,2.552051,1.229161,0 713,59884.1554,4,2.099711,1.737169,0 713,59884.1663,5,5.788035,3.869600,0 713,59887.0951,2,5.011691,1.668699,0 713,59887.1027,1,3.719429,2.314904,0 713,59887.1103,3,10.036420,1.852814,0 713,59887.1213,4,5.964674,2.277389,0 713,59887.1322,5,8.822542,4.767565,0 713,59896.0839,2,5.064992,0.999215,0 713,59896.0915,1,5.780192,1.083338,0 713,59896.0992,3,3.587355,1.642954,1 713,59896.1101,4,6.185760,2.277758,0 713,59896.1211,5,-4.762829,5.042903,0 713,59899.0854,2,4.823127,1.244829,0 713,59899.0930,1,6.899071,1.246326,0 713,59899.1007,3,3.249064,2.133093,0 713,59899.1116,4,7.382133,3.388385,0 713,59899.1226,5,-4.677240,9.115748,0 713,59902.0445,2,9.166100,1.426165,0 713,59902.0522,1,8.076466,1.395627,0 713,59902.0598,3,11.330316,2.051576,1 713,59902.0707,4,9.245844,2.876306,0 713,59902.0817,5,0.942024,7.248375,0 713,59904.0584,0,3.223553,2.679078,0 713,59905.0468,0,14.509829,3.098125,0 713,59906.0474,0,5.995616,2.589032,0 713,59907.0480,0,5.440472,2.469325,0 713,59908.0487,0,5.961231,3.348282,0 713,59909.0494,0,10.137896,2.151001,0 713,59910.0590,0,8.248549,2.160179,0 713,59913.2446,2,5.475236,0.822163,1 713,59913.2522,1,6.833441,0.969664,0 713,59913.2599,3,6.275328,1.430679,0 713,59913.2708,4,4.298039,2.311868,0 713,59913.2818,5,3.143612,5.875287,0 713,59916.0544,2,10.529041,1.787002,0 713,59916.0621,1,9.129021,2.415574,0 713,59916.0697,3,5.509865,2.141148,0 713,59916.0806,4,9.827934,2.274502,0 713,59916.0915,5,2.627945,4.551546,0 713,59924.0589,2,5.190053,0.786980,1 713,59924.0665,1,6.531730,0.851491,1 713,59924.0742,3,9.141804,1.210878,1 713,59924.0851,4,9.810373,1.739901,0 713,59924.0961,5,2.349317,4.040898,0 713,59927.0604,2,5.366942,0.863455,0 713,59927.0680,1,4.619713,0.947374,0 713,59927.0756,3,6.296741,1.472587,0 713,59927.0866,4,2.465199,2.073566,0 713,59927.0975,5,-0.702472,6.396966,0 713,59930.0619,2,2.780317,0.759708,1 713,59930.0695,1,4.959312,0.809846,0 713,59930.0771,3,4.033259,1.196190,0 713,59930.0881,4,4.485665,1.901773,0 713,59930.1063,5,6.218721,4.791905,0 713,59933.0632,2,1.830853,0.639458,0 713,59933.0709,1,1.716145,0.707228,0 713,59933.0785,3,4.893567,0.968482,0 713,59933.0944,4,3.197614,1.429430,0 713,59933.1150,5,3.335699,3.523145,0 713,59935.0739,0,0.554208,1.573855,0 713,59936.0735,0,2.584441,1.804314,0 713,59937.0743,0,6.470248,1.848658,0 713,59938.0754,0,0.724684,2.076312,0 713,59939.0808,0,2.375108,2.243821,0 713,59942.0746,2,-0.148046,0.800387,0 713,59942.0889,1,0.648101,0.878962,0 713,59942.0965,3,0.328905,1.080046,0 713,59942.1074,4,-4.550706,1.669870,0 713,59942.1184,5,2.364145,4.108390,0 713,59945.0770,2,-3.002108,1.474453,0 713,59945.0846,1,-1.725136,2.059556,0 713,59945.0922,3,-1.422123,1.846779,0 713,59945.1032,4,-6.208874,2.036851,0 713,59945.1141,5,-2.945050,4.744831,0 713,60192.2930,2,7.250862,1.972519,0 713,60192.3006,1,7.834616,2.653803,0 713,60192.3082,3,6.543319,2.216304,0 713,60192.3192,4,7.960829,2.695718,0 713,60192.3301,5,3.404367,5.894906,0 713,60195.2343,2,4.326025,1.101823,0 713,60195.2419,1,3.302556,1.100982,1 713,60195.2496,3,5.533146,1.655807,0 713,60195.2605,4,5.854890,2.367182,0 713,60195.2715,5,3.825871,5.774144,0 713,60198.2332,0,2.208139,3.192551,0 713,60199.2358,0,8.620851,2.547614,0 713,60200.2314,0,3.770694,2.643626,0 713,60201.2281,0,6.634655,2.975509,0 713,60202.2255,0,9.813441,2.750465,0 713,60209.2281,2,3.984369,1.161990,0 713,60209.2357,1,1.381281,1.139709,0 713,60209.2433,3,1.715379,1.724909,0 713,60209.2543,4,0.545876,2.622813,0 713,60209.2652,5,6.504875,6.293293,0 713,60212.2147,2,2.873843,1.790648,0 713,60212.2223,1,1.546698,2.668681,0 713,60212.2300,3,5.084908,2.077699,0 713,60212.2409,4,3.087726,2.654123,0 713,60212.2519,5,-2.272981,5.844298,0 713,60223.1948,2,0.405613,1.246678,0 713,60223.2024,1,1.120193,1.277229,0 713,60223.2100,3,1.011539,1.974625,0 713,60223.2210,4,3.507817,2.882992,0 713,60223.2319,5,14.770886,6.656366,0 713,60226.2721,2,1.071414,0.746168,0 713,60226.2797,1,1.648819,0.776689,0 713,60226.2931,3,1.727918,1.133994,0 713,60226.3129,4,-0.916487,1.736045,0 713,60226.3238,5,3.996732,4.304620,0 713,60236.1862,0,4.067199,2.328237,0 713,60238.2696,2,-0.087907,0.758784,0 713,60238.2803,1,-0.829578,0.881391,0 713,60238.2879,3,-0.576265,1.265385,0 713,60238.2988,4,0.329135,1.983817,0 713,60238.3098,5,-4.923808,4.832184,0 713,60241.1342,2,2.333379,1.636026,0 713,60241.1418,1,-0.293893,2.097461,0 713,60241.1495,3,3.571144,1.816384,0 713,60241.1604,4,1.146531,2.471305,0 713,60241.1713,5,-7.436915,5.122927,0 713,60260.0773,0,1.232121,3.040076,0 713,60261.0632,0,2.412768,2.796987,0 713,60262.0637,0,-1.678317,2.631186,0 713,60263.0643,0,-0.390618,2.617705,0 713,60264.0716,0,1.131548,2.522264,0 713,60265.0867,0,-2.189290,2.832789,0 713,60267.0443,2,-2.785007,1.335972,0 713,60267.0519,1,-1.797494,1.524965,0 713,60267.0595,3,-4.881196,1.920069,0 713,60267.0705,4,3.231234,2.734247,0 713,60267.0814,5,-12.699218,5.767424,0 713,60270.1082,2,0.393975,2.344530,0 713,60270.1158,1,2.370688,3.545214,0 713,60270.1234,3,2.489378,2.970400,0 713,60270.1344,4,-7.822262,3.554679,0 713,60270.1453,5,-10.877887,6.829591,0 713,60278.0525,2,-0.607012,0.969379,0 713,60278.0601,1,-0.027766,1.022582,0 713,60278.0677,3,2.024312,1.532588,0 713,60278.0786,4,-2.256550,2.154194,0 713,60278.0896,5,-1.112494,5.843420,0 713,60281.0552,2,-1.695972,1.034572,0 713,60281.0629,1,-1.833499,1.044974,0 713,60281.0705,3,-0.919016,1.470630,0 713,60281.0814,4,-1.391540,2.003621,0 713,60281.0924,5,-3.945375,4.607381,0 713,60284.0557,2,-2.572076,0.783297,0 713,60284.0633,1,-2.387862,0.801296,0 713,60284.0709,3,-5.832908,1.240340,0 713,60284.0819,4,-1.649157,1.857165,0 713,60284.0928,5,8.627832,4.482957,0 713,60287.0577,2,-0.764727,0.906658,0 713,60287.0653,1,-1.477176,0.966182,0 713,60287.0729,3,-0.536819,1.463827,0 713,60287.0839,4,-0.667864,2.361719,0 713,60287.0948,5,-5.286497,6.810267,0 713,60290.0641,0,-1.816348,2.038470,0 713,60291.0599,0,-5.132619,1.989085,0 713,60292.0607,0,-5.080487,1.908693,0 713,60293.0607,0,-5.075594,1.939040,0 713,60294.0616,0,-0.566193,1.833754,0 713,60295.0621,0,-3.857503,2.114682,0 713,60297.1169,2,-4.830737,0.921245,0 713,60297.1245,1,-4.334117,1.068175,0 713,60297.1321,3,-4.767125,1.234546,0 713,60297.1431,4,-4.473659,1.723665,0 713,60297.1540,5,-10.414721,4.258311,0 713,60300.0641,2,-5.492156,1.607434,0 713,60300.0717,1,-3.599649,2.103016,0 713,60300.0793,3,-3.476922,1.725975,0 713,60300.1031,4,-2.745461,2.269754,0 713,60300.1160,5,7.738044,4.946638,0 713,60554.2916,0,-9.100129,2.393532,0 713,60555.2620,0,-8.218450,2.308315,1 713,60556.2548,0,-5.576579,2.770439,0 713,60557.2501,0,-9.173389,2.218352,0 713,60558.2534,0,-13.083604,2.663738,0 713,60559.2490,0,-9.237353,2.428750,0 713,60560.2424,0,-10.050170,3.275514,0 713,60567.2231,2,-8.265152,1.515329,0 713,60567.2308,1,-8.954789,2.105672,1 713,60567.2384,3,-8.418892,1.630414,0 713,60567.2493,4,-12.286801,1.978125,1 713,60567.2603,5,-11.054881,4.445991,0 713,60578.2746,2,-4.951467,1.229683,0 713,60578.2822,1,-7.403615,1.305094,0 713,60578.2898,3,-5.050255,1.839125,1 713,60578.3008,4,-7.385537,2.667687,0 713,60578.3117,5,-6.356452,6.387929,0 713,60581.1779,2,-5.760825,1.288651,1 713,60581.1855,1,-7.428378,1.275975,0 713,60581.1931,3,-6.902376,1.927237,0 713,60581.2041,4,-9.594004,2.818656,0 713,60581.2150,5,-14.211164,6.624023,0 713,60582.2087,0,-11.829331,2.358846,0 713,60583.1842,0,-9.363182,3.042286,0 713,60584.1807,0,-9.220502,2.544668,0 713,60585.1757,0,-3.587870,2.280919,0 713,60586.1765,0,-9.129416,2.146863,0 713,60587.1702,0,-5.876253,2.481174,0 713,60588.1666,0,-9.116284,2.157747,0 713,60593.1682,2,-9.569608,0.985850,1 713,60593.1758,1,-8.809836,1.078624,1 713,60593.1834,3,-9.553467,1.347112,1 713,60593.1944,4,-9.193518,1.914358,1 713,60593.2053,5,-8.280509,4.493694,0 713,60596.1820,2,-8.760753,2.037911,0 713,60596.1896,1,-4.396494,2.863201,0 713,60596.1972,3,-11.907238,2.252078,1 713,60596.2081,4,-4.786119,2.784098,0 713,60596.2191,5,-5.489277,6.255779,0 713,60605.1380,2,-9.696579,0.873996,1 713,60605.1456,1,-11.159884,0.883977,1 713,60605.1532,3,-12.394593,1.305202,1 713,60605.1642,4,-9.511388,1.906236,0 713,60605.1751,5,-6.906372,4.587698,0 713,60608.1308,2,-9.163915,0.800012,1 713,60608.1384,1,-11.715749,0.823976,1 713,60608.1460,3,-11.449253,1.202452,0 713,60608.1569,4,-12.221146,1.749559,1 713,60608.1679,5,-2.633516,4.132709,0 713,60611.1227,2,-10.067919,0.717739,1 713,60611.1303,1,-9.289042,0.761477,1 713,60611.1380,3,-10.801243,1.080986,1 713,60611.1489,4,-11.623042,1.560488,1 713,60611.1599,5,-7.861447,3.710802,0 713,60612.1183,0,-11.605895,1.778605,1 713,60613.1019,0,-11.340659,1.930082,1 713,60614.0960,0,-10.934606,2.143276,1 713,60615.0917,0,-14.735178,2.326417,0 713,60616.0927,0,-12.353376,2.357691,1 713,60617.0896,0,-6.599936,2.023456,0 713,60620.1350,0,-6.110061,2.056073,0 713,60621.1263,2,-5.537477,0.829998,1 713,60621.1339,1,-7.972793,0.877838,1 713,60621.1416,3,-7.565215,1.233034,1 713,60621.1525,4,-6.638791,1.780862,0 713,60621.1635,5,-9.333499,4.242186,0 713,60624.0821,2,-4.490414,1.314625,0 713,60624.0897,1,-5.545699,1.695655,0 713,60624.0974,3,-7.286825,1.600662,0 713,60624.1083,4,-6.478677,2.086655,0 713,60624.1193,5,-7.099849,4.810002,0 713,60627.2801,2,-6.219934,2.530638,0 713,60627.2877,1,-5.039655,3.228468,0 713,60627.2954,3,-0.950650,2.782719,0 713,60627.3063,4,-4.143107,3.379841,0 713,60627.3173,5,0.243241,7.639313,0 713,60632.0400,2,-5.855491,1.492071,0 713,60632.0476,1,-6.847743,1.453141,0 713,60632.0552,3,-9.374930,2.217679,0 713,60632.0662,4,-10.557325,3.237312,0 713,60632.0771,5,-2.921649,7.857955,0 713,60635.0469,2,-6.480945,1.091159,1 713,60635.0545,1,-6.966879,1.087843,1 713,60635.0621,3,-6.185159,1.629475,1 713,60635.0731,4,-5.490345,2.377301,0 713,60635.0840,5,-2.292507,5.646507,0 713,60640.0504,2,-7.312206,0.974580,1 713,60640.0580,1,-7.250492,0.991461,0 713,60640.0656,3,-10.161006,1.479101,1 713,60640.0766,4,-6.631466,2.145102,0 713,60640.0875,5,-10.591419,5.138685,0 713,60643.0609,0,-9.289350,1.992813,1 713,60644.0533,0,-8.482151,2.118450,0 713,60645.0537,0,-2.605739,2.197297,1 713,60646.0548,0,-8.104684,2.135281,0 713,60647.0546,0,-7.506279,2.275638,0 713,60648.0553,0,-10.602926,1.838902,1 713,60649.0561,0,-12.232555,1.708795,0 713,60651.1265,2,-9.331477,0.865811,1 713,60651.1451,1,-10.061421,0.932510,1 713,60651.1527,3,-9.335849,1.315029,1 713,60651.1637,4,-6.167844,1.952829,0 713,60651.1746,5,-10.171921,4.815349,0 713,60654.0597,2,-9.607999,1.647062,1 713,60654.0673,1,-6.258916,2.222855,0 713,60654.0749,3,-9.524345,1.901351,0 713,60654.0859,4,-9.513783,2.389906,0 713,60654.0968,5,-9.744430,5.404162,0 713,60662.1451,2,-5.698765,1.334831,0 713,60662.1527,1,-4.317381,2.017339,0 713,60662.1603,3,-6.093997,1.675434,0 713,60662.1713,4,-5.760686,2.093239,0 713,60662.1822,5,-5.400730,4.721159,0 713,60665.0637,2,-6.826318,0.676851,1 713,60665.0713,1,-6.498077,0.740772,1 713,60665.0789,3,-5.938825,1.048616,1 713,60665.1017,4,-6.440791,1.543503,1 713,60665.1156,5,-13.727009,3.770338,0 713,60668.0647,2,-6.938087,0.920544,1 713,60668.0723,1,-8.995543,0.954973,1 713,60668.0893,3,-10.263328,1.437371,1 713,60668.1055,4,-5.455149,2.164149,0 713,60668.1165,5,-9.138229,5.354884,0 713,60671.0655,0,-10.165054,1.726118,1 713,60672.0693,0,-10.828177,1.470152,1 713,60673.0745,0,-12.148479,2.243120,0 713,60674.0798,0,-8.669188,2.216094,0 730,59798.3205,2,1.177371,1.364300,0 730,59798.3281,1,2.320849,1.159247,0 730,59798.3357,3,2.939447,1.771328,0 730,59798.3466,4,2.128097,2.610659,0 730,59798.3576,5,-12.809639,5.380097,0 730,59801.3553,2,0.111235,2.460576,0 730,59801.3629,1,-3.393080,3.564052,0 730,59801.3705,3,-1.899219,2.292693,0 730,59801.3815,4,2.284906,2.523534,0 730,59801.3924,5,5.203419,5.395980,0 730,59818.2740,0,-2.342200,1.801066,0 730,59819.2541,0,3.380978,2.469600,0 730,59820.2522,0,-2.230815,1.915426,0 730,59821.2478,0,1.159034,2.461736,0 730,59822.2433,0,5.942166,2.901580,0 730,59823.2659,0,-0.180970,2.714361,0 730,59826.3105,2,0.521923,0.925337,0 730,59826.3181,1,-1.421768,0.929596,0 730,59826.3258,3,0.972355,1.513987,0 730,59826.3367,4,-0.570261,2.162375,0 730,59826.3477,5,-2.301237,5.548611,0 730,59842.2456,2,0.156290,0.853800,0 730,59842.2532,1,-0.567360,0.819375,0 730,59842.2608,3,-0.251899,1.325633,0 730,59842.2718,4,2.019500,2.173066,0 730,59842.2827,5,10.142254,6.086383,0 730,59851.1792,0,-1.472170,2.597541,0 730,59854.1485,2,0.368931,1.230250,0 730,59854.1563,1,0.664051,1.345911,0 730,59854.1640,3,3.201455,1.909905,0 730,59854.1750,4,3.012713,2.778862,0 730,59854.1860,5,3.750187,5.803461,0 730,59857.1408,2,1.076537,2.141015,0 730,59857.1485,1,5.693109,2.937809,0 730,59857.1563,3,-2.640246,2.100464,0 730,59857.1673,4,0.402461,2.684283,0 730,59857.1782,5,-4.509360,6.643411,0 730,59867.1112,2,-0.449365,1.088300,0 730,59867.1189,1,0.282022,0.981426,0 730,59867.1267,3,0.026595,1.544194,0 730,59867.1377,4,0.956947,2.364042,0 730,59867.1487,5,2.724518,5.845339,0 730,59870.1049,2,1.070328,0.976301,0 730,59870.1126,1,0.511964,0.828288,0 730,59870.1204,3,-0.505236,1.377689,0 730,59870.1314,4,0.251195,2.289763,0 730,59870.1424,5,4.119082,5.293428,0 730,59873.0971,2,-0.211154,1.045822,0 730,59873.1049,1,-1.287062,1.048773,0 730,59873.1126,3,-1.557674,1.446841,0 730,59873.1236,4,-0.739414,2.074561,0 730,59873.1346,5,5.151175,4.601235,0 730,59874.1461,0,0.412505,1.627923,0 730,59875.0995,0,-2.200486,2.037783,0 730,59876.0980,0,-2.931559,2.450620,0 730,59877.0976,0,2.024089,1.789397,0 730,59878.0964,0,-1.250103,2.029308,0 730,59879.0895,0,-0.671039,1.877854,0 730,59880.1017,0,0.189355,1.384724,0 730,59884.1760,2,-0.108323,0.771566,0 730,59884.1836,1,-1.113737,0.892852,0 730,59884.1913,3,-0.427802,1.142666,0 730,59884.2022,4,1.402694,1.614300,0 730,59884.2132,5,-1.060647,4.220271,0 730,59887.2856,2,1.474370,1.835391,0 730,59887.2933,1,-3.203188,2.221069,0 730,59887.3009,3,-5.435799,2.359130,0 730,59887.3118,4,0.192088,3.154000,0 730,59887.3228,5,-1.082339,7.966248,0 730,59896.1307,2,1.145174,0.826742,0 730,59896.1384,1,-0.032153,0.707979,0 730,59896.1460,3,-0.357363,1.398256,0 730,59896.1569,4,-2.788487,2.198583,0 730,59896.1679,5,2.883538,5.962979,0 730,59899.1519,2,-1.456884,1.371527,0 730,59899.1595,1,-0.707794,1.396877,0 730,59899.1672,3,5.298447,2.388603,0 730,59899.1781,4,6.412822,3.720956,0 730,59899.1891,5,1.091714,7.924479,0 730,59902.1384,2,-0.887660,1.191683,0 730,59902.1460,1,0.168580,1.085883,0 730,59902.1537,3,-1.369444,1.950019,0 730,59902.1646,4,0.303218,2.770533,0 730,59902.1755,5,11.777126,7.044582,0 730,59904.1053,0,-0.620050,2.301550,0 730,59905.0555,0,0.922903,2.527480,0 730,59906.0562,0,-1.153271,2.043133,0 730,59907.0567,0,0.449173,1.764913,0 730,59908.0681,0,0.837362,2.710272,0 730,59909.0582,0,-0.985495,1.850359,0 730,59910.0503,0,-0.355463,1.880359,0 730,59914.0526,2,-0.833646,1.445693,0 730,59914.0602,1,0.370377,1.970406,0 730,59914.0678,3,-1.719942,1.692403,0 730,59914.0788,4,1.004354,2.274112,0 730,59914.0897,5,0.138586,5.367689,0 730,59924.1060,2,-0.787230,1.137160,0 730,59924.1136,1,-1.572903,1.584968,0 730,59924.1212,3,0.555294,1.735223,0 730,59924.1322,4,-2.475216,2.533980,0 730,59924.1431,5,-0.816748,5.644574,0 730,59927.1074,2,-0.474538,1.196533,0 730,59927.1151,1,0.973025,1.142775,0 730,59927.1227,3,-2.039601,1.598035,0 730,59927.1336,4,-1.036243,2.305239,0 730,59927.1446,5,-3.438392,5.903537,0 730,59930.1236,2,-0.114812,0.945627,0 730,59930.1312,1,0.475511,0.835235,0 730,59930.1388,3,0.226621,1.287869,0 730,59930.1498,4,-3.755495,2.037717,0 730,59930.1607,5,2.542647,5.343603,0 730,59933.1249,2,0.603719,0.695106,0 730,59933.1325,1,-0.226574,0.698751,0 730,59933.1401,3,0.106692,1.273440,0 730,59933.1511,4,0.993756,1.919590,0 730,59933.1620,5,5.318815,5.441072,0 730,59934.0638,0,1.190260,1.159169,0 730,59935.0646,0,-0.320948,1.132809,0 730,59936.0642,0,-1.230814,1.533033,0 730,59937.0650,0,-0.751357,1.654440,0 730,59938.0647,0,-3.109122,2.015928,0 730,59939.0650,0,1.571790,2.219707,0 730,60165.3032,2,-0.502432,1.200698,0 730,60165.3109,1,-2.832010,1.356671,0 730,60165.3186,3,1.843434,1.524752,0 730,60165.3295,4,3.196369,2.029726,0 730,60165.3405,5,-8.889149,4.793297,0 730,60168.2892,2,-0.176546,0.837958,0 730,60168.2970,1,0.201754,0.793672,0 730,60168.3047,3,0.369397,1.305260,0 730,60168.3157,4,-2.235131,2.071596,0 730,60168.3267,5,-5.047883,4.827778,0 730,60176.2820,0,1.753881,2.391554,0 730,60177.2726,0,-1.260694,2.790846,0 730,60181.4088,2,1.103341,0.929294,0 730,60181.4164,1,1.391831,1.272189,0 730,60181.4232,3,-4.766650,3.249354,0 730,60183.2660,2,-2.848838,1.924783,0 730,60183.2736,1,-3.133007,2.955767,0 730,60183.2812,3,2.159384,2.557742,0 730,60183.2922,4,-5.836310,3.003132,0 730,60183.3031,5,-8.716421,6.192660,0 730,60195.2812,2,-0.162802,0.937681,0 730,60195.2888,1,0.864197,1.028533,0 730,60195.2964,3,-2.294667,1.631539,0 730,60195.3073,4,-2.693345,2.345676,0 730,60195.3183,5,-0.570636,6.289552,0 730,60198.2690,0,1.026459,2.192766,0 730,60199.2186,0,0.527036,2.176080,0 730,60200.2139,0,-1.516695,2.611164,0 730,60201.2072,0,0.150572,2.513518,0 730,60202.2089,0,-1.982165,2.505382,0 730,60209.1811,2,-0.462435,1.572888,0 730,60209.1888,1,2.153122,1.421764,0 730,60209.1965,3,0.765511,1.838664,0 730,60209.2075,4,-1.591831,2.917069,0 730,60209.2184,5,0.939232,6.712256,0 730,60212.1675,2,-2.669531,1.944909,0 730,60212.1753,1,3.706729,2.772595,0 730,60212.1830,3,-1.340184,2.303333,0 730,60212.1941,4,-3.525083,2.598093,0 730,60212.2050,5,7.831807,5.678200,0 730,60223.2416,2,1.328195,1.188578,0 730,60223.2493,1,-0.298775,1.249490,0 730,60223.2569,3,2.987647,1.656098,0 730,60223.2678,4,4.241424,2.481011,0 730,60223.2788,5,9.333996,5.811805,0 730,60226.3337,2,-0.679090,0.949060,0 730,60226.3413,1,1.282680,0.952563,0 730,60226.3489,3,0.970081,1.800158,0 730,60226.3599,4,1.366870,3.079182,0 730,60226.3708,5,-4.552550,8.128254,0 730,60238.3197,2,-0.599053,1.058323,0 730,60238.3273,1,0.734427,1.161131,0 730,60238.3349,3,2.553997,2.179708,0 730,60238.3459,4,1.377842,3.840058,0 730,60238.3568,5,-19.159811,11.281384,0 730,60241.0870,2,0.823192,2.008905,0 730,60241.0948,1,3.386674,3.088520,0 730,60241.1025,3,0.043122,2.453789,0 730,60241.1136,4,-1.052531,2.925313,0 730,60241.1245,5,-8.036972,5.953956,0 730,60250.1708,2,-0.962673,2.112349,0 730,60250.1957,1,0.580816,2.829899,0 730,60250.2034,3,6.845217,2.940232,0 730,60250.2143,4,0.204509,3.994097,0 730,60250.2253,5,1.290714,7.954757,0 730,60261.1296,0,1.237353,2.094631,0 730,60262.0550,0,3.469973,2.338792,0 730,60263.0556,0,2.352035,1.998888,0 730,60264.0559,0,-2.396658,2.192123,0 730,60265.0780,0,3.070599,2.439756,0 730,60268.0449,2,-0.465229,1.410433,0 730,60268.0525,1,0.174091,1.861911,0 730,60268.0601,3,0.538344,2.137292,0 730,60268.0711,4,-3.556071,2.741589,0 730,60268.0820,5,-3.639747,6.760314,0 730,60278.0993,2,2.521567,1.652593,0 730,60278.1069,1,-2.468382,2.289480,0 730,60278.1145,3,-1.407348,2.177464,0 730,60278.1255,4,3.475310,3.068326,0 730,60278.1364,5,8.474236,7.658961,0 730,60281.1023,2,-1.139811,0.801878,0 730,60281.1099,1,-1.247972,0.800422,0 730,60281.1175,3,-1.347594,1.374244,0 730,60281.1285,4,-0.890039,1.996277,0 730,60281.1394,5,2.285095,5.189152,0 730,60284.1027,2,-0.679968,0.840813,0 730,60284.1104,1,-0.530991,0.766401,0 730,60284.1180,3,-1.148911,1.164351,0 730,60284.1289,4,0.493227,1.707686,0 730,60284.1399,5,-4.683412,4.445528,0 730,60287.1047,2,0.288175,1.256500,0 730,60287.1123,1,-2.067724,1.090506,0 730,60287.1200,3,-0.846692,1.802521,0 730,60287.1309,4,-5.051833,2.972183,0 730,60287.1418,5,-13.252449,7.029711,0 730,60290.0761,0,-3.000368,1.929932,0 730,60291.0689,0,1.081815,1.681175,0 730,60292.0699,0,2.211185,1.882060,0 730,60293.0699,0,0.182480,1.575780,0 730,60294.0708,0,1.605139,1.418435,0 730,60532.3019,2,20.994711,1.047298,1 730,60532.3097,1,1.504146,0.960956,0 730,60532.3173,3,31.523088,1.569497,1 730,60532.3282,4,41.159981,2.310168,1 730,60532.3392,5,46.795868,5.458707,1 730,60535.2802,2,20.880348,1.426747,0 730,60535.2879,1,2.271271,1.348233,0 730,60535.2957,3,30.361010,2.107024,1 730,60535.3068,4,40.715591,3.043571,1 730,60535.3177,5,47.310059,7.197146,1 730,60538.2826,2,19.450977,1.865142,1 730,60538.2903,1,3.462672,2.695356,0 730,60538.2980,3,33.572102,1.944897,1 730,60538.3089,4,38.518837,2.334413,1 730,60538.3199,5,40.146099,5.039364,1 730,60554.2651,0,0.190944,2.266587,0 730,60555.2411,0,0.098122,2.049620,0 730,60556.2370,0,-0.253067,2.551228,0 730,60557.2322,0,-2.200897,1.848830,0 730,60558.2332,0,-3.459960,2.511074,0 730,60559.2274,0,0.328893,2.224590,0 730,60560.2268,0,2.453341,3.110694,0 730,60567.3291,2,15.044784,0.951184,1 730,60567.3368,1,-0.142653,1.050350,0 730,60567.3444,3,18.416132,1.262663,1 730,60567.3553,4,28.234451,1.676854,1 730,60567.3663,5,31.623583,4.281011,1 730,60580.1736,2,12.164557,1.463993,1 730,60580.1813,1,2.065962,1.402610,0 730,60580.1889,3,10.053763,2.203885,1 730,60580.1999,4,19.975168,3.213686,1 730,60580.2108,5,24.093925,7.662856,0 730,60582.1681,0,-0.473370,2.422541,0 730,60583.1640,0,-3.070249,3.006098,0 730,60584.1591,0,0.970706,2.362254,0 730,60585.1601,0,-0.533032,1.881978,0 730,60586.1564,0,-0.049936,1.830623,0 730,60587.1540,0,-2.202578,2.138732,0 730,60588.1461,0,1.361049,1.798501,0 730,60593.1209,2,6.307311,1.155241,1 730,60593.1287,1,0.462838,1.313489,0 730,60593.1365,3,8.789671,1.545997,1 730,60593.1476,4,10.031554,2.179338,0 730,60593.1585,5,10.850924,5.118365,0 730,60596.1351,2,8.231540,2.483539,0 730,60596.1427,1,1.623348,3.539990,0 730,60596.1504,3,9.673650,2.658536,0 730,60596.1613,4,8.778720,3.238315,0 730,60596.1723,5,10.870938,7.199404,0 730,60605.0908,2,5.607800,0.980471,1 730,60605.0986,1,-0.587054,0.918425,0 730,60605.1063,3,6.155015,1.461859,1 730,60605.1174,4,7.274523,2.152366,0 730,60605.1283,5,2.950838,5.145659,0 730,60608.0836,2,5.938226,0.889949,0 730,60608.0913,1,0.898013,0.825604,0 730,60608.0991,3,5.509429,1.330889,0 730,60608.1101,4,9.166319,1.962560,0 730,60608.1211,5,3.346682,4.667600,0 730,60611.0756,2,2.112415,0.773398,1 730,60611.0833,1,0.247475,0.704158,0 730,60611.0911,3,1.898379,1.172223,0 730,60611.1021,4,4.244992,1.745410,0 730,60611.1130,5,6.172510,4.175368,0 730,60612.0813,0,1.228119,1.461220,0 730,60613.0818,0,1.540095,1.541647,0 730,60614.0803,0,1.231758,1.758784,0 730,60615.0761,0,-0.502854,1.998764,0 730,60616.0769,0,-2.247711,2.024976,0 730,60617.0737,0,-1.035569,1.574103,0 730,60620.1444,0,-1.018565,1.722706,0 730,60621.2673,2,2.330264,0.944892,0 730,60621.2749,1,-0.153496,0.950369,0 730,60621.2825,3,3.589653,1.651967,0 730,60621.2934,4,1.950011,2.746167,0 730,60621.3044,5,6.258384,7.527862,0 ================================================ FILE: examples/data/plasticc_training_set_metadata_1k.csv ================================================ object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target 615,349.046051,-61.943836,320.796530,-51.753706,1,0.0000,0.0000,0.0000,nan,0.017,92 713,53.085938,-27.784405,223.525509,-54.460748,1,1.8181,1.6267,0.2552,45.4063,0.007,88 730,33.574219,-6.579593,170.455585,-61.548219,1,0.2320,0.2262,0.0157,40.2561,0.021,42 745,0.189873,-45.586655,328.254458,-68.969298,1,0.3037,0.2813,1.1523,40.7951,0.007,90 1124,352.711273,-63.823658,316.922299,-51.059403,1,0.1934,0.2415,0.0176,40.4166,0.024,90 1227,35.683594,-5.379379,171.992947,-59.253501,1,0.0000,0.0000,0.0000,nan,0.020,65 1598,347.846710,-64.760857,318.929827,-49.143596,1,0.1352,0.1820,0.0304,39.7279,0.019,90 1632,348.595886,-63.072620,320.023289,-50.713060,1,0.6857,0.7014,0.0100,43.1524,0.021,42 1920,149.414062,3.433834,234.919132,42.245550,1,0.3088,0.3229,0.3360,41.1401,0.027,90 1926,149.414062,1.940072,236.565366,41.393323,1,0.0000,0.0000,0.0000,nan,0.018,65 2072,0.965665,-46.375080,325.845907,-68.579427,1,0.1516,0.1900,0.0104,39.8317,0.007,90 2103,346.500000,-62.320400,321.951129,-50.736054,1,0.1695,0.5409,0.2283,42.4667,0.020,42 2300,359.446716,-44.201530,331.730015,-69.805709,1,0.2360,2.7474,0.5335,46.7959,0.010,42 2330,359.805206,-46.768478,327.135979,-67.829903,1,0.4541,0.5736,0.2827,42.6207,0.011,90 2624,346.655182,-63.260487,320.952196,-50.040935,1,0.0000,0.0000,0.0000,nan,0.019,65 2677,53.964844,-28.630989,225.142950,-53.813613,1,0.0000,0.0000,0.0000,nan,0.009,16 2922,352.398651,-62.696659,318.017427,-51.967966,1,0.1539,0.1469,0.0094,39.2171,0.020,67 3041,346.130127,-63.072620,321.423103,-50.042305,1,0.1069,0.1274,0.0198,38.8800,0.020,67 3285,150.820312,1.641510,237.994507,42.358984,1,0.1610,0.1818,0.0079,39.7258,0.020,42 3423,349.615387,-63.636005,318.927246,-50.506542,1,1.9876,1.1213,0.1591,44.4078,0.018,95 3489,150.117188,2.836105,236.124718,42.483719,1,1.1330,1.4377,0.2168,45.0753,0.016,88 3910,0.589520,-47.161343,325.385896,-67.769893,1,0.1969,2.6766,0.5926,46.7274,0.009,62 4088,0.965665,-46.375080,325.845907,-68.579427,1,0.4833,0.4644,0.0321,42.0691,0.007,88 4132,359.811707,-45.191612,329.485675,-69.150905,1,0.0561,0.0556,0.0301,36.9750,0.010,42 4171,2.097458,-45.783966,324.737840,-69.478613,1,0.0000,0.0000,0.0000,nan,0.011,16 4173,152.050781,3.284369,237.157374,44.318466,1,0.5149,0.5512,0.0221,42.5158,0.019,15 4220,358.648071,-46.375080,329.462659,-67.716008,1,0.1197,0.1322,0.3351,38.9679,0.009,42 4389,151.699219,3.583322,236.533224,44.205648,1,0.2333,0.2205,0.9667,40.1939,0.016,90 4595,349.615387,-63.636005,318.927246,-50.506542,1,0.5919,0.5995,0.0127,42.7370,0.018,90 4819,35.332031,-5.979157,172.286722,-59.931743,1,0.3053,0.2870,0.0076,40.8445,0.022,90 5527,347.861847,-61.943836,321.519104,-51.424048,1,0.1315,0.2487,0.8604,40.4896,0.017,42 6180,33.222656,-4.780192,167.515653,-60.396584,1,0.3201,0.2685,0.5211,40.6793,0.018,90 6266,0.929752,-44.597992,328.531426,-70.083244,1,0.0000,0.0000,0.0000,nan,0.011,65 6762,348.595886,-63.072620,320.023289,-50.713060,1,0.3863,0.3983,0.0132,41.6735,0.021,90 6947,34.277344,-5.679190,170.314930,-60.410322,1,0.5680,0.5667,0.0181,42.5888,0.020,90 7033,52.207031,-28.291550,224.208534,-55.300157,1,0.0826,0.0850,0.0073,37.9414,0.007,42 7164,347.861847,-61.943836,321.519104,-51.424048,1,0.4299,0.4245,0.0288,41.8371,0.017,90 7315,2.071130,-45.191612,325.606223,-69.989264,1,0.1330,0.1337,0.0171,38.9942,0.011,88 7409,352.398651,-62.696659,318.017427,-51.967966,1,3.4451,0.5176,1.2609,42.3516,0.020,88 7566,359.446716,-44.201530,331.730015,-69.805709,1,0.0000,0.0000,0.0000,nan,0.010,16 7698,347.013428,-62.508568,321.472056,-50.735330,1,0.2628,0.1876,0.0216,39.8011,0.018,90 7703,53.085938,-28.122234,224.100909,-54.509752,1,0.0830,0.0820,0.2257,37.8568,0.007,62 7756,149.414062,2.238686,236.239766,41.565558,1,0.0000,0.0000,0.0000,nan,0.017,16 8328,1.694561,-45.191612,326.278557,-69.858253,1,0.3779,0.4808,0.2970,42.1592,0.011,90 8688,32.695312,-4.929937,166.868469,-60.841230,1,0.0000,0.0000,0.0000,nan,0.018,65 8745,349.966217,-62.696659,319.542989,-51.376556,1,0.6276,0.6136,0.0129,42.7983,0.021,90 8784,34.101562,-5.829153,170.247753,-60.638325,1,0.0000,0.0000,0.0000,nan,0.019,16 9006,34.277344,-5.079716,169.526841,-59.956640,1,0.0000,0.0000,0.0000,nan,0.019,65 9172,346.655182,-63.260487,320.952196,-50.040935,1,0.0000,0.0000,0.0000,nan,0.019,65 9184,0.949367,-45.586655,326.991548,-69.251686,1,1.4031,1.2719,0.4971,44.7463,0.013,88 9203,51.855469,-27.953188,223.543603,-55.561470,1,0.2138,0.1111,0.0626,38.5591,0.008,90 9543,352.132874,-63.636005,317.424173,-51.095855,1,0.0000,0.0000,0.0000,nan,0.021,65 9936,32.871094,-4.780192,166.959493,-60.615132,1,0.1633,0.0719,0.0389,37.5580,0.017,42 9985,150.820312,3.732834,235.666318,43.572109,1,0.0000,0.0000,0.0000,nan,0.016,65 10321,358.312500,-44.993881,332.185785,-68.685906,1,1.0833,1.1162,0.1020,44.3954,0.009,95 10337,54.667969,-27.615883,223.610785,-53.050840,1,0.6830,0.6725,0.0089,43.0404,0.009,90 10349,34.980469,-6.279288,172.180075,-60.389399,1,0.0000,0.0000,0.0000,nan,0.023,65 10478,52.910156,-27.953188,223.774083,-54.639214,1,0.5552,0.2233,0.2002,40.2248,0.007,90 10586,358.636353,-46.768478,328.890146,-67.388837,1,0.6052,0.6017,0.0153,42.7467,0.008,88 10757,52.910156,-26.276812,220.926149,-54.363918,1,0.1699,0.1711,0.0185,39.5801,0.008,52 10796,52.910156,-25.944481,220.366350,-54.301439,1,0.0000,0.0000,0.0000,nan,0.010,65 10798,351.299988,-62.320400,319.038597,-52.026867,1,0.1778,0.1872,0.0121,39.7959,0.018,42 11165,150.996094,2.985506,236.647967,43.287350,1,0.0000,0.0000,0.0000,nan,0.020,16 11359,349.966217,-62.696659,319.542989,-51.376556,1,0.1529,0.1415,0.0072,39.1281,0.021,42 11507,53.085938,-28.122234,224.100909,-54.509752,1,0.3312,0.5095,0.0718,42.3102,0.007,90 11770,346.130127,-63.072620,321.423103,-50.042305,1,0.1415,0.2171,0.4350,40.1560,0.020,62 11773,150.644531,3.583322,235.698235,43.342784,1,0.2207,0.5279,0.1679,42.4027,0.018,52 11931,149.589844,3.583322,234.885369,42.474696,1,0.0000,0.0000,0.0000,nan,0.024,65 11978,358.648071,-46.375080,329.462659,-67.716008,1,0.4920,0.4605,0.0179,42.0472,0.009,90 12695,51.855469,-28.630989,224.733260,-55.649872,1,0.0000,0.0000,0.0000,nan,0.009,92 12872,347.861847,-61.943836,321.519104,-51.424048,1,0.0000,0.0000,0.0000,nan,0.017,65 13079,151.699219,3.583322,236.533224,44.205648,1,0.2019,2.4470,1.0434,46.4913,0.016,90 13138,346.655182,-63.260487,320.952196,-50.040935,1,0.0756,0.5192,0.2158,42.3596,0.019,52 13194,53.789062,-27.784405,223.685697,-53.845803,1,0.5195,0.5624,0.2843,42.5685,0.009,90 13459,150.117188,2.836105,236.124718,42.483719,1,0.3495,0.3449,0.6556,41.3068,0.016,90 13482,33.750000,-4.630479,168.146242,-59.949072,1,0.2929,0.3115,0.0205,41.0501,0.019,90 13504,1.363636,-46.768478,324.669342,-68.371416,1,0.4469,0.3816,0.0766,41.5643,0.008,90 14080,150.996094,4.181528,235.291975,43.970869,1,0.0000,0.0000,0.0000,nan,0.015,65 14156,53.085938,-27.111860,222.384291,-54.355086,1,0.0000,0.0000,0.0000,nan,0.007,65 14279,54.667969,-27.615883,223.610785,-53.050840,1,0.3434,0.5728,0.4518,42.6167,0.009,52 14398,2.071130,-45.191612,325.606223,-69.989264,1,0.2812,0.2634,1.0581,40.6310,0.011,90 14539,150.644531,3.583322,235.698235,43.342784,1,0.2882,0.2359,0.0434,40.3590,0.018,88 14553,359.805206,-46.768478,327.135979,-67.829903,1,1.1897,1.1667,0.1717,44.5143,0.011,95 14601,32.695312,-4.929937,166.868469,-60.841230,1,0.3837,0.3653,0.2005,41.4527,0.018,90 14674,33.750000,-4.630479,168.146242,-59.949072,1,0.2012,0.0567,0.4176,37.0171,0.019,90 14983,349.615387,-63.636005,318.927246,-50.506542,1,0.3391,0.3238,0.0255,41.1476,0.018,90 15002,349.046051,-61.943836,320.796530,-51.753706,1,0.3409,0.3512,0.0531,41.3530,0.017,90 15251,32.871094,-4.780192,166.959493,-60.615132,1,0.4653,2.3270,0.6097,46.3585,0.017,90 15475,351.382965,-64.011238,317.574052,-50.604657,1,0.0000,0.0000,0.0000,nan,0.023,65 15626,346.130127,-63.072620,321.423103,-50.042305,1,0.0000,0.0000,0.0000,nan,0.020,16 15674,0.965665,-46.375080,325.845907,-68.579427,1,0.2927,0.2727,0.3286,40.7172,0.007,90 15700,359.415588,-46.768478,327.729895,-67.686097,1,0.0000,0.0000,0.0000,nan,0.009,16 15718,51.855469,-27.953188,223.543603,-55.561470,1,0.1193,2.3179,0.7672,46.3482,0.008,52 15845,53.789062,-27.784405,223.685697,-53.845803,1,0.3174,0.3471,0.8216,41.3232,0.009,90 15968,149.414062,2.238686,236.239766,41.565558,1,0.3509,0.4729,0.4544,42.1164,0.017,90 16339,51.328125,-27.447618,222.535046,-55.950727,1,0.0000,0.0000,0.0000,nan,0.013,16 16349,150.820312,3.134927,236.341348,43.230123,1,0.0000,0.0000,0.0000,nan,0.016,16 16463,151.699219,3.583322,236.533224,44.205648,1,0.2023,0.1805,0.0254,39.7082,0.016,90 16496,359.415588,-46.768478,327.729895,-67.686097,1,0.3391,0.3895,0.2635,41.6162,0.009,52 16802,53.437500,-29.142223,225.908120,-54.336118,1,0.3145,0.3319,0.0234,41.2094,0.008,90 16983,150.117188,3.732834,235.120533,42.993809,1,0.2899,0.2762,0.1879,40.7495,0.020,90 17094,52.207031,-28.291550,224.208534,-55.300157,1,0.0000,0.0000,0.0000,nan,0.007,16 17172,53.437500,-29.142223,225.908120,-54.336118,1,0.0000,0.0000,0.0000,nan,0.008,16 17285,148.710938,2.836105,235.050801,41.328739,1,0.3073,0.3057,0.0484,41.0025,0.031,90 17366,349.285706,-62.884678,319.786163,-51.046461,1,0.2387,0.2024,0.0247,39.9853,0.018,90 17370,0.949367,-45.586655,326.991548,-69.251686,1,0.3138,0.3391,0.4176,41.2636,0.013,62 17515,52.207031,-28.630989,224.800211,-55.343637,1,0.3577,0.3487,0.0073,41.3345,0.009,90 18029,359.415588,-46.768478,327.729895,-67.686097,1,0.3525,0.3609,0.0112,41.4219,0.009,90 18507,352.711273,-63.823658,316.922299,-51.059403,1,0.3755,0.3457,0.0230,41.3125,0.024,88 18556,51.855469,-26.276812,220.627031,-55.293792,1,0.0000,0.0000,0.0000,nan,0.014,6 18645,358.636353,-46.768478,328.890146,-67.388837,1,0.1640,2.3025,1.1022,46.3306,0.008,62 18706,34.277344,-5.679190,170.314930,-60.410322,1,0.1706,0.1766,0.0158,39.6556,0.020,62 18937,348.595886,-63.072620,320.023289,-50.713060,1,0.2142,0.2222,0.0102,40.2123,0.021,90 18952,151.699219,3.583322,236.533224,44.205648,1,0.2800,0.2658,1.1944,40.6541,0.016,90 19154,351.382965,-64.011238,317.574052,-50.604657,1,0.2354,2.4138,0.5022,46.4553,0.023,67 19213,1.753247,-46.768478,324.030235,-68.498041,1,0.1254,0.1484,0.0086,39.2403,0.014,62 19866,359.814819,-44.399834,330.775011,-69.801007,1,0.2608,0.2877,0.0235,40.8505,0.009,90 20567,351.259003,-64.386185,317.344860,-50.255113,1,0.1549,0.1481,0.2206,39.2350,0.020,62 20934,348.908447,-63.823658,319.169886,-50.176186,1,0.0999,2.5704,1.2137,46.6209,0.018,42 21335,33.574219,-5.379379,168.838090,-60.637536,1,0.1542,0.2082,0.4220,40.0542,0.017,90 22184,358.312500,-44.993881,332.185785,-68.685906,1,0.3508,0.3850,0.6064,41.5869,0.009,90 22574,150.996094,2.985506,236.647967,43.287350,1,0.0000,0.0000,0.0000,nan,0.020,16 22901,151.171875,1.342993,238.602520,42.464379,1,0.2581,0.2502,0.0061,40.5039,0.026,90 23116,53.261719,-27.615883,223.280041,-54.281374,1,0.8237,0.7520,0.0300,43.3376,0.006,15 23127,149.414062,3.433834,234.919132,42.245550,1,0.3221,0.4025,0.7933,41.7004,0.027,52 23299,33.222656,-4.780192,167.515653,-60.396584,1,0.5869,0.5400,0.0151,42.4624,0.018,88 23373,150.117188,3.732834,235.120533,42.993809,1,0.5442,0.5636,0.2043,42.5744,0.020,88 23396,359.811707,-45.191612,329.485675,-69.150905,1,0.5667,0.6192,0.1193,42.8220,0.010,90 23409,348.595886,-63.072620,320.023289,-50.713060,1,0.1407,0.1392,0.0136,39.0882,0.021,52 23539,34.277344,-5.079716,169.526841,-59.956640,1,0.4550,0.2524,0.3112,40.5254,0.019,95 23795,51.855469,-26.276812,220.627031,-55.293792,1,0.0000,0.0000,0.0000,nan,0.014,65 23822,2.457983,-45.389202,324.632685,-69.945696,1,0.2411,0.2420,0.9270,40.4218,0.011,52 23848,33.925781,-5.979157,170.179895,-60.866303,1,0.3316,0.3185,1.0181,41.1057,0.022,90 23857,151.699219,3.583322,236.533224,44.205648,1,0.2988,0.4769,0.0894,42.1379,0.016,90 23931,32.695312,-4.929937,166.868469,-60.841230,1,0.6282,0.6337,0.0073,42.8832,0.018,88 24193,152.050781,2.985506,237.495952,44.143927,1,2.0958,1.3937,0.2518,44.9919,0.019,88 24236,346.655182,-63.260487,320.952196,-50.040935,1,0.0000,0.0000,0.0000,nan,0.019,65 24592,349.966217,-62.696659,319.542989,-51.376556,1,0.2901,0.2846,0.0249,40.8234,0.021,90 24849,53.085938,-27.111860,222.384291,-54.355086,1,0.0000,0.0000,0.0000,nan,0.007,16 24903,52.031250,-26.443335,220.963669,-55.168557,1,0.0000,0.0000,0.0000,nan,0.014,65 24947,150.117188,2.238686,236.784618,42.139082,1,0.4723,0.4521,0.0193,41.9998,0.016,90 24989,34.804688,-5.829153,171.307861,-60.174401,1,0.4468,0.4763,0.0117,42.1349,0.023,90 25003,359.814819,-44.399834,330.775011,-69.801007,1,0.3137,0.2996,0.0218,40.9523,0.009,90 25039,346.562500,-63.448284,320.824720,-49.866957,1,0.3161,0.2675,1.1577,40.6696,0.021,90 25474,151.523438,3.134927,236.900695,43.803170,1,0.5236,0.5626,0.0155,42.5697,0.019,90 25529,358.312500,-44.993881,332.185785,-68.685906,1,0.2835,0.5789,0.2180,42.6448,0.009,90 25577,348.529419,-61.755440,321.293980,-51.763351,1,0.4028,0.3918,0.0170,41.6314,0.016,90 25783,150.820312,3.134927,236.341348,43.230123,1,0.1040,0.1439,0.0116,39.1669,0.016,42 25920,150.644531,3.583322,235.698235,43.342784,1,0.0000,0.0000,0.0000,nan,0.018,16 25925,35.332031,-5.979157,172.286722,-59.931743,1,1.7327,1.7075,0.1320,45.5358,0.022,88 26161,359.415588,-46.768478,327.729895,-67.686097,1,0.0000,0.0000,0.0000,nan,0.009,92 26338,151.171875,2.537361,237.288526,43.169764,1,0.1892,0.2250,0.0141,40.2436,0.024,62 26352,1.708861,-45.586655,325.688716,-69.520253,1,0.0000,0.0000,0.0000,nan,0.011,65 26401,151.699219,3.583322,236.533224,44.205648,1,0.0000,0.0000,0.0000,nan,0.016,16 26531,351.259003,-64.386185,317.344860,-50.255113,1,2.5314,2.4324,0.2792,46.4755,0.020,88 26660,347.846710,-64.760857,318.929827,-49.143596,1,0.0000,0.0000,0.0000,nan,0.019,65 26783,150.820312,1.641510,237.994507,42.358984,1,0.0000,0.0000,0.0000,nan,0.020,92 27124,351.299988,-62.320400,319.038597,-52.026867,1,0.0000,0.0000,0.0000,nan,0.018,16 27339,51.855469,-26.276812,220.627031,-55.293792,1,0.1432,0.1625,0.0226,39.4561,0.014,90 27941,149.414062,1.940072,236.565366,41.393323,1,0.3632,0.3746,0.0319,41.5166,0.018,90 28220,1.694561,-45.191612,326.278557,-69.858253,1,0.2985,0.3605,0.3149,41.4193,0.011,90 28301,0.189873,-45.586655,328.254458,-68.969298,1,0.3606,0.2852,1.3620,40.8288,0.007,90 28391,351.953644,-62.132156,318.777388,-52.347124,1,0.0000,0.0000,0.0000,nan,0.019,92 28636,51.855469,-28.630989,224.733260,-55.649872,1,0.1743,0.4412,0.3366,41.9364,0.009,67 28843,151.171875,2.537361,237.288526,43.169764,1,0.3664,0.3611,0.0225,41.4234,0.024,90 28915,53.789062,-27.784405,223.685697,-53.845803,1,0.0000,0.0000,0.0000,nan,0.009,16 29088,52.558594,-27.279613,222.538937,-54.845107,1,0.3037,0.3244,0.0203,41.1521,0.008,90 29252,51.855469,-28.630989,224.733260,-55.649872,1,0.1439,0.1421,0.0233,39.1376,0.009,42 29416,1.694561,-45.191612,326.278557,-69.858253,1,0.2168,0.1921,0.0349,39.8588,0.011,90 29420,2.097458,-45.783966,324.737840,-69.478613,1,0.5849,0.5559,0.0102,42.5385,0.011,90 29576,346.655182,-63.260487,320.952196,-50.040935,1,0.2362,2.5224,1.0484,46.5713,0.019,90 29668,151.699219,3.583322,236.533224,44.205648,1,0.1461,0.1584,0.0175,39.3960,0.016,42 29670,1.694561,-45.191612,326.278557,-69.858253,1,0.1135,0.1208,0.0198,38.7544,0.011,62 30066,351.259003,-64.386185,317.344860,-50.255113,1,0.0000,0.0000,0.0000,nan,0.020,65 30172,33.574219,-5.379379,168.838090,-60.637536,1,0.5444,0.5455,0.0094,42.4889,0.017,90 30191,150.117188,2.238686,236.784618,42.139082,1,1.5405,1.3073,0.1521,44.8201,0.016,88 30505,151.171875,2.238686,237.619933,42.994783,1,0.0000,0.0000,0.0000,nan,0.024,16 30545,2.071130,-45.191612,325.606223,-69.989264,1,0.2160,0.2221,0.0338,40.2113,0.011,90 30576,51.855469,-27.953188,223.543603,-55.561470,1,0.0000,0.0000,0.0000,nan,0.008,65 30673,349.966217,-62.696659,319.542989,-51.376556,1,0.0000,0.0000,0.0000,nan,0.021,16 30895,349.429535,-62.508568,320.039643,-51.393745,1,0.0000,0.0000,0.0000,nan,0.020,16 31033,349.891296,-64.573555,317.972107,-49.786192,1,0.0000,0.0000,0.0000,nan,0.023,65 31100,1.694561,-45.191612,326.278557,-69.858253,1,0.3614,0.3471,1.2589,41.3231,0.011,90 31310,149.238281,3.882372,234.283829,42.351155,1,0.0000,0.0000,0.0000,nan,0.033,92 31569,346.655182,-63.260487,320.952196,-50.040935,1,1.0133,1.0031,0.0118,44.1084,0.019,95 31605,150.292969,2.686724,236.427488,42.541447,1,0.0000,0.0000,0.0000,nan,0.016,65 31824,352.398651,-62.696659,318.017427,-51.967966,1,0.1838,0.0844,0.3311,37.9246,0.020,42 32238,34.101562,-5.829153,170.247753,-60.638325,1,0.0000,0.0000,0.0000,nan,0.019,16 32309,34.804688,-5.829153,171.307861,-60.174401,1,0.2251,0.2258,0.9011,40.2521,0.023,42 32375,53.964844,-28.630989,225.142950,-53.813613,1,0.0000,0.0000,0.0000,nan,0.009,65 32695,358.636353,-46.768478,328.890146,-67.388837,1,0.7689,0.7806,0.0113,43.4371,0.008,90 33088,351.259003,-64.386185,317.344860,-50.255113,1,0.3437,0.3495,0.0200,41.3400,0.020,90 33179,51.855469,-27.953188,223.543603,-55.561470,1,0.4407,0.4765,0.4079,42.1357,0.008,90 33191,151.171875,2.238686,237.619933,42.994783,1,0.4030,0.4039,0.0174,41.7094,0.024,42 33409,33.222656,-4.780192,167.515653,-60.396584,1,0.0000,0.0000,0.0000,nan,0.018,65 33419,150.820312,3.732834,235.666318,43.572109,1,0.7462,0.7461,0.0356,43.3165,0.016,90 33422,33.574219,-6.579593,170.455585,-61.548219,1,1.1111,1.1054,0.0101,44.3693,0.021,88 34012,35.683594,-5.379379,171.992947,-59.253501,1,0.0853,0.0793,0.0210,37.7805,0.020,52 34166,0.189873,-45.586655,328.254458,-68.969298,1,0.0873,0.0909,0.0246,38.0959,0.007,42 34243,34.101562,-5.829153,170.247753,-60.638325,1,0.1416,0.5642,0.3631,42.5771,0.019,88 34299,346.276581,-64.011238,320.448031,-49.344136,1,0.1901,0.2016,0.0086,39.9759,0.019,62 34437,152.050781,2.985506,237.495952,44.143927,1,0.2657,0.2629,1.0604,40.6263,0.019,67 35197,51.679688,-27.447618,222.618229,-55.642263,1,0.2509,0.2366,0.1269,40.3670,0.010,42 35315,150.468750,3.732834,235.392208,43.283244,1,1.8476,1.5239,0.2256,45.2314,0.020,95 35555,359.805206,-46.768478,327.135979,-67.829903,1,0.0000,0.0000,0.0000,nan,0.011,65 35743,34.277344,-5.679190,170.314930,-60.410322,1,0.0781,0.0752,0.0197,37.6598,0.020,42 35772,150.117188,2.836105,236.124718,42.483719,1,0.2385,0.2588,0.0217,40.5879,0.016,90 35855,0.929752,-44.597992,328.531426,-70.083244,1,0.3815,0.4401,0.5094,41.9300,0.011,90 36085,352.398651,-62.696659,318.017427,-51.967966,1,0.1689,0.1759,0.5357,39.6465,0.020,42 36153,150.468750,1.641510,237.714575,42.075234,1,0.2547,0.2589,0.0256,40.5887,0.017,52 36337,52.558594,-27.279613,222.538937,-54.845107,1,0.0000,0.0000,0.0000,nan,0.008,65 36362,53.085938,-27.784405,223.525509,-54.460748,1,0.0000,0.0000,0.0000,nan,0.007,65 36671,149.589844,3.583322,234.885369,42.474696,1,0.0000,0.0000,0.0000,nan,0.024,65 36783,349.966217,-62.696659,319.542989,-51.376556,1,0.1287,0.1431,0.0129,39.1539,0.021,90 37149,359.816315,-44.003082,331.451340,-70.123054,1,0.9435,0.9017,0.0524,43.8228,0.013,90 37168,53.613281,-27.953188,223.929533,-54.024772,1,0.0000,0.0000,0.0000,nan,0.007,65 37661,32.871094,-4.780192,166.959493,-60.615132,1,0.1226,0.0973,0.0168,38.2528,0.017,52 37776,346.655182,-63.260487,320.952196,-50.040935,1,0.0000,0.0000,0.0000,nan,0.019,16 37865,151.171875,2.238686,237.619933,42.994783,1,0.2263,0.2221,0.0172,40.2112,0.024,90 37872,150.820312,3.134927,236.341348,43.230123,1,0.2517,0.2448,0.0217,40.4506,0.016,67 38174,1.694561,-45.191612,326.278557,-69.858253,1,1.6152,1.7388,0.1564,45.5843,0.011,88 38205,33.750000,-4.630479,168.146242,-59.949072,1,0.2945,0.2311,1.2272,40.3089,0.019,42 38244,346.655182,-63.260487,320.952196,-50.040935,1,0.0000,0.0000,0.0000,nan,0.019,65 38690,33.222656,-4.780192,167.515653,-60.396584,1,0.1801,0.1274,0.0307,38.8795,0.018,90 38730,53.261719,-27.615883,223.280041,-54.281374,1,0.0000,0.0000,0.0000,nan,0.006,16 38754,33.574219,-6.579593,170.455585,-61.548219,1,0.2646,0.2656,0.0093,40.6515,0.021,90 38899,1.666667,-44.399834,327.519190,-70.529554,1,0.4828,0.4754,0.0332,42.1297,0.009,90 39223,150.996094,2.388015,237.313912,42.939977,1,0.0000,0.0000,0.0000,nan,0.021,65 39305,346.562500,-63.448284,320.824720,-49.866957,1,0.4045,0.2986,0.1602,40.9435,0.021,90 39398,51.679688,-27.447618,222.618229,-55.642263,1,0.3347,0.5555,0.5802,42.5365,0.010,90 39597,53.085938,-28.122234,224.100909,-54.509752,1,0.1280,0.1327,0.0064,38.9763,0.007,62 39626,149.414062,2.238686,236.239766,41.565558,1,0.5197,0.4293,0.7344,41.8662,0.017,90 39846,351.382965,-64.011238,317.574052,-50.604657,1,0.1886,0.2780,0.6915,40.7654,0.023,62 40290,35.859375,-4.630479,171.270769,-58.580806,1,0.3153,0.5118,0.8469,42.3221,0.022,42 41515,358.648071,-46.375080,329.462659,-67.716008,1,0.5720,0.5797,0.0188,42.6484,0.009,90 41738,150.117188,3.732834,235.120533,42.993809,1,0.1206,0.1277,0.0222,38.8865,0.020,42 42118,0.574468,-45.981140,327.041068,-68.778764,1,0.1801,0.1977,0.0131,39.9287,0.006,62 42224,51.328125,-27.784405,223.130589,-55.999499,1,0.1119,0.0888,0.1482,38.0396,0.013,42 42288,359.415588,-46.768478,327.729895,-67.686097,1,0.3487,0.3864,0.0241,41.5962,0.009,90 42333,346.562500,-63.448284,320.824720,-49.866957,1,0.1921,0.2046,0.0101,40.0116,0.021,67 42469,2.071130,-45.191612,325.606223,-69.989264,1,1.5989,1.4913,0.1216,45.1735,0.011,95 42689,346.562500,-63.448284,320.824720,-49.866957,1,0.0000,0.0000,0.0000,nan,0.021,65 42776,152.050781,3.284369,237.157374,44.318466,1,0.0000,0.0000,0.0000,nan,0.019,16 42852,351.321442,-64.198746,317.458993,-50.429931,1,0.6771,0.6680,0.0223,43.0226,0.023,88 43028,51.679688,-27.447618,222.618229,-55.642263,1,0.1366,0.1364,0.0092,39.0408,0.010,42 43151,34.980469,-6.279288,172.180075,-60.389399,1,0.1096,0.1352,0.0222,39.0199,0.023,52 43211,34.980469,-6.279288,172.180075,-60.389399,1,0.3321,0.3275,0.0164,41.1762,0.023,90 43337,51.328125,-27.447618,222.535046,-55.950727,1,0.1775,0.2488,0.0180,40.4902,0.013,90 43413,348.595886,-63.072620,320.023289,-50.713060,1,0.0000,0.0000,0.0000,nan,0.021,16 43509,51.855469,-27.953188,223.543603,-55.561470,1,0.0000,0.0000,0.0000,nan,0.008,92 43812,150.820312,3.134927,236.341348,43.230123,1,0.2783,0.2821,0.0219,40.8022,0.016,90 43962,347.861847,-61.943836,321.519104,-51.424048,1,0.6595,0.6813,0.0340,43.0750,0.017,90 44102,152.050781,3.284369,237.157374,44.318466,1,0.2450,0.2640,0.2125,40.6366,0.019,42 44217,51.855469,-27.953188,223.543603,-55.561470,1,0.4288,0.1772,0.6584,39.6645,0.008,90 44309,34.980469,-6.279288,172.180075,-60.389399,1,0.3131,0.3059,1.2366,41.0045,0.023,90 44480,53.964844,-28.630989,225.142950,-53.813613,1,0.0000,0.0000,0.0000,nan,0.009,16 44836,151.171875,1.342993,238.602520,42.464379,1,0.0000,0.0000,0.0000,nan,0.026,65 45060,346.276581,-64.011238,320.448031,-49.344136,1,0.3613,0.3300,0.1387,41.1948,0.019,62 45115,2.097458,-45.783966,324.737840,-69.478613,1,0.0000,0.0000,0.0000,nan,0.011,65 45127,35.859375,-4.630479,171.270769,-58.580806,1,0.1378,0.1359,0.0125,39.0331,0.022,90 45203,150.820312,3.134927,236.341348,43.230123,1,0.0000,0.0000,0.0000,nan,0.016,16 45319,348.595886,-63.072620,320.023289,-50.713060,1,0.1270,0.0737,0.0272,37.6138,0.021,42 45349,32.695312,-4.929937,166.868469,-60.841230,1,0.2821,2.6404,0.9837,46.6916,0.018,67 45549,52.207031,-28.291550,224.208534,-55.300157,1,0.6733,0.7639,0.0513,43.3795,0.007,42 46210,51.855469,-27.953188,223.543603,-55.561470,1,0.0000,0.0000,0.0000,nan,0.008,65 46567,149.414062,3.433834,234.919132,42.245550,1,0.0000,0.0000,0.0000,nan,0.027,16 46804,352.398651,-62.696659,318.017427,-51.967966,1,0.2933,2.3248,0.6338,46.3561,0.020,62 46958,150.820312,3.732834,235.666318,43.572109,1,0.0000,0.0000,0.0000,nan,0.016,65 47148,53.085938,-27.784405,223.525509,-54.460748,1,0.3153,0.3234,0.0167,41.1439,0.007,90 47725,351.259003,-64.386185,317.344860,-50.255113,1,0.3615,0.2950,0.7526,40.9133,0.020,90 48187,52.207031,-26.610098,221.298836,-55.042928,1,0.5466,0.5379,0.1063,42.4521,0.014,90 48260,52.207031,-28.291550,224.208534,-55.300157,1,0.1677,0.1746,0.4651,39.6283,0.007,42 48426,351.734680,-62.884678,318.284128,-51.651217,1,0.0000,0.0000,0.0000,nan,0.019,16 48473,51.855469,-27.953188,223.543603,-55.561470,1,0.0809,0.0825,0.0165,37.8722,0.008,42 48575,32.695312,-4.929937,166.868469,-60.841230,1,0.7796,0.7698,0.0437,43.4000,0.018,90 48687,347.812500,-63.448284,320.128971,-50.202348,1,0.7916,0.8371,0.0551,43.6237,0.021,88 48725,53.613281,-28.630989,225.073365,-54.119461,1,0.3658,0.3599,0.1804,41.4145,0.006,67 48749,348.529419,-61.755440,321.293980,-51.763351,1,1.6645,1.5782,0.0657,45.3252,0.016,88 48817,348.586945,-64.573555,318.693903,-49.477869,1,0.0962,0.1446,0.6309,39.1788,0.018,90 48981,358.648071,-46.375080,329.462659,-67.716008,1,0.0000,0.0000,0.0000,nan,0.009,16 49219,33.398438,-3.732834,166.492280,-59.466614,1,0.0000,0.0000,0.0000,nan,0.022,65 49389,349.285706,-62.884678,319.786163,-51.046461,1,0.0000,0.0000,0.0000,nan,0.018,92 49529,53.613281,-27.953188,223.929533,-54.024772,1,0.1974,0.2117,0.0123,40.0947,0.007,62 49783,51.328125,-27.784405,223.130589,-55.999499,1,0.0000,0.0000,0.0000,nan,0.013,65 49937,151.347656,3.583322,236.252362,43.918627,1,0.0947,0.1091,0.0218,38.5175,0.015,52 50277,349.046051,-61.943836,320.796530,-51.753706,1,0.3009,0.2478,1.3214,40.4807,0.017,67 50395,349.891296,-64.573555,317.972107,-49.786192,1,0.1552,0.1574,0.3022,39.3801,0.023,90 51178,52.558594,-27.279613,222.538937,-54.845107,1,0.9668,0.9534,0.0188,43.9721,0.008,88 51279,1.708861,-45.586655,325.688716,-69.520253,1,0.1760,0.1824,0.3979,39.7343,0.011,90 51318,34.277344,-5.679190,170.314930,-60.410322,1,0.2987,0.3038,0.0273,40.9868,0.020,62 51490,0.574468,-45.981140,327.041068,-68.778764,1,0.3707,0.4717,0.3810,42.1099,0.006,90 51987,352.711273,-63.823658,316.922299,-51.059403,1,0.0000,0.0000,0.0000,nan,0.024,92 52150,52.207031,-26.610098,221.298836,-55.042928,1,0.2746,0.3069,0.0656,41.0122,0.014,90 52320,52.910156,-25.944481,220.366350,-54.301439,1,0.1656,2.0324,0.6458,46.0000,0.010,88 52370,352.711273,-63.823658,316.922299,-51.059403,1,0.0000,0.0000,0.0000,nan,0.024,16 52425,52.910156,-26.276812,220.926149,-54.363918,1,0.1587,0.1182,0.6266,38.7042,0.008,42 52740,53.613281,-27.953188,223.929533,-54.024772,1,0.3653,0.4721,0.8955,42.1118,0.007,90 52854,149.414062,1.940072,236.565366,41.393323,1,0.4383,0.4343,0.0335,41.8957,0.018,90 53025,358.636353,-46.768478,328.890146,-67.388837,1,0.0000,0.0000,0.0000,nan,0.008,65 53249,349.285706,-62.884678,319.786163,-51.046461,1,0.3941,0.4211,0.4203,41.8166,0.018,90 53354,34.980469,-6.279288,172.180075,-60.389399,1,0.2231,0.2134,0.0125,40.1138,0.023,62 53525,349.615387,-63.636005,318.927246,-50.506542,1,0.0000,0.0000,0.0000,nan,0.018,16 53574,0.574468,-45.981140,327.041068,-68.778764,1,0.4176,0.4427,0.0420,41.9454,0.006,90 53782,53.261719,-27.615883,223.280041,-54.281374,1,0.3798,0.3714,0.0189,41.4948,0.006,90 53938,53.437500,-29.142223,225.908120,-54.336118,1,0.0000,0.0000,0.0000,nan,0.008,16 54416,347.812500,-63.448284,320.128971,-50.202348,1,0.3708,0.3734,0.0162,41.5087,0.021,42 54883,347.617462,-62.508568,321.121462,-50.904708,1,0.7844,0.8578,0.0392,43.6891,0.019,88 54915,148.886719,2.686724,235.347248,41.389003,1,0.0000,0.0000,0.0000,nan,0.028,65 55002,52.207031,-28.291550,224.208534,-55.300157,1,0.4345,0.4175,0.0286,41.7946,0.007,90 55018,350.230255,-61.943836,320.053946,-52.070537,1,0.0000,0.0000,0.0000,nan,0.017,65 55033,151.171875,1.342993,238.602520,42.464379,1,0.4920,0.5345,0.0204,42.4355,0.026,42 55060,51.855469,-28.630989,224.733260,-55.649872,1,0.0234,0.0824,0.0202,37.8680,0.009,42 55141,2.097458,-45.783966,324.737840,-69.478613,1,0.0000,0.0000,0.0000,nan,0.011,65 55155,34.101562,-5.829153,170.247753,-60.638325,1,0.0000,0.0000,0.0000,nan,0.019,65 55354,350.230255,-61.943836,320.053946,-52.070537,1,0.1958,2.3870,0.2904,46.4258,0.017,90 55419,150.820312,3.732834,235.666318,43.572109,1,0.0967,0.1092,0.0137,38.5190,0.016,62 55946,347.617462,-62.508568,321.121462,-50.904708,1,0.0000,0.0000,0.0000,nan,0.019,65 56053,150.292969,2.686724,236.427488,42.541447,1,0.8507,0.8283,0.0406,43.5954,0.016,90 56245,1.708861,-45.586655,325.688716,-69.520253,1,0.0000,0.0000,0.0000,nan,0.011,16 56334,358.312500,-44.993881,332.185785,-68.685906,1,0.1955,0.1948,0.0100,39.8925,0.009,62 56349,1.694561,-45.191612,326.278557,-69.858253,1,0.1646,0.2000,0.1961,39.9565,0.011,52 56461,347.812500,-63.448284,320.128971,-50.202348,1,0.1760,2.4036,1.0097,46.4440,0.021,90 56769,33.574219,-5.079716,168.448505,-60.407218,1,0.0000,0.0000,0.0000,nan,0.016,65 56821,52.207031,-26.610098,221.298836,-55.042928,1,0.0775,0.0884,0.0181,38.0309,0.014,62 56893,150.117188,2.836105,236.124718,42.483719,1,0.0000,0.0000,0.0000,nan,0.016,65 56987,33.574219,-6.579593,170.455585,-61.548219,1,0.2910,0.3152,0.0132,41.0794,0.021,62 57205,359.816315,-44.003082,331.451340,-70.123054,1,0.5891,0.6057,0.0306,42.7639,0.013,90 57237,52.207031,-26.610098,221.298836,-55.042928,1,0.2544,0.3807,0.6581,41.5580,0.014,42 57263,351.299988,-62.320400,319.038597,-52.026867,1,0.0000,0.0000,0.0000,nan,0.018,65 57561,152.050781,3.284369,237.157374,44.318466,1,0.0000,0.0000,0.0000,nan,0.019,65 57666,1.666667,-44.399834,327.519190,-70.529554,1,0.4399,0.4648,0.0183,42.0716,0.009,90 57784,348.529419,-61.755440,321.293980,-51.763351,1,0.0000,0.0000,0.0000,nan,0.016,65 58174,348.529419,-61.755440,321.293980,-51.763351,1,1.1032,1.1018,0.1226,44.3607,0.016,42 58265,0.190678,-45.783966,327.956322,-68.803772,1,0.0650,0.1027,1.1126,38.3778,0.005,67 58323,349.615387,-63.636005,318.927246,-50.506542,1,1.0155,1.0187,0.0471,44.1500,0.018,88 59068,34.277344,-5.079716,169.526841,-59.956640,1,0.5716,0.5429,0.1404,42.4762,0.019,42 59128,348.529419,-61.755440,321.293980,-51.763351,1,0.6794,0.4288,0.2152,41.8629,0.016,90 59163,150.644531,3.583322,235.698235,43.342784,1,0.0000,0.0000,0.0000,nan,0.018,65 59427,151.171875,1.342993,238.602520,42.464379,1,0.0000,0.0000,0.0000,nan,0.026,65 59463,52.031250,-26.443335,220.963669,-55.168557,1,0.1439,0.1438,0.0169,39.1654,0.014,42 59580,1.753247,-46.768478,324.030235,-68.498041,1,0.4178,0.5020,0.7485,42.2716,0.014,42 59644,34.980469,-6.279288,172.180075,-60.389399,1,0.4238,0.3347,0.1863,41.2308,0.023,90 59732,347.812500,-63.448284,320.128971,-50.202348,1,0.2489,2.7125,0.9243,46.7623,0.021,90 60023,359.814819,-44.399834,330.775011,-69.801007,1,0.6857,0.6858,0.0077,43.0925,0.009,90 60098,53.261719,-27.615883,223.280041,-54.281374,1,0.2529,0.2478,0.0697,40.4804,0.006,90 60340,148.710938,2.836105,235.050801,41.328739,1,0.3521,0.3472,0.0096,41.3237,0.031,67 60350,52.558594,-27.279613,222.538937,-54.845107,1,0.5295,0.5982,0.1284,42.7310,0.008,62 60376,51.855469,-27.953188,223.543603,-55.561470,1,0.0000,0.0000,0.0000,nan,0.008,16 60407,346.130127,-63.072620,321.423103,-50.042305,1,0.0000,0.0000,0.0000,nan,0.020,65 60554,347.013428,-62.508568,321.472056,-50.735330,1,0.2356,0.2869,0.0805,40.8438,0.018,67 60742,346.562500,-63.448284,320.824720,-49.866957,1,0.0000,0.0000,0.0000,nan,0.021,65 60976,358.648071,-46.375080,329.462659,-67.716008,1,0.0615,0.0821,0.0078,37.8606,0.009,52 61101,151.171875,2.537361,237.288526,43.169764,1,0.0000,0.0000,0.0000,nan,0.024,16 61165,53.789062,-27.784405,223.685697,-53.845803,1,0.1089,0.1122,0.0140,38.5832,0.009,62 61407,53.613281,-27.953188,223.929533,-54.024772,1,0.0000,0.0000,0.0000,nan,0.007,92 61763,52.207031,-28.291550,224.208534,-55.300157,1,0.5749,0.5237,0.0263,42.3819,0.007,90 62078,0.965665,-46.375080,325.845907,-68.579427,1,0.0000,0.0000,0.0000,nan,0.007,65 62187,33.398438,-3.732834,166.492280,-59.466614,1,0.1391,0.1301,1.1896,38.9300,0.022,64 62230,53.085938,-27.784405,223.525509,-54.460748,1,0.0000,0.0000,0.0000,nan,0.007,16 62253,51.328125,-27.447618,222.535046,-55.950727,1,0.7671,0.7610,0.0367,43.3693,0.013,90 62254,53.085938,-27.111860,222.384291,-54.355086,1,0.2738,0.2895,0.0180,40.8666,0.007,90 62384,351.321442,-64.198746,317.458993,-50.429931,1,0.0000,0.0000,0.0000,nan,0.023,65 62541,347.013428,-62.508568,321.472056,-50.735330,1,0.5490,0.3280,0.8012,41.1797,0.018,90 62908,150.820312,3.134927,236.341348,43.230123,1,0.2372,0.2111,0.0189,40.0878,0.016,62 63561,359.814819,-44.399834,330.775011,-69.801007,1,0.7386,0.7247,0.0129,43.2390,0.009,90 63718,0.965665,-46.375080,325.845907,-68.579427,1,0.2891,0.4200,0.0613,41.8100,0.007,90 63860,351.382965,-64.011238,317.574052,-50.604657,1,0.1617,0.1988,0.0128,39.9412,0.023,42 64248,2.097458,-45.783966,324.737840,-69.478613,1,0.1653,0.1710,0.0196,39.5787,0.011,52 64485,348.586945,-64.573555,318.693903,-49.477869,1,0.0000,0.0000,0.0000,nan,0.018,16 64854,1.723404,-45.981140,325.117958,-69.180825,1,0.1163,0.0705,0.0098,37.5117,0.010,62 64888,52.031250,-26.443335,220.963669,-55.168557,1,0.3802,0.4086,0.7097,41.7390,0.014,90 64896,347.846710,-64.760857,318.929827,-49.143596,1,0.0991,0.1224,1.1874,38.7854,0.019,42 64911,150.820312,1.641510,237.994507,42.358984,1,0.2375,2.2621,1.0009,46.2837,0.020,90 65745,53.085938,-28.122234,224.100909,-54.509752,1,0.0680,0.0552,1.1581,36.9555,0.007,90 65749,33.398438,-4.331149,167.226341,-59.936551,1,0.0000,0.0000,0.0000,nan,0.018,65 65877,349.891296,-64.573555,317.972107,-49.786192,1,0.0000,0.0000,0.0000,nan,0.023,65 66126,348.908447,-63.823658,319.169886,-50.176186,1,0.0601,0.0718,0.0163,37.5531,0.018,42 66325,53.261719,-27.615883,223.280041,-54.281374,1,0.0000,0.0000,0.0000,nan,0.006,65 66536,150.996094,2.388015,237.313912,42.939977,1,0.3144,0.3150,0.2975,41.0779,0.021,90 66548,347.617462,-62.508568,321.121462,-50.904708,1,0.3713,0.4282,1.3857,41.8598,0.019,90 66852,2.071130,-45.191612,325.606223,-69.989264,1,0.5802,0.5679,0.0103,42.5945,0.011,42 66904,349.285706,-62.884678,319.786163,-51.046461,1,0.0000,0.0000,0.0000,nan,0.018,16 66967,150.996094,2.388015,237.313912,42.939977,1,0.4580,0.4551,0.0163,42.0168,0.021,88 66999,33.222656,-4.780192,167.515653,-60.396584,1,0.3635,0.4147,0.0332,41.7770,0.018,90 67245,150.292969,2.686724,236.427488,42.541447,1,0.0482,0.0598,0.0147,37.1379,0.016,42 67480,1.753247,-46.768478,324.030235,-68.498041,1,0.0000,0.0000,0.0000,nan,0.014,65 67514,34.101562,-5.829153,170.247753,-60.638325,1,0.2756,0.5256,0.1081,42.3913,0.019,90 67686,359.415588,-46.768478,327.729895,-67.686097,1,0.1785,0.2040,0.0109,40.0042,0.009,90 67730,52.910156,-25.944481,220.366350,-54.301439,1,0.3927,0.3926,0.0321,41.6368,0.010,90 67898,347.812500,-63.448284,320.128971,-50.202348,1,0.3824,0.3802,0.0081,41.5546,0.021,42 67981,151.523438,3.134927,236.900695,43.803170,1,0.0000,0.0000,0.0000,nan,0.019,65 68003,150.820312,3.732834,235.666318,43.572109,1,0.4201,0.4099,0.0162,41.7474,0.016,90 68276,33.398438,-3.732834,166.492280,-59.466614,1,0.0251,0.0342,0.0167,35.8818,0.022,42 68298,349.429535,-62.508568,320.039643,-51.393745,1,0.6264,0.6544,0.1479,42.9683,0.020,90 68667,348.595886,-63.072620,320.023289,-50.713060,1,0.0000,0.0000,0.0000,nan,0.021,16 68835,348.908447,-63.823658,319.169886,-50.176186,1,0.0177,0.0504,0.0318,36.7530,0.018,42 68886,2.457983,-45.389202,324.632685,-69.945696,1,0.3991,0.4463,0.3629,41.9666,0.011,52 69271,148.710938,2.836105,235.050801,41.328739,1,0.4269,0.4491,0.8217,41.9826,0.031,90 69490,51.855469,-26.276812,220.627031,-55.293792,1,0.2919,0.4825,0.1693,42.1687,0.014,42 69767,1.694561,-45.191612,326.278557,-69.858253,1,0.1603,2.4521,1.2066,46.4968,0.011,62 70046,351.321442,-64.198746,317.458993,-50.429931,1,0.3441,0.3129,0.0280,41.0610,0.023,90 70135,0.949367,-45.586655,326.991548,-69.251686,1,0.4500,0.4728,0.0167,42.1160,0.013,42 70171,53.789062,-27.784405,223.685697,-53.845803,1,0.4708,0.5059,0.0358,42.2916,0.009,90 70272,34.277344,-5.079716,169.526841,-59.956640,1,0.3088,0.3314,0.0357,41.2058,0.019,88 70276,151.171875,1.342993,238.602520,42.464379,1,0.8102,0.7948,0.0774,43.4853,0.026,88 70430,150.820312,1.641510,237.994507,42.358984,1,0.0451,0.0858,0.0255,37.9624,0.020,42 70571,149.414062,1.940072,236.565366,41.393323,1,0.3902,0.4137,1.1078,41.7706,0.018,90 70816,349.429535,-62.508568,320.039643,-51.393745,1,0.0000,0.0000,0.0000,nan,0.020,65 70898,151.171875,2.238686,237.619933,42.994783,1,0.5346,0.5478,0.0157,42.4996,0.024,90 70977,51.328125,-27.784405,223.130589,-55.999499,1,0.3428,0.3404,0.0099,41.2737,0.013,90 71068,34.453125,-5.229529,169.987075,-59.956185,1,0.2726,0.2679,1.0038,40.6730,0.019,88 71080,53.964844,-28.630989,225.142950,-53.813613,1,0.0000,0.0000,0.0000,nan,0.009,92 71084,52.558594,-27.279613,222.538937,-54.845107,1,0.1522,0.1472,0.0208,39.2216,0.008,42 71126,53.964844,-28.630989,225.142950,-53.813613,1,0.3021,0.5146,0.5741,42.3363,0.009,62 71438,52.558594,-27.279613,222.538937,-54.845107,1,0.0000,0.0000,0.0000,nan,0.008,65 71676,53.437500,-29.142223,225.908120,-54.336118,1,0.4396,0.4103,0.0209,41.7496,0.008,88 71890,348.595886,-63.072620,320.023289,-50.713060,1,0.7036,0.7024,0.0078,43.1558,0.021,88 71954,1.666667,-44.399834,327.519190,-70.529554,1,0.2483,0.2571,0.6448,40.5711,0.009,90 72053,53.613281,-27.953188,223.929533,-54.024772,1,0.2832,0.2616,0.0190,40.6141,0.007,88 72256,358.636353,-46.768478,328.890146,-67.388837,1,0.0000,0.0000,0.0000,nan,0.008,65 72337,34.277344,-5.079716,169.526841,-59.956640,1,0.2449,0.2107,0.1165,40.0824,0.019,90 72385,150.117188,2.836105,236.124718,42.483719,1,0.3029,0.2983,0.3068,40.9409,0.016,67 72426,51.679688,-27.447618,222.618229,-55.642263,1,0.5166,0.3846,0.3149,41.5839,0.010,90 72428,33.574219,-5.379379,168.838090,-60.637536,1,0.2214,2.4663,0.4396,46.5120,0.017,90 72489,53.613281,-26.944359,222.237403,-53.863858,1,0.8358,0.8312,0.0197,43.6047,0.009,90 72525,34.101562,-5.829153,170.247753,-60.638325,1,0.2407,0.2580,0.0145,40.5802,0.019,90 72735,151.699219,3.583322,236.533224,44.205648,1,0.2648,0.2313,0.0265,40.3104,0.016,67 73031,34.277344,-5.079716,169.526841,-59.956640,1,0.3151,0.5833,0.2442,42.6648,0.019,52 73236,33.398438,-3.732834,166.492280,-59.466614,1,0.1398,0.5280,0.4528,42.4036,0.022,90 73339,351.299988,-62.320400,319.038597,-52.026867,1,0.5183,0.2010,0.3771,39.9686,0.018,90 73433,349.966217,-62.696659,319.542989,-51.376556,1,0.0000,0.0000,0.0000,nan,0.021,65 73509,34.453125,-5.229529,169.987075,-59.956185,1,0.2953,0.2879,0.0259,40.8522,0.019,42 73610,34.277344,-5.079716,169.526841,-59.956640,1,0.8134,1.5079,0.3508,45.2031,0.019,95 74093,351.259003,-64.386185,317.344860,-50.255113,1,0.3940,0.4643,1.0625,42.0686,0.020,90 75116,349.891296,-64.573555,317.972107,-49.786192,1,0.0000,0.0000,0.0000,nan,0.023,16 75223,351.382965,-64.011238,317.574052,-50.604657,1,0.5652,0.5648,0.0157,42.5800,0.023,90 75562,346.562500,-63.448284,320.824720,-49.866957,1,0.0000,0.0000,0.0000,nan,0.021,65 75598,1.723404,-45.981140,325.117958,-69.180825,1,0.1652,0.1477,0.0151,39.2287,0.010,42 75646,348.529419,-61.755440,321.293980,-51.763351,1,0.2103,0.2089,0.0083,40.0620,0.016,67 75754,151.347656,3.583322,236.252362,43.918627,1,0.0000,0.0000,0.0000,nan,0.015,16 75792,148.886719,2.686724,235.347248,41.389003,1,0.3699,0.3602,0.0451,41.4171,0.028,90 75886,358.636353,-46.768478,328.890146,-67.388837,1,0.3985,0.3735,0.0375,41.5091,0.008,90 75987,54.667969,-27.615883,223.610785,-53.050840,1,0.9014,0.8506,0.0414,43.6664,0.009,88 76242,152.050781,2.985506,237.495952,44.143927,1,0.4916,0.5572,0.3767,42.5443,0.019,90 76304,348.595886,-63.072620,320.023289,-50.713060,1,2.4303,2.6811,1.0262,46.7317,0.021,95 76305,349.891296,-64.573555,317.972107,-49.786192,1,0.0000,0.0000,0.0000,nan,0.023,16 76639,346.500000,-62.320400,321.951129,-50.736054,1,0.5322,0.5077,0.0148,42.3012,0.020,90 77010,34.453125,-5.229529,169.987075,-59.956185,1,0.0000,0.0000,0.0000,nan,0.019,65 77041,346.276581,-64.011238,320.448031,-49.344136,1,0.3793,0.4070,0.6742,41.7290,0.019,90 77157,51.328125,-27.784405,223.130589,-55.999499,1,0.0000,0.0000,0.0000,nan,0.013,6 77192,151.347656,4.181528,235.568369,44.259942,1,0.3227,0.2764,0.0329,40.7513,0.016,62 77222,151.171875,1.342993,238.602520,42.464379,1,0.4342,0.3228,0.3114,41.1398,0.026,90 77292,34.277344,-5.079716,169.526841,-59.956640,1,0.1250,0.1890,0.2309,39.8194,0.019,62 77306,148.710938,2.836105,235.050801,41.328739,1,0.7167,0.6876,0.0165,43.0996,0.031,90 77340,346.500000,-62.320400,321.951129,-50.736054,1,0.8207,0.8217,0.0401,43.5741,0.020,88 77391,346.130127,-63.072620,321.423103,-50.042305,1,1.3214,1.4667,0.1459,45.1288,0.020,95 77518,53.437500,-29.142223,225.908120,-54.336118,1,0.3531,0.3522,0.0125,41.3602,0.008,67 77623,0.190678,-45.783966,327.956322,-68.803772,1,0.2765,0.2994,0.0135,40.9507,0.005,42 77825,349.046051,-61.943836,320.796530,-51.753706,1,0.1071,0.1065,0.0160,38.4603,0.017,42 77906,359.811707,-45.191612,329.485675,-69.150905,1,0.1265,0.0738,0.0297,37.6154,0.010,90 77952,358.665253,-45.783966,330.353593,-68.203652,1,0.1258,0.0858,0.0192,37.9614,0.009,90 78095,151.699219,3.583322,236.533224,44.205648,1,0.0000,0.0000,0.0000,nan,0.016,65 78233,148.710938,2.836105,235.050801,41.328739,1,0.2391,0.2060,0.0706,40.0283,0.031,90 78677,53.437500,-29.142223,225.908120,-54.336118,1,0.0000,0.0000,0.0000,nan,0.008,65 78702,349.891296,-64.573555,317.972107,-49.786192,1,0.0000,0.0000,0.0000,nan,0.023,92 78705,350.230255,-61.943836,320.053946,-52.070537,1,0.0000,0.0000,0.0000,nan,0.017,92 78727,51.679688,-27.447618,222.618229,-55.642263,1,0.1592,0.1479,0.0203,39.2318,0.010,90 78974,152.050781,3.284369,237.157374,44.318466,1,0.6592,0.6572,0.0085,42.9797,0.019,90 79002,2.097458,-45.783966,324.737840,-69.478613,1,0.4446,0.2938,0.8441,40.9035,0.011,90 79155,1.666667,-44.399834,327.519190,-70.529554,1,0.4097,0.4154,0.0160,41.7815,0.009,42 79235,52.031250,-26.443335,220.963669,-55.168557,1,0.1639,0.1669,0.0564,39.5206,0.014,42 79428,33.398438,-3.732834,166.492280,-59.466614,1,0.0000,0.0000,0.0000,nan,0.022,65 79515,53.964844,-28.630989,225.142950,-53.813613,1,0.2207,0.2188,1.1526,40.1744,0.009,90 79743,32.695312,-4.929937,166.868469,-60.841230,1,0.6266,0.6062,0.0094,42.7661,0.018,90 79819,150.820312,1.641510,237.994507,42.358984,1,0.1742,0.1725,0.0134,39.5994,0.020,42 79921,352.132874,-63.636005,317.424173,-51.095855,1,0.1638,0.1709,0.0219,39.5766,0.021,42 80155,53.085938,-28.122234,224.100909,-54.509752,1,0.3628,0.3633,0.0418,41.4386,0.007,90 80205,33.925781,-5.979157,170.179895,-60.866303,1,0.8134,0.7508,0.1176,43.3333,0.022,95 80780,152.050781,3.284369,237.157374,44.318466,1,0.1684,0.1902,0.0581,39.8341,0.019,42 80832,150.117188,2.238686,236.784618,42.139082,1,0.3950,0.3665,0.0146,41.4614,0.016,90 80852,151.347656,3.583322,236.252362,43.918627,1,0.2569,0.2659,0.0082,40.6545,0.015,62 80903,52.910156,-26.276812,220.926149,-54.363918,1,0.4140,0.4279,0.0171,41.8576,0.008,90 81000,150.996094,2.985506,236.647967,43.287350,1,0.2989,0.3106,0.0100,41.0422,0.020,42 81252,33.574219,-4.780192,168.064587,-60.175886,1,0.1494,0.1737,0.6985,39.6157,0.019,62 81464,149.414062,3.433834,234.919132,42.245550,1,0.2854,0.2818,0.3207,40.7987,0.027,42 81665,53.789062,-27.784405,223.685697,-53.845803,1,0.0000,0.0000,0.0000,nan,0.009,65 82302,346.276581,-64.011238,320.448031,-49.344136,1,0.0000,0.0000,0.0000,nan,0.019,65 82401,51.855469,-28.630989,224.733260,-55.649872,1,0.0000,0.0000,0.0000,nan,0.009,16 82409,348.595886,-63.072620,320.023289,-50.713060,1,0.1147,2.6274,1.1973,46.6786,0.021,42 82702,35.683594,-5.379379,171.992947,-59.253501,1,0.3194,0.2986,0.0204,40.9437,0.020,90 82740,349.615387,-63.636005,318.927246,-50.506542,1,1.0263,1.0228,0.2611,44.1609,0.018,88 83348,349.429535,-62.508568,320.039643,-51.393745,1,0.0000,0.0000,0.0000,nan,0.020,6 83410,51.855469,-26.276812,220.627031,-55.293792,1,0.5138,0.5071,0.0093,42.2980,0.014,90 83462,150.820312,3.134927,236.341348,43.230123,1,0.3424,0.3239,0.0179,41.1483,0.016,90 83634,349.285706,-62.884678,319.786163,-51.046461,1,2.1107,1.3813,0.4158,44.9680,0.018,95 83821,359.415588,-46.768478,327.729895,-67.686097,1,0.0000,0.0000,0.0000,nan,0.009,65 83872,149.589844,3.583322,234.885369,42.474696,1,0.2160,0.2054,0.2819,40.0211,0.024,90 83954,346.276581,-64.011238,320.448031,-49.344136,1,0.4390,0.4535,0.0428,42.0079,0.019,90 83961,1.753247,-46.768478,324.030235,-68.498041,1,0.2177,0.2211,0.0151,40.2004,0.014,90 84306,151.171875,2.537361,237.288526,43.169764,1,0.0000,0.0000,0.0000,nan,0.024,16 84716,151.523438,3.134927,236.900695,43.803170,1,0.4303,0.2925,1.0152,40.8923,0.019,90 84758,349.615387,-63.636005,318.927246,-50.506542,1,0.0000,0.0000,0.0000,nan,0.018,65 85125,53.085938,-28.122234,224.100909,-54.509752,1,0.0000,0.0000,0.0000,nan,0.007,65 85470,33.398438,-4.331149,167.226341,-59.936551,1,0.0000,0.0000,0.0000,nan,0.018,65 85490,348.908447,-63.823658,319.169886,-50.176186,1,0.2537,0.5506,0.2880,42.5130,0.018,90 85789,53.437500,-29.142223,225.908120,-54.336118,1,0.2990,0.2893,0.0367,40.8644,0.008,62 86456,33.574219,-5.379379,168.838090,-60.637536,1,0.2127,0.2191,0.0141,40.1785,0.017,42 86487,0.574468,-45.981140,327.041068,-68.778764,1,0.3850,0.3847,0.0333,41.5849,0.006,90 86759,348.529419,-61.755440,321.293980,-51.763351,1,0.0000,0.0000,0.0000,nan,0.016,65 86834,149.589844,3.583322,234.885369,42.474696,1,0.3160,0.4622,0.4742,42.0568,0.024,90 87180,150.820312,1.641510,237.994507,42.358984,1,0.1782,0.1820,0.0103,39.7288,0.020,62 87467,150.820312,3.134927,236.341348,43.230123,1,0.0000,0.0000,0.0000,nan,0.016,65 87498,152.050781,2.985506,237.495952,44.143927,1,0.0000,0.0000,0.0000,nan,0.019,16 87608,358.665253,-45.783966,330.353593,-68.203652,1,0.1894,0.2101,0.0179,40.0763,0.009,90 87685,347.861847,-61.943836,321.519104,-51.424048,1,0.3743,0.3612,0.0219,41.4237,0.017,90 87703,51.855469,-26.276812,220.627031,-55.293792,1,0.3246,0.5019,0.3014,42.2711,0.014,90 88073,347.013428,-62.508568,321.472056,-50.735330,1,0.0000,0.0000,0.0000,nan,0.018,92 88180,149.414062,1.940072,236.565366,41.393323,1,0.2996,2.6936,1.3051,46.7439,0.018,67 88195,53.964844,-28.630989,225.142950,-53.813613,1,0.3403,0.3448,0.0304,41.3063,0.009,90 88511,53.613281,-28.630989,225.073365,-54.119461,1,0.0000,0.0000,0.0000,nan,0.006,65 88587,352.711273,-63.823658,316.922299,-51.059403,1,0.7839,0.7789,0.0120,43.4314,0.024,90 88600,351.321442,-64.198746,317.458993,-50.429931,1,0.3303,0.3150,0.3295,41.0778,0.023,90 88627,32.871094,-4.780192,166.959493,-60.615132,1,0.3934,0.4406,0.0433,41.9329,0.017,67 88980,351.382965,-64.011238,317.574052,-50.604657,1,0.0000,0.0000,0.0000,nan,0.023,16 89157,348.908447,-63.823658,319.169886,-50.176186,1,0.0000,0.0000,0.0000,nan,0.018,92 89298,359.811707,-45.191612,329.485675,-69.150905,1,0.0000,0.0000,0.0000,nan,0.010,16 89387,346.130127,-63.072620,321.423103,-50.042305,1,0.6210,0.4739,0.0648,42.1217,0.020,90 89455,0.189873,-45.586655,328.254458,-68.969298,1,0.2603,0.2485,0.0142,40.4873,0.007,90 89709,52.910156,-25.944481,220.366350,-54.301439,1,0.0000,0.0000,0.0000,nan,0.010,92 89999,149.238281,3.882372,234.283829,42.351155,1,0.2640,0.3194,0.0576,41.1125,0.033,90 90399,51.328125,-27.784405,223.130589,-55.999499,1,0.0000,0.0000,0.0000,nan,0.013,65 90534,152.050781,3.284369,237.157374,44.318466,1,0.1759,0.1913,0.0131,39.8481,0.019,62 90645,51.855469,-28.630989,224.733260,-55.649872,1,0.4496,0.4486,0.0173,41.9797,0.009,90 90814,348.595886,-63.072620,320.023289,-50.713060,1,0.2252,0.2208,0.0172,40.1971,0.021,62 90892,152.050781,3.284369,237.157374,44.318466,1,0.0322,0.0365,0.0161,36.0320,0.019,52 91219,150.820312,3.134927,236.341348,43.230123,1,0.2921,0.5086,0.1231,42.3058,0.016,90 91291,352.711273,-63.823658,316.922299,-51.059403,1,0.1826,0.1746,0.0132,39.6288,0.024,90 91335,151.699219,3.583322,236.533224,44.205648,1,1.0655,1.4889,0.2658,45.1692,0.016,88 91337,53.613281,-27.953188,223.929533,-54.024772,1,0.0000,0.0000,0.0000,nan,0.007,65 91460,53.613281,-28.630989,225.073365,-54.119461,1,0.2782,0.5294,0.2771,42.4103,0.006,90 91610,346.130127,-63.072620,321.423103,-50.042305,1,0.2326,0.2302,0.0070,40.2992,0.020,42 91644,349.891296,-64.573555,317.972107,-49.786192,1,0.1893,0.1839,0.1322,39.7535,0.023,90 91917,152.050781,2.985506,237.495952,44.143927,1,0.2448,0.2740,0.0117,40.7289,0.019,90 91988,150.468750,1.641510,237.714575,42.075234,1,0.0000,0.0000,0.0000,nan,0.017,65 92334,350.230255,-61.943836,320.053946,-52.070537,1,0.0000,0.0000,0.0000,nan,0.017,65 92354,51.328125,-27.784405,223.130589,-55.999499,1,0.5449,0.5807,0.0374,42.6530,0.013,88 92566,1.753247,-46.768478,324.030235,-68.498041,1,0.0000,0.0000,0.0000,nan,0.014,16 92577,351.734680,-62.884678,318.284128,-51.651217,1,0.4426,0.3854,0.6831,41.5893,0.019,90 92904,347.812500,-63.448284,320.128971,-50.202348,1,0.2536,0.2814,0.0325,40.7958,0.021,62 92929,348.908447,-63.823658,319.169886,-50.176186,1,0.0000,0.0000,0.0000,nan,0.018,65 93333,151.171875,1.342993,238.602520,42.464379,1,0.4029,0.3217,0.4659,41.1310,0.026,90 93362,51.855469,-28.630989,224.733260,-55.649872,1,0.0000,0.0000,0.0000,nan,0.009,92 93509,51.855469,-26.276812,220.627031,-55.293792,1,0.0587,0.0644,0.0151,37.3066,0.014,42 93663,53.964844,-28.630989,225.142950,-53.813613,1,0.4196,0.4099,0.0367,41.7471,0.009,90 94004,52.910156,-27.953188,223.774083,-54.639214,1,0.2354,0.2257,0.0162,40.2510,0.007,62 94107,34.980469,-6.279288,172.180075,-60.389399,1,0.0000,0.0000,0.0000,nan,0.023,16 94229,52.207031,-28.291550,224.208534,-55.300157,1,0.3853,0.4477,0.0268,41.9745,0.007,90 94613,2.457983,-45.389202,324.632685,-69.945696,1,0.0739,0.0616,0.0188,37.2044,0.011,62 94704,349.966217,-62.696659,319.542989,-51.376556,1,0.2722,0.2658,0.0080,40.6537,0.021,90 95127,351.299988,-62.320400,319.038597,-52.026867,1,1.8136,1.6691,0.2276,45.4751,0.018,95 95147,149.238281,3.882372,234.283829,42.351155,1,0.0000,0.0000,0.0000,nan,0.033,65 95369,53.613281,-28.630989,225.073365,-54.119461,1,0.1313,0.5834,0.7684,42.6652,0.006,62 95455,351.259003,-64.386185,317.344860,-50.255113,1,0.4942,0.5590,0.1899,42.5528,0.020,90 95483,150.820312,3.732834,235.666318,43.572109,1,0.8512,0.8164,0.0436,43.5568,0.016,88 95508,149.414062,2.238686,236.239766,41.565558,1,0.6107,0.5480,0.0306,42.5009,0.017,88 95566,33.574219,-5.079716,168.448505,-60.407218,1,0.3880,0.4580,0.0992,42.0331,0.016,90 95580,34.101562,-5.829153,170.247753,-60.638325,1,0.4200,0.4311,0.0132,41.8770,0.019,90 95690,351.734680,-62.884678,318.284128,-51.651217,1,0.3442,0.3507,0.0052,41.3494,0.019,42 95741,35.332031,-5.979157,172.286722,-59.931743,1,0.5064,0.5368,0.0209,42.4469,0.022,52 95864,53.085938,-27.111860,222.384291,-54.355086,1,0.0000,0.0000,0.0000,nan,0.007,92 96284,152.050781,3.284369,237.157374,44.318466,1,0.1593,2.4014,0.4125,46.4417,0.019,42 97053,150.117188,3.732834,235.120533,42.993809,1,0.0000,0.0000,0.0000,nan,0.020,65 97406,347.846710,-64.760857,318.929827,-49.143596,1,0.1163,0.0897,0.8592,38.0649,0.019,15 97687,346.655182,-63.260487,320.952196,-50.040935,1,0.4603,0.4626,0.0271,42.0594,0.019,90 97850,351.259003,-64.386185,317.344860,-50.255113,1,0.3090,0.3005,0.0119,40.9599,0.020,90 97920,150.820312,3.134927,236.341348,43.230123,1,0.3588,0.3524,0.0238,41.3611,0.016,67 97957,1.723404,-45.981140,325.117958,-69.180825,1,0.2947,0.2906,0.0093,40.8754,0.010,90 98533,349.615387,-63.636005,318.927246,-50.506542,1,0.0000,0.0000,0.0000,nan,0.018,65 98570,1.708861,-45.586655,325.688716,-69.520253,1,0.6197,0.5296,0.0328,42.4113,0.011,42 98749,33.750000,-4.630479,168.146242,-59.949072,1,0.1473,0.0973,0.6573,38.2516,0.019,67 99013,350.230255,-61.943836,320.053946,-52.070537,1,0.5255,0.5303,0.0220,42.4150,0.017,90 99050,52.207031,-26.610098,221.298836,-55.042928,1,0.2972,0.3036,0.1492,40.9851,0.014,42 99261,53.613281,-27.953188,223.929533,-54.024772,1,0.1411,0.0857,0.0282,37.9579,0.007,90 99280,359.811707,-45.191612,329.485675,-69.150905,1,0.2037,0.2150,0.0079,40.1324,0.010,62 99293,347.846710,-64.760857,318.929827,-49.143596,1,0.3106,0.3644,0.0251,41.4467,0.019,90 99294,348.529419,-61.755440,321.293980,-51.763351,1,0.5552,0.5204,0.0185,42.3656,0.016,90 99452,352.711273,-63.823658,316.922299,-51.059403,1,0.8420,0.8479,0.0465,43.6581,0.024,88 99642,347.846710,-64.760857,318.929827,-49.143596,1,0.2320,0.2369,1.1397,40.3699,0.019,90 99862,52.207031,-28.291550,224.208534,-55.300157,1,0.5810,0.5733,0.0141,42.6194,0.007,42 99932,51.855469,-28.630989,224.733260,-55.649872,1,0.0000,0.0000,0.0000,nan,0.009,65 100057,346.130127,-63.072620,321.423103,-50.042305,1,0.8320,0.8006,0.0197,43.5045,0.020,90 100097,348.529419,-61.755440,321.293980,-51.763351,1,0.1566,0.1745,0.0186,39.6271,0.016,42 100133,346.655182,-63.260487,320.952196,-50.040935,1,0.2309,0.2215,0.0190,40.2046,0.019,42 100331,52.207031,-26.610098,221.298836,-55.042928,1,0.1684,0.1999,0.0173,39.9556,0.014,42 101050,32.695312,-4.929937,166.868469,-60.841230,1,0.2222,0.2529,0.0906,40.5303,0.018,90 101298,34.277344,-5.079716,169.526841,-59.956640,1,0.1848,0.1983,0.0069,39.9355,0.019,90 101374,51.855469,-26.276812,220.627031,-55.293792,1,0.0000,0.0000,0.0000,nan,0.014,16 101489,359.816315,-44.003082,331.451340,-70.123054,1,0.0721,0.0695,0.0025,37.4803,0.013,42 101508,358.648071,-46.375080,329.462659,-67.716008,1,0.0656,0.0249,0.0130,35.1822,0.009,90 101890,346.276581,-64.011238,320.448031,-49.344136,1,0.0000,0.0000,0.0000,nan,0.019,65 102036,53.789062,-27.784405,223.685697,-53.845803,1,0.2331,0.2269,0.0197,40.2635,0.009,42 102330,148.886719,2.686724,235.347248,41.389003,1,0.0000,0.0000,0.0000,nan,0.028,16 102343,51.328125,-27.447618,222.535046,-55.950727,1,0.1081,0.1422,0.0164,39.1394,0.013,67 102363,349.160583,-64.760857,318.219706,-49.458924,1,0.8046,0.7835,0.0167,43.4470,0.020,90 102745,349.615387,-63.636005,318.927246,-50.506542,1,0.2460,0.2333,1.0359,40.3315,0.018,90 102823,347.861847,-61.943836,321.519104,-51.424048,1,0.0000,0.0000,0.0000,nan,0.017,65 102864,0.574468,-45.981140,327.041068,-68.778764,1,0.2123,0.2254,0.0097,40.2473,0.006,42 103023,359.811707,-45.191612,329.485675,-69.150905,1,0.3221,0.2994,0.0125,40.9500,0.010,90 103026,348.529419,-61.755440,321.293980,-51.763351,1,0.1949,0.2006,0.0174,39.9637,0.016,42 103100,34.101562,-5.829153,170.247753,-60.638325,1,0.5057,0.4350,0.0240,41.9001,0.019,90 103145,349.160583,-64.760857,318.219706,-49.458924,1,0.6603,0.6380,0.0433,42.9010,0.020,90 103162,350.230255,-61.943836,320.053946,-52.070537,1,0.1101,0.1207,0.0096,38.7539,0.017,90 103171,1.753247,-46.768478,324.030235,-68.498041,1,0.0000,0.0000,0.0000,nan,0.014,92 103350,359.446716,-44.201530,331.730015,-69.805709,1,0.6777,0.6935,0.0507,43.1220,0.010,90 103354,150.117188,2.836105,236.124718,42.483719,1,2.7124,2.7655,0.0801,46.8131,0.016,88 103572,52.207031,-28.291550,224.208534,-55.300157,1,0.0000,0.0000,0.0000,nan,0.007,65 103927,150.820312,1.641510,237.994507,42.358984,1,0.2130,0.2282,0.0218,40.2779,0.020,90 103948,33.574219,-4.780192,168.064587,-60.175886,1,0.2257,0.2319,0.1182,40.3169,0.019,52 103967,0.190678,-45.783966,327.956322,-68.803772,1,0.5094,0.4789,0.0608,42.1493,0.005,42 104212,53.789062,-27.784405,223.685697,-53.845803,1,0.3895,0.3850,1.0136,41.5869,0.009,90 104397,349.966217,-62.696659,319.542989,-51.376556,1,0.3220,0.3038,0.0187,40.9870,0.021,90 104476,51.855469,-28.630989,224.733260,-55.649872,1,1.4820,1.4819,0.1602,45.1565,0.009,95 104498,149.238281,3.882372,234.283829,42.351155,1,0.1312,0.1100,0.0094,38.5371,0.033,67 104523,152.050781,3.284369,237.157374,44.318466,1,0.0000,0.0000,0.0000,nan,0.019,16 104526,349.615387,-63.636005,318.927246,-50.506542,1,0.2353,0.2389,0.0148,40.3902,0.018,42 104701,352.132874,-63.636005,317.424173,-51.095855,1,0.0000,0.0000,0.0000,nan,0.021,65 105744,348.908447,-63.823658,319.169886,-50.176186,1,0.3087,0.3318,1.0382,41.2090,0.018,90 106177,358.312500,-44.993881,332.185785,-68.685906,1,0.6593,0.5956,0.1033,42.7194,0.009,90 106429,152.050781,2.985506,237.495952,44.143927,1,0.0000,0.0000,0.0000,nan,0.019,16 106434,33.574219,-4.780192,168.064587,-60.175886,1,0.2861,0.2795,0.0098,40.7789,0.019,62 106594,0.589520,-47.161343,325.385896,-67.769893,1,0.2403,0.2370,1.0591,40.3710,0.009,88 106730,347.013428,-62.508568,321.472056,-50.735330,1,0.1576,2.2437,0.3333,46.2621,0.018,52 106743,0.574468,-45.981140,327.041068,-68.778764,1,0.2154,0.2539,0.0150,40.5405,0.006,90 106818,348.595886,-63.072620,320.023289,-50.713060,1,0.0000,0.0000,0.0000,nan,0.021,92 106937,53.085938,-28.122234,224.100909,-54.509752,1,0.3656,0.3882,0.8841,41.6079,0.007,52 107193,359.058563,-45.191612,330.695783,-68.844915,1,0.2109,0.1761,0.0082,39.6492,0.011,90 107439,150.468750,3.732834,235.392208,43.283244,1,0.9187,1.5247,0.3061,45.2328,0.020,88 107451,347.861847,-61.943836,321.519104,-51.424048,1,0.0000,0.0000,0.0000,nan,0.017,65 107568,33.750000,-4.630479,168.146242,-59.949072,1,0.1042,0.0802,0.0143,37.8074,0.019,67 107615,150.117188,2.836105,236.124718,42.483719,1,0.1323,2.3872,0.9055,46.4260,0.016,42 107712,53.261719,-27.615883,223.280041,-54.281374,1,0.1282,0.1450,0.0286,39.1863,0.006,90 107901,359.058563,-45.191612,330.695783,-68.844915,1,0.0000,0.0000,0.0000,nan,0.011,65 108021,150.996094,4.181528,235.291975,43.970869,1,0.2815,0.3756,0.8142,41.5232,0.015,62 108141,53.789062,-27.784405,223.685697,-53.845803,1,0.2179,0.2239,0.0151,40.2317,0.009,90 108229,351.321442,-64.198746,317.458993,-50.429931,1,0.1858,0.2022,1.1539,39.9826,0.023,90 108358,349.891296,-64.573555,317.972107,-49.786192,1,0.0769,0.2433,1.0869,40.4350,0.023,52 108487,359.816315,-44.003082,331.451340,-70.123054,1,0.1787,0.2401,0.6645,40.4026,0.013,42 108554,33.222656,-4.780192,167.515653,-60.396584,1,0.4478,0.4542,0.0258,42.0120,0.018,90 108693,0.574468,-45.981140,327.041068,-68.778764,1,0.4868,0.4183,0.0364,41.7991,0.006,90 108739,53.085938,-28.122234,224.100909,-54.509752,1,0.4453,0.4388,0.0272,41.9226,0.007,90 108888,358.648071,-46.375080,329.462659,-67.716008,1,0.6247,0.5951,0.0599,42.7172,0.009,90 109036,1.753247,-46.768478,324.030235,-68.498041,1,0.1924,0.1862,0.2684,39.7838,0.014,90 109057,348.595886,-63.072620,320.023289,-50.713060,1,0.4499,0.3307,1.0499,41.2005,0.021,90 109294,359.814819,-44.399834,330.775011,-69.801007,1,0.3014,0.3643,0.8070,41.4461,0.009,90 109516,1.753247,-46.768478,324.030235,-68.498041,1,0.3828,0.3941,0.0095,41.6466,0.014,95 109654,347.013428,-62.508568,321.472056,-50.735330,1,0.1395,0.1231,0.0131,38.8003,0.018,90 109860,0.929752,-44.597992,328.531426,-70.083244,1,0.2708,0.2639,0.0184,40.6357,0.011,42 109903,150.996094,2.388015,237.313912,42.939977,1,0.3892,0.3579,0.0137,41.4006,0.021,62 109937,149.414062,3.433834,234.919132,42.245550,1,0.6620,0.6423,0.0188,42.9190,0.027,88 110241,34.453125,-5.229529,169.987075,-59.956185,1,0.1072,0.0924,0.4007,38.1327,0.019,42 110257,148.886719,2.686724,235.347248,41.389003,1,0.9318,0.8495,0.0379,43.6631,0.028,88 110270,54.667969,-27.615883,223.610785,-53.050840,1,0.8259,0.8541,0.0446,43.6776,0.009,90 110304,53.261719,-27.615883,223.280041,-54.281374,1,0.2475,0.5457,0.2676,42.4899,0.006,62 110387,151.347656,4.181528,235.568369,44.259942,1,0.4318,0.4218,0.6713,41.8205,0.016,90 110551,51.328125,-27.447618,222.535046,-55.950727,1,0.4861,0.3027,0.3931,40.9783,0.013,88 110768,351.734680,-62.884678,318.284128,-51.651217,1,2.9378,2.8626,1.1139,46.9035,0.019,88 110958,349.615387,-63.636005,318.927246,-50.506542,1,0.4630,0.4742,0.5031,42.1236,0.018,90 111281,0.589520,-47.161343,325.385896,-67.769893,1,1.4967,1.6014,0.0895,45.3643,0.009,42 111283,150.468750,1.641510,237.714575,42.075234,1,0.6565,0.6363,0.0252,42.8941,0.017,90 111448,51.855469,-27.953188,223.543603,-55.561470,1,0.0000,0.0000,0.0000,nan,0.008,65 111650,351.734680,-62.884678,318.284128,-51.651217,1,0.0000,0.0000,0.0000,nan,0.019,65 111795,34.453125,-5.229529,169.987075,-59.956185,1,0.4070,0.3077,1.0584,41.0187,0.019,90 111799,52.558594,-27.279613,222.538937,-54.845107,1,0.2813,0.5741,0.4749,42.6227,0.008,42 112151,349.966217,-62.696659,319.542989,-51.376556,1,0.5341,0.5419,0.0204,42.4716,0.021,90 112462,33.574219,-4.780192,168.064587,-60.175886,1,0.4045,0.4356,0.0299,41.9040,0.019,90 112629,346.276581,-64.011238,320.448031,-49.344136,1,0.0000,0.0000,0.0000,nan,0.019,16 112717,35.683594,-5.379379,171.992947,-59.253501,1,0.2867,0.3138,0.8216,41.0680,0.020,67 112764,347.812500,-63.448284,320.128971,-50.202348,1,0.3208,0.2995,0.0255,40.9517,0.021,90 112782,32.871094,-4.780192,166.959493,-60.615132,1,0.4211,0.4038,0.0224,41.7088,0.017,90 112886,351.953644,-62.132156,318.777388,-52.347124,1,0.0000,0.0000,0.0000,nan,0.019,65 113028,151.699219,3.583322,236.533224,44.205648,1,0.0000,0.0000,0.0000,nan,0.016,65 113206,52.207031,-28.291550,224.208534,-55.300157,1,0.2446,0.2208,0.0078,40.1969,0.007,42 113335,358.636353,-46.768478,328.890146,-67.388837,1,0.0000,0.0000,0.0000,nan,0.008,65 113625,149.589844,3.583322,234.885369,42.474696,1,0.2615,0.2381,0.0160,40.3824,0.024,90 113669,351.299988,-62.320400,319.038597,-52.026867,1,0.2914,0.2905,0.0061,40.8748,0.018,15 113982,359.805206,-46.768478,327.135979,-67.829903,1,0.7438,0.9216,0.1482,43.8813,0.011,90 114191,33.574219,-6.579593,170.455585,-61.548219,1,0.0000,0.0000,0.0000,nan,0.021,65 114341,151.699219,3.583322,236.533224,44.205648,1,0.1709,2.3232,0.5051,46.3542,0.016,42 114626,2.071130,-45.191612,325.606223,-69.989264,1,0.5702,0.5408,0.0178,42.4659,0.011,90 114670,150.996094,2.388015,237.313912,42.939977,1,0.2930,0.2607,0.0670,40.6057,0.021,90 114715,0.965665,-46.375080,325.845907,-68.579427,1,0.2141,0.2020,0.0362,39.9807,0.007,67 114808,52.207031,-26.610098,221.298836,-55.042928,1,0.3037,0.2944,0.0104,40.9082,0.014,90 115053,33.925781,-5.979157,170.179895,-60.866303,1,0.1649,0.1424,0.2329,39.1427,0.022,42 115079,1.694561,-45.191612,326.278557,-69.858253,1,0.2207,0.2325,0.0072,40.3232,0.011,90 115157,2.457983,-45.389202,324.632685,-69.945696,1,0.6382,0.5628,0.0442,42.5706,0.011,90 115336,351.734680,-62.884678,318.284128,-51.651217,1,1.7312,1.7123,0.0766,45.5432,0.019,95 115638,358.665253,-45.783966,330.353593,-68.203652,1,0.6365,0.6239,0.0079,42.8420,0.009,90 115670,151.347656,3.583322,236.252362,43.918627,1,0.0000,0.0000,0.0000,nan,0.015,16 115792,352.711273,-63.823658,316.922299,-51.059403,1,0.2389,0.2859,0.0505,40.8350,0.024,90 115859,148.886719,2.686724,235.347248,41.389003,1,1.4248,1.2871,0.1757,44.7782,0.028,88 115937,51.679688,-27.447618,222.618229,-55.642263,1,0.0000,0.0000,0.0000,nan,0.010,65 116132,150.292969,2.686724,236.427488,42.541447,1,0.0878,0.0965,0.0207,38.2336,0.016,62 116212,150.644531,3.583322,235.698235,43.342784,1,0.2344,0.2757,1.1756,40.7443,0.018,90 116570,51.855469,-27.953188,223.543603,-55.561470,1,0.0000,0.0000,0.0000,nan,0.008,16 116720,34.980469,-6.279288,172.180075,-60.389399,1,0.4081,0.3212,0.0367,41.1269,0.023,90 116818,359.805206,-46.768478,327.135979,-67.829903,1,0.6528,0.7091,0.0211,43.1811,0.011,88 117016,349.966217,-62.696659,319.542989,-51.376556,1,0.0000,0.0000,0.0000,nan,0.021,92 117104,346.130127,-63.072620,321.423103,-50.042305,1,0.0000,0.0000,0.0000,nan,0.020,92 117184,352.132874,-63.636005,317.424173,-51.095855,1,0.0000,0.0000,0.0000,nan,0.021,16 117393,52.910156,-25.944481,220.366350,-54.301439,1,0.8259,0.8295,0.0204,43.5993,0.010,90 117461,53.261719,-27.615883,223.280041,-54.281374,1,0.0000,0.0000,0.0000,nan,0.006,92 117513,151.347656,3.583322,236.252362,43.918627,1,0.8515,0.8845,0.0148,43.7712,0.015,90 117774,346.276581,-64.011238,320.448031,-49.344136,1,0.5589,0.5608,0.1532,42.5611,0.019,90 118211,151.699219,3.583322,236.533224,44.205648,1,0.2420,0.2139,1.1778,40.1199,0.016,90 118422,149.589844,3.583322,234.885369,42.474696,1,0.0000,0.0000,0.0000,nan,0.024,65 118455,347.861847,-61.943836,321.519104,-51.424048,1,0.1346,0.1396,0.3139,39.0951,0.017,90 118770,53.085938,-28.122234,224.100909,-54.509752,1,0.2642,0.6375,0.0267,42.8991,0.007,90 118868,350.230255,-61.943836,320.053946,-52.070537,1,0.0000,0.0000,0.0000,nan,0.017,65 118979,2.457983,-45.389202,324.632685,-69.945696,1,0.2374,0.5280,0.1420,42.4036,0.011,67 119215,52.031250,-26.443335,220.963669,-55.168557,1,0.2490,0.2486,0.0069,40.4887,0.014,90 119383,0.949367,-45.586655,326.991548,-69.251686,1,0.3501,0.2602,0.6723,40.6012,0.013,42 119494,150.117188,2.836105,236.124718,42.483719,1,0.2113,0.2256,0.0219,40.2502,0.016,90 119647,151.523438,3.134927,236.900695,43.803170,1,0.1844,0.1571,0.0210,39.3762,0.019,62 119811,348.586945,-64.573555,318.693903,-49.477869,1,0.7287,0.7966,0.0308,43.4912,0.018,90 119882,358.312500,-44.993881,332.185785,-68.685906,1,0.0000,0.0000,0.0000,nan,0.009,65 120356,32.695312,-4.929937,166.868469,-60.841230,1,0.0582,0.0543,0.0185,36.9203,0.018,42 120927,152.050781,3.284369,237.157374,44.318466,1,0.4807,0.4897,0.0334,42.2071,0.019,90 121107,35.332031,-5.979157,172.286722,-59.931743,1,0.5440,0.2209,0.2405,40.1982,0.022,90 121182,53.085938,-28.122234,224.100909,-54.509752,1,0.0000,0.0000,0.0000,nan,0.007,65 121224,359.811707,-45.191612,329.485675,-69.150905,1,0.1244,2.3005,0.4304,46.3283,0.010,42 121266,151.347656,3.583322,236.252362,43.918627,1,0.4062,0.5002,0.0481,42.2623,0.015,42 121301,52.207031,-28.630989,224.800211,-55.343637,1,0.1459,0.1449,1.1250,39.1845,0.009,62 121440,53.613281,-28.630989,225.073365,-54.119461,1,0.2230,0.2219,0.1530,40.2089,0.006,42 121447,0.189873,-45.586655,328.254458,-68.969298,1,0.2571,0.2465,0.0106,40.4674,0.007,90 121704,359.805206,-46.768478,327.135979,-67.829903,1,1.5357,1.5245,0.1053,45.2324,0.011,88 121705,352.132874,-63.636005,317.424173,-51.095855,1,0.1125,0.1219,0.0187,38.7767,0.021,42 121783,35.332031,-5.979157,172.286722,-59.931743,1,0.1324,0.0956,0.0132,38.2109,0.022,90 121803,348.586945,-64.573555,318.693903,-49.477869,1,0.2563,0.3703,1.1769,41.4870,0.018,90 121883,150.117188,2.836105,236.124718,42.483719,1,0.0000,0.0000,0.0000,nan,0.016,65 122235,53.085938,-28.122234,224.100909,-54.509752,1,0.0000,0.0000,0.0000,nan,0.007,65 122275,54.667969,-27.615883,223.610785,-53.050840,1,0.3172,0.5059,0.1684,42.2916,0.009,90 122716,33.398438,-3.732834,166.492280,-59.466614,1,0.0000,0.0000,0.0000,nan,0.022,65 122965,53.085938,-28.122234,224.100909,-54.509752,1,0.7135,0.7416,0.0359,43.3006,0.007,90 123035,150.468750,3.732834,235.392208,43.283244,1,0.2740,0.3135,0.5471,41.0660,0.020,52 123151,52.207031,-28.291550,224.208534,-55.300157,1,0.2548,0.2657,0.0184,40.6528,0.007,62 123211,150.996094,2.985506,236.647967,43.287350,1,1.6177,1.6654,0.0205,45.4691,0.020,88 123244,1.363636,-46.768478,324.669342,-68.371416,1,0.4582,0.3682,0.3963,41.4728,0.008,42 123437,348.595886,-63.072620,320.023289,-50.713060,1,0.1987,0.2137,0.0164,40.1178,0.021,52 123493,34.804688,-5.829153,171.307861,-60.174401,1,1.0395,0.9736,0.2348,44.0284,0.023,95 123743,152.050781,2.985506,237.495952,44.143927,1,0.2016,0.2129,0.0188,40.1084,0.019,90 123926,151.171875,1.342993,238.602520,42.464379,1,0.0000,0.0000,0.0000,nan,0.026,16 123927,150.468750,3.732834,235.392208,43.283244,1,0.0000,0.0000,0.0000,nan,0.020,16 124006,2.457983,-45.389202,324.632685,-69.945696,1,0.3574,0.3496,0.7517,41.3410,0.011,90 124183,33.574219,-6.579593,170.455585,-61.548219,1,0.0000,0.0000,0.0000,nan,0.021,65 124188,348.908447,-63.823658,319.169886,-50.176186,1,0.0000,0.0000,0.0000,nan,0.018,65 124361,51.855469,-27.953188,223.543603,-55.561470,1,0.3256,0.3420,0.8779,41.2856,0.008,88 124394,52.207031,-26.610098,221.298836,-55.042928,1,0.9198,1.4761,0.2825,45.1459,0.014,90 124679,346.276581,-64.011238,320.448031,-49.344136,1,0.3602,0.3409,0.0345,41.2771,0.019,52 124762,347.861847,-61.943836,321.519104,-51.424048,1,0.1442,2.8043,0.6276,46.8496,0.017,42 125095,352.711273,-63.823658,316.922299,-51.059403,1,0.0000,0.0000,0.0000,nan,0.024,65 125242,349.891296,-64.573555,317.972107,-49.786192,1,0.2721,0.5662,0.2621,42.5866,0.023,90 125258,349.615387,-63.636005,318.927246,-50.506542,1,0.0000,0.0000,0.0000,nan,0.018,65 125426,349.429535,-62.508568,320.039643,-51.393745,1,1.1327,1.1093,0.0315,44.3789,0.020,88 125470,34.277344,-5.079716,169.526841,-59.956640,1,0.5731,0.5848,0.0092,42.6713,0.019,90 125518,359.805206,-46.768478,327.135979,-67.829903,1,0.0000,0.0000,0.0000,nan,0.011,65 125743,347.013428,-62.508568,321.472056,-50.735330,1,0.9950,1.0046,0.0133,44.1126,0.018,42 125762,1.753247,-46.768478,324.030235,-68.498041,1,0.0000,0.0000,0.0000,nan,0.014,65 126061,349.966217,-62.696659,319.542989,-51.376556,1,0.0000,0.0000,0.0000,nan,0.021,65 126084,149.238281,3.882372,234.283829,42.351155,1,0.1250,0.1223,0.4578,38.7846,0.033,90 126970,150.996094,2.985506,236.647967,43.287350,1,0.0000,0.0000,0.0000,nan,0.020,16 127056,34.453125,-5.229529,169.987075,-59.956185,1,0.1629,0.1385,0.0373,39.0772,0.019,42 127488,348.908447,-63.823658,319.169886,-50.176186,1,0.0000,0.0000,0.0000,nan,0.018,65 127773,53.085938,-27.111860,222.384291,-54.355086,1,0.0000,0.0000,0.0000,nan,0.007,65 127942,348.595886,-63.072620,320.023289,-50.713060,1,0.3899,0.5026,0.2876,42.2748,0.021,90 127996,346.562500,-63.448284,320.824720,-49.866957,1,0.6993,0.6759,0.0121,43.0539,0.021,95 128339,0.965665,-46.375080,325.845907,-68.579427,1,0.2192,2.1719,1.1420,46.1760,0.007,67 128405,352.398651,-62.696659,318.017427,-51.967966,1,0.0000,0.0000,0.0000,nan,0.020,65 128488,150.468750,1.641510,237.714575,42.075234,1,0.1333,0.1194,0.0257,38.7277,0.017,42 128518,34.101562,-5.829153,170.247753,-60.638325,1,0.5644,0.5449,0.0420,42.4859,0.019,90 128564,150.117188,2.238686,236.784618,42.139082,1,0.4541,2.8846,1.1987,46.9235,0.016,90 128737,52.558594,-27.279613,222.538937,-54.845107,1,0.4204,0.4270,0.0598,41.8523,0.008,90 128746,152.050781,2.985506,237.495952,44.143927,1,0.3408,0.3671,0.5585,41.4654,0.019,52 128967,150.644531,3.583322,235.698235,43.342784,1,0.3145,0.3149,0.0153,41.0770,0.018,90 129179,346.130127,-63.072620,321.423103,-50.042305,1,0.1422,0.1600,0.0245,39.4198,0.020,52 129490,34.453125,-5.229529,169.987075,-59.956185,1,0.3803,0.4821,0.2384,42.1664,0.019,42 129503,150.292969,2.686724,236.427488,42.541447,1,0.1939,0.1882,0.0140,39.8092,0.016,52 129637,33.925781,-5.979157,170.179895,-60.866303,1,0.0000,0.0000,0.0000,nan,0.022,92 129648,348.529419,-61.755440,321.293980,-51.763351,1,0.3613,2.9518,0.4926,46.9838,0.016,62 129861,351.321442,-64.198746,317.458993,-50.429931,1,0.5180,0.5215,0.0294,42.3710,0.023,90 130220,346.276581,-64.011238,320.448031,-49.344136,1,0.5002,0.4635,0.0365,42.0643,0.019,42 130404,150.820312,3.732834,235.666318,43.572109,1,0.2531,0.2629,0.0072,40.6262,0.016,42 130502,35.683594,-5.379379,171.992947,-59.253501,1,0.2141,0.1909,0.0935,39.8439,0.020,90 130625,1.708861,-45.586655,325.688716,-69.520253,1,0.0000,0.0000,0.0000,nan,0.011,16 130750,359.816315,-44.003082,331.451340,-70.123054,1,0.2536,0.2917,0.7396,40.8850,0.013,90 131075,52.910156,-25.944481,220.366350,-54.301439,1,0.1709,0.1776,0.0174,39.6689,0.010,42 131181,34.101562,-5.829153,170.247753,-60.638325,1,0.0000,0.0000,0.0000,nan,0.019,16 131305,351.382965,-64.011238,317.574052,-50.604657,1,0.2846,0.3113,0.0407,41.0484,0.023,90 131368,150.996094,2.388015,237.313912,42.939977,1,0.0000,0.0000,0.0000,nan,0.021,65 131488,0.965665,-46.375080,325.845907,-68.579427,1,0.2691,0.2621,0.0183,40.6187,0.007,90 131492,150.820312,3.732834,235.666318,43.572109,1,0.0000,0.0000,0.0000,nan,0.016,16 131629,33.574219,-5.079716,168.448505,-60.407218,1,1.1119,1.0817,0.0329,44.3113,0.016,95 131814,151.699219,3.583322,236.533224,44.205648,1,0.2823,0.5387,0.2038,42.4557,0.016,90 131815,32.695312,-4.929937,166.868469,-60.841230,1,0.3521,0.3520,0.0254,41.3584,0.018,90 132021,359.058563,-45.191612,330.695783,-68.844915,1,0.2525,0.6394,0.1855,42.9069,0.011,90 132278,359.415588,-46.768478,327.729895,-67.686097,1,0.0000,0.0000,0.0000,nan,0.009,92 133074,347.617462,-62.508568,321.121462,-50.904708,1,0.2610,0.2672,0.8507,40.6666,0.019,90 133191,346.276581,-64.011238,320.448031,-49.344136,1,0.4648,0.4423,0.0247,41.9430,0.019,90 133234,151.523438,3.134927,236.900695,43.803170,1,0.2288,0.2866,0.0891,40.8416,0.019,15 133354,358.665253,-45.783966,330.353593,-68.203652,1,0.4990,0.5076,0.0086,42.3007,0.009,90 133513,34.980469,-6.279288,172.180075,-60.389399,1,0.4337,0.4221,0.0715,41.8224,0.023,90 133773,149.414062,3.433834,234.919132,42.245550,1,0.0000,0.0000,0.0000,nan,0.027,53 134380,150.996094,2.985506,236.647967,43.287350,1,0.3943,0.3670,0.0425,41.4645,0.020,90 134824,351.734680,-62.884678,318.284128,-51.651217,1,0.1196,0.1143,0.0103,38.6270,0.019,42 135054,350.230255,-61.943836,320.053946,-52.070537,1,0.3893,0.4001,0.3373,41.6849,0.017,90 135067,148.886719,2.686724,235.347248,41.389003,1,0.5285,0.5059,0.6484,42.2917,0.028,90 135097,151.171875,1.342993,238.602520,42.464379,1,0.2066,0.3032,0.0238,40.9819,0.026,42 135357,51.855469,-28.630989,224.733260,-55.649872,1,0.0000,0.0000,0.0000,nan,0.009,65 135588,151.523438,3.134927,236.900695,43.803170,1,0.0000,0.0000,0.0000,nan,0.019,16 135790,150.117188,2.238686,236.784618,42.139082,1,0.4136,0.3284,0.7703,41.1829,0.016,90 135813,0.589520,-47.161343,325.385896,-67.769893,1,0.2577,0.2445,0.0139,40.4473,0.009,52 136110,53.261719,-27.615883,223.280041,-54.281374,1,0.3785,0.4183,0.7565,41.7995,0.006,90 136352,51.328125,-27.784405,223.130589,-55.999499,1,0.0000,0.0000,0.0000,nan,0.013,65 136407,33.574219,-4.780192,168.064587,-60.175886,1,0.1110,0.0954,0.0100,38.2070,0.019,42 136704,150.820312,3.732834,235.666318,43.572109,1,0.2725,1.2302,1.1813,44.6568,0.016,62 136931,52.558594,-27.279613,222.538937,-54.845107,1,0.3304,0.3783,0.1057,41.5421,0.008,52 136949,351.321442,-64.198746,317.458993,-50.429931,1,0.0000,0.0000,0.0000,nan,0.023,65 137510,346.562500,-63.448284,320.824720,-49.866957,1,0.0000,0.0000,0.0000,nan,0.021,65 137645,51.855469,-26.276812,220.627031,-55.293792,1,0.1590,0.1754,0.0130,39.6390,0.014,42 138010,52.207031,-28.630989,224.800211,-55.343637,1,0.2411,0.2709,0.0169,40.7009,0.009,42 138068,150.292969,2.686724,236.427488,42.541447,1,0.2270,2.7496,0.8355,46.7979,0.016,42 138263,51.328125,-27.784405,223.130589,-55.999499,1,0.4085,0.4242,0.0398,41.8353,0.013,90 138415,349.429535,-62.508568,320.039643,-51.393745,1,0.2761,0.2500,0.0203,40.5026,0.020,42 138553,347.812500,-63.448284,320.128971,-50.202348,1,0.3010,0.2608,0.2763,40.6065,0.021,90 138947,349.160583,-64.760857,318.219706,-49.458924,1,0.5965,0.5575,0.2970,42.5456,0.020,42 139016,151.523438,3.134927,236.900695,43.803170,1,0.0000,0.0000,0.0000,nan,0.019,16 139329,151.699219,3.583322,236.533224,44.205648,1,0.5029,0.5454,0.2843,42.4885,0.016,90 139362,149.238281,3.882372,234.283829,42.351155,1,0.0809,0.0779,0.0097,37.7408,0.033,64 139405,1.666667,-44.399834,327.519190,-70.529554,1,0.3701,0.4458,0.8403,41.9636,0.009,90 139637,359.816315,-44.003082,331.451340,-70.123054,1,0.4142,0.4028,0.3993,41.7023,0.013,90 140096,2.457983,-45.389202,324.632685,-69.945696,1,0.2211,0.2342,0.0540,40.3413,0.011,90 140472,52.558594,-27.279613,222.538937,-54.845107,1,0.0000,0.0000,0.0000,nan,0.008,65 140948,1.666667,-44.399834,327.519190,-70.529554,1,0.2048,0.1926,0.0117,39.8645,0.009,62 141212,53.613281,-28.630989,225.073365,-54.119461,1,0.1256,2.3091,0.5229,46.3382,0.006,90 141302,150.292969,2.686724,236.427488,42.541447,1,0.4316,0.4620,0.3119,42.0560,0.016,42 141334,149.238281,3.882372,234.283829,42.351155,1,0.0000,0.0000,0.0000,nan,0.033,65 141686,53.085938,-28.122234,224.100909,-54.509752,1,0.3431,0.3179,0.2942,41.1008,0.007,67 141937,151.523438,3.134927,236.900695,43.803170,1,0.4487,0.4658,0.0138,42.0770,0.019,90 142099,52.207031,-26.610098,221.298836,-55.042928,1,0.0000,0.0000,0.0000,nan,0.014,65 142254,52.910156,-25.944481,220.366350,-54.301439,1,0.4858,0.5210,0.3055,42.3685,0.010,90 142368,347.013428,-62.508568,321.472056,-50.735330,1,0.2586,0.0878,0.7050,38.0148,0.018,90 142866,150.996094,2.388015,237.313912,42.939977,1,0.6327,0.6476,0.0215,42.9408,0.021,42 142867,349.966217,-62.696659,319.542989,-51.376556,1,0.1548,0.1591,0.0118,39.4056,0.021,42 142885,52.207031,-28.291550,224.208534,-55.300157,1,0.4057,0.3852,0.0218,41.5882,0.007,90 143066,0.189873,-45.586655,328.254458,-68.969298,1,0.1937,0.5270,0.1240,42.3983,0.007,42 143275,2.071130,-45.191612,325.606223,-69.989264,1,0.1114,0.1030,0.0117,38.3830,0.011,42 143651,34.101562,-5.829153,170.247753,-60.638325,1,0.3590,0.3440,0.0396,41.3003,0.019,90 143865,33.925781,-5.979157,170.179895,-60.866303,1,0.0000,0.0000,0.0000,nan,0.022,16 144204,53.085938,-27.111860,222.384291,-54.355086,1,0.4825,0.5378,0.3853,42.4513,0.007,90 144244,33.222656,-4.780192,167.515653,-60.396584,1,0.2120,0.6014,0.1268,42.7453,0.018,90 145107,34.980469,-6.279288,172.180075,-60.389399,1,0.0000,0.0000,0.0000,nan,0.023,16 145160,0.189873,-45.586655,328.254458,-68.969298,1,0.0000,0.0000,0.0000,nan,0.007,65 145257,52.207031,-28.291550,224.208534,-55.300157,1,0.2570,0.2708,0.2312,40.7001,0.007,90 145675,51.855469,-27.953188,223.543603,-55.561470,1,0.8655,0.8841,0.0293,43.7698,0.008,90 145859,34.980469,-6.279288,172.180075,-60.389399,1,0.2213,0.2411,0.0230,40.4126,0.023,90 145926,152.050781,3.284369,237.157374,44.318466,1,0.2061,0.2269,0.0092,40.2640,0.019,42 145990,33.222656,-4.780192,167.515653,-60.396584,1,0.4648,0.4366,0.0319,41.9096,0.018,90 146187,33.398438,-3.732834,166.492280,-59.466614,1,0.0000,0.0000,0.0000,nan,0.022,92 146410,359.805206,-46.768478,327.135979,-67.829903,1,0.2555,0.4043,0.6020,41.7117,0.011,90 146429,150.117188,2.238686,236.784618,42.139082,1,0.1260,0.1220,0.0168,38.7782,0.016,62 147214,51.855469,-27.953188,223.543603,-55.561470,1,0.1677,0.1843,0.0127,39.7584,0.008,90 147571,0.190678,-45.783966,327.956322,-68.803772,1,0.1885,0.2575,0.1606,40.5748,0.005,90 147642,52.910156,-26.276812,220.926149,-54.363918,1,0.2453,0.2263,0.0090,40.2576,0.008,62 147752,151.523438,3.134927,236.900695,43.803170,1,0.2042,0.1823,0.0158,39.7324,0.019,90 147816,34.277344,-5.079716,169.526841,-59.956640,1,0.0000,0.0000,0.0000,nan,0.019,92 148204,151.171875,2.238686,237.619933,42.994783,1,0.0000,0.0000,0.0000,nan,0.024,65 148466,151.171875,2.238686,237.619933,42.994783,1,0.5915,0.5684,0.0143,42.5965,0.024,90 148535,34.277344,-5.679190,170.314930,-60.410322,1,0.1966,0.1835,0.0107,39.7482,0.020,42 148543,349.046051,-61.943836,320.796530,-51.753706,1,0.4717,0.5098,0.0652,42.3120,0.017,90 148976,53.964844,-28.630989,225.142950,-53.813613,1,0.6993,0.6715,0.0353,43.0365,0.009,90 148996,51.855469,-26.276812,220.627031,-55.293792,1,0.0258,0.0954,0.0390,38.2066,0.014,15 149129,0.589520,-47.161343,325.385896,-67.769893,1,0.1440,2.5349,0.7043,46.5843,0.009,90 149130,151.171875,2.537361,237.288526,43.169764,1,0.1925,0.1847,0.0130,39.7633,0.024,90 149478,2.071130,-45.191612,325.606223,-69.989264,1,0.1744,0.5308,0.1019,42.4174,0.011,42 149673,33.398438,-4.331149,167.226341,-59.936551,1,0.3411,0.3499,0.0077,41.3436,0.018,90 150266,349.615387,-63.636005,318.927246,-50.506542,1,0.0000,0.0000,0.0000,nan,0.018,16 150344,349.966217,-62.696659,319.542989,-51.376556,1,0.0000,0.0000,0.0000,nan,0.021,65 150561,53.261719,-27.615883,223.280041,-54.281374,1,0.2265,0.2307,0.0085,40.3046,0.006,42 150765,51.855469,-27.953188,223.543603,-55.561470,1,0.0000,0.0000,0.0000,nan,0.008,65 150818,53.613281,-26.944359,222.237403,-53.863858,1,0.1401,0.1231,0.0150,38.8002,0.009,90 150880,351.382965,-64.011238,317.574052,-50.604657,1,0.1168,0.0932,0.0179,38.1533,0.023,42 151356,51.679688,-27.447618,222.618229,-55.642263,1,0.0000,0.0000,0.0000,nan,0.010,16 151427,0.574468,-45.981140,327.041068,-68.778764,1,0.0370,0.0313,0.0117,35.6883,0.006,42 151458,53.261719,-27.615883,223.280041,-54.281374,1,0.0000,0.0000,0.0000,nan,0.006,65 151462,33.574219,-4.780192,168.064587,-60.175886,1,0.2901,0.3886,0.4161,41.6102,0.019,42 151498,359.446716,-44.201530,331.730015,-69.805709,1,0.4319,0.4487,0.5711,41.9804,0.010,90 151694,54.667969,-27.615883,223.610785,-53.050840,1,0.0000,0.0000,0.0000,nan,0.009,92 151704,150.292969,2.686724,236.427488,42.541447,1,0.1558,0.1295,0.6463,38.9183,0.016,52 151973,150.996094,4.181528,235.291975,43.970869,1,0.0000,0.0000,0.0000,nan,0.015,92 152079,33.574219,-5.379379,168.838090,-60.637536,1,0.3503,0.3429,0.0254,41.2924,0.017,90 152083,150.468750,3.732834,235.392208,43.283244,1,0.5001,0.5041,0.0165,42.2827,0.020,90 152300,33.574219,-6.579593,170.455585,-61.548219,1,0.7306,0.6981,0.0380,43.1398,0.021,42 152425,150.820312,3.732834,235.666318,43.572109,1,0.0000,0.0000,0.0000,nan,0.016,65 152453,52.910156,-27.953188,223.774083,-54.639214,1,0.2741,0.5064,0.2047,42.2944,0.007,90 152567,150.996094,4.181528,235.291975,43.970869,1,0.0000,0.0000,0.0000,nan,0.015,16 152618,33.574219,-5.079716,168.448505,-60.407218,1,0.0000,0.0000,0.0000,nan,0.016,16 152640,33.574219,-4.780192,168.064587,-60.175886,1,0.0000,0.0000,0.0000,nan,0.019,65 152682,51.679688,-27.447618,222.618229,-55.642263,1,0.0000,0.0000,0.0000,nan,0.010,16 152756,150.996094,4.181528,235.291975,43.970869,1,0.1326,0.1837,0.4894,39.7513,0.015,62 152787,351.299988,-62.320400,319.038597,-52.026867,1,0.0000,0.0000,0.0000,nan,0.018,65 152812,32.695312,-4.929937,166.868469,-60.841230,1,2.2378,2.2813,0.0673,46.3061,0.018,88 153089,35.332031,-5.979157,172.286722,-59.931743,1,0.0000,0.0000,0.0000,nan,0.022,16 153539,54.667969,-27.615883,223.610785,-53.050840,1,0.3371,0.3157,0.0155,41.0832,0.009,88 153880,51.679688,-27.447618,222.618229,-55.642263,1,0.3501,0.3152,0.0731,41.0793,0.010,90 154053,34.101562,-5.829153,170.247753,-60.638325,1,0.1526,0.1479,0.0167,39.2327,0.019,42 154402,151.171875,2.238686,237.619933,42.994783,1,0.0294,0.0619,0.0172,37.2157,0.024,90 154631,358.648071,-46.375080,329.462659,-67.716008,1,0.9576,0.8148,0.0688,43.5515,0.009,90 154648,359.811707,-45.191612,329.485675,-69.150905,1,1.4174,1.4408,0.0548,45.0811,0.010,88 154762,35.859375,-4.630479,171.270769,-58.580806,1,0.3187,0.2788,1.0862,40.7724,0.022,90 154986,150.996094,2.985506,236.647967,43.287350,1,0.0000,0.0000,0.0000,nan,0.020,16 155110,148.886719,2.686724,235.347248,41.389003,1,0.0000,0.0000,0.0000,nan,0.028,65 155380,150.996094,2.388015,237.313912,42.939977,1,0.2147,0.2288,0.0127,40.2844,0.021,88 155468,150.996094,4.181528,235.291975,43.970869,1,0.0000,0.0000,0.0000,nan,0.015,65 155541,151.347656,4.181528,235.568369,44.259942,1,0.5115,0.4788,0.0312,42.1482,0.016,90 155613,148.710938,2.836105,235.050801,41.328739,1,0.3585,0.3694,0.0090,41.4811,0.031,90 155778,53.085938,-27.111860,222.384291,-54.355086,1,0.2234,0.5912,0.2372,42.6999,0.007,42 156386,151.523438,3.134927,236.900695,43.803170,1,1.4308,1.2425,0.1528,44.6834,0.019,88 156537,352.132874,-63.636005,317.424173,-51.095855,1,2.0260,2.3090,0.0769,46.3380,0.021,88 156739,351.321442,-64.198746,317.458993,-50.429931,1,0.3405,0.4842,0.8467,42.1775,0.023,90 157120,51.328125,-27.447618,222.535046,-55.950727,1,0.2126,0.2539,0.5364,40.5402,0.013,42 157299,34.453125,-5.229529,169.987075,-59.956185,1,0.5504,0.6351,0.0335,42.8890,0.019,42 157477,359.805206,-46.768478,327.135979,-67.829903,1,0.9586,0.7920,0.0736,43.4756,0.011,95 157746,149.414062,1.940072,236.565366,41.393323,1,0.2167,0.5603,0.2926,42.5588,0.018,52 158042,1.666667,-44.399834,327.519190,-70.529554,1,0.1965,0.2093,0.0089,40.0667,0.009,90 158241,347.013428,-62.508568,321.472056,-50.735330,1,0.0424,0.0344,0.0149,35.8982,0.018,62 158507,351.299988,-62.320400,319.038597,-52.026867,1,0.6109,0.7017,0.0936,43.1535,0.018,90 158515,52.910156,-26.276812,220.926149,-54.363918,1,0.3265,0.3196,0.0097,41.1146,0.008,42 158573,33.574219,-4.780192,168.064587,-60.175886,1,0.2736,0.5337,0.1790,42.4315,0.019,90 158697,35.332031,-5.979157,172.286722,-59.931743,1,0.3830,0.4049,0.0394,41.7159,0.022,90 158731,347.846710,-64.760857,318.929827,-49.143596,1,0.0000,0.0000,0.0000,nan,0.019,65 158813,150.820312,3.134927,236.341348,43.230123,1,0.1966,0.1962,0.0134,39.9093,0.016,62 158904,2.097458,-45.783966,324.737840,-69.478613,1,1.8591,1.6874,0.1207,45.5041,0.011,88 159277,359.805206,-46.768478,327.135979,-67.829903,1,0.6452,0.5890,0.0356,42.6903,0.011,90 159316,151.699219,3.583322,236.533224,44.205648,1,0.0000,0.0000,0.0000,nan,0.016,65 159491,151.171875,1.342993,238.602520,42.464379,1,0.0000,0.0000,0.0000,nan,0.026,16 159665,33.574219,-4.780192,168.064587,-60.175886,1,0.3253,0.3336,0.0122,41.2221,0.019,90 159925,150.820312,3.134927,236.341348,43.230123,1,1.6095,1.6763,0.0691,45.4866,0.016,88 160048,51.855469,-28.630989,224.733260,-55.649872,1,0.3638,0.4409,0.3427,41.9350,0.009,90 160426,351.321442,-64.198746,317.458993,-50.429931,1,0.3511,0.3353,0.0637,41.2356,0.023,90 160527,2.097458,-45.783966,324.737840,-69.478613,1,0.3162,0.2266,0.6723,40.2609,0.011,90 160737,52.207031,-28.291550,224.208534,-55.300157,1,0.3593,0.5374,0.2042,42.4496,0.007,88 160921,349.160583,-64.760857,318.219706,-49.458924,1,0.0000,0.0000,0.0000,nan,0.020,65 161135,52.207031,-28.291550,224.208534,-55.300157,1,0.4293,0.4326,0.0209,41.8860,0.007,90 161411,150.468750,3.732834,235.392208,43.283244,1,0.0000,0.0000,0.0000,nan,0.020,65 161432,149.414062,2.238686,236.239766,41.565558,1,0.0000,0.0000,0.0000,nan,0.017,16 161521,346.130127,-63.072620,321.423103,-50.042305,1,0.1147,0.1640,0.9394,39.4785,0.020,90 161591,150.996094,4.181528,235.291975,43.970869,1,0.0761,0.1416,0.0294,39.1295,0.015,62 161877,150.468750,1.641510,237.714575,42.075234,1,0.0000,0.0000,0.0000,nan,0.017,65 161988,51.679688,-27.447618,222.618229,-55.642263,1,0.5641,0.5875,0.0180,42.6834,0.010,90 162093,51.855469,-28.630989,224.733260,-55.649872,1,0.1580,0.1822,0.0192,39.7311,0.009,62 162139,150.820312,3.134927,236.341348,43.230123,1,0.1036,0.0926,0.0114,38.1367,0.016,42 162152,348.529419,-61.755440,321.293980,-51.763351,1,0.0000,0.0000,0.0000,nan,0.016,16 162531,347.861847,-61.943836,321.519104,-51.424048,1,0.5290,0.4535,0.0438,42.0076,0.017,90 162538,150.996094,4.181528,235.291975,43.970869,1,0.2795,0.5154,0.2561,42.3402,0.015,90 162994,53.085938,-28.122234,224.100909,-54.509752,1,0.5827,0.5520,0.0825,42.5199,0.007,90 163208,33.398438,-4.331149,167.226341,-59.936551,1,0.1546,0.1227,0.0022,38.7928,0.018,42 163680,0.574468,-45.981140,327.041068,-68.778764,1,0.3407,0.3516,0.0201,41.3557,0.006,90 163894,52.207031,-28.630989,224.800211,-55.343637,1,0.3027,0.3162,0.0110,41.0874,0.009,88 164582,32.871094,-4.780192,166.959493,-60.615132,1,0.6341,0.6230,0.0125,42.8383,0.017,90 164805,51.679688,-27.447618,222.618229,-55.642263,1,0.0000,0.0000,0.0000,nan,0.010,92 165406,351.321442,-64.198746,317.458993,-50.429931,1,0.0000,0.0000,0.0000,nan,0.023,65 165494,349.615387,-63.636005,318.927246,-50.506542,1,0.2616,0.2665,0.0118,40.6604,0.018,90 165507,151.171875,1.342993,238.602520,42.464379,1,0.2323,0.2251,0.0187,40.2445,0.026,67 165821,149.414062,2.238686,236.239766,41.565558,1,0.2458,1.0072,0.4491,44.1195,0.017,42 165985,51.679688,-27.447618,222.618229,-55.642263,1,0.4551,0.4933,0.2305,42.2259,0.010,88 166103,0.589520,-47.161343,325.385896,-67.769893,1,0.2871,0.2079,0.4627,40.0507,0.009,90 166165,150.117188,2.836105,236.124718,42.483719,1,0.0000,0.0000,0.0000,nan,0.016,16 166186,51.679688,-27.447618,222.618229,-55.642263,1,0.5156,0.5329,0.0107,42.4278,0.010,42 166195,149.238281,3.882372,234.283829,42.351155,1,0.0000,0.0000,0.0000,nan,0.033,16 166330,149.414062,1.940072,236.565366,41.393323,1,0.0000,0.0000,0.0000,nan,0.018,65 166697,53.613281,-28.630989,225.073365,-54.119461,1,1.1664,1.1160,0.0368,44.3951,0.006,95 166727,34.453125,-5.229529,169.987075,-59.956185,1,0.2659,0.2946,0.0220,40.9098,0.019,62 166956,359.811707,-45.191612,329.485675,-69.150905,1,0.0000,0.0000,0.0000,nan,0.010,65 167123,149.414062,2.238686,236.239766,41.565558,1,0.0000,0.0000,0.0000,nan,0.017,65 167220,359.446716,-44.201530,331.730015,-69.805709,1,0.7136,0.7235,0.0416,43.2348,0.010,90 167260,2.071130,-45.191612,325.606223,-69.989264,1,0.3449,0.3554,0.0193,41.3826,0.011,62 167310,349.615387,-63.636005,318.927246,-50.506542,1,0.3079,0.3246,0.0181,41.1538,0.018,42 167417,349.046051,-61.943836,320.796530,-51.753706,1,0.5774,0.5558,0.0410,42.5377,0.017,90 167436,350.230255,-61.943836,320.053946,-52.070537,1,0.1918,0.2263,0.0138,40.2576,0.017,90 167488,348.586945,-64.573555,318.693903,-49.477869,1,0.3928,0.3939,0.0171,41.6449,0.018,90 167910,348.908447,-63.823658,319.169886,-50.176186,1,0.4761,0.4646,0.0434,42.0701,0.018,90 168146,51.855469,-26.276812,220.627031,-55.293792,1,0.0000,0.0000,0.0000,nan,0.014,65 168465,149.414062,1.940072,236.565366,41.393323,1,2.1492,2.4337,0.1988,46.4769,0.018,95 168659,53.613281,-27.953188,223.929533,-54.024772,1,0.2663,0.3135,0.2502,41.0656,0.007,67 168952,358.312500,-44.993881,332.185785,-68.685906,1,0.0000,0.0000,0.0000,nan,0.009,65 168957,53.085938,-27.784405,223.525509,-54.460748,1,0.0000,0.0000,0.0000,nan,0.007,65 168967,347.812500,-63.448284,320.128971,-50.202348,1,0.1845,0.2142,0.1608,40.1234,0.021,90 168989,151.171875,1.342993,238.602520,42.464379,1,0.0000,0.0000,0.0000,nan,0.026,16 169133,347.013428,-62.508568,321.472056,-50.735330,1,0.4656,0.4799,0.2303,42.1546,0.018,90 169203,347.861847,-61.943836,321.519104,-51.424048,1,0.1833,0.1791,0.1849,39.6900,0.017,90 169282,149.414062,3.433834,234.919132,42.245550,1,0.3181,0.3458,0.3165,41.3133,0.027,90 ================================================ FILE: examples/docker/modin-ray/Dockerfile ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. # Build image from this dockerfile like this: # docker build -t modin-ray:latest . FROM ubuntu:20.04 # Proxy settings ENV http_proxy=${http_proxy} ENV https_proxy=${https_proxy} ENV no_proxy=${no_proxy} RUN apt-get update --yes \ && apt-get install wget --yes \ && rm -rf /var/lib/apt/lists/* ENV USER modin ENV UID 1000 ENV HOME /home/$USER RUN adduser --disabled-password \ --gecos "Non-root user" \ --uid $UID \ --home $HOME \ $USER # Conda settings ENV CONDA_DIR=${HOME}/miniconda ENV CONDA_ENV_NAME=modin-ray ENV PATH="${CONDA_DIR}/bin:${PATH}" RUN wget -nv https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda3.sh \ && bash /tmp/miniconda3.sh -b -p "${CONDA_DIR}" -f -u \ && "${CONDA_DIR}/bin/conda" init bash \ && rm -f /tmp/miniconda3.sh RUN conda update -n base -c defaults conda -y \ && conda create -n ${CONDA_ENV_NAME} --yes -c conda-forge --strict-channel-priority \ modin-ray \ ray-dashboard \ scikit-learn \ scikit-learn-intelex \ xgboost \ && conda clean --all --yes # Activate ${CONDA_ENV_NAME} for interactive shells RUN echo "source ${CONDA_DIR}/bin/activate ${CONDA_ENV_NAME}" >> "${HOME}/.bashrc" # Activate ${CONDA_ENV_NAME} for non-interactive shells # The following line comments out line that prevents ~/.bashrc execution in # non-interactive mode. RUN sed -e 's,\(^[[:space:]]\+[*]) return;;$\),# \1,' -i "${HOME}/.bashrc" ENV BASH_ENV="${HOME}/.bashrc" # Set up benchmark scripts COPY nyc-taxi.py "${HOME}" COPY census.py "${HOME}" COPY plasticc.py "${HOME}" RUN mkdir /dataset WORKDIR ${HOME} # Clean up proxy settings to publish on Docker Hub ENV http_proxy= ENV https_proxy= ENV no_proxy= # Set entrypoint with arguments expansion ENTRYPOINT ["/bin/bash", "-c", "exec $0 $*"] ================================================ FILE: examples/docker/modin-ray/build-docker-image.sh ================================================ #!/bin/bash -e # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. cd "`dirname \"$0\"`" docker build -t modin-ray . echo -e '\nNYC TAXI BENCHMARK User is responsible for preparing the dataset. It Can be generated by following the instructions on the link: https://github.com/toddwschneider/nyc-taxi-data#instructions To run the benchmark execute: \tdocker run --rm -v /path/to/dataset:/dataset modin-ray python nyc-taxi.py CENSUS BENCHMARK User is responsible for preparing the dataset. It can be downloaded from the following link: https://rapidsai-data.s3.us-east-2.amazonaws.com/datasets/ipums_education2income_1970-2010.csv.gz To run the benchmark execute: \tdocker run --rm -v /path/to/dataset:/dataset modin-ray python census.py PLASTICC BENCHMARK User is responsible for preparing the datasets. The datasets must include four files: training set, test set, training set metadata and test set metadata. To run the benchmark execute: \tdocker run --rm -v /path/to/dataset:/dataset modin-ray python plasticc.py \n' ================================================ FILE: examples/docker/modin-ray/census.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import sys import time import sklearnex from sklearn import config_context import modin.pandas as pd sklearnex.patch_sklearn() import numpy as np import sklearn.linear_model as lm from sklearn.model_selection import train_test_split def read(filename): columns_names = [ "YEAR0", "DATANUM", "SERIAL", "CBSERIAL", "HHWT", "CPI99", "GQ", "QGQ", "PERNUM", "PERWT", "SEX", "AGE", "EDUC", "EDUCD", "INCTOT", "SEX_HEAD", "SEX_MOM", "SEX_POP", "SEX_SP", "SEX_MOM2", "SEX_POP2", "AGE_HEAD", "AGE_MOM", "AGE_POP", "AGE_SP", "AGE_MOM2", "AGE_POP2", "EDUC_HEAD", "EDUC_MOM", "EDUC_POP", "EDUC_SP", "EDUC_MOM2", "EDUC_POP2", "EDUCD_HEAD", "EDUCD_MOM", "EDUCD_POP", "EDUCD_SP", "EDUCD_MOM2", "EDUCD_POP2", "INCTOT_HEAD", "INCTOT_MOM", "INCTOT_POP", "INCTOT_SP", "INCTOT_MOM2", "INCTOT_POP2", ] columns_types = [ "int64", "int64", "int64", "float64", "int64", "float64", "int64", "float64", "int64", "int64", "int64", "int64", "int64", "int64", "int64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", ] dtypes = {columns_names[i]: columns_types[i] for i in range(len(columns_names))} df = pd.read_csv( filename, names=columns_names, dtype=dtypes, skiprows=1, ) return df def etl(df): keep_cols = [ "YEAR0", "DATANUM", "SERIAL", "CBSERIAL", "HHWT", "CPI99", "GQ", "PERNUM", "SEX", "AGE", "INCTOT", "EDUC", "EDUCD", "EDUC_HEAD", "EDUC_POP", "EDUC_MOM", "EDUCD_MOM2", "EDUCD_POP2", "INCTOT_MOM", "INCTOT_POP", "INCTOT_MOM2", "INCTOT_POP2", "INCTOT_HEAD", "SEX_HEAD", ] df = df[keep_cols] df = df[df["INCTOT"] != 9999999] df = df[df["EDUC"] != -1] df = df[df["EDUCD"] != -1] df["INCTOT"] = df["INCTOT"] * df["CPI99"] for column in keep_cols: df[column] = df[column].fillna(-1) df[column] = df[column].astype("float64") y = df["EDUC"] X = df.drop(columns=["EDUC", "CPI99"]) return (df, X, y) def mse(y_test, y_pred): return ((y_test - y_pred) ** 2).mean() def cod(y_test, y_pred): y_bar = y_test.mean() total = ((y_test - y_bar) ** 2).sum() residuals = ((y_test - y_pred) ** 2).sum() return 1 - (residuals / total) def ml(X, y, random_state, n_runs, test_size): clf = lm.Ridge() X = np.ascontiguousarray(X, dtype=np.float64) y = np.ascontiguousarray(y, dtype=np.float64) mse_values, cod_values = [], [] ml_scores = {} print("ML runs: ", n_runs) for i in range(n_runs): (X_train, X_test, y_train, y_test) = train_test_split( X, y, test_size=test_size, random_state=random_state ) random_state += 777 with config_context(assume_finite=True): model = clf.fit(X_train, y_train) y_pred = model.predict(X_test) mse_values.append(mse(y_test, y_pred)) cod_values.append(cod(y_test, y_pred)) ml_scores["mse_mean"] = sum(mse_values) / len(mse_values) ml_scores["cod_mean"] = sum(cod_values) / len(cod_values) ml_scores["mse_dev"] = pow( sum([(mse_value - ml_scores["mse_mean"]) ** 2 for mse_value in mse_values]) / (len(mse_values) - 1), 0.5, ) ml_scores["cod_dev"] = pow( sum([(cod_value - ml_scores["cod_mean"]) ** 2 for cod_value in cod_values]) / (len(cod_values) - 1), 0.5, ) return ml_scores def measure(name, func, *args, **kw): t0 = time.time() res = func(*args, **kw) t1 = time.time() print(f"{name}: {t1 - t0} sec") return res def main(): if len(sys.argv) != 2: print( f"USAGE: docker run --rm -v /path/to/dataset:/dataset python census.py " ) return # ML specific N_RUNS = 50 TEST_SIZE = 0.1 RANDOM_STATE = 777 df = measure("Reading", read, sys.argv[1]) _, X, y = measure("ETL", etl, df) measure( "ML", ml, X, y, random_state=RANDOM_STATE, n_runs=N_RUNS, test_size=TEST_SIZE ) if __name__ == "__main__": main() ================================================ FILE: examples/docker/modin-ray/nyc-taxi.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import sys import time import modin.pandas as pd def read(filename): columns_names = [ "trip_id", "vendor_id", "pickup_datetime", "dropoff_datetime", "store_and_fwd_flag", "rate_code_id", "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude", "passenger_count", "trip_distance", "fare_amount", "extra", "mta_tax", "tip_amount", "tolls_amount", "ehail_fee", "improvement_surcharge", "total_amount", "payment_type", "trip_type", "pickup", "dropoff", "cab_type", "precipitation", "snow_depth", "snowfall", "max_temperature", "min_temperature", "average_wind_speed", "pickup_nyct2010_gid", "pickup_ctlabel", "pickup_borocode", "pickup_boroname", "pickup_ct2010", "pickup_boroct2010", "pickup_cdeligibil", "pickup_ntacode", "pickup_ntaname", "pickup_puma", "dropoff_nyct2010_gid", "dropoff_ctlabel", "dropoff_borocode", "dropoff_boroname", "dropoff_ct2010", "dropoff_boroct2010", "dropoff_cdeligibil", "dropoff_ntacode", "dropoff_ntaname", "dropoff_puma", ] parse_dates = ["pickup_datetime", "dropoff_datetime"] return pd.read_csv( filename, names=columns_names, header=None, parse_dates=parse_dates ) def q1(df): return df.groupby("cab_type")["cab_type"].count() def q2(df): return df.groupby("passenger_count", as_index=False).mean()[ ["passenger_count", "total_amount"] ] def q3(df): transformed = pd.DataFrame( { "pickup_datetime": df["pickup_datetime"].dt.year, "passenger_count": df["passenger_count"], } ) return transformed.groupby( ["pickup_datetime", "passenger_count"], as_index=False ).size() def q4(df): transformed = pd.DataFrame( { "passenger_count": df["passenger_count"], "pickup_datetime": df["pickup_datetime"].dt.year, "trip_distance": df["trip_distance"].astype("int64"), } ) return ( transformed.groupby( ["passenger_count", "pickup_datetime", "trip_distance"], as_index=False ) .size() .sort_values(by=["pickup_datetime", "size"], ascending=[True, False]) ) def measure(name, func, *args, **kw): t0 = time.time() res = func(*args, **kw) t1 = time.time() print(f"{name}: {t1 - t0} sec") return res def main(): if len(sys.argv) != 2: print( f"USAGE: docker run --rm -v /path/to/dataset:/dataset python nyc-taxi.py " ) return df = measure("Reading", read, sys.argv[1]) measure("Q1", q1, df) measure("Q2", q2, df) measure("Q3", q3, df) measure("Q4", q4, df) if __name__ == "__main__": main() ================================================ FILE: examples/docker/modin-ray/plasticc.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import sys import time from functools import partial import numpy as np import sklearnex import xgboost as xgb import modin.pandas as pd sklearnex.patch_sklearn() from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder ################ helper functions ############################### def create_dtypes(): dtypes = dict( [ ("object_id", "int32"), ("mjd", "float32"), ("passband", "int32"), ("flux", "float32"), ("flux_err", "float32"), ("detected", "int32"), ] ) # load metadata columns_names = [ "object_id", "ra", "decl", "gal_l", "gal_b", "ddf", "hostgal_specz", "hostgal_photoz", "hostgal_photoz_err", "distmod", "mwebv", "target", ] meta_dtypes = ["int32"] + ["float32"] * 4 + ["int32"] + ["float32"] * 5 + ["int32"] meta_dtypes = dict( [(columns_names[i], meta_dtypes[i]) for i in range(len(meta_dtypes))] ) return dtypes, meta_dtypes def ravel_column_names(cols): d0 = cols.get_level_values(0) d1 = cols.get_level_values(1) return ["%s_%s" % (i, j) for i, j in zip(d0, d1)] def measure(name, func, *args, **kw): t0 = time.time() res = func(*args, **kw) t1 = time.time() print(f"{name}: {t1 - t0} sec") return res def all_etl(train, train_meta, test, test_meta): train_final = etl(train, train_meta) test_final = etl(test, test_meta) return (train_final, test_final) def split_step(train_final, test_final): X = train_final.drop(["object_id", "target"], axis=1).values Xt = test_final.drop(["object_id"], axis=1).values y = train_final["target"] assert X.shape[1] == Xt.shape[1] classes = sorted(y.unique()) class_weights = {c: 1 for c in classes} class_weights.update({c: 2 for c in [64, 15]}) lbl = LabelEncoder() y = lbl.fit_transform(y) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.1, stratify=y, random_state=126 ) return X_train, y_train, X_test, y_test, Xt, classes, class_weights def multi_weighted_logloss(y_true, y_preds, classes, class_weights): """ refactor from @author olivier https://www.kaggle.com/ogrellier multi logloss for PLAsTiCC challenge """ y_p = y_preds.reshape(y_true.shape[0], len(classes), order="F") y_ohe = pd.get_dummies(y_true) y_p = np.clip(a=y_p, a_min=1e-15, a_max=1 - 1e-15) y_p_log = np.log(y_p) y_log_ones = np.sum(y_ohe.values * y_p_log, axis=0) nb_pos = y_ohe.sum(axis=0).values.astype(float) class_arr = np.array([class_weights[k] for k in sorted(class_weights.keys())]) y_w = y_log_ones * class_arr / nb_pos loss = -np.sum(y_w) / np.sum(class_arr) return loss def xgb_multi_weighted_logloss(y_predicted, y_true, classes, class_weights): loss = multi_weighted_logloss( y_true.get_label(), y_predicted, classes, class_weights ) return "wloss", loss ################ helper functions ############################### def read( training_set_filename, test_set_filename, training_set_metadata_filename, test_set_metadata_filename, dtypes, meta_dtypes, ): train = pd.read_csv(training_set_filename, dtype=dtypes) test = pd.read_csv( test_set_filename, names=list(dtypes.keys()), dtype=dtypes, header=0, ) train_meta = pd.read_csv(training_set_metadata_filename, dtype=meta_dtypes) target = meta_dtypes.pop("target") test_meta = pd.read_csv(test_set_metadata_filename, dtype=meta_dtypes) meta_dtypes["target"] = target dfs = (train, train_meta, test, test_meta) return dfs def etl(df, df_meta): # workaround for Modin_on_ray. Eventually this should be fixed df["flux_ratio_sq"] = (df["flux"] / df["flux_err"]) * ( df["flux"] / df["flux_err"] ) # np.power(df["flux"] / df["flux_err"], 2.0) df["flux_by_flux_ratio_sq"] = df["flux"] * df["flux_ratio_sq"] aggs = { "passband": ["mean"], "flux": ["min", "max", "mean", "skew"], "flux_err": ["min", "max", "mean"], "detected": ["mean"], "mjd": ["max", "min"], "flux_ratio_sq": ["sum"], "flux_by_flux_ratio_sq": ["sum"], } agg_df = df.groupby("object_id", sort=False).agg(aggs) agg_df.columns = ravel_column_names(agg_df.columns) agg_df["flux_diff"] = agg_df["flux_max"] - agg_df["flux_min"] agg_df["flux_dif2"] = agg_df["flux_diff"] / agg_df["flux_mean"] agg_df["flux_w_mean"] = ( agg_df["flux_by_flux_ratio_sq_sum"] / agg_df["flux_ratio_sq_sum"] ) agg_df["flux_dif3"] = agg_df["flux_diff"] / agg_df["flux_w_mean"] agg_df["mjd_diff"] = agg_df["mjd_max"] - agg_df["mjd_min"] agg_df = agg_df.drop(["mjd_max", "mjd_min"], axis=1) agg_df = agg_df.reset_index() df_meta = df_meta.drop(["ra", "decl", "gal_l", "gal_b"], axis=1) df_meta = df_meta.merge(agg_df, on="object_id", how="left") return df_meta def ml(train_final, test_final): X_train, y_train, X_test, y_test, Xt, classes, class_weights = split_step( train_final, test_final ) cpu_params = { "objective": "multi:softprob", "eval_metric": "merror", "tree_method": "hist", "nthread": 16, "num_class": 14, "max_depth": 7, "verbosity": 1, "subsample": 0.7, "colsample_bytree": 0.7, } func_loss = partial( xgb_multi_weighted_logloss, classes=classes, class_weights=class_weights ) dtrain = xgb.DMatrix(data=X_train, label=y_train) dvalid = xgb.DMatrix(data=X_test, label=y_test) dtest = xgb.DMatrix(data=Xt) watchlist = [(dvalid, "eval"), (dtrain, "train")] clf = xgb.train( cpu_params, dtrain=dtrain, num_boost_round=60, evals=watchlist, feval=func_loss, early_stopping_rounds=10, verbose_eval=None, ) yp = clf.predict(dvalid) cpu_loss = multi_weighted_logloss(y_test, yp, classes, class_weights) ysub = clf.predict(dtest) # noqa: F841 (unused variable) return cpu_loss def main(): if len(sys.argv) != 5: print( f"USAGE: docker run --rm -v /path/to/dataset:/dataset python plasticc.py " ) return dtypes, meta_dtypes = create_dtypes() train, train_meta, test, test_meta = measure( "Reading", read, sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], dtypes, meta_dtypes, ) train_final, test_final = measure( "ETL", all_etl, train, train_meta, test, test_meta ) cpu_loss = measure("ML", ml, train_final, test_final) print("validation cpu_loss:", cpu_loss) if __name__ == "__main__": main() ================================================ FILE: examples/jupyter/Modin_Taxi.ipynb ================================================ { "cells": [ { "cell_type": "code", "execution_count": 1, "id": "cc4bd9e9", "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [], "source": [ "# To run this notebook as done in the README GIFs, you must first locally download the 2015 NYC Taxi Trip Data.\n", "import urllib.request\n", "url_path = \"https://modin-datasets.intel.com/green-taxi/green_tripdata_2015-01.csv\"\n", "urllib.request.urlretrieve(url_path, \"taxi.csv\")\n", "\n", "from modin.config import Engine\n", "Engine.put(\"dask\")\n", "from dask.distributed import Client\n", "client = Client(n_workers=12)\n", "\n", "from modin.config import BenchmarkMode\n", "BenchmarkMode.put(True)" ] }, { "cell_type": "code", "execution_count": 2, "id": "97b245e5", "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [], "source": [ "import modin.pandas as pd" ] }, { "cell_type": "code", "execution_count": 3, "id": "b65b121c", "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1.57 s, sys: 683 ms, total: 2.26 s\n", "Wall time: 14.2 s\n" ] } ], "source": [ "%time df = pd.read_csv(\"taxi.csv\", parse_dates=[\"tpep_pickup_datetime\", \"tpep_dropoff_datetime\"], quoting=3)" ] }, { "cell_type": "code", "execution_count": 4, "id": "c48193b2", "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 138 ms, sys: 27.3 ms, total: 166 ms\n", "Wall time: 404 ms\n" ] } ], "source": [ "%time isnull = df.isnull()" ] }, { "cell_type": "code", "execution_count": 5, "id": "1d32ed7c", "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 175 ms, sys: 28.4 ms, total: 203 ms\n", "Wall time: 663 ms\n" ] } ], "source": [ "%time rounded_trip_distance = df[[\"pickup_longitude\"]].applymap(round)" ] }, { "cell_type": "code", "execution_count": null, "id": "3ef271dc", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.11" } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: examples/jupyter/Pandas_Taxi.ipynb ================================================ { "cells": [ { "cell_type": "code", "execution_count": 1, "id": "5d674ce8", "metadata": {}, "outputs": [], "source": [ "# To run this notebook as done in the README GIFs, you must first locally download the 2015 NYC Taxi Trip Data.\n", "import urllib.request\n", "url_path = \"https://modin-datasets.intel.com/green-taxi/green_tripdata_2015-01.csv\"\n", "urllib.request.urlretrieve(url_path, \"taxi.csv\")\n", "\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")" ] }, { "cell_type": "code", "execution_count": 2, "id": "27f7321c", "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 3, "id": "8de98215", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 30.7 s, sys: 4.25 s, total: 35 s\n", "Wall time: 35.3 s\n" ] } ], "source": [ "%time df = pd.read_csv(\"taxi.csv\", parse_dates=[\"tpep_pickup_datetime\", \"tpep_dropoff_datetime\"], quoting=3)" ] }, { "cell_type": "code", "execution_count": 4, "id": "14422c3f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1.37 s, sys: 300 ms, total: 1.67 s\n", "Wall time: 1.67 s\n" ] } ], "source": [ "%time isnull = df.isnull()" ] }, { "cell_type": "code", "execution_count": 5, "id": "f8f87974", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 3.07 s, sys: 305 ms, total: 3.37 s\n", "Wall time: 3.37 s\n" ] } ], "source": [ "%time rounded_trip_distance = df[[\"pickup_longitude\"]].applymap(round)" ] }, { "cell_type": "code", "execution_count": null, "id": "2c7d62bf", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.11" } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: examples/jupyter/integrations/NLTK.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Demonstrating NLTK Modin Interoperability" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## All the examples in this section are taken / adapted from https://www.kirenz.com/post/2021-12-11-text-mining-and-sentiment-analysis-with-nltk-and-pandas-in-python/text-mining-and-sentiment-analysis-with-nltk-and-pandas-in-python/" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import modin.pandas as pd\n", "import pandas\n", "import nltk\n", "from nltk.tokenize import RegexpTokenizer\n", "from nltk.corpus import stopwords" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Import some Tweets from Barack Obama \n", "modin_df = pd.read_csv(\"https://raw.githubusercontent.com/kirenz/twitter-tweepy/main/tweets-obama.csv\")\n", "modin_df.head(3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modin_df['text'] = modin_df['text'].astype(str).str.lower()\n", "modin_df.head(3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "regexp = RegexpTokenizer('\\w+')\n", "\n", "modin_df['text_token']=modin_df['text'].apply(regexp.tokenize)\n", "modin_df.head(3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "nltk.download('stopwords')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Make a list of english stopwords\n", "stopwords = nltk.corpus.stopwords.words(\"english\")\n", "\n", "# Extend the list with your own custom stopwords\n", "my_stopwords = ['https']\n", "stopwords.extend(my_stopwords)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Remove stopwords\n", "modin_df['text_token'] = modin_df['text_token'].apply(lambda x: [item for item in x if item not in stopwords])\n", "modin_df.head(3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modin_df['text_string'] = modin_df['text_token'].apply(lambda x: ' '.join([item for item in x if len(item)>2]))\n", "modin_df[['text', 'text_token', 'text_string']].head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "nltk.download('punkt')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_words = ' '.join([word for word in modin_df['text_string']])\n", "tokenized_words = nltk.tokenize.word_tokenize(all_words)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from nltk.probability import FreqDist\n", "\n", "fdist = FreqDist(tokenized_words)\n", "fdist" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modin_df['text_string_fdist'] = modin_df['text_token'].apply(lambda x: ' '.join([item for item in x if fdist[item] >= 1 ]))\n", "modin_df[['text', 'text_token', 'text_string', 'text_string_fdist']].head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#lemmatization\n", "nltk.download('wordnet')\n", "nltk.download('omw-1.4')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from nltk.stem import WordNetLemmatizer\n", "\n", "wordnet_lem = WordNetLemmatizer()\n", "\n", "modin_df['text_string_lem'] = modin_df['text_string_fdist'].apply(wordnet_lem.lemmatize)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# check if the columns are equal\n", "modin_df['is_equal']= (modin_df['text_string_fdist']==modin_df['text_string_lem'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# show level count\n", "modin_df.is_equal.value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_words_lem = ' '.join([word for word in modin_df['text_string_lem']])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "from wordcloud import WordCloud\n", "\n", "wordcloud = WordCloud(width=600, \n", " height=400, \n", " random_state=2, \n", " max_font_size=100).generate(all_words_lem)\n", "\n", "plt.figure(figsize=(10, 7))\n", "plt.imshow(wordcloud, interpolation='bilinear')\n", "plt.axis('off');" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Replicating NLTK workflow with pandas" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Import some Tweets from Barack Obama as pandas df\n", "pandas_df = pandas.read_csv(\"https://raw.githubusercontent.com/kirenz/twitter-tweepy/main/tweets-obama.csv\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pandas_df['text'] = pandas_df['text'].astype(str).str.lower()\n", "pandas_df.head(3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "regexp = RegexpTokenizer('\\w+')\n", "\n", "pandas_df['text_token']=pandas_df['text'].apply(regexp.tokenize)\n", "pandas_df.head(3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Remove stopwords\n", "pandas_df['text_token'] = pandas_df['text_token'].apply(lambda x: [item for item in x if item not in stopwords])\n", "pandas_df.head(3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pandas_df['text_string'] = pandas_df['text_token'].apply(lambda x: ' '.join([item for item in x if len(item)>2]))\n", "pandas_df[['text', 'text_token', 'text_string']].head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_words = ' '.join([word for word in pandas_df['text_string']])\n", "tokenized_words = nltk.tokenize.word_tokenize(all_words)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from nltk.probability import FreqDist\n", "\n", "fdist = FreqDist(tokenized_words)\n", "fdist" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pandas_df['text_string_fdist'] = pandas_df['text_token'].apply(lambda x: ' '.join([item for item in x if fdist[item] >= 1 ]))\n", "pandas_df[['text', 'text_token', 'text_string', 'text_string_fdist']].head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from nltk.stem import WordNetLemmatizer\n", "\n", "wordnet_lem = WordNetLemmatizer()\n", "\n", "pandas_df['text_string_lem'] = pandas_df['text_string_fdist'].apply(wordnet_lem.lemmatize)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# check if the columns are equal\n", "pandas_df['is_equal']= (pandas_df['text_string_fdist']==pandas_df['text_string_lem'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# show level count\n", "pandas_df.is_equal.value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_words_lem = ' '.join([word for word in pandas_df['text_string_lem']])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "from wordcloud import WordCloud\n", "\n", "wordcloud = WordCloud(width=600, \n", " height=400, \n", " random_state=2, \n", " max_font_size=100).generate(all_words_lem)\n", "\n", "plt.figure(figsize=(10, 7))\n", "plt.imshow(wordcloud, interpolation='bilinear')\n", "plt.axis('off');" ] } ], "metadata": { "interpreter": { "hash": "9752fa87da8bf164654ccc33a595e9110c8fc9bb15d763374a7037fd32519b1f" }, "kernelspec": { "display_name": "Python 3.9.7 ('base')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: examples/jupyter/integrations/altair.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Demonstrating Altair Modin Interoperability\n", "### Currently Altair is not interoperable with Modin. Each visualization is created with a Modin and then pandas dataframe for comparison." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import modin.pandas as pd\n", "import altair as alt" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from vega_datasets import data\n", "pandas_cars = data.cars()\n", "modin_cars = pd.DataFrame(data.cars())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a visualization with Modin df\n", "alt.Chart(modin_cars).mark_point().encode(\n", " x='Horsepower',\n", " y='Miles_per_Gallon',\n", " color='Origin',\n", ").interactive()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a visualization with pandas df\n", "alt.Chart(pandas_cars).mark_point().encode(\n", " x='Horsepower',\n", " y='Miles_per_Gallon',\n", " color='Origin',\n", ").interactive()" ] } ], "metadata": { "interpreter": { "hash": "9752fa87da8bf164654ccc33a595e9110c8fc9bb15d763374a7037fd32519b1f" }, "kernelspec": { "display_name": "Python 3.9.7 ('base')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.18" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: examples/jupyter/integrations/bokeh.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Demonstrating Bokeh Modin Interoperability\n", "### Currently Boken is not interoperable with Modin. Each visualization is created with a Modin and then pandas dataframe for comparison." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import modin.pandas as pd\n", "import pandas\n", "from bokeh.plotting import figure\n", "from bokeh.models import ColumnDataSource\n", "from bokeh.io import show" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "UserWarning: `from_dict` is not currently supported by PandasOnRay, defaulting to pandas implementation.\n", "Please refer to https://modin.readthedocs.io/en/stable/supported_apis/defaulting_to_pandas.html for explanation.\n", "2023-04-06 12:14:58,510\tINFO worker.py:1544 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32m127.0.0.1:8265 \u001b[39m\u001b[22m\n", "UserWarning: When using a pre-initialized Ray cluster, please ensure that the runtime env sets environment variable __MODIN_AUTOIMPORT_PANDAS__ to 1\n" ] }, { "ename": "ValueError", "evalue": "expected a dict or pandas.DataFrame, got x_values y_values\n0 1 6\n1 2 7\n2 3 2\n3 4 3\n4 5 6", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/var/folders/qj/jybppsbd2jl75s8y2q8s2xx80000gn/T/ipykernel_5953/1336630338.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;31m# create a ColumnDataSource by passing the dict\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0msource\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mColumnDataSource\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodin_data\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfigure\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/opt/anaconda3/lib/python3.9/site-packages/bokeh/models/sources.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 229\u001b[0m \u001b[0mraw_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_data_from_groupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mraw_data\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 230\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 231\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"expected a dict or pandas.DataFrame, got {raw_data}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 232\u001b[0m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 233\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mraw_data\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: expected a dict or pandas.DataFrame, got x_values y_values\n0 1 6\n1 2 7\n2 3 2\n3 4 3\n4 5 6" ] } ], "source": [ "# Create a visualization with Modin df \n", "modin_data = pd.DataFrame.from_dict({'x_values': [1, 2, 3, 4, 5], 'y_values': [6, 7, 2, 3, 6]})\n", "\n", "# create a ColumnDataSource by passing the dict\n", "source = ColumnDataSource(modin_data)\n", "\n", "p = figure()\n", "p.circle(x='x_values', y='y_values', source=source)\n", "show(p)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Create a visualization with pandas df \n", "pandas_data = pandas.DataFrame.from_dict({'x_values': [1, 2, 3, 4, 5], 'y_values': [6, 7, 2, 3, 6]})\n", "\n", "# create a ColumnDataSource by passing the dict\n", "source = ColumnDataSource(pandas_data)\n", "\n", "p = figure()\n", "p.circle(x='x_values', y='y_values', source=source)\n", "show(p)" ] } ], "metadata": { "interpreter": { "hash": "9752fa87da8bf164654ccc33a595e9110c8fc9bb15d763374a7037fd32519b1f" }, "kernelspec": { "display_name": "Python 3.9.7 ('base')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: examples/jupyter/integrations/huggingface.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Demonstrating Hugging Face Modin Interoperability\n", "## All the examples in this section are taken/ adapted from https://www.kaggle.com/code/satyampd/imdb-sentiment-analysis-using-bert-w-huggingface/notebook" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import modin.pandas as pd\n", "import numpy as np # linear algebra" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import tensorflow as tf\n", "import sklearn\n", "from tqdm import tqdm" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import urllib.request\n", "url_path = \"https://modin-datasets.intel.com/testing/IMDB_Dataset.csv\"\n", "urllib.request.urlretrieve(url_path, \"imdb.csv\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%time\n", "modin_df = pd.read_csv(\"imdb.csv\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modin_df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "type(modin_df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modin_df.sample()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import BertTokenizer, TFBertForSequenceClassification" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Loading the BERT Classifier and Tokenizer along with Input module\n", "from transformers import InputExample, InputFeatures\n", "\n", "model = TFBertForSequenceClassification.from_pretrained(\"bert-base-uncased\")\n", "tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model.summary()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# changing positive and negative into numeric values\n", "\n", "def cat2num(value):\n", " if value=='positive': \n", " return 1\n", " else: \n", " return 0\n", " \n", "modin_df['sentiment'] = modin_df['sentiment'].apply(cat2num)\n", "train = modin_df[:45000]\n", "test = modin_df[45000:]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# But first see BERT tokenizer exmaples and other required stuff!\n", "\n", "example='In this Kaggle notebook, I will do sentiment analysis using BERT with Huggingface'\n", "tokens=tokenizer.tokenize(example)\n", "token_ids = tokenizer.convert_tokens_to_ids(tokens)\n", "print(tokens)\n", "print(token_ids)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "type(train)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def convert_data_to_examples(train, test, review, sentiment): \n", " train_InputExamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case\n", " text_a = x[review], \n", " label = x[sentiment]), axis = 1)\n", "\n", " validation_InputExamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case\n", " text_a = x[review], \n", " label = x[sentiment]), axis = 1,)\n", " \n", " return train_InputExamples, validation_InputExamples\n", "\n", "train_InputExamples, validation_InputExamples = convert_data_to_examples(train, test, 'review', 'sentiment')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):\n", " features = [] # -> will hold InputFeatures to be converted later\n", "\n", " for e in tqdm(examples):\n", " input_dict = tokenizer.encode_plus(\n", " e.text_a,\n", " add_special_tokens=True, # Add 'CLS' and 'SEP'\n", " max_length=max_length, # truncates if len(s) > max_length\n", " return_token_type_ids=True,\n", " return_attention_mask=True,\n", " pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length\n", " truncation=True\n", " )\n", "\n", " input_ids, token_type_ids, attention_mask = (input_dict[\"input_ids\"],input_dict[\"token_type_ids\"], input_dict['attention_mask'])\n", " features.append(InputFeatures( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label) )\n", "\n", " def gen():\n", " for f in features:\n", " yield (\n", " {\n", " \"input_ids\": f.input_ids,\n", " \"attention_mask\": f.attention_mask,\n", " \"token_type_ids\": f.token_type_ids,\n", " },\n", " f.label,\n", " )\n", "\n", " return tf.data.Dataset.from_generator(\n", " gen,\n", " ({\"input_ids\": tf.int32, \"attention_mask\": tf.int32, \"token_type_ids\": tf.int32}, tf.int64),\n", " (\n", " {\n", " \"input_ids\": tf.TensorShape([None]),\n", " \"attention_mask\": tf.TensorShape([None]),\n", " \"token_type_ids\": tf.TensorShape([None]),\n", " },\n", " tf.TensorShape([]),\n", " ),\n", " )\n", "\n", "\n", "DATA_COLUMN = 'review'\n", "LABEL_COLUMN = 'sentiment'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_InputExamples" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)\n", "train_data = train_data.shuffle(100).batch(32).repeat(2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)\n", "validation_data = validation_data.batch(32)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), \n", " loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), \n", " metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model.fit(train_data, epochs=2, validation_data=validation_data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pred_sentences = ['worst movie of my life, will never watch movies from this series', \n", " 'Wow, blew my mind, what a movie by Marvel, animation and story is amazing']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf') # we are tokenizing before sending into our trained model\n", "tf_outputs = model(tf_batch) \n", "tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1) # axis=-1, this means that the index that will be returned by argmax will be taken from the *last* axis.\n", "labels = ['Negative','Positive']\n", "label = tf.argmax(tf_predictions, axis=1)\n", "label = label.numpy()\n", "for i in range(len(pred_sentences)):\n", " print(pred_sentences[i], \": \", labels[label[i]])" ] } ], "metadata": { "interpreter": { "hash": "9752fa87da8bf164654ccc33a595e9110c8fc9bb15d763374a7037fd32519b1f" }, "kernelspec": { "display_name": "Python 3.9.7 ('base')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: examples/jupyter/integrations/matplotlib.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Demonstrating Matplotlib Modin Interoperability" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import modin.pandas as pd\n", "import pandas\n", "import numpy as np\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a visualization with Modin df\n", "# Example modified from https://matplotlib.org/3.1.1/gallery/lines_bars_and_markers/xcorr_acorr_demo.html#sphx-glr-gallery-lines-bars-and-markers-xcorr-acorr-demo-py\n", "\n", "# Fixing random state for reproducibility\n", "np.random.seed(19680801)\n", "\n", "x = pd.DataFrame(np.random.randn(100, 1),columns=[\"Col_1\"])\n", "y = pd.DataFrame(np.random.randn(100, 1),columns=[\"Col_1\"])\n", "\n", "fig, [ax1, ax2] = plt.subplots(2, 1, sharex=True)\n", "ax1.xcorr(x[\"Col_1\"], y[\"Col_1\"], usevlines=True, maxlags=50, normed=True, lw=2)\n", "ax1.grid(True)\n", "\n", "ax2.acorr(x[\"Col_1\"], usevlines=True, normed=True, maxlags=50, lw=2)\n", "ax2.grid(True)\n", "\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a visualization with pandas df\n", "# Example modified from https://matplotlib.org/3.1.1/gallery/lines_bars_and_markers/xcorr_acorr_demo.html#sphx-glr-gallery-lines-bars-and-markers-xcorr-acorr-demo-py\n", "\n", "# Fixing random state for reproducibility\n", "np.random.seed(19680801)\n", "\n", "x = pandas.DataFrame(np.random.randn(100, 1),columns=[\"Col_1\"])\n", "y = pandas.DataFrame(np.random.randn(100, 1),columns=[\"Col_1\"])\n", "\n", "fig, [ax1, ax2] = plt.subplots(2, 1, sharex=True)\n", "ax1.xcorr(x[\"Col_1\"], y[\"Col_1\"], usevlines=True, maxlags=50, normed=True, lw=2)\n", "ax1.grid(True)\n", "\n", "ax2.acorr(x[\"Col_1\"], usevlines=True, normed=True, maxlags=50, lw=2)\n", "ax2.grid(True)\n", "\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a visualization with Modin df\n", "# Example modified from https://matplotlib.org/stable/tutorials/introductory/pyplot.html#sphx-glr-tutorials-introductory-pyplot-py\n", "\n", "names = ['group_a', 'group_b', 'group_c']\n", "values = [1, 10, 100]\n", "\n", "modin_df = pd.DataFrame({'names':['group_a', 'group_b', 'group_c'],'values':[1, 10, 100]})\n", "\n", "plt.figure(figsize=(9, 3))\n", "\n", "plt.subplot(131)\n", "plt.bar(modin_df['names'], modin_df['values'])\n", "plt.subplot(132)\n", "#plt.scatter(df['names'], df['values'])\n", "#plt.subplot(133)\n", "plt.plot(modin_df['names'], modin_df['values'])\n", "plt.suptitle('Categorical Plotting')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a visualization with pandas df\n", "# Example modified from https://matplotlib.org/stable/tutorials/introductory/pyplot.html#sphx-glr-tutorials-introductory-pyplot-py\n", "\n", "names = ['group_a', 'group_b', 'group_c']\n", "values = [1, 10, 100]\n", "\n", "pandas_df = pandas.DataFrame({'names':['group_a', 'group_b', 'group_c'],'values':[1, 10, 100]})\n", "\n", "plt.figure(figsize=(9, 3))\n", "\n", "plt.subplot(131)\n", "plt.bar(pandas_df['names'], pandas_df['values'])\n", "plt.subplot(132)\n", "\n", "plt.plot(pandas_df['names'], pandas_df['values'])\n", "plt.suptitle('Categorical Plotting')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a visualization with Modin df\n", "# Example modified from https://matplotlib.org/stable/tutorials/introductory/pyplot.html#sphx-glr-tutorials-introductory-pyplot-py\n", "\n", "names = ['group_a', 'group_b', 'group_c']\n", "values = [1, 10, 100]\n", "\n", "modin_df = pd.DataFrame({'names':['group_a', 'group_b', 'group_c'],'values':[1, 10, 100]})\n", "\n", "plt.figure(figsize=(9, 3))\n", "\n", "plt.subplot(131)\n", "plt.barh(modin_df['names'], modin_df['values'])\n", "plt.subplot(132)\n", "#plt.scatter(df['names'], df['values'])\n", "#plt.subplot(133)\n", "plt.plot(modin_df['names'], modin_df['values'])\n", "plt.suptitle('Categorical Plotting')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a visualization with pandas df\n", "# Example modified from https://matplotlib.org/stable/tutorials/introductory/pyplot.html#sphx-glr-tutorials-introductory-pyplot-py\n", "\n", "names = ['group_a', 'group_b', 'group_c']\n", "values = [1, 10, 100]\n", "\n", "pandas_df = pandas.DataFrame({'names':['group_a', 'group_b', 'group_c'],'values':[1, 10, 100]})\n", "\n", "plt.figure(figsize=(9, 3))\n", "\n", "plt.subplot(131)\n", "plt.barh(pandas_df['names'], pandas_df['values'])\n", "plt.subplot(132)\n", "plt.plot(pandas_df['names'], pandas_df['values'])\n", "plt.suptitle('Categorical Plotting')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(9, 3))\n", "\n", "plt.subplot(131)\n", "plt.hlines(pandas_df['values'], 1, 3)\n", "plt.suptitle('Categorical Plotting')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a visualization with Modin df\n", "# Example modified from https://matplotlib.org/stable/tutorials/introductory/pyplot.html#sphx-glr-tutorials-introductory-pyplot-py\n", "\n", "names = ['group_a', 'group_b', 'group_c']\n", "values = [1, 10, 100]\n", "\n", "modin_df = pd.DataFrame({'names':['group_a', 'group_b', 'group_c'],'values':[1, 10, 100]})\n", "\n", "plt.figure(figsize=(9, 3))\n", "\n", "plt.subplot(131)\n", "plt.bar(modin_df['names'], modin_df['values'])\n", "plt.subplot(132)\n", "#plt.scatter(df['names'], df['values'])\n", "#plt.subplot(133)\n", "plt.plot(modin_df['names'], modin_df['values'])\n", "plt.suptitle('Categorical Plotting')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a visualization with pandas df\n", "# Example modified from https://matplotlib.org/stable/tutorials/introductory/pyplot.html#sphx-glr-tutorials-introductory-pyplot-py\n", "\n", "names = ['group_a', 'group_b', 'group_c']\n", "values = [1, 10, 100]\n", "\n", "pandas_df = pandas.DataFrame({'names':['group_a', 'group_b', 'group_c'],'values':[1, 10, 100]})\n", "\n", "plt.figure(figsize=(9, 3))\n", "\n", "plt.subplot(131)\n", "plt.bar(pandas_df['names'], pandas_df['values'])\n", "plt.subplot(132)\n", "#plt.scatter(df['names'], df['values'])\n", "#plt.subplot(133)\n", "plt.plot(pandas_df['names'], pandas_df['values'])\n", "plt.suptitle('Categorical Plotting')\n", "plt.show()" ] } ], "metadata": { "interpreter": { "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" }, "kernelspec": { "display_name": "Python 3.9.10 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: examples/jupyter/integrations/plotly.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Demonstrating Plotly Modin Interoperability" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Currently Plotly is not completely interoperable with Modin. Each visualization is created with a Modin and then pandas dataframe for comparison." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import modin.pandas as pd\n", "import pandas\n", "import numpy as np\n", "import plotly.express as px\n", "import plotly.io as pio\n", "pio.renderers.default = \"notebook\"\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modin_df = pd.DataFrame(dict(a=[1,3,2,4], b=[3,2,1,0]))\n", "pandas_df = pandas.DataFrame(dict(a=[1,3,2,4], b=[3,2,1,0]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a visualization with Modin df\n", "fig2 = px.bar(modin_df)\n", "fig2.show()\n", "# py.iplot(fig2 , filename='jupyter-basic_bar')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a visualization with pandas df\n", "fig2 = px.bar(pandas_df)\n", "fig2.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a visualization with Modin df\n", "fig = px.line(modin_df)\n", "fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a visualization with pandas df\n", "fig = px.line(pandas_df)\n", "fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a visualization with Modin df\n", "fig = px.area(modin_df)\n", "fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a visualization with pandas df\n", "fig = px.area(pandas_df)\n", "fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a visualization with Modin df\n", "fig = px.area(modin_df)\n", "fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a visualization with pandas df\n", "fig = px.area(pandas_df)\n", "fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a visualization with Modin df\n", "fig = px.violin(modin_df)\n", "fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a visualization with pandas df\n", "fig = px.violin(pandas_df)\n", "fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a visualization with Modin df\n", "fig = px.box(modin_df)\n", "fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a visualization with pandas df\n", "fig = px.box(pandas_df)\n", "fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a visualization with Modin df\n", "fig = px.histogram(modin_df, opacity=0.5, orientation='h', nbins=5)\n", "fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a visualization with pandas df\n", "fig = px.histogram(pandas_df, opacity=0.5, orientation='h', nbins=5)\n", "fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a visualization with Modin df\n", "# Example from https://plotly.com/python/mapbox-county-choropleth/#choropleth-map-using-plotlyexpress-and-carto-base-map-no-token-needed\n", "from urllib.request import urlopen\n", "import json\n", "with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:\n", " counties = json.load(response)\n", "import modin.pandas as pd\n", "modin_df = pd.read_csv(\"https://raw.githubusercontent.com/plotly/datasets/master/fips-unemp-16.csv\",\n", " dtype={\"fips\": str})\n", "fig = px.choropleth(modin_df, geojson=counties, locations='fips', color='unemp',\n", " color_continuous_scale=\"Viridis\",\n", " range_color=(0, 12),\n", " scope=\"usa\",\n", " labels={'unemp':'unemployment rate'}\n", " )\n", "fig.update_layout(margin={\"r\":0,\"t\":0,\"l\":0,\"b\":0})\n", "fig.show()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a visualization with pandas df\n", "# Example from https://plotly.com/python/mapbox-county-choropleth/#choropleth-map-using-plotlyexpress-and-carto-base-map-no-token-needed\n", "from urllib.request import urlopen\n", "import json\n", "with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:\n", " counties = json.load(response)\n", "import pandas\n", "pandas_df = pandas.read_csv(\"https://raw.githubusercontent.com/plotly/datasets/master/fips-unemp-16.csv\",\n", " dtype={\"fips\": str})\n", "\n", "fig = px.choropleth(pandas_df, geojson=counties, locations='fips', color='unemp',\n", " color_continuous_scale=\"Viridis\",\n", " range_color=(0, 12),\n", " scope=\"usa\",\n", " labels={'unemp':'unemployment rate'}\n", " )\n", "fig.update_layout(margin={\"r\":0,\"t\":0,\"l\":0,\"b\":0})\n", "fig.show()\n" ] } ], "metadata": { "interpreter": { "hash": "9752fa87da8bf164654ccc33a595e9110c8fc9bb15d763374a7037fd32519b1f" }, "kernelspec": { "display_name": "Python 3.9.7 ('base')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: examples/jupyter/integrations/seaborn.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Demonstrating Seaborn Modin Interoperability" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### All the examples in this section are taken / adapted from https://seaborn.pydata.org/tutorial/introduction.html. Each visualization is created with a Modin and then pandas dataframe for comparison." ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "import seaborn as sns\n", "import modin.pandas as pd" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "UserWarning: Distributing object. This may take some time.\n" ] }, { "data": { "text/plain": [ "modin.pandas.dataframe.DataFrame" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Apply the default theme\n", "sns.set_theme()\n", "\n", "# Load an example dataset\n", "pandas_tips = sns.load_dataset(\"tips\")\n", "modin_tips = pd.DataFrame(pandas_tips)\n", "\n", "type(modin_tips)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
total_billtipsexsmokerdaytimesize
016.991.01FemaleNoSunDinner2
110.341.66MaleNoSunDinner3
221.013.50MaleNoSunDinner3
323.683.31MaleNoSunDinner2
424.593.61FemaleNoSunDinner4
\n", "
" ], "text/plain": [ " total_bill tip sex smoker day time size\n", "0 16.99 1.01 Female No Sun Dinner 2\n", "1 10.34 1.66 Male No Sun Dinner 3\n", "2 21.01 3.50 Male No Sun Dinner 3\n", "3 23.68 3.31 Male No Sun Dinner 2\n", "4 24.59 3.61 Female No Sun Dinner 4" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "modin_tips.head()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAwEAAAFcCAYAAACQkLIVAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAACZvElEQVR4nOzdZ4BU1dnA8f+902dntvdCWXrvKEXBggXEAhp7iRo1+qoxMUaNpvhGo4mvRGOKJiZGYxcQsKNYqErvvSxsYXubXu59PywsLLsLy/bdeX5fYO7MnHvO7O6Z+9xzznMUXdd1hBBCCCGEEBFD7egKCCGEEEIIIdqXBAFCCCGEEEJEGAkChBBCCCGEiDASBAghhBBCCBFhJAgQQgghhBAiwkgQIIQQQgghRISRIEB0GbfeeitlZWUA/OhHP2LPnj0dWp+HH36YV155pc3Pc+ONN/Lpp5+2+XmEEN1TZ+w7zzrrLC677DIuu+wypk+fzq9+9SuKi4sBKCws5JprrunQOgoRCYwdXQEhmmr58uW1///HP/7RgTURQoiuozP2nbfccgu33XYbALqu89JLL3H77bczb948UlJSePvttzu4hkJ0fxIEiC7hkUceAeDmm2/m5Zdf5vrrr+f555/H4/Hw3HPPkZaWxv79+7HZbNxxxx28/vrr7N+/nwsuuIBHH30UgCVLlvC3v/2NYDCI1WrlF7/4BaNGjapznj179vCzn/2s3vlvuukmZs+e3aS65ubmMnPmTNavX1/v8bx581i8eDGqqpKTk4PVauWZZ56hT58+FBcX8+tf/5p9+/ahqirXXHMNN910EwBffvklr7zyCiUlJUyYMIHf/e53qKoM5AkhTq4r9J2KonDXXXcxf/58li9fTnZ2dm2f+ec//5m8vDyKi4vJy8sjJSWFP/7xjyQnJ3PuuedyxRVXsHLlSgoKCrjsssv4yU9+ctI6//nPf2bDhg0UFRUxYMAAnn322Vb4lIXoonQhuoj+/fvrpaWluq7r+jnnnKNv2rRJX7VqlT5o0CB969atuq7r+m233aZfffXVut/v10tLS/UhQ4bohw8f1vfv369fcsklellZma7rur5r1y590qRJutvtbnZ9fvGLX+j//Oc/6x0/dOiQPnLkyAYfz507Vx8zZoxeUFCg67quP/HEE/pDDz2k67qu33PPPfozzzyj67quV1VV6TNmzNAPHDig33DDDfqPf/xjPRQK6R6PR580aZK+evXqZtdbCBFZukrfee+99+r/+Mc/6vSZL7zwgn7eeefp1dXVuq7r+p133qk///zztW15+umndV3X9cOHD+vDhg3TDx48eNI6v/DCC/qFF16oB4PBZtdfiO5CRgJEl5eZmcngwYMB6NGjB06nE7PZTHx8PFFRUVRWVrJ69WqKioq45ZZbat+nKAoHDx5k4MCBtcdaYyTgVIYMGUJqaioAgwcPZvHixQCsWLGCn//85wA4nU4+/PDD2vdMnz4dg8GAzWajV69elJaWtkpdhBCRq7P1nYqiYLPZ6h0fP348DocDqOkzKysra58777zzAEhJSSEhIYHKyko2btzYaJ0BRo4cidEolz9CyF+B6PLMZnOdxw117pqmMWHCBP70pz/VHisoKCA5ObnO6/r27cuCBQtaVB9FUdB1vfZxMBis87zVam3wtUajEUVRap87dOgQcXFxtc81Vr4QQjRHZ+o7dV1n69at3HDDDfWea6zPBLBYLPWeO1mdFy9ejN1ub3Y9hehOZFKx6DIMBgOhUKhZ750wYQLLly9n7969AHzzzTdceuml+Hy+1qwiANHR0QSDwdoMHB999FGT6zh37lwAqqurufnmmzlw4ECr108IEVk6e98ZDof5y1/+QlxcHOPGjWtxee3Z3wvRlclIgOgyLrroIm688Ub+/Oc/n/Z7+/btyxNPPMFPf/pTdF3HaDTyt7/9jaioqBbVac6cObz44ou1j8855xyee+45fv7zn/OjH/2I+Ph4LrrooiaV9atf/Yrf/OY3zJw5E13XufPOOxk6dGiL6ieEEJ2x73z11VdZuHAhiqIQDocZNmwYL7/8covKbOs6C9HdKLrMKxBCCCGEECKiyHQgIYQQQgghIowEAUIIIYQQQkQYCQKEEEIIIYSIMBIECCGEEEIIEWEkCBBCCCGEECLCdLkUoaWlLjSt+yY0iouzU17u6ehqtLtIbHckthmk3a0hKcnZrPd19/4TIvP3KxLbDJHZ7khsM7R+u5vbh3Y3MhLQyRiNho6uQoeIxHZHYptB2i3aViR+zpHYZojMdkdimyFy293WJAgQQgghhBAiwkgQIIQQQgghRISRIEAIIYQQQogII0GAEEIIIYQQEUaCACGEEEIIISKMBAFCCCGEEEJEGAkChBBCCCGEiDBtGgS4XC4uueQScnNzAVixYgUzZ87kggsuYM6cOW15aiGEEEII0Y0oCpg1d+1jix55G6e1pjYLAjZu3Mi1117LgQMHAPD5fDz66KP89a9/5eOPP2bLli188803bXV6IYQQQgjRTSgKmMr2UP3Rc1j8pVjceVR+8Hss7vyOrlqX1WZBwLvvvsuvf/1rkpOTAdi0aRM9e/YkKysLo9HIzJkz+fTTT9vq9EIIIYQQoptQtRDBgl0ECvZSsfBZyuY9Q7D4EFp5PorS0bXrmoxtVfCTTz5Z53FRURFJSUm1j5OTkyksLGyr0wshhBBCiG4irBgxDjqfKFc57g2LAYg55ybCmaPQ9Q6uXBfVZkHAiTRNQzkuVNN1vc7jpkpIcLRmtTqlpCRnR1ehQ0RiuyOxzSDt7iiR0H9Cx3/OHSES2wyR2e5IbDPUtNtfsJfKXd/XHvNu+ZrkAWMxJ2R0YM26rnYLAlJTUykuLq59XFxcXDtV6HSUlrrQtO4b8iUlOSkuru7oarS7SGx3JLYZpN2tVVZzdPf+EyLz9ysS2wyR2e5IbDPUtLu8pJzgxq/QPJXETrudcHUZ1avm4d6zkSo9+rRGAyI1kDpRuwUBI0aMYP/+/eTk5JCZmcmHH37I7Nmz2+v0QgghhBCiiwrpRswjZpDUZwyB2N4Y9TBJWQMJxfSU6UDN1G5BgMVi4emnn+bee+/F7/czZcoULrroovY6vRBCCCGE6MKCRgfB2L4AaIqBUFz/Dq5R19bmQcCSJUtq/z9hwgQWLlzY1qcUQgghhBBCnITsGCyEEEIIIUSEkSBACCGEEEKICCNBgBBCCCGEEBFGggAhhBBCCCEijAQBQgghhBBCRBgJAoQQQgghhIgwEgQIIYQQQggRYSQIEEIIIYQQIsJIECCEEEIIIUSEkSBACCGEEEKICCNBgBBCCCGEEBFGggAhhBBCCCEijAQBQgghhBBCRBgJAoQQQgghhIgwEgQIIYQQQggRYSQIEEIIIYQQIsJIECCEEEIIIUSEkSBACCGEEEKICCNBgBBCCCGEEBFGggAhhBBCCNGuFKWjayCMHV0BIYQQQggRGTyBMPsLqjlwuIqEaCv9s2KJd5g7uloRSYIAIYQQQgjR5jz+MC8v2sqmPSW1x6KsRn55y3hSY60dWLPIJNOBhBBCCCFEm9uTX1knAABw+0J8uHw/egfVKZJJECCEEEIIIdqUoijsOlTR4HOrtxfiDYTbt0JCggAhhBBCCNG2dF0nKdbW4HNJsTZMRrkkbW/yiQshhBBCiDY3uFc8FpOh3vEfnNcPkyrpgtqbBAFCCCGEEKLNJcdYeOyH4xjeLxGApFgr9101gsE9Yzu2YhFKsgMJIYQQQog2p+uQEW/nvlnD8QTCmI0qZoOMAHQUCQKEEEIIIUS7URVwWOpPCxLtS6YDCSGEEEIIEWEkCBBCCCGEECLCSBAghBBCCCFEhJEgQAghhBBCiAgjQYAQQgghhBARRoIAIYQQQgghIowEAUIIIYQQQkQYCQKEEEIIIYSIMBIECCGEEEIIEWEkCBBCCCGEECLCSBAghBBCCCFEhJEgQAghhBBCiAgjQYAQQgghhBARRoIAIYQQQgghIowEAUIIIYQQQkQYCQKEEEIIIYSIMBIECCGEEEIIEWEkCBBCCCGEECLCSBAghBBCCCFEhJEgQAghhBBCiAjTIUHAggULmDFjBjNmzOCZZ57piCoIIYQQQggRsdo9CPB6vTz55JO8/vrrLFiwgDVr1rBixYr2roYQQgghhBARq92DgHA4jKZpeL1eQqEQoVAIi8XS3tUQQgghhBAiYhnb+4QOh4P777+fiy++GJvNxrhx4xg9enR7V0MIIYQQQoiIpei6rrfnCXfs2MHDDz/MK6+8gtPp5MEHH2T48OHcfvvt7VkNIYQQQgghIla7jwQsW7aMCRMmkJCQAMCsWbN48803mxwElJa60LR2jVvaVVKSk+Li6o6uRruLxHZHYptB2t1aZTVHd+8/ITJ/vyKxzRCZ7Y7ENkPrt7u5fWh30+5rAgYOHMiKFSvweDzous6SJUsYNmxYe1dDCCGEEEKIiNXuIwGTJ09m27ZtzJo1C5PJxLBhw7jjjjvauxpCCCGEEEJErHYPAgDuuOMOufAXQgghhBCig8iOwUIIIYQQQkQYCQKEEEIIIYSIMBIECCGEEEII0QkcTaPfHjpkTYAQQgghhGhYSNPJL/OQX+zGYjbQOy2aWLupo6sluhkJAoQQQgghOomQprN4zSHeW7Kn9pjNYuSXN4+T/PadiNvt5pFHHiEnJwdVVRkyZAgzZszgT3/6E2lpaezfvx+bzcYdd9zB66+/zv79+7ngggt49NFHAXjnnXd4/fXXUVWVxMREHn/8cXr37l3nHE899RQ7d+7kr3/9KyaTiWeffZbVq1cTDocZPHgwjz32GA6Hg3PPPZfhw4ezc+dOfvrTnzJt2rQmtUGmAwkhhBBCdBL5ZZ46AQCA1x/i3x9tw+0NdlCtxIkWL16M2+1mwYIFvP/++wDk5uayefNm7rjjDhYsWIDD4eDll1/mpZdeYt68ebz55psUFhaycuVK/vnPf/Laa6+xcOFCLrnkEu655x50vWYzR13XeeKJJ8jPz+cf//gHUVFRvPzyyxgMBubNm8fChQtJTk7m2Wefra1Pv379+OSTT5ocAICMBAghhBBCdBr5Je4Gj+/Nq6Sw3IPTJPdvO4MxY8YwZ84cbrzxRiZOnMjNN99MWVkZmZmZDB48GIAePXrgdDoxm83Ex8cTFRVFZWUlS5cuZfr06cTHxwMwa9YsnnzySXJzcwF49dVXKS0t5YMPPsBsNgPw9ddfU11dzYoVKwAIBoMkJCTU1mfs2LGn3QYJAoQQQgghOgmLydDgcaNBwWKUAKCzyMrKYvHixXz33XesWrWKH/7whzzxxBO1F+1HGY31L7U1Tat3TNd1QqEQAOPGjWP06NE88sgjvPPOO5hMJjRN49FHH2XKlClAzXQkv99f+3673X7abZDfJiGEEEKITqJ3WjQ2S/0Lx4sn9iIt0dEBNRINefPNN3nkkUeYPHkyP//5z5k8eTLbtm1r0nvPOussPv74Y8rKygCYO3cusbGx9OzZE4ChQ4dyww034HQ6efHFFwGYPHkyb7zxBoFAAE3TePzxx3nuueda1AYJAoQQQgghOolYu4lf3jKOvpmxQM0IwMyzejNtbBaqqnRs5UStyy+/nHA4zPTp05k1axbV1dUMGDCgSe+dNGkSt9xyCzfffDMzZszggw8+4KWXXkJVj12WK4rCU089xZtvvsm6deu4++67ycjI4IorrmD69Onous7DDz/cojYo+tFVCF1EaakLTetSVT4tSUlOiourO7oa7S4S2x2JbQZpd2uV1Rzdvf+EyPz9isQ2Q/dvd0jTqXAHMBlVYuwm0Lt/mxvT2u2WLEs1ZE2AEEIIIUQnY1QVEp2WmgfdO3YXHUSmAwkhhBBCCBFhJAgQQgghhBAiwkgQIIQQQgghRISRIEAIIYQQQogII0GAEEIIIYQQEUaCACGEEEIIISKMBAFCCCGEEEJ0Erm5uQwYMIDly5fXOX7uueeSm5vbaueRIEAIIYQQQohOxGQy8fjjj+NyudrsHLJZmBBCCCGEEKfh67WHeO2T7ZSUe0mMs3HTxYOYOiar1cpPTk5m4sSJPPPMM/zv//5vnef+/ve/s3DhQgwGA5MmTeLnP/85BoPhtM8hIwFCCCGEEEI00ddrD/HiexspLveiA8XlXl58byNfrz3Uqud5+OGHWbZsWZ1pQd9++y1Llixh7ty5zJ8/n5ycHN5+++1mlS9BgBBCCCGEEE302ifb8QfDdY75g2Fe+2R7q57H4XDwv//7v3WmBa1atYoZM2Zgs9kwGo3Mnj2blStXNqt8CQKEEEIIIYRoopJy72kdb4nJkyfXTgsC0DSt3mtCoVCzypYgQAghhBBCiCZKjLOd1vGWOjotqKioiDPPPJOPPvoIn89HKBRi7ty5nHnmmc0qV4IAIYQQQgghmuimiwdhMdVdiGsxGbjp4kFtcr6j04KCwSBTp05l6tSpzJ49mxkzZpCens4NN9zQrHIlO5AQQgghhBBNdDQLUFtlB8rMzGTJkiV1jk2ePJmdO3cCcPfdd3P33Xe3+DwSBAjRzRkMCgDhsN7BNRFCCCFAA0IhDbNJhS761TR1TFarpgTtCBIECNFNufwh9uZVsXp7IaqqMG5QCtlp0URZTj+XsBBCCNFSYV1nV24lC5buo6jMy5lDUzl3TCaJTktHVy0iSRAgRDdU5Q3ywvsb2ZdXVXts2cZ8BvWK567Lh+K0yp++EEKI9rXtQAVz3llf+/jTVTl8t+Uwv7p1PDF2UwfWLDLJwmAhuhlFUVi2qaBOAHDU9gNlrNtVjKJ0QMWEEEJErEBY47+f7ah3vNzlZ09eZQfUSEgQIEQ34/aH+HD5/kafX7h0H75Q/TzDQgghRFvxBzWKKxrJr1/Z+vn1xalJECBENxMK6/gC4Uafr/YEZJGwEEKIdmUzG+ibGdvgcxmJjvatjAAkCBCi27FbDPTJiGn0+aF9ErGa5U9fCCFE+zGqCtdd0B+DWnc+6sCecfROc3ZQrTqfJ554gvvuu6/OsWXLlnHeeefhcrla9VxyJSBEN2NUFa48t2+jz18yqTcqsihACHGMMeTGGD42JcMarmrXtUOyTiky9Ep28Nvbz2TW1D5MGp7G3bOGc/cVw4iySLKKo372s5+xZcsWvvzySwA8Hg+/+c1veOqpp3A4WnfERD51IbqhvunR3HfVCP790TaqPUEAYh0Wbps5mN4pMuwqhDjGFHLjW/0+itGCZezlqK4iyhbNIW7GfQRie6O34exBRQFT0VYwmgkl9MdUeQDNVUYobSS63KzoltLjbWRM6IWqdt39a/RwkMPvPg1AyuwHKZz7LACpP3gYxdCyLEdRUVH87ne/49FHH2XChAm88MILnHvuudhsNq699lp8Ph9xcXH89re/JSsri3//+9/Mnz8fVVUZPnw4TzzxRJPPJUGAEN2QQVEY1TeBfndO5HC5B1WBlDg7drPsESCEqMvgL8OzbSmEQ2ieCvwHNqH5PXg2fYll8o2ElLbL4W4NlFK86HkA4s7/IaVLXkMP+km+6Wm8luQ2O6/oWLquE2586Vqnd/jdp/Ed3AbAwRfuQA+Hao+nXft4i8ufOHEikydP5pFHHmHfvn28+eabXH/99fz9738nPT2dpUuX8vjjj/PKK6/w0ksvsXTpUgwGA7/85S8pLCwkJSWlSeeRIECIbkrXIcpioE+qzLUUQjTO7+hBwuUPUjr/Wbw7VwFg6zsW64QfEGzDAADAa0og/tIHKP3gWco+fQlQiJ95Pz5LUpueV4jWoIcC6KEAAIrR3KplP/zww0ydOpW//OUvFBQUcOjQIX784x/XPu9yuTAYDIwaNYorr7yS8847jx/+8IdNDgBAggAhhBAioum6jmK2o6gq+pHswWpULLraPps3KRY7iqKiEwZFQbU5auYJdc2ZIiICpMx+sGYE4EgAAKAYjKTM/nmrncPhcBAdHU1GRgYul4vMzEwWLFgAQDgcpqSkBIC//vWvbNiwgW+//Zbbb7+dZ599lvHjxzfpHLIwWAghhIhgVnceZfOeRg8FMCX3BtWIe+MXBNcvwqD72/bcgVJK5z2DroVxnnk5KCql8/6A1VvYpucVoiUK5z5bOwXoKD0conDuH9vkfNnZ2VRWVrJmzRoA5s6dy4MPPkhZWRnTp0+nf//+3H///UyaNImdO3c2uVwZCRBCCCEimGZ2YErri2q0EDX1ZvTyXMoWPIc5axAh1dymd+QD5lhiz7kJxWRFyxhBQvoAwuWHCVgT2u6kQrQSxWhGMRjrBQStzWw28/zzz/Pkk0/i9/txOBw888wzxMfHc/XVV3PllVdis9no3bs3s2fPbnK5iq635br/1lda6kLTulSVT0tSkpPi4uqOrka7i8R2R2KbQdrdWmU1R3fvPyEyf79ao83mQAW6ohI0RaMoCtZACT5zPLre9hl6FMKAWpsNSCWMxqmTGMjPOnK0drub24ce1ZbZgdqTjAQIIYQQES5gjq39v67reE0J7TYnXz/hgr8pAYAQHUkxmOpkAWqNjEAdQdYECCGEEEIIEWEkCBBCCCGEECLCSBAghBBCCCFEhJEgQAghhBBCiAgjQYAQQgghhBARRoIAIYQQQkQERQGjUUVV2z71qRCdXYcEAUuWLGHWrFlcfPHF/O53v+uIKgghhBAigpS6Any2JpenXl/L3G/3UVDuRZFYQLSizZs3c99993V0NZqs3fcJOHToEL/+9a957733SEhI4Oabb+abb75hypQp7V0VIYQQQkSAMleAp15dTbnLD8COnHI+W5XD47eOJyPe3sG1E93FsGHDeOGFFzq6Gk3W7kHA4sWLmT59OqmpqQDMmTMHi8XS3tUQQgghRCdX6Q1iNKg4LEZ0vXm7lykKbNpbUhsAHBUIaXz+/UF+ePHAdtsYTXQfbrebRx55hJycHFRVZciQIcyYMYMnn3ySDz/8kNtuu42SkhIAPB4Phw4d4tNPPyU9PZ1nn32W1atXEw6HGTx4MI899hgOh6Pd29Du04FycnIIh8PcddddXHbZZbz55pvExMS0dzWEEEII0UlVeIK88vF2fvrCUh76y3K+3VRAMKw1qyxVVdl2oLzB57buKyMQal65IrItXrwYt9vNggULeP/99wHIzc2tff6VV15hwYIFvPfee6SkpPDTn/6UXr168fLLL2MwGJg3bx4LFy4kOTmZZ599tkPa0O4jAeFwmDVr1vD6669jt9v58Y9/zPz585k1a1aT3p+Q0P6RUntLSnJ2dBU6RCS2OxLbDNLujhIJ/Sd0/OfcEbpTmwOhMP/4aC0rNxcA4PWH+PdH20hOsDN5REad1za13QN6xrFme2H94z3iSE50YDIaWl7xdtKdftano7O1e8yYMcyZM4cbb7yRiRMncvPNN1NWVlbnNZqm8eCDD5Kdnc0dd9wBwNdff011dTUrVqwAIBgMkpCQ0O71hw4IAhITE5kwYQLx8fEAnH/++WzatKnJQUBpqQtN677jdklJToqLqzu6Gu2uPdtdXOVnT14FBoNK/8xYYu2mdjnvieRnHVlas93N/TLs7v0nRObvV3drc7knWBsAHO+bdbkMzoolfGRE4HTaPaJPAvOtRjy+UO0xVVW48MweVJR7Wqfi7aC7/aybqrXb3RoBRVZWFosXL+a7775j1apV/PCHP+SJJ56o85onn3wSr9fLnDlzao9pmsajjz5auxbW7Xbj99edqtZe2j0IOOecc/jFL35BVVUVUVFRLF26lPPOO6+9qyEiVEm1n9+88h1ef80XQXKcncduGYvD0u5/CkIIIRpgVBUsJgP+YLjO8VinpdnrAhKdFn71w/F8uzGfDbuK6ZMRw/nje5CVaJf1AKJZ3nzzTdauXcuzzz7LWWedRWlpKdu2bat9/uWXX2b9+vW8/vrrGAzHRpomT57MG2+8wYQJEzAajTz++OPY7fYOyZbZ7lc+I0aM4Pbbb+e6664jGAwyadIkZs+e3d7VEBFqf0FVbQAAUFTuIbfIzcAsWZcihBCdQbTNyJXn9OWNz3fWHjOoChOHprVoJCs5xspVU7KZdXY2BkWpCSgkABDNdPnll/P9998zffp0bDYbaWlpDBgwgE8//ZTCwkKee+45evfuzQ033ICm1Yxe3Xfffdx9990888wzXHHFFYTDYQYNGsTDDz/cIW3okNufV155JVdeeWVHnFpEOJOx/lp4k0n2zBNCiM5C1+Gs4Wkkxtr4dkMeCTFWzhqZQWYr3LXX9ZqMKM0dURDiKLvdzp/+9Kd6x6dNmwbAjh07Gn3vr3/967aq1mmRORAiovRJj6FPRgx78yoBmDA0lczEqA6ulRBCiOOZjSojsuMZ3S8RXddrRgDkul2IViVBgIgoTquRn10zkrwSNwaDSnqCHbNBRgKEEAJAOTpNhpr8+h19wzzczLSgQohTkyBARByryUCftOiOroYQQnQq5mAleuEu9IzhGDzF6D43gYQBHV2tRgVDYUqq/fj8IRKirdjMXSfNpxCdgQQBQgghhECpzKP0478QNXQq3j1rMMam4LzkAQJq59tfwuUP8dbcjXyx+hC6DrEOC3ddMYz+mdEybUiIJpJ5EEIIIYQglDiA6AmzcG/5Gj3oJ/biuztlAKAo8NW6PBZ/f6h2ulKFy88f3lhLYbmvYysnRBfS5CCgsrISl8vVlnURQgghRAcxugpwrf8c1WJHDwfxbluKQet8F9XVvhAfLd9f77im6ewrqOyAGgnRNZ0yCNi3bx+zZ89mwoQJnHHGGdxwww3k5+e3R92EEEII0U4UdExJWSRc9ztip90GWrhTTq3RNJ1gIwuGAyFZSCy6B5fLxSWXXEJubm6bneOUQcAjjzzCVVddxcaNG1m/fj0XXnghv/zlL9usQkIIIYRofz5HFo6L7sdnikfvPRHDiBmEDdaOrlY9TpuJCUPTGnyuV4qznWsjROvbuHEj1157LQcOHGjT85wyCPB6vVxzzTWYTCbMZjM33ngjJSUlbVopIYQQQrS/gFJz0a9hIKx2vgAAQAGumNKHlHh7nePXXtBf9n0R7SoUClFSUkIoFGrVct99911+/etfk5yc3KrlnuiU2YGys7NZt24do0ePBmDXrl1kZma2aaWEEEIIIRqT4DDz9D2T2bG/FI8/RFqCnaQYKwZF6eiqiQixceNG7r//fgKBAGazmeeff54RI0a0StlPPvlkq5RzKqcMAvLz87nxxhsZMGAARqORbdu2kZSUxMyZMwFYtGhRm1dSCCGEEOJ4ibE2BmTGdHQ1RAQKhULcf//9tQlzAoEA999/P19++SUGQ9fZr+KUQcCDDz7YHvUQQgghhBCi06uoqCAQCNQ5FggEKC8vJzExsYNqdfoaDQL27t1Lnz59iIpqeH7dkCFD2qxSQgghhBBCdEaxsbGYzeY6gYDZbCYuLq4Da3X6Gg0C/vCHP/DSSy9x1VVXkZaWhq4fyxPm9XpZuXJlu1RQCCGEEKIzUBTQO2HaVNG+jEYjzz//fL01AV1pKhCcJAh45plnqKiooE+fPrz++uvouo6iKASDQW644Yb2rKMQopWoqoKiKIQbybEthBCivgpPkN25FRSVe8lMdtAnPRqH5ZQzqkU3NmLECL788kvKy8uJi4trkwBgyZIlrV7m8Rr9Df7Zz37G8uXLURSFCRMm1B43GAxceOGFbVopIUTr0tE5WOxh1dbDFJV5OGNIKoN7xuGwypeYEKJ1WL2HCVpiCatWDJoPk78Cny211cpXFHD7wxRVeDEaVGxRllYr+2RKqv089Z/VVLiOTf3okxHDvVcNJ9pqapc6iM7JYDB0qTUAJ2r0CuCVV14BajYL+/3vf99uFRJCtL5duVX84Y21tcPY63cVM7h3PPdcMQybuWsNX4ruRVF0LMFKfMZYjJoXJRwgaJKML12NpTqHkrlP4xg5DevIi/Bt+oTKjV+SMOsX+J09W+EMOlsPVvKPBVuoPHIx3ivNyR2XDSM1tg33M1Dg63V5dQIAgL15lWw/UM4ZA9s2j7sQbemUm4VJACBE1+YPabz68bZ681i37S/jQJGrYyolxBHm0l2Uvv0rrK5DBNYuwPPNq5jD7o6uljgNigK6z4Ue9FP9/ULK3/kN1d8vQg/60X0uWiN1/8ESD//35rraAADgQEE1T/1nNVXeYMtP0IhASGPV1sMNPrd+VzEGwykvo4TotOS3V4huzuULUVjmbfC50sqGjwvRbqISMUTFUvzm47jXf4p94ERCBltH10qcBl2HYPJg4mf+BIBQZREA8TPvJ5g8uMULaVVVYcXmggbLcXmD7C+obtkJTsJkUOvtTHxUepIDTZNVwqLrkiBAiCYqdwdYv7eUFdsK2ZlXiTcY7ugqNYndYiDW0fDc2Zh2mlN7SgqUuQLklrip9oVQTnHr8PinFYVWudMoOkbYZMec2rfmgcGIIS4NTb6amkeBck/N31GFJ9iufxdq2E/w8O46x4IFe1DD/haXrSgK+/KqGn2+3NXyczR6buCys7LrHTeoCmMGJAMSBJwOX0hjZ14VK7YVsn5vKWXuwKnfJNqMrAoU4lQU2H6okj+9vZ5g6FhWnYwkBw9cPZJ4h7ndq2RQwoR1Q73/N8RuNnD9RQP4y/ub6hxPT4yid5qzTevZFC5/iI9X5fD5dwfRNB2bxcjV5/fjzMEpmBsYajcHK9EObUTpOYaQMQpT6U5AJZjQT1L3dUFqyV7cm5cQe/6tuDd+QcUnfyNm9qP4lYb3qBEN8wTCLFmXx8Kl+wiFNcxGlSum9OHskenYTHX7B0WpuRhToMG/sdNhUMLoihElZy3V3y8C1UD0GZdRteoDqr9fSFxcCmrvyS26Y67rOkOyE9iTW9Hg8zbzsUsZTdc5XOHjcKmHWIeZzKSoFrexT7qTB68bzZuf7yS/xE3/HrGcMyaLtxbvYNygVEb1SyLaJpdTp1LmDvCndzaQe9w0VJNR5SdXj2JQjxiJpzqA/NYKcQqFFT6ee3Md4RO+xPKKXfxj0VYevGYUhna842YKuQhu+gTLoLMA8G9fimn4xQSNjgZfr+swIjuBn18/mg+XH6CsysdZI9KZMDStw1Pc6cDcb/bxzbrc2mNef4hXP9qO1Wxk/ICkOq83aT7cy/6Lb/dq7MP2Y+s7ltKFc1BQSLz2N/jsGe3cAtFSWmJfkq77LQFHFjEZQyAckADgNGmazpJ1ucz7em/tsUBI450vd6OoKheNy6y9CA/rOut3l/LmZztRVYVbpg9icM841Gb0YRZfMf6tS7CMnI4am4IlcxCOsTMIpw4lIaUPrnUfo6YNJtTCKTOapjN2YDIfr9hf50YMQEq8HU3XUFUFTddYsbWIfy3aWvv8xOFp3DCtP1ZT8xMgGBSFwT1i+fWt4zlY7OLDZQd4+YPN6Dps21/Oyh4F3H/ViHrBljgmrOu8smhrnQAAIBjS+L+31vHknRNIiWnDBd6iQRIECHEKO3LK6wUAR+3MKSe/1ENWYsNzRtuCGvLg2b0az47lNQcMZuIGT4VGggAAo6owKCuWgdeOJBzWMRnUOhsAdpSiCl+dAOB4b3+xi2HZ8XW+WEMGK1EjL8S/fyOezV/h2fwVAPZRFxKydK2dGkWNkGol5KjJHuO3JHRwbbqmvGIXC5fub/C5+d/s4czBKbV3qg8Vu/nr3GOjgs+9vZ7/vWMCGfHNWIfhq6R6w2J8B7cSLM0jbur16Am9CGugpQzGMa0XAUPrBHQZCTZuu3QIi787yN68SlQFRg9MoVdaNAo1gUJJdYBXP9pW530rNhUweXg6AzNbnnEqENR44Z0NVHvqLkTedbCCg0UuBmRIVqvG5Jd52X6gvMHnNE1ne04ZKcPT27lWndeLL77IJ598AsCUKVN46KGH2uQ8MvFSiJNQVYX8kpNnKqn2tO+cRr81mfiL7iTsqiDsqiD+orvwW5NO/UZA0WsCgs4QAABUneSzq6j24wvUXXeh6xBK6EvM2dfUHlNtTuxjZxIytF8gJkRnUuUOEGpkA0B/IIzbd+yitbC8fjKA4gpPs84bjO9LzMQrCRYfBEXB1GsUQUPNzQhdV1otAKgpENLi7cQ4zFx6VjaXTM6mqNzDt+tyGdgzHoDyan+D045KK32tUgW3L1gvADiqvKrt1iV0B65TfE/mF7tRmzMc1Q2tWLGCZcuWMX/+fD744AO2bt3K4sWL2+RcEgQIcRKappOZ1PgddoDoqPZdE2DxFVH20YsYnIkYnImUffwiFl9Ru9ahtcQ4zI0uXoyPttaZ6ws1c5mNJTup+ObN2mOatxrPqnmYJK2kiFCxDjNmY8Nf5zaLkajj5qunnpDpRlEgJa55AbSpdDeVy9/DnNYHgOpv38AUaru0w1mJUcya0pdwWGP3oXLOHZPFL24cQ6y9ZsOueKcVQwMXkomxrZNtymEzNdrfx0fLVJaTcdpP/j2Zkdz1Mi2Vl5fzt7/9jXvuuYe///3vlJc3PNJxupKSknj44Ycxm82YTCb69OlDfn5+q5R9IpkOJMQpDOgRi9GgEArX76CGZMeT1pxh9BbQjFFEDZ2KuX/NTt6B3SvRTF1zDnVStJXzxmbxxepD9Z67dlp/rKa6FzbGsBfv5q8gHCJq1IXY+oyh9INn8exYjn3E+QTtXfNzEKIl0pMcXD6lD+9+ubvec1ee25cYm7l29C8z0c79PxjJm4t3YlJVrr9oIKlxzbyAtcfhHHcJ5mEXQHkumquMsKFtL4bT421cNbUPqqoQFxdFcfGx9KAJTjO3XzqElxdsqU0SMHV0Jj2TW6dfiLIYuHn6IP783sY6x4dkx5OVfPKbRZEuPd7GsD4JbN5bWu85o0FhUM+uNZ2zvLycq6++murqaoLBIOvXr2f+/Pm8/fbbxMW1rC39+vWr/f+BAwf45JNPeOutt1pa5QYpemeZF9BEpaWuLhctno6kJGedTi1SdPZ2786v4rm31+M/bnpKr7Ro7r1yOHHNHAloSZsNikZYV+v9vys4sd1uf5gv1h7io+UHCIU1HDYT110wgDEDkjA1cFfPFKpCz98KGSMIG+2YyvagqCqB2OxOnR2oNX/Hk5Kal9Wpu/ef0Pn7kraQlOTkYH4FSzcdZv43e/AHwtgsRq48ty8Th6RiaWCUIKjpKNRMD2yJo/2PogC6ht6OEwwa+lnr1CRzKCr3EOMwkx5vx9SKG3ppus6+w9UsWZtLeZWfs0amMyw7Aae1fe6pduXf73J3gBfnbmJ//rF0rxazgQeuGUX/9OiTvre1293cPvSov/3tb7z22msEg8emh5lMJm666SZ+/OMft7R6AOzevZs777yTe++9lyuuuKJVyjyRjAQI0QT9M6J5+scTOXi4Gk8gTEK0laykqAa/XNvD8Rf9XSkAaEiUxcBlk3oxZWQGXn+IaLsZh9XQ6AV90BiN0nNizZ1NvWZesq4j6eVERLOZDFw0LpMzB6fg9gVrpq7YTI2u/2kowG6Oo/1PzWk6vi9SgNRYK6mxbTMioSoKfdOiGXDZUADCjazFaAtGjl1wKgqoWpCwYmq387dUXJSZh64bTW6xm5IqHzazgZ4pTmKjTJ36Bk5DtmzZUicAAAgGg2zZsqVVyl+7di333Xcfjz76KDNmzGiVMhsiQYAQTaDrEGMzMax3fEdXpXvSIdZuqp3be6ovhOMvbLral4cQbUXTdKJtxtpMQG090K/pUFzpI7/UjcVkoEeKo8PTDreX9rz4BzAFK/Gvno9//HQUUwqm0l0EDm3BOOwiQmrX2WHbYlTpk+akz3F71HTFPnzYsGGsX7+eQODYgmez2cywYcNaXHZBQQH33HMPc+bMYcKECS0u72Qi469VCCGEEK1G03WWby3k1Y+21V7ExTksPHTjmIjL964oCsGwhtGgtMmIpKrohHYtxb3la7z71hN7zo2UffoSejhIQmo2SuqILnkh3ZVdffXVzJ8/n6qqKoLBIGazGafTydVXX93isl955RX8fj9PP/107bFrrrmGa6+9tsVln0iCACGEEEKclsPlPv79Yd2c/OUuP699up2f/WAkamNpv7qZSm+Q5ZsKWLoxn8xkBzMn96ZHYusmKNB0BXP/yVgObcd/cCtlH70IQNSI89ES+0oA0AHi4uJ4++23efvtt9myZQvDhg3j6quvbvGiYIDHHnuMxx57rBVqeWoSBAghhBDitBwqajgV6Pb95ZS5AiQ6Le1co/YX1nVe/3QH63YWA1BY5mHj7mKe+NGZpLZSWtKjguZYHGOm4z94bDdk+7Bz8LXmXgzitMTFxbXaIuCO0vGreIQQQgjRpZiMDd/pV1UFQytm4+nMiip8tQHAUaGwzq5DFa16HkWp2ZOhbOGfah4batZOlc77AxZP2+SPF5EhMv5ShRBCCNFqeqZGN7hB2TljMomzd52MNS3R2MLrcGun4dU1dG8VejiEc9Q0km7+A5YeQ9ADXgjU3wFaiKaS6UBCCCGEOC2JTjMP3zSWv83bTHGFF0WBScPTmTmxV0dXrVYwrJFX6sHtDZIUZyMp2kprrlRIjrUxsGccO3KO7RSrKtA/K7YVzwI6KqHMUSRd+2tsKVmUeww4zvsRqr8Kv7OHpEcWzSZBgBBCCCFOi65Dr2QHT9x+BsWVPiwmA4nRlla9yG6JSm+QfyzayrZ9ZUDNxfm1Fwxgyoj0Fm+QdpRRVfjRpUP57Psclm7IJz0ximum9Scj3t4q5R9P11V8zl44o5zgqSZgigVTrAQAokUkCBBCCCFEs1iMKpkJrX/R2xKKAl+vy6sNAKBmT4M3PttJr7Ro+qS2bLfY48VFmbj23H5cNjkbs1HF0FmiICGaQNYECCGEEKLb8ATCfPpdToPP7cwpp7Wzl+q6js0kAYBoPc8//zzTp09nxowZ/Pvf/26z88hIgBBCCCFaRFFAVVU0TevwvPUKCoZGpvyorTQVSAgAn89HSUkJiYmJWK2ts0ne999/z6pVq1i4cCGhUIjp06czZcoUsrOzW6X840kQIIQQQohmK6n2s2F3CXvzKhjQI57hfROIjzJ3WH1sZgMzJ/XmnS9313tuUK/4Dg9SRNcXCoV44YUXmDt3bm3wO3v2bO677z6MxpZdWo8fP57XXnsNo9FIYWEh4XAYu71tptxJECCEEEJEAFVV0HW9VS+Ci6v8PPGv73D7QgB8t7WQOIeFR28ZR4KjYwIBXdeZNCyNvBI3yzbW5NG3mAzcMmMwWYmda/2C6JpeeOEF5s2bh9/vrz02b948FEXhgQceaHH5JpOJF154gX/9619cdNFFpKSktLjMhsiaACGEEKKNmANlWMp2oipgqdyPxXu43esQ0nS251byn093sGR9PpWeYKuUaw6UUVFUQFjTeeTSNPql1Vxgl7v8rN1Z1Opz70+Hw2rklosG8NSdE/jlzWN5+scTOXNQMmpHVkp0Cz6fj7lz5+Lz+eodf//99+sdb6777ruPlStXUlBQwLvvvtsqZZ5IggAh2pE3qLE7v4otOeXsPlgu2d2E6MZUFcL711Iy9xnY9iml857Gs/ZDTErrXIQ31a7cSv7437V8tS6P1z/dwT8XbSPUwg2tzJoH11f/Jm3XPH4zI5q075/nx+M1HLaajcI27Czu8J2DVUUhNc5Gn7RoYuymRjf3EuJ0lJSUoKoN/26rqkpJSUmLyt+7dy/bt28HwGazccEFF7Bz584WldkYmQ4kRDtQFNhX6OLP722kotpfe+ycMZlcflY2Dov8KQrR3WgamPtNxpKzmcqlb2OMTsI+4Sr8evvtqGswqHyzIa/Osa37Symt9pMS0/yFjEGDHee4SymZ9wy23E1oSX3YWObA5a25AOrbIxattXfOFaITSExMRNO0Bp/TdZ3ExMQWlZ+bm8sLL7zAW2+9BcCXX37J7NmzW1RmY2QkQHQIgx7E5slDQcccrMBfeKCjq9SmCit9PPPamtoAAGo221myJpdPvztIp9lhRwjRahQFlOoCAnk7QTUSqi5FK9qP0o5jgJqmk5XsqHPMYjZgM7fsxoOuA0YTypHpNbrRgjtQc2FkMqpMGJIqQYDolqxWK7Nnz66XDaix46drypQpTJ06lcsvv5zZs2czatQoZsyY0aIyGyO3H0WHMJbtpWjeH4i76C5c276lsrqM2FmP4lcdp35zF6MosHlvKYFQw3cOPl2VwzmjMklwdlw2DSFEW9DR3BWYU/sSM+02XKsXEa4sQk0NEVbaZzRA13UmDk1j9fZCDhW6MBlV7rp8WIunx5h1L1VL38KUmo111AyqP/kzZ/WfTPnYLKaOziA93iZZeES3dd9996EoCu+//36d7ED33ntvq5R/7733tlpZJ6PoXWySXGmpq1vfXUhKclJcXN3R1WhzprAb38q38WxbCopK6nW/wuPo1S2/NAwGlefnbmTdjuJGX/PLm8fSJy26HWvVcSLld/xErdnupKTm7Xja3ftP6Hy/Xwo6prCbgMGBMeRGN5gIK60b8Delzf6QRkmljyibibgoU6v0tWZfCagqQUs8Vk8BbnMcqtFKew10dLafdXuIxDZD67e7uX3oidpin4D2JCMBokMoWpCw68iW7rqO5qmCKJ3uOC9G13XS4qOAxoMAm6X95ggLIdqPjkLAUDPCGTJGneR1OsVVfkorfaTE2Ulwmpt9oe4LahwqcWEyqKQn2DEbVCxGlYyEmuw9rXWzJWBNPFp5vLa0mvnF3TvGFKIOq9VKZmZmR1ej2SQIEB1CqS4iULCHxKsexbNjBZXfLcI5oy9+5WgOZx1FUTrlyIAOhDUds1Ft0l1VTdMZPSCZj1YcaPD5Qb3jSI3rencQhBCtQ1Fgy/4K5ryzHl2vmVP/6M3j6JnUeNBwIkv5bjBH4TLEoh3cxLaDFhasKeesEelcf0F/zB2cqUcI0fl0aK/wzDPP8PDDD3dkFcRpMFGT1k5RwKgHTvraoKbjD/ix+Q6jKDoKOjbvYVTCAATi+5F0/ZME4vtjHXcVCRfeRnDHt5hCLhRFx3R4M6aSHbWLzjqDkKaz41Alz769gV/+YxUfLDtAmevkn8NRPZKjuOGiAfWOx0db+OH0wZK7WohmMOjHUm0e7Z86+zmNR96j6iFUvWaDLW9Q45+Lttbe9AiGNN7/aneTb6oHq0qp/Pq/lM1/Bn3dXMJf/52ze2iYjSpLN+aTX+I5ZRlW72GMmhcAi68IU7DytNrlC4bZmVfJ9zuLySl2EW7kBomR9v+ZdUaBsMbew9V8v7OYfYerCYQbXjMmRFvqsJGAlStXMn/+fKZOndpRVRCnwRwow/vdXKLGz0L3VuDbvwHTiOmEVFu91+aXeZj/1S6u7V1EYO3bxF76U1Rdo2jRn4i74EfoPc9ER8FrTgQdFFWlbPEb+HI2Yy/Lw9prJGUf/wVUlaRrn8BnT++AFte39UA5z7+7ofbxB9/uZcXmAn5581ic1pP/KRkUhSnD0+mfGcfOQ+VUuQP0y4qjZ7IDp00G5IQ4XUbNT3j7YszJ2Shx6XhWvod93OVA68z1bfSc2xZjTslGiU3Hs6rmnH5rUpPeb3Hn4t28BNv42YQObqy5ydFjHGFNwesP1XltlSuApoOhCfcHTNEJxM24l6LXHsa/5Uu04Zfw8nfh2mQEgVD4pO+3unMpef8p7EPOJmrYuZQt+D+MCRnYp/6QoPHUn6c3GOZfH21n7Y6i2mPXTuvP+WMy60zwNIVcBNYtwDpoMhgteNZ9hG38bAKm2FM3shsJhjXe/3ovX6w+VHvsojN7csVZ2Zia8gMXopV0yNVHRUUFc+bM4a677mLHjh0dUQVxmpSQD1/OFvyHtqH53JgzBmAO+cFcNwjwBTX+9M4G3L4QroFJxBtNVCx4tqYMsxVDXBqhE+b9Bw024s66ksP5u/BsXYpn61IAnOMuJ2RrWb7d1uILabz+af3f1aJyD/vyqxiRHX/KMgyqQmaincxEO4qikJjoiMgFXkK0BkULECzJpWrlfAzOeDSvC/uoC9vhnIeoWnX8OS9q2nsV0L1VuLd8g//QVkLlhTjHz8SQFcJhsTJral/e+WJX7etnTe3bpAAAIOzz4N2xArRQTT+7+1suH/kj/ljgJi3BTmaSA1WtmV7ZUC4QzRSFKTEL97pPca/7FICoUdPQDE2bpnig0FUnAAB4+4tdjOibSPJxexEoYT/+w3txb1uGYjSjGM3Yx/ghwpZE5ZV66gQAUJMl7syhqfRIbPoUMCFaqkOCgF/96lc88MADFBQUnPZ7ExK6XwrJE7XWqvXWNQBl2i0UL3gegMTzbsSSllXvVfvyKimprNky+5XvAjw4/BJYNxeAuLOvwZk9hGil/iw0PWEwsZOvovyr/wJgiIohdtyFmKIT2qpBp+VQYTVlVQ1vBV7lCTT7Z9Y5f9ZtT9rdMbpX/+nEPuUacnd+R7iymIRpt+LsNRhoy8/ZiX3qteTuOnLOC27D2WsQ0U2czqfFjiIw4jyq13+OanUQO346ppiaUYRLz+5DdmYMRWUespKdDOmTgLWJufzD7ioIeki44HZsvYdSuPBFeiXbufeqEQzqlUCFy8+n3x/CZFSYNDyDAT3j6rXLMu0W8l99BABjfDoxI87B6Iht0vm/21k/6YGugycQPuFn4cR8wW0UvPZL9ICXlB88QlRW3yadozEd/TfVHJtzKho87vKGmtSertjm1hCp7W5L7R4EvPfee6SlpTFhwgTmzZt32u/v7inuOmv6L6s7l5KP/oY5rS+hymIKP3ie2MsexG+q+2ViNSj0THVS7fZx97gwrJqPYqxJh1f25WsojiQCyYM5PguQouhYS7ZR/vWbtcfC7kpKl/wX6xlXEzR2/IWLQdfJTHKQW+yq91xCtLVZP7PO+rNua9Lu1imrObpT/2kOVVH9yYsoZivmlN6Ufvkf1PgsnH2Ht9nv17Fz2mrO+cWrqHGZ+GJ6n/K9iqJgzFtL9frPsfYeiS9nM6VfvoF10g0E1Zq75f1SnfRLrfnZVld6aWorkpKiMY68lJDBTKVuJOaSn+FXbIwAiiq9PPbyKkJH5px/ujKH/73jTOKjjqUptXiLKF84BwDV6iBUlk/pt+9hHn0ZIUP9KZ8nSoiuP2JgNChE20x1fhYWfwnl8/8PQ3QiqiWKogXPk3jV4/jsaU1s6Ynt7pp9SVwje8LEOcynbE9XbXNLddYUoW3tmWeeoby8nKeffrpNym/3IODjjz+muLiYyy67jMrKSjweD0899RSPPvpoe1dFnIawOYboCbMw9p2AEnQTPryHUAMX5yaDwj2zh/PlmoMYDSWoZiuxl/8cFZ3SD56lofxxxrAX9/YVoGs4z5yFpccQSuf/Ad/edUSNnt4pggCTQeGm6QN5+rU1HH8NNbBnHL1Su0ZnIkR3EjLYiRoxDUNcKro9AeuuZWj2E+9wt8U5z8cQl3ba59R1HTU+i9ipN6D2P4uo4j2AQkBtncxgIYO99v9+5diF+8HC6toAAMDrD5Fb7CI+6rgpjAqg68RMvQFL9lgqPv87NX110wLGHklRXDKpNx8u3w+AqircecUw4h11L3ZDZifOsTMwZg1DV42E9q0mbI6M/VGOlx5r4/oLB/DGZzuBmqliN140iORYyRLXlbhcLj777DNycnLo2bMnF154IQ5H612vtMfa2Q7dLGzevHl8//33pxXhdKc7WQ3pzFG+qlB7AawoOrp+kiFwBcKahiNUidcUh6KAJVCG3xzX4PviLAHcOdsIpwxCUy2YK/aBwYTfUX/KUUc6VOLm+22FFJS5GT8olUE94065KLgxnfln3Zak3a1TVnN0t/5TQUc/Mqp4tH9q69+vhs55Ok71HgUN/bjEfQZFI6yfPJHfydq8/VAFf3xjXZ1jj90yjuwTbl5YAqWETNGEFRPmQDmawVInsGiwrke6cl2vSZtcWOGlwhUgOc5GgtOM0sC+L8e3vzmf3/G6cl+io1NY4aOsyk9CjJXkaCtNmVXWldvcEp1tJGDNmjU88MAD6LqOz+fDarWiKApz5sxh7NixLa5fRUUFd9xxB9OnT2fHjh3dZyRAdF3Hd9YnDQAAdDAoKt4j04V0HXym+EZvLBmjEwimjaxJkaeDPya7dSrdyrISo+g5tQ+KohCWlG5CdCj9uIvM9optWnrOk71HJYx6aC3+6EyqjIkkmP2Et3yGeej5zc6g0zvVydTRmXy9LheASyb1IrOB/Qf85mPrrwLmk49uhHWdQ8Vuth8oR9d1BvaMp0dyFOnxdtLjTx44HN/+bhSPnjYFhdRYG6mxp55uJToXl8vFAw88gNfrrT3m89WsGXzggQf45JNPWjwi0JK1s6ejQ4OAWbNmMWvWrI6sguhEOuPGYA2puZPaRSorhOhwvmAYtz9MjN2EUW38BoqlOpedVUb+9vEBCst3MLhHNDcOjCPxu7lYJ99EsBlpdKwmA9dP68eFZ2TVZCVzWjlJFU5Nge+2FfHPhVvrHL5p+iCmjkiFU90gEqKL++yzzxrMsgU10/4+++wzZs+e3ezyW7p29nTISIAQQgjRJnS2Hazknwu3UuHy0zPVyY8uHUp6fMN3f6ttqby8Mp/C8pq7itsOVvFVfCI3nzsCfwvyaBoUhZSY1rnjXO4K8OpH2+sd/++nOxjaO55Ep6VVztNZKGhYqg4SticRNEZhDlag+irxR/fsMjeuROvKycmpvfN/Ip/Px8GDB1tUfnuunZUgQAghhGgD+WU+/u+tdbUXizmHq3nmv2t48s4JOCz1v35dQZX8Em+dY9vy/fjMiQ3Mru8YxZW+OguNj9I0neIKb7cKAhR0jPkbKP7wz9iHTcUx7lIql7xCIHcn8bMeIhDXr6OrKDpAz549sVqtDQYCVquVHj16tKj8f//737X/P7p2tq2S55x8tZEQQgghmuVQUXW9u8XVniCHSz0Nvj7e6GNUn9g6x87ub8NatqvB17cXDQiENdyBMKFw47e/o2zdbNcvRUG1x6AYTXg2f0XRf36OP2cLisWOYpZNvSLVhRdeWLPbdwMUReHCC9t208LWJEGAEEII0QYsZkMjxxsehDf7K7mmTxkzxiTSM9XJjedmMTq0nkDeTgx6oC2r2gidPQXV/PGt9fzy5VV8uHw/+cVuRvRLqvfKkf0SSTvFouCuRtchENeH+EsfqDkQDgGQcMVD+KPSO7BmoiM5HA7mzJmDzWbDaq1J62q1WrHZbMyZM6dV04TOmjWrzTIDgUwHEkIIIdpE77Ro4qMtlFX5a4+NHpBEaiNrAnyOTDL7hbluiB2/NRmT5kUtDRKKzyasNLzBVFs6WOzh96+trh3N+Oy7g4wdlEL/HrEkxdlYvjEfXde5eEIvpozMwNSiFcedkylQgWvtR3WOuTd9gXXclQSNMhoQqcaOHcsnn3zCZ599xsGDB+nRo0er7xPQHiQIEEIIIdpAjM3EIzeNY+3OIvYcqmD0gCSG9E7AoCi4/GEsJrXehXPYmUEII+g6umrEnzi4fsFK7f5e9Z9SQNWC+HUjvqBGlClMTkmASpefuGgr6fH2k2YoOlaOwvrdxfXOsXZHIRlJUazZXsjkEenEx1iZPr5Ht0yZrCoQ2rMKf84WVHs0cefeRNmnL+PZ/BW2XsNR0kdF1OJgI0FCRxaoG7QgmsF46nTh3ZjD4WhRFqDOQIKAbk1H1cNoihEDIcJt8OP2BsPsL6ii3BUgK9lBVmJUp1nAJoQQHS3BYebCsZlcfORCubDSxz8WbqW4wsuFZ/ZEUSAh2krfjGhivLn4dq7EPGomatCNZ91H2MZdUbtHgKLAwRIPC5buw+0JctnZ2QzIOLbjrqKAuXwvnv0b+cw1iPE9jBgPLmdzYATzV5cDcNbIdK45tx92k8b+Ej95RS6yU+0kxDowG06YIXzcBa5BVZh2Rk8sJgOxTgtnj8zg8+9yGDsopdF0iV2dpoO1z3iiynIxJ2TgWr+Y2EmzCbur0FMGRlQAYAq7CWz4EGu/MwlHpxPetQRTfCbBlMERHQh0dRIEdFOKomMq3EbYU46l11gCmz/H0nskfkfLVq3XPQl8vDKHj1YcqD30yE1j6ZceedvACyFEY3QdwmENtz/M06+vodIV4LoLB/D6J9trLyTvubQvI4o/wbtzFTZ3JYHDewlXFmHLHoVyZCPF0uoAT/77ewKhmrvuf/jvWn5163iSk2v6XJPmpWrVPAIHt3JWnxIMS7cQ9FQx+ZwRfLheJRjSWLohn5F9Exjm/R6UDAweIzHb11HW6xzSsrJq66PrOiP7J7Jw2T50Ha6Y2pcvVh+korpmapPNYmT2uf3ITo/uVrtQn8hnjscx4SqK/vtLdJ8LS88hmMZcTkDvZougT8HgLcW17jPcm7/GPmgS7o1fYErMJOayX+A3tGz3XdFxZGFwN6WGgwQP76Xi839SueBpqlbOI1x+GEVpeWd9dFV8hSvAJysP1HnuoxUHULvhvFAhhGip/FI3la4AvdKi2bqvrM6d5L8u2oMy7geYMwfh3bmScGUR0WddSzhlSO3rCso8tQHAUXnFrtr/BxQb/rE3QFwmyt4VaJ4qvKOv4YWlPoLHve+rdflolSUkrvk7Q/b/F33vKnzVlbj94Tpl90iK4uEbxzJuUDKHy9y1AQCA1x9iw64ieiQ1PgfaoPux+IsBUPUQtkAxjSRV6bRMYTdVS99CD3ixZA2iasV8KNrV5drRUn5HFvEz/gc94MW98QsMzgRiZvxEAoAuToKAbiqsmrEMOx9jdBLBwgPY+o9D6TG6RcN2bl+I73cWM2/pPnbmVoKqYDxh+Nhpj6y7I0KIrqW02s/WnAoOV3jbfd9vo6Gm//UHwlhPyBxkNKiYtADhqpLaY6GKAtTwsaxA8Q3k4I+PrrvI2KoEUHxVtY9N7mLSY+v2y3abkcCg89EDPqg8THDcdbyywlPvBo6CQr/0aH506RD251Vxop055fiD4XrHAQyan/DmTymf+xRW32HUg99T/MZjmCv2Nfj6zipksGPrfyYJl/0Mx0X3ET35KpTo5IiaCgSg6kFCFYW1jzW/BwLuDqyRaA0SBHRTBj2Ab8MnhKqKsWQMwLtrNXrOmmaPBOjA3G/38ff5m1m0bD/P/HctBaVubpo+qPY1FrOBi87s2a2HhoUQXVdOkZtH/r6C/3trHY/+fSUb95W26x3d9IQo+mTEUFDqJjsjBttxG4Y9cs1g/KveJlxVTPTkq7H0GoFn89coRTtr65gSa+XHs4ZhsxgxqApXTO1DdtqxO7Ehn4vw8v+ge6swT7oeQ2pfjDu/5KrhKibjsa/7c0akYlj2T1SLHeIyMH7/X359eTI2U8OXBCZVYUS/xHrHB2cnYG3kPbpqxOCIJ+wqp+TNX1H+2csoJiuYrM345DqOjkIobQSBpMEEFBvKkIvwW+qnSO3uzK58qpa9gzE+nbgL70AP+qn4+EWsWnVHV61buvHGG5kxYwaXXXYZl112GRs3bmyT88iagG4qrJgx9xhCXEImao+RWLd/hSEhi1AzRwIqPEG+WZ9b59jCb/fz8+tH0TPVSaUrQFqCnXiHOeLukAghOj8deHfJ7jqbXf1r0TaeuXtSoxe/rc1iVLln9nBWbT3M2h1F3DZzCAZVITHGSlqcDVvSDdgHTEDLHI2j30TseZvRjluAqioK4wckM7hXHOGwTozdVPtcSbWfJ/61jjumziI9u5znvjNyz8W3EFO9j39t1AiGNCwmA9dc0J/+6U4M3omQ0p9qzYotZxmW6FiCx/fdCri8IVRVIcpi4NzRmazZVkhRRc2OxlE2E1ef1w+1kShKw4C5zwRsBzbi3bMGgPjpd+O3d738+vpx6S40PTLvnQYcmcRf/GPU5GwClkQSLnWgOOLxqZE5HSgUCrFw4ULeeustiouLSUpK4tprr+XSSy/FaGzZpbWu6xw4cICvvvqqxWWdigQB3VggYSBKgkYIFXXwBfhb0HmZDSp2ixG3L1R7LDXRjlFRyIi3k3FkkxgJAIQQnZGug8cfrHMsEAyjtXOnFWs3cfH4Hlx8Rg8UqB05VRRw6QlYeiWhhXXCphjU7LPqjazquk7Ukc3Gjq/6pr0luLwh/vRpEdEOCxXVXn75xh6eu/8sfpAR5uKzg0RHmYmLOhI4DDwPTVeJAgwJVxA87vvBH9L4cm0uH3y7F4vZyG0zBzMiO4HHbx3PoSIXYU0nKymK6JPsEKzqIcIHvq8JAAxGCIco//TvxF3+EH57Wmt9nKKdhBUjes8zCR75fQymjui2WaFOJRQKcd9997Fp0yZ8Ph8ALpeL5557ji+++IIXXnihRRfv+/bVTJm79dZbqaio4Ac/+AE33HBDq9T9RJEZ0kYQ/ciPuKV3L+JNPu65cjhGg8IV4xLonxnNRWc0PPUnpOnkl3nYk1+FO9DwfFEhhGhPqgKzp/atc2z2Of1wWBre1bfJ5arKaSdD0HUdXdNr+89qX4gPlh/gl/9YyWuf7aTcXbMO4HSmVhYfuUOv6dQu4A2GNFzuIHFRZnokRRF73MjB0e+EkKaTU+JjT0EVniP99b6CKt7/ag+hsI7bG+TP722ksMJLlNnAwMwYhvSIPWkAAKDoYcIVhaj2GJKu+19ip92G5nND0Nv0D0p0Ksf/PkZqAACwcOHCOgHAUT6fj02bNrFo0aIWlV9VVcWECRP4y1/+wquvvsrbb7/N8uXLW1RmY2QkQJySxVdM5cd/Zvg5N/HS/4zE/80rXD71HLQYS72FdZqus3hNLu8t2Q1AcqyNh24YQ7yj/Xe7FEKI4w3qEcuvbh1PXombhGgr2WnOFo1eFlb6WL6pALcvyNkjMuiZfPo7yCoKLFmXy4Jva+7+LVmTS4XLz92XD0Vt4q4rIVc5o/on8+36fC4eFc+8VUUA9Ex1Eh9dfzFx7fs0nYUrDvDhsv0AZCRF8bNrR1NSWffiRtehvNpPamzDOx03JKxaMAy/mIRBk/FZklB7J5OYPgC/NZl2X5EtRCt666236gUAR/l8Pt58802uuOKKZpc/atQoRo0aVfv4yiuv5JtvvmHSpEnNLrMxEgSIU/NUECzLp2z+HzDGJhMsySUYl4Y5fTAhte6XQkmVvzYAACiq8LJhTzHnjsxo71oLIUQdBkWhV7KDXsmNp7VsqkpvkN+9uhq3t2aK0Tfrcvntj86snRrZVIGwzopNBXWOrdtRjMcfxmFp/Cs6pOkUVXhJMLop2PAdg/qM5I9XJaGvnUv81Ev5fLufH88aXm9H4uMVVfhqAwCAvGI323PKyUysG8yYjSopcafXLoCwaiVsqVkIrGHAZ2n/AMAcrCRstBFWzFjDlfiNMTJtVbRIcXFxi54/lTVr1hAMBpkwYQJQM+rSVmsDZDqQOCV/fD8SLrkXPRQgWJKLpddwrONn1wsAAPzB+lvHuzzB2r0FhBCiO8gvcdcGAFAzDSfncE2mFIuvEEvlflQFLOW7MAfKGi3HZFAY3rdu5p3+PWKxmRv/0g+GdT5Yto9vN+bzyaZqfvt9InM+LeRwRQCjM55xg9N49KaxJJ9kFABoML2nxxeiR4qDn107iuyMaEb0S+SXt4wnwdn80VxNC1NZWcnGfaWUl5cTDLfPNFFzsBLXFy+jbfsCa3UOpW89jqUqp13OLbqvpKSTZ4c61fOnUl1dzR/+8Af8fj8ul4v58+czbdq0FpXZGBkJEKdkDlbgWv957eNA7g6oOIQS36/eHZWUOBuDe8Wz7UDNl56qKozsnxTR8weFEN2Pw1b/othpN2NSQnjWfYx3x0qiJ86iZOm7xEz+Aergi2hwir8O0yf0pNIdYM32Qvr3iOW2S4ZgOMl9k8PlXvblV2E1G9mwq+auY2EZbN6v8MRNV5Iek9Sku91pCTayM6LZd2QPAKNBYVDPOAyKwpCecQy6cSyqUlPH5nbhgWCQ0L7v0PN2oyRPRt37BZ6sURh7jyHK0rx9Zazew2gGC+DEoAcxewvx29PRTrivGTZFYe0zmsqvX4flYE7rh2apyWajoGMOVeM3RmMMe4CaPQGEOJVrr72W5557rsEpQVarleuuu65F5Z9zzjls3LiRyy+/HE3TuO666+pMD2pNEgSIU1Iq8/Ef2krUiPOx9h5F6cI5uNd/hn1qJsETRgPMBoU7LhvKrtwKXN4AfTNiyUy0yxxQIUSXp6hKbUaftHgb10wbwLtf7kLTdM4akU6f9GiCuoGoM2YTyN1B5bdvY+kxBOPAKQSO9IEuf4i9eVVUeQKkJUTRM9lBXJSZ+2b0JDQthbAtAXuglLAGIbXhO/m+QIgBPeJZ8O3eOsfDms7m3YVkx+l4LSmnbI/VaODeK0ew61AlHn+QfpmxpMcd69NVaHHffbDITbRPw7bnW/ocWo0W8OFOHcnuncWcPSL9tIMLi6eAsvnPYIhPxz7zHrSdyyn69i3ip9+NnjmmNhkGQBgj1rR+x97bYwhhkwNFAdPhLVR8+xbxl/4E94bPAQXzuFkNjnALcbxLL72UL774ot7iYKvVyvDhw5k5c2aLz/GTn/yEn/zkJy0u51QkCBCnFEwcQNI1vyIUlUrAYCfp6sfQrHEEGukso21Gxh6/sYwEAEKILkxHZ29+NYuWH8BuNTBjYm+yEu1MG5PBmAFJgE5slBmDoqCgEy4+QKiyGFQjgfzdKJX5KHF9qPKGmPPOBg4UHNt99+rz+jF9XCrB9QsJHN5H7Dk3UbroeZxjZxDMnopBVTlxWn96YhT7C+ru4BtlMzF6QDKpGfEErXFN7ndjbCbG9a+/EVhrUFWFZZsLyYxOZEJ8BlpZLmpqf74rcbJsxwHOGJKKxahiCVbgM8YCNXfoLaHK2seNCRzaRt6/foHmc0EjC6it4SrKFs3BnNYPa59RVC17l6SeQ/HF9kV1JqIHvBT95yFAIX7G/xBS228jM4OiET6SoUlFQ1GofXzi86JzMRqNvPDCCyxatIg333yzdp+A66+/nksuuaTNc/u3pq5TU9HuKj1B9uVX4QuG6ZGcSrrBhgL4nL2aXaaCjqoFajJHaD40gwW9kQ3MFEWh2hdE0+puilP/dTV1VVUFp9UkU4+EEK0qt8TLc++sZ8LQdJx2E+98sYtbZgzGZlY5UFBFhTtAdno0PeONOPCgqeAYdwlRQyYRLjqA7qsGXWdvfhUHC6uZPqk3FpNKOKzTK9FMiUcndfBZhN3lFL35K9ToRNzx/Vi4dB+KonDGkFSSYqzYTDXpTB0WI+MGpbJ1Xymb95ZyzphMLGYj3289zJa9JRRP7MWEgQnYbVaMmr/REYX20CfVxijXt2jl+SiDp6Ft+4JzU3dRmNYXg0HFVLqT0o9eJP6ynxGM6YUhfyNlS/5D/OUPErIlElItddrgt6eRMOthiv776JEAAOIuuI1w1hj0Ey6afYZo4i/9KZo5irA5muTMQQQdNZuVha2xWLIG4tmxEtVqx5DUk2ATszG1lMVXRGDXSkxDzidssGI8vIlQWT6WHsPxO7KweAsJ7FmFafA0go1MUTIRIIgZBQ1VCxLuwJ9xJDIajVxxxRUtygLUGUgQIBpU6vLz1H/WUH4k37SiwAPXjGJYr7hmzw1V0DEe3kgwfzfWURfjWbsIa8/hBJIH13ttWNf5fnsRr32ynVBI44opfThvTCYWY91O3h/SWLwmlwXf7sVoVLn54kGMH5jU6C6WQghxug6XebjynH58uHw/FdV+RvZPosLl58udRVQdyemfrpZh08NUbfoKa88hmKMT8G5dij9/L9FnX4uOQkW1j5mTs1m6IY+yKh+3T00ka+d8VOeVKEoVChA18EzcRXkcLguSmeJk0+4SfvPP74h1WLh79nCy0xys3l7Eqx9v58IzejK0TyI7c8pZt/PYju5vfLaTfQcTuGtmPwKr52HtM4ZA4qB2/9w0TSczNY5DhSOIn9iP1zcZuWVSH/ICUYwYlEpUqJKyT/+O5q2mdN7TRE+YTdm3b4GuUb30LaKGnY01fTCeNQuw9h5FIGkwBj2IL2cT6MeSULi3r8CZPoSAOa5eHXyOrNr/e6N7AzU3mMjbiGfHKmLPvZnq7xZS9cU/cUz/CQHadjTAormo/OgFgqW52N3lmNP6ogU8VC57D9XyEfEz7qV88SuEq0uItUShDjwf7YR8GxZPPq4V7+GcchPhor0ES/MwDL2QcDuOZIjuQcaaRD2KAmt3FNcGAFCzKOyVRVtbuPmXjh7wU73mI8refAz3+s/Rgz6UBsatC8q8/GPBFvyBMGFN5/2v9nDgSOaN4+0/XM28r/cQ1nT8gTAvL9hCQfmxzWg0HQ6VuPlyQz6L1+axv9BF6DQ24BFCiKQ4G/O/3lO7CdeGXcXsya1kxaZ8kuPsrNpcgMmgo3kq8eXuoGr1R5R98W/CrgrMKb3QLNEAZCU78QdClFXVzCO2mRS0inzCHz9NyQd/IuypRut7FoqrhFTPLipdAb7fVghAhcvPn95Zz+EKHy99UNM3Lly6D13XWbezqF6dV24vZf+mTbg3LUEP+hvsZ9tDz2QH1dZU/vh1iG05lTy5JESOL5oR2fH4jDHEXfYgqs2BHvBR+c0boGuYknriHDed0o/+WvNdseEL9FAABR1TZQ5V374JKMSfdxOGqFgCh7bhX/8RRqVp30+6rkPaEJKufhy931Tir3iI6Kk3t3kAABAwOIg574coRjOezV9T8fk/CZbmYes9Es3voWTeM4SrSzCn9cXQa3S9AABA0cL483ZS9u5vKPvoL2h+N4rewAuFOAUZCRD1qKrKnrzKescrXQG8/hB2U/N22NRRMWQNx5Tci2DRAcwZ/VEyhtZZyHXU8QHIUaVVPiCmzrGjX6Z13lvlJyPejqIorN9Twl/e31jn+ZsuHsSUEWntNPArhOjqTAYVty9U59ieQxX07xFHtSeApsOrK108frYRx/BzqF77CYrJgiUtG0NKX3ymWAB6pzlZvOZQbRmvLq/gN+dfgXHZyzXnGTeb+948zAPT/odd5Qb2l9Tthz2+EBXVgTrHTqzX8ar9Oqk9BqOkDULvoB5PVRXOGJjMwB5xVLj8RNmMJDgttWsWgo4MoideScWXr9a+J+bs69ATemBKzCJYcghL1pDaNoSiM3COm4kpqQcxw89GTRuEa+mbWEdehF9v+ndT0OgkGOMEHXz2tFZudeN0HUJxvYgacT6utR8D4MvfTcLFd+Pdv6H2dTFnXYvPHN9gGQFnBtHjL6Ny6VugGogaeSFeyWwkmkFGAkQ94bDG6P7189z2THUSbW9+rmiDEsa/8ROCRTlEDTmbQN4uQlu/QFXq36FKT7BjNNT99cxKdtZ7XWZS3U1/TEaVtCMb3VS4A/xz4ZZ67/nvp9spqaofZAghREMSYyxknNDXnDksjfPHZpEQbUVVFX56bjROc5jqtZ9g6z0CUHBvX4lv5wosnprNwFSo07f+/MJ4TCv/DakDMMQk4/78r9x6dgK/W1jI3OUFDOtTd8FuaoKd1HgbpuOmRaqKgqGBDcFUVSE5MRp/zhZCO7+t7WctngIs/hIURcPqysGoeeu9t7Xpuk60zUiPpCgSHMcCAAUdQ8FGKpa8Vuf1pYvmQP5WQpXFRA05G/+hrYS2f42qaIRUG4YRlxDOGoNiMOK3pxF14f/gt7TN4ubWphKGvStqAwAUFcegSZQseK7O60oXPofFdaiBEsBUupvKpW9j6zcexWSh4tO/YQ672rrqohuSkQDRoKG94xk3KIXV22uGoqOjzNxx2dCT7j55KmHdgKXfGZhSekPGcCy9RqDGpePXFSpcfg5XeHHazUSZDSRGW3jsh+NZuGwfPn+ISydn16QaPUFmYhS/uHEsC77di91m4rLJ2SQ6zeg6lFT58DcwfUnToajcS9IpNtIRQggAi0Hl/h+M4PPvD7Int5ILz+jB0F5xWIwqyXE2EmNt7CovJqVXInFTr0Pze3COuQjN66Jy3WdYhpxXW9aQXnGcOTSVVVsO881uH5ePvxpTz5FEmzUCh/eSm2umb2YMZwxNQ1F0brx4EEs35NE7PZqLz+xJgsPMY7eMZ8GyfXi8QYb3SSDGYea1j7fXqfO15/YirV9PFMs9qAlZ+HUFi+6h8rO/omsaztEXUbz4XyRc+hOUtBEdsouuOVRFxXFTgGoyI81B87pwb19B4qwHCcf1xtJzGIaELHxHFv6GVUud7EcBuk5fbgh58R7cDEDstNsxJmYSLthVM3UsrS+xF95F9bK38e5ZQ7g8H8WZWS95hu5MIW7arSg9xxLlLkb3VBA0RDV0OiFOStG7WCqV0lIXWjee052U5KS4uP7c944Q0nQOl3vxB8KkxtuJsjRvGtCJFEVB1/Xaf4ur/Tz/znrySzxER5l54JpR9Ew60qEpoKCcMuOPoiig6MevFeNwuZdHX1rZ4Ot/9cPx9EpxNPhce+lMP+v2JO1unbKao7v3n9C2v1+KqtRcLDfQHykqqIqKEvahBL2EzLGoeghjqBr/Cekuj+9b0xPttVl/1CM3WVz+EIeKXOw4UEZ8jJURfROJsZvRj/vZKSqg1/SNDqeNDbsK2bC7BHSdEf2SyE5zYlAUVFWp8zO3evIpefs36KEA9iFTsJx5dYdukmXxFuJe8S5Rk6/Db0nA4s7DvXoh9knXEjTHoet6vTYc1VX7ElOoGrUyj2BCP3TFgLnqIErAheZMJWCOxxSsQq0qIBjfB02pe6/2aJuP/0yOfpd2Z639s25uH3qivLy82hShGRkZrVImwJIlS3jxxRfxer1MmjSJxx57rNXKPp6MBHRnioJywoXx6TCqCpkJrf/lcLSz0nUdFPhw2X7yS2p2bKxyB/jb/E389rYzsBjUmp0qm7CgTdf1enmxk2KtTByWxorNBXWOD+gZS3qi3DURQpwe/SQBlK5BGA0wg8kMuk4YA+EG8t0f7VuDYR1/SEOH2k3IAOwmAwMyYhiUFVt77MRz1/TrNcdsViP9MmLonxkLul7ngvn4/yuKRri8AD0UBCBweA/2kLtDgwC/LQXbeXfip2aqqT8qA9vU2whgrg22ulvgGjQ6IWFgzQMd/M4edZ83RUNC9EnLOP4z6e4BQGe0bds2nnrqKfbv34/JZCIYDNK7d28effRRBg+un/HwdBw6dIhf//rXvPfeeyQkJHDzzTfzzTffMGXKlFaq/TESBHRDIU1n64FyPly+n5R4O5dM6k1qbNtlPWjsLk1TaDrsy6+76U1xuZdAUKsJAlrAoChcfV4/UhPsfLTiAOGwzrTxPZg2PguzQZYFCyEapygK3mAYg6Jgau3+QoE9+VW8+tEO8ktcjB6QxNXn9yfJWXdaS5P6VQXW7Szi/SW70TSdy8/Opk9qw3c5jSEv3l2riBp5PvahUyib/yx6dQmKNalDpgMdFcR80sdCdCbbtm3jjjvuqN0t2O+vWWO4Y8cO7rjjDl5++eUWBQKLFy9m+vTppKamAjBnzhwslraZ8iZBQDe0t6Ca59/dUPP/vEo27y3hyTsn4LC07o+7whNg455Siso9jB6QTO8Ux2nn5zcocMEZPfj3h9tqj501IgOntXXq6rQauXRiL84dnYkONVOa5KaJEOIkAmGNlVsLef+rPURHmfjhjCH0TXe2Wt9RUObl6dfX1l7kr9tZTG6xi9/eeka9vVBOpbDCx2//uaq2rO37S3nijgmkx9Xf0T1oiMI2+UZ01YjPYCf+6t/gN8Z0aAAgRFfz1FNP1QYAJ/L5fPz+97/n9ddfb3b5OTk5mEwm7rrrLgoKCpg6dSo/+clPml3eyUh2oG5GUWDXwfI6x6o9QQrLWzcDRCCs8dd5m/nPx9v5ZGUOT766mv2Fp5+dQNdh/MBk/ueqEYzqn8T1Fw5k1tTsVq2rpunYzQaizBIACCFObW9+Ff/5eDtub5CCEg9/+O8aSk9IzdkSOYer693lLyrzklfiPu2y8kvcdaf/6JBX3HhfHDRF107/8RljOyx1qBBdUV5eHvv37z/pa/bt20deXl6zzxEOh1m5ciVPPfUU77zzDps2bWL+/PnNLu9kJAjoZnSdeqnsDKpCrKN1h5KKK3zsya2bw3r19sLahW2nw2JUufDMXvzkqhGcPzqdaKuptaophBCnLf+Ei/FQWKe0svVupJyY/vioPbmV5JZ6TqusOGf9vj0+WnaOFaItFBcXYzKd/BrFZDJRXFzc7HMkJiYyYcIE4uPjsVqtnH/++WzatKnZ5Z2MBAHd0MAeMVxwRg9UBZx2E/dfPYoER+vOsbRajPVyUyfF2lq0QCkc1mRYWgjRbCGtZvfc73YWc7DY3aSkAg058UaK0aAS42i9C+vs9GjsJ0x5HNAjjt25FTzz37W4TrIB2ImyEqO4beYQLCYDJqPK9RcMoEdSx2Y+E6K7SkpKIhgMnvQ1wWCQpKT6ey011TnnnMOyZcuoqqoiHA6zdOlShgwZ0uzyTkbWBHRDdrORa87ty8Vn9sRkUImyGFr94jrBYeLWmUN4ZdFWNE2nZ6qT0f07dnFZR1MUsATL8BnjMehBjMFq/I3s+CiEaF2KovD9jkL+uXBr7bGfXjuaoT1jT7usvunRXHfhAD5ZcQCn3cyU0Zks35THrLNaZ6pivMPM47eMZ/Hqg+zLr2Jgzzh04LNVOQAUlHnol37y7DBHGVSFy6b0YUSfBHQg2mZs1X5Y03WqfSHsFmOL9okRojvIyMigd+/e7Nixo9HXZGdntyhd6IgRI7j99tu57rrrCAaDTJo0idmzZze7vJORIKCb8IU0yqp8OO3m2i+BGFvNkFXbXJgrTBiURL/MSXj9QZJjbae9oK27sbjyKHn/SeIuvptweQEVm74k9rKHJBAQoh14AiHe/HxnnWPvfrGLgbeOw3iaCQs0XeebdbkM65uI2xvkjc92oCpw0Rk9sZtbZ7+UlFgrYwcmU1ThZdnGfFzeY3cXDQaFjfvKCGs6WckOEqPNKCeZu68oCo4jIwut2d+XuQL8++NtbN1XRq+0aO66YijJMtWoW1MUMGh+QkrNNDMTAcnWdIJHH320Tnag41mtVh555JEWn+PKK6/kyiuvbHE5pyJBQDdQUu1nztsbKCh1Y7MY+ck1o+iX1nCKuLCms+9wNTmHq0mJt9EvIwarqfEvNVXRsFTloKhGQtZYjJ5iNJMdnzUVUEh0msEpHQRA2BKNtdcIyj54FoDos64hZJJheSHag6Io9ebam4wqiqKgoGNxHSRkT0bRNAy+UvzOrNqdWF3+EAVlHnIOV2NQFcZmR/PoxXHMWVzGpL52pvdP5L2NIaq8QVZuPUyc00L/zNjaC++m1Q9UVSUc1mofa8DhUnedAGDsoBQ+XZnDmh1FAKgK3HvVSEb0iW9SYoOQpuMNhLGZDRhbcudegUXL9rN1XxkABwqqeGXRNh66bjSSYbl7UhQwl+/Du/UbLGdchRqoxr3qfeyTrycgN7NqDR48mJdffpnf//737Nu3r3afgOzsbB555JEW7xPQniQI6OIUFT777iAFpTUL2bz+EH+du5Gn7pyIzVT/zvzmA2W88O7G2sdXTOnDpRN7Nnr3yOIppOitJ7APGA+Kgmf3GhKm3Yol04FflQvc42kGK6akLLw7a3YpNiZkElQkQBKiPdhMKrfMGFTbv6kKXHNBfwyA1VdI0Vu/JWrE+eihAJ5ty0i+8Sm8lmTCus6mvaW8v2QPFS4/qqow+soEQl88xz2jr0TN3YDhYBk/mflL7vnrCo5cwzNuUAp3XDoYQxNGGaq8QdbtKmbXoQrGDUphSM84jEaVd77YxcRh6QRDGkXlHjKTHWQmO/nL+8f6aE2Hv8zdyNM/nkT8SdZ26cDuvCre+HwHhwpd9Ex1cv2FA+mT5jjpKEJjAiGNzftK6xzbk1uBLxButd3jRediCPvxbPkKz7alhDwVhIoOEnaXYx8yBSU1PqKn+55o8ODBvP766222Y3B7kSCgy1M4UFB3s61KVwBfIFwvCNCBBd/uq3Ns0bJ9nDMqo9E7Wn5bCnHTbqV88SsAxE6ajZrQE58EAPWYPYcpWvYu0ROvIliaS/nHfyHx+ifxmeQOihBtTddhRO94nvmfyeQXVZMSH1W7SaLfmkzcBT+i/LOXAIif8T/4LDUL98qq/RRX+Khw1Wz4o2k6/13r50ejpuNf9z4Asdf8hv+uKa8NAKAmG9qsKX1IOcVGjLoO73y5m5VbDgOwasth7rpiGOMHJpGdHsPCpfuwWYzEOi3kF7soq/LXKyMU1skvcZ80CDhQ6OLp19fUPs45XM3vX1vNr249g55Jp79DusWocvaoDD74Zm/tsTOHpGKXAKDbCqkWrBN+QNhTiX9/TSAae+7NhFKGSADQiIyMjC558X9UZE/i7gZ0TeeCM+puOT5haCqx9voprBQF4k6Yz2m3mjCcZGdeQ9hLIG9X7eNgWQG6qwRVb3r2ikjht6eRdN0TqEOmYZt0HYlX/VLWAwjRjhRFYXDvBEZkJ9TZJd0Q8uLPP9aP+fN3Y9Bq5vOajAbUE7rAoekWwoV7ah+HS3LISKibitOgKpiasA6q0husDQCO+uy7HEBhxsReZCQ58PpDuL1BbrlkCKu2FDRYjvkk0zbDms7n3x+sd1zX4dsNeRiaMX9H1+HcURlceW4/0hOjmD6xF1ed2092FejmDH4XoeJDtY/9uTsxhE4vba3oOmQkoBsY0SeB+68eyfdbC+mXFcPo/kk0OEKtw+wpfdhxoAxfIIyqwI8uG4rdrDYa5RsC1XgPbCT2rB+g2qKpXPkBlqxBGMJeNGPD6w4ilaYY8TlqArKw0UzQ2Ho7jAohms8QdOHft574S+5HDwepWvYOtmHnE7LaiI0y0TstmhH9kti4uxhVVRiRYUDbl0/SNb/Gd2Aznh0rOWPKMOY5LZRX19ypv+GigcQ7Taf8G7eYVGIdltqRBoDs9BgUBRKdFh6/eSyFlTUjERt2F9MnI4YdOXU3fIx3Wsg4yd38UEijoJGNxnKLXIBCczojh9XIjDN6cOG4LAxqs4oQXYiJANUr3yPsLif2nJvx5+3Au2sV9gHjUdJHy2hAN6ToLUns3gFKS131dlrsTpKSnBQXV5/0NRo6Fa4gFpMBh/VY+k+DQUXTTp5rX1Gg3BOkqNxDnMN6yqwTANZQBaAQNEdj8ZcRxEjQFHN6DTuFprS7u4nENoO0u7XKao7u3n9C45+zNVSB3xiDgo45VIXPGFv7nK7rFFcHKKn0YjKq9Ex2EK3XvMaoB1CDHgLm2JoFxKUeHDYTKbFW1CZmHdpzuJo/vb0ejy9EWkIUD1wzksQjm3y5/SH++OY6Dh7Zcf2KqX3ZfaicHQfKuXhiL2xmA3arkd5p0WQmRjXYWyclOfnvJ9t454vd9Z676eKBnDMyvVtewEViX9LWbbYEytDLDhFKHYoh6EEp3kU4dQhhtWOzQrV2u5vbh3Y3MhLQxVT7Qrz9xS5WbjlMlM3Ej68YxuAesQC1WSdORtch1mYi1tb0i3ivMYZ9h118uHwzFpOB6RN70TOprVKPCiFE6zt60a+j1AkAoGYaUXK0heToY1N+fNS8JqSYwVwzF99hMTY5f//x+qU5+f1dE6n2BkhwWuukU95fUF0bAADM/3oPA3vFce9VI3hl0Vaq3IHa5x64ZhTDe8c12PeOH5TCN+vyOFx2bOpGRpKDkf0ie/8WcXr85niUtJpFwJrJiZIxRn5/ujEJArqYTXtLaueXur1Bnn93A8/cPanBNQCtJb/My1P/WV3bEazbWcxTd02ovZMlhBDdgaIoLdr1vE5ZKuSXesk5XI3ZZCA7PZr0OHu915VV1881nl/sZvO+0joBAMC/PtzaaOa3uCgzj9w4ltxSF8GQTpTNRFqcrdX2NRCR4/g/AQkAujcJAroQVVXIOXxsOExVICPRRrUnQKz96I+y9Zdt5Ra563QEobBGfom7UwUBqqqgKEqTRkOEEOJE5kA5evE+9IwRGCprFtkGYrNPOb3SH9IwqirHp+QvrvaxN6+KfyzYUvv+mCgzj94yjqQT+s34BjbfinNaKCqrvxiz0hXA4w9hMzWcJchsUtmTW8nCb/dhMqrcPH0Q4wYkNXnakhAiskh2oC5E03RG9qtJa6cqcN+FSfxs6GF6xqmYCjdjKtyMorR+2B7XwMV+bCcJABQF8su9/HfxLn732hqWbz2M2y+Zi4QQTacoCnrRHso++jOB79+mdN4zuFYvwqjVv0t/VCCk8e2mAn758ir+750N5B25aHcHwny1Lp/3l+ypE0BUugN8t/UwygkX5L1To+mTUXeKUX6Jm9EDkuuds09GDDEnGfXdV1DN/K/3EtZ0fIEwL32whcPl3qZ8BEKICCQjAV1M/8xo7v/BSJZuzKO/00fwy/n4inbgz9uJffBkLEn9CKm2Vj1nrxQHs6b2ZcG3e1FVhesuGEB6fP1h7Y5QWOHjf//1Pf5gGIDdhyqYMiqDGy8YQEs2yxRCRA5d19GzRhE1/DzcG75AtTqIPucWvErjiyF3HKrg3x9tB6C00sfTr63hqbsm1i4uLquqH0DsOliBYbJCKHQsOoiyGPifK0ewaU8pyzblkxpvZ+roTBJjrew8mMbKzTUpQ+Ojrdw2c8hJdwFu6Jzl1f5O018LIToXCQK6GKOqMiI7nlH9EjCgESi7ANf6z1GtDqImXImvlQMAALNR5ZIJPThreBqKohBjN3aaeYI7D1bUBgBHfbM+j4sn9CS5gWF2IYRoiKHiIJ7ty1CtUWg+F/69qzH0m0JYqX/nXVUVdh6sqHPM7QtRWuXDaTeTX+yid3o0+/PrbuQ4fnAKoVD9KYsxNhNnD09l6sh04FiSh1unD2TGxF74A2FS4+0NrgU4XmZy3TSiZqNKWsLpbxQmhIgMEgR0UbqmoxRtw7XhC0yJmQRLcvGs+xjT6MvbJpWXTu0wdGcJAAC8jUz9aeiLVgghGqXrWDIHEX3OLfh2r0IPh1AaSYyvaTp9M+pmWLOYDSSavCSYPEwclkaFy095tZ+KI/sKjOyfxIi+iSc7fb01TQZFIT2u6Td2shId/OLGsSz4di8Om4mZZ2WT4DR3qj77lBSo9oYwGBSizMZWW6gthKhPgoAuStdBiYrDMepCrGNmouVtBSCsNr6tfHMEwzrBsEaUxdApv0gGHEmPerzs9GiSYlt/REQI0X0F4voQdd6deFUbhoHnAhBWGu9PB/aM5cpz+rBw6X4SYyzcfkEWykf/i3/wZMaNvYpyT5hhfRKpdPkxG1VS4u0EQxpBTcfUSnMVPYEwhuPKUhUYkBHNL64bDYqOrrXspo2igKqF0FQjqh5CUWrWQvg1A1aTodXTUFR5Q3y86gBfrD6E1Wzg6vP6ccbgFMyN7GofKM1DJQqNmgxI1mAZflMcuuxrLESTdEgQ8OKLL/LJJ58AMGXKFB566KGOqEbrU6DCFUCnJl3b6TLrXvSQFT3gxR8MYrcY0U02dL2mQ9OAclcAk0Ehxm7Cb0/HNG42pT4IxwwjyezCEPJgNULQ48ZliCHW4MFrjEdRwBT24FdtlLuCGAwKBlXFFwgT5zBhUBQUNMxlu9EdSWgGC5TsY68vga15AUb3spOUnFgnv3VnkJXk4KfXjOI/H2+ntMrHuEEp/OC8fq32JSuE6CQUqHAH0TSdOIf5pJd5iqJjDHsJqnaMmhdNNddeKJ7sBKUBE/6AnzinhUp3AE33owDJdo3ygJFgWCfFoaOHdaJMJmac2ZOZY+Jxbf2W0OJXMCRmYBk5A7+m4LQacaLj8YXYlVvBlv1lhMM663cVMWtKHwZlxaIe108Zwj50gwkNA2bdS0Bp/EZGSXWAT1YdYOmGPKJsJmZN6Utqgh2zSSUjIQpjE3f3VRQwaTXnMug16UiPBj6KAqbSnYQK92POGIhWeRh/4QFM6QMorAjzVX4054/vcVrTLlU9hKKFCBusmDQvIYO19vsNBT5edYDPv6vJzOTxhfj3R9uJdlgY0Tv+WBlomF25KEYThz/8M/Hn3oAWnQGeckrm/4G4C+4glDZCAgEhmqDdg4AVK1awbNky5s+fj6Io3H777SxevJhp06a1d1VaVTCss2R9Hu8v2Y2u68w8K5uLx/fA3MSLZouvmOov/4l6xqUYq8sJ7F1POGMAhtgUSB+O2x/mva/38c26XMxGlR9eMpjxA1PYfNDNX+Zu5Ipx8Zx5+F3iz5hO+foviBpwJk6bneq963GOmU44FMa1ehGHsi/jqbkHMKgqF0/sxeY9JaQnRnHNef1INFRRsuh5TAmZGGJT8W5bysDLH8QXYyJhw7uYxs2CpL5t/EmeHlWBob3i+N2dZxIIakRZjZLySohuRtN0lm09zH8/3UlY05g2vgeXTe6NzVT/wl7XNYwFm/Ht/g7HxKvxrl2EMSUbQ+ZQDL5K/I7Mmqk3mk5RpY9gWCM13s6OnDJe+mALg3rFE+s089XaXAyqwvUX9MOgBXn9yxyCIY3bz09nAhswJmRhzhhI1eKXcPYbQ4XRRLD4EHrJfpSUIei6wvbcKp59Y23t3Xir2cBlZ/fh/95az8+vH8OgrJopRcawh8D6RZgSs7BmDKDq85dxTrkRvyOzXvtc/hB/fGMtxRU1WX8qXQH+/dE2LjqzJ99vK2T6hF6cOzq9SUGAuXwf1d/NJ+a82/Dt+R7CQQyDpxFWzFj9JRR/8H/ooQDR42YQLMvHu3c9ht2rSTvnh+z9rpTvtxfxm9vOaNI+NaoeQsn5nmDJIaJGXYxr+VtYB0wkmDIUUKj2hvhi9aF671uy5hCj+yYQDusoChiLtlE8/1kcI87DOWwKRfOfw953DCgKut9DxZf/IuGa3+Izxp36AxAiwrV7EJCUlMTDDz+M+cgOjH369CE/P7+9q9HqckvcvPPFrtrHC77dx+Ce8fTLaOLukoqCFvBQsugF9HAIa9Yg9LJDVFkScSoqu3LL+WZdLlAzHPvygi30SHXywnsb8AfCfLqxkiFnXUR0VQlhVxkVK+aiB3zY+4+neu0nePeuw5wxiF2HqtD1mlz/i5bu44qpfZn/9R6G901k/IAkEi5/kJJ3noD8XURPuZ6qr16ll89NGAhqYFU615qAoywGFUsjQ8ZCiK6toNzLq0cy8QB8/t1BhmUnMKRnQxd6CorBgGfnKnz71qP5vcSfewPBzYspX/cpSTc8RZUxgQXL9vPxigMADOmdQGqiHa8/RK/0aOZ9tQeAUFinyqsx/+v9taVv3F/FuN4GXIv/gWqxg2oASzQFEx4k9dBiAq4K1GQNf1jhtY+31ekvfYEwBwurSU2wM/frPTxywxgMCuiKimo0UvH5y0fKNNbcim9AzuHq2gDgeF+tzeW88T14+4tdjBmQdNJUorVUlUDBbkreeBTN7yH67Os4uteMz5xA3MU/pmzRC1St/qjmkzWaiRk/k0/2m9hfWFOH/QVVjOqTcMpT6YqKajDgXvcJ3q3foAX92AadXXs+g6pgMRnqrfNy2s1HXqPXTIONSceUko1r45c1LzAYsaT2pvK7D1FtTuKv+IUEAEI0UbsHAf369av9/4EDB/jkk0946623mvz+hARHW1SrxbYeqqh3zOULkZTkBCBYWYwpJolgZTFGZzyKWvcOlq47CJ9xBeUfvgBAVL+xlO9cy2qjk0tHWalw+U94Pbi8QfyBmsw4gZCGplpxbVmKY8hZVK5aAIBj9AUUvfMk6BrGkdNZ+GZRnXKOLkQrrvASZw9T/v0KQAFVxbPre+z9x1O1+iP0QdPICcQyKdHZ4s+qMUc/q0gSiW0GaXdH6az956nsPm6TxKOqvaFGP09Hr8G4MwfiO7gVU0I6YVc51d9/SNLMe4lK68H+fWW1AQDA1v2l9D8SUJyYVODExbqr97q49fzJsOEzNL8H+5mz+Tw/hv9+k8P5w8ZRuBp+NtCCIaRRWFb/Yr2wzENijI39+ZWYrUbio22Ak8DI86he8zGa30PsWT/AkdmHaGP9C3nv7pIG2+wPhjEc2fHYajU16XdNi+mLf8jZVK//HNViJ2bIREzxx6behGzDqU7pSbCw5rOyZY/Cu38DYx1lrMnIYFeeG4+/8Z/DiUKWUVTHpREqL8DaeziOXoMw2I+997oLBvDKoq21jxUFLp7Yi/j44zMcOVEnXUHRvP8DwJo1iEDhATSfi9izrsKRkU10I5updRcd3Y90lEhtd1vqsIXBu3fv5s477+Shhx6iV69eTX5faakLTet8t6KTY2yYjCrBI18gBlUhNcFGcXE11nAl5XOfImbK9VQue5v46f+D15Ze5/1WTx7lH/8VS8+haNVllC9/n5jxMzkrsAtPdV96ptb95XfYTKTE2emR4uRgYTWzz0ggZutrxJ51FSWL/oKt9wgCpbmUfvRX4s+/BdeWb/F++jw/vegn/H5hzciL9bjt5Af0iMNdUUWgNJ/4S+5FsUdTufgVVHs0ev8pKNsXM6jPEEpKEttkJCApyUlxcf0v+u4sEtsM0u7WKqs5Omv/eSpJ0VZsFmPtXWJFgYykqAY/z8QEOxWrP8V3cBuOMRfhWr+YkKsca9Zg/KX5hIrLqaz213vfURazAVWBox+TAkRZjbh9Nee+7qwUtK//hmqxYes7FvequYw65z7eNap8sbkSgPwiF8mxVsYOTGbNjro3Xgb0iOOb9bmMG5xK2B+iuLgai+aicsEcVEsUtr6jqVj6LsaUPvjjB9RvX0zDc/CT42xUuv1cdnYfLAZO+bumKArGvDVUr/+cqJHT8GxbStHHL+G44G4Cig1T2I1v1Ts1AYCigq7j2fUdjnEzCccN5NDKmmAkKcbWpN9rI0ECq94mVFmEY8zFuNZ+QsXaz2DQhbWf9bgBSThsI/lyzSGio8ycPy6L9DhrnfIt5bspWfA8isGEGhWD78BmHEPPxpI5kIql74EtFr3XmU1YA9I1Sf/ZeuWJDgoC1q5dy3333cejjz7KjBkzOqIKrS45xsLjPxzP4u8PEgxrXHBGD9LibKCD3xhL9OSrKV3wHI6R5xO01d8JMmSNJ/7iOzHEpRPyenBXlhOyR2GNTiSkKfRMiuKRm8by2Xc5xEdbOW9sFtFWI/deNYKlG/NZd6iSsefeg+o0E3/h7YRjs7CiYaguRE8dgDNjGHrJPtyWNMYNCmO3mRjeJ5FVWwp46PoxZKc4CKgKjgvuJmiIAkUhdvrd5Be72OwcwDkXD8WQ1Itw17t+EEJ0cbF2E4/dMo4v1hzC5QlywRk9yEpoeOGsohowZY8lISEDY1w6Jmc8poQsAhWFVK9aQFy/CWQkxhIfbaGsqiYYMBpUhvdJIBAIcbjExU+vG8O3G3IxGQxkpTr45TUD+XJLBRWuAP16JhE74BqwxRC2J0LqAD7apxA4cgMoKdZKrMOMCvzg/H4cLvWQW+wCYFifRAIhDUVRuOiMHrV19qsOos++FsXqJGxLxNJjCFp0WoPty0iwc/64rDrz52s2cRyI3WqgR7KjSesBdF1HScwmfsa96Bn/396dx0lVn/ke/5xau3pf6I1eAWn2HRVwAST2NTar0YhmMIbMJLnXV5wk9w4xhmtmMiZRbl5xifEmNyZmcV7jK8YtMmKcIRCNC8giKCog0DQNTdP0vtZ2fvePhoJm7dbqharv+y+quuqc8zTVT53n/H7n+U3BN/YqsG0Cju7vLSvURfB4NZbbS9ZN3ybQ2kjLy4/RVlfD68dH0ekPMWdSPiW5vRtdCuHGN2kBCaOvIJw9hqyCsVgZw/GfdqweV/c6ONNHZwHWWaMwAJY3GVdSBmlzlhFuqaN97xa8hWMI292j4q2bXiSjYAJ+d3qvjksknllmgJvw1tTUsGzZMh566CFmz57d5/cP9StZzhPz0k9PXt5QM00vPIC3YAydezaTdctquhKHn/Xe+jY/2/bUkZeZyIQRmTg5e/690+nAGNPjd2BOdM041tBBXmZid+eMyHxSQ/jEmbt1Yqj45Dag+8vjQj31T8Zj23a/3gsQj1c34jFmUNzR2tYnMdTz58U4HBaWde6Tw5NO/p6tE/cvJYQa8LsycJgw7mAzXZ7u+ev1bQHe23ecrkCYyaOGUZDlw7KsSE7szn0msj/L6t539+/PRPJhVyjMX7Ye5tXNVZTkpXD79WPITT91tT4Qsqmqa8cfDNPY2oXTYVFWlEFWcs8pK9Zp91udzNXn4w/ZVNa2sqeqieREN5MvyyYr2d2rk/8zXWhf3kADlr8Vf2oJFgZHw34q23xsPRSgrCid0ryUc96cfeH9nWhx/SnuL0sIt2DjJLDtRRInzqPLl4cr2ELXOy/gm34j/oSzL7TFCuXP6G1PBqEIuP/++3n22WcpLj51FWT58uXcdtttvXr/pfglZlk2Ce01BJNycbXXEkrIJHTGyr7tgTD3/b+3aTwx9/8fFk9g9vjcXmwc/vbeUX699gMAEhNcfPfOy8m/xPrkx2Nii8eYQXFHa1ufxKWYP/vqXL9np90JlpOw5cFjtxN0Jp12wv3pmx1YFnQFbdxOBxfqTnyhfVkWBG2D02Fh9fJ4Tm5voP+movE7i4asNDf1zcHIY5cVImRie/kj5c/obU8GYTrQ6tWrWb169UDvdlAZ46AzsQAMhBILzvp5IGRz+Hg7c2cUcriujXc+qOXI8faLXg2C7hGA3637KPK4oyvEX7ZUs6K8LOa/7EVEzsXpdERGC5x2J6HtL+FISMZXdiWNLz9G2twv4E/vbnccjZNZY+jVGirn21cwbLNt73FefH0/2ek+Pr9gNAWZiZ94e/1tKBQAAA5PAnCqCIj1AkAk2vQXM8g6A2F+9R8fsG13HQDjSjOZP72QOZPze7VcejhsEzpjaLy5PXBiOpDBckAoDC7HxQsKEZFLWXNHkG1vV+IPhMnPSqQwKwmnw4UzJYvmjb+nddMLWG4vuIfWSOmewy384oX3ATha38G+w8388KtzSPXpK1pE+o8aqw+yg7VtkQIA4MPKBqaNyen1dJ6MFC/XzTy1oIxlQfkVxYTDNo3tAZ7+y8fc98QmXt5cRdsZ/ZdFRGJFyDZ8cLCRjVur+eWL7/PYH3fy7v56QpYbz4ipWO4ETChA4rg5hJOGzpxxy4KPq5t6PNfRFeJ489ntRUVEokmXGQaZPxg+67lQ+OznztTSGaLqWCsel5NFV49g8qhsmtv9lOSmUJSdhG0M//bq7kiB8cz6vWDgxiuLNSIgIjGnpTPIurcqqT7WRrKvu53o48/u5Gd3zyCw/v9iudwkTy+nddOfyCoch5U/NarTWtzBZhwtRwhlleFqPwrGxp9cdNH3GQPFuT3nJ7ucDjJSvNE7OBGRc1ARMMhK8lJITfLQ0h4AICXRTUnehVcZbmgL8MPfvkPDiV7XM8fm8PcLx+M5bU5qmz/MgcPNFA1L4NDxLsYUJvP2rhquv7wQ13lWohQRuVQ1tfmpPtZGWrKbVfM8BC0Pf620cB/aRur8FYQtF6GkXIYNL8POKI5qAWBZFtTu5vjLj5N61S00bF2HO7uI5Bv+kYB17r7+pxtbnM6yuaN46W/7SUvy8pWlE8lIiu0Fr0Rk8KkIGGTpiW7+952Xs/tQEwaYMjqbZPeFZ2ntrW6KFAAAWz46xo1zSinNOdWv2ed18N3PeLA7mtncXsBV7g8wwyfhdlhD5qYuEZFoCNk2gZDN4mtGMj7PRcpH/w61e1laOJnOv24hcen/JJg7CWMgnD0x6vs3xmAKppAwYS4tbzyDIyEJz9yVBB0JvWrb6XM7WTSnhOtmFOJyWnidmqkrIv1PRcAAuVBLtawUL3NOtAM9sw3W6R2CTv7bPseGzuwE5DUh0v01tG95nrnDSrGPV5I2vAiDoXsNzMExVFrLiUhsMMB/bT3MH9bvBeBVj5MH7/g7rJf+BSq3kDhzIeHcCRfNO582N9ktxwjs2woOF7a/k/rdO3CVXU1a0sVHAk4GkuSJvVVue9PlTkQGh4qAftbUEeS9/fU0tvqZMCKTEXnJOHpxEt7eFWLXwUYO17UxtSyb9s4gew41UZqXyojhaST53LR3drdGG1eayfBhST3eH3Z4SJx8Pf4P/0boeCW+sstxlswgaAanAGjpDPH+gXrqmjqZNHIYpblJODQtSUQ+pca2AH/8y97I44lFPhwfv4Yd7MKdXYhduw9XUxXhtFK8/uMQ6CCUXoLn+EeQkErAm0VlbQs7DraTluRl0qgs0hPdfToGy4JgezMmOQt71p0MsxppPHIEn78WLy78SWe3ho51YWM4cLSN9/fXk5fpY8KILFISdMohMpToLzLKDBAM2SR4nHQEwjz6zLtU1nRf2X/xtX38w9JJTCzJIPkCydAAz762j43bDuPzunA6Hbzw132Rn980bxTf//srOVDTgsftZEReKgln9Kh2mgBd764j1FKHp6CMzj3vkDByOlbJLMwAFwJB2/DLP+1i14F6AF58bT/3fvFyLsvXYh0i0pPDYbBti+4ux2EM5786HrINbf4Qpw+EdgQMeJNh8b+wsQp2flzP1YfczEgMYG9bS8fut0m/ZjnHNj5F6rRyavOu4p9/vzsyClCcm8w9K2aelVMvxBiodpWwv2A5M1qO0vDaL3BdcQtdr/6UkC+V1MX/hN8aWm1J+9vHh1t48KmtkcfTx2bz35dMxKmLPyJDhoqAKKpr8fPMho/5uLqJz8wsYvLoYZECALq/KPZVN+HAcMWY87eoa2oP8tfthwGYdNkw3n6/psfP//T6fuZPL2T6ZcPOu42w5cFTPIGMrEIcxVMJfbgBZ1YxoUEYCTje3BUpAE56670jlBWM1YJmIhLh9dcT2PMGngkLoOkIdnsDdtFM7HMUAu3+EL97ZTcNrV3MnpTPW+9158mPDnfgXzyPZ/96kLfeP9r9XFUTQdtB+RXL8B/6iMb1v8GTfxkGw+bd9T2mAVXVtnG0oaPHPVa9kZ+ZxNu73PyxxsPnRl1FaPMzWO4E0pauoivOCgCn08GG7dU9ntv2UR318/3kpPVyepSI9DsVAVESCNs8+swODte1AfDHDR/j9bhI8rlo7zzVnz/B42JfdROzxuWe9wTY43KQ6HXR3hWiuc1PZmoCR+s7Ij9PT/bivtDa9CePKWssVpZNCAeO8eX4zcDfbBYMGyzL4pqpw3nrvaORhc2yMxI1T1REeupooOXtF/AcfJ9g7QE8xeNJGj4B23n2CfnHh1t458NaADJTElg27zIyUryMK83E6bQiBcBJ//XOIeaPKCDUchyAYH01SWOuJDN4dl70efr+1ehxObh57igcbTV0vLwbGzDBLkLH9uMozjpnIROrjDHkZfacoupxOfC64+d3IHIpUAuCKGluD0YKgJPe+bCWlQvHc3L0s/DElaVJo7IveAU8OcHJV5ZOwuW02H2wkcvH55J0YvqQx+XgK0sn9WgHeiHmxH+xPQgFwOGGDn74+y3c+/M32XOwkeXXl5Ge7CU/K4krxuXqBmER6SGYNZq0a24lcGQPxhjS5t9J8BwFAEDbiXuioDvXPr/xY4alJTC2NBOfy0FeVmKP15cVpcHxSnwjp5J354O4M4eDZTFt/HBKc7tfa1lw+/VjyE77ZD36XU6LhGALDoeTnC+uIXnGjfgPfYgj7L/4m2OIbRuunpxPdnr3VX+Hw+LLiyeQntS3ey1EpH9Z5hK7HFtf3zYkp5D4wzb3/XITdU2nVnm87foxXD9zOFV1HVTVttLQ3MXw7GQmj8zEc54WcJHuQBbUtwZobQ+Qk+EjbBuON3eRkewh/RLoH93SGeTeX7xFR9epURCnw+K7X7qc3HQfvjOuCJ3ZFSkexGPMoLijta1PYqjmz5O8zfup/+OPcKZlE2o8im/MLHxXfYGgM+ms1x5u6OT//NtWFlzevUK61+NkzsQ8LivOpK6ulUP1HTz89HYaW/2U5qfyP26aTG4yOMJdBF2p+AJ1hNypBC0v4UAn1c02Po+T7DQv1qfooGZZBm+olS5nKi67E8sOE3T1bWpRXw3Vv6nOQJjapk6SfW6GpXiIdme6oRp3f4rHmCH6cX/SHBprNB0oSrxOB99cPo1fr/2Ayppm5s8oYtaEXDAWxcOSKMlJ7r7y3duay0BWsoes5FMn/CkJ/ftFEk1Hjnf0KAAAwrahvrmL0uxLJw4RGUDeZJImziVh5hJMfRXhlmPYjnNflS/I8vH1W6aw5qmtBEPd0wxDIZuS4ekAFGUlcv9XZtHWGSI92YPbYREGwq7unNrpyY5sy+nxUZJ95h4+GWMsupzdCz6GHL64Hm/3eZx9vrdCRAaOioAoyktP4NtfmI4/GMbncUauefhDNofq2uj0hyjOSSGtj+3nLkXu80xX8ro0J1REzs2fkIN71m34jRMrdzxWzhjs83xNOSyLN9+riRQAcyblY4DXtlczPCuJo/VtJHhdFGUn9+oeKhGReKMiIMqcFiSetuBLyDY8vX5vpNtPSqKb1V+6guyUTzbn9FJRmJ3E6OJ09lY1RZ7LSfdRkqchOBE5v5Dpzp/GWJiLfEWdnNo0a2Ie9S1dvHmiQ1BWWgKzJ+Wz9m8HmD+9kNuuH41LrSlFRHqI44HKgVHb2BkpAABaO4K8v7/+Au+IDV6Xg7uWTeKOz45l4sgsbv3MaFb93QwtFiMiUWHbhutmFOFyWuRlJbH7YGPkZ/XNXbhO3He1YVs1tY1dg3WYMkQ57S4cdI8ieUx83bgtcpLOyPqZfY57AMLhoXtjXjSl+tzMnzqcz8woJBw2agkqIlFVnJ3Ev35lNofr2i/4OuUeOZ3L7iT07lrcOaV4ckfSsv7XpFz7BfyJwwf70EQGlIqAfpaX4WPm2By2fHQMgASPk0kjswb5qAaOMd0364mIRJsxhty0BFITPYwuTGNvdTMAqUmeSA+Gy8flkpveu8W6LMvgCPsJOxJw2n5shwcT5Y42MgQYgx0O0fDy4zh8SZHnROKNioB+5nY6uPPGscybXkinP0RJXgrZqV7lGxGRKPG5HXz95ikcONqCwSI73UfN8Tb+1+3TKc1Lwe3szYm8wV37AV0Hd5I4czFdO9bhzh9NKG+yCoEYE3ImkjS1nPZ3/xO7s420a28nmJQ/2IclMuBUBAyARI+L8cXpkccqAEREois5wcWk0sxIP/H8EwtV9ZaFwQS7aN/+Kl17NhFubyLjhq8BNsTRar/xwBtupfE/HsWRkEhCyWSaX/t3huWOwJ8xerAPTWRAqQgQEZF+4TAhnLafoDMJd7idsMOLbQ29rx3LAne4EwomkjxlAaHGIziS0nEWTyHUDwVAbXMX1cfaSE30UJybjLeXK8BLdARcyaTOuQUrKQPbl4m3dBJ2cu5gH5bIgBt62VhERIYEb8cRcHq6hy/tYJ9unLSwcVS9Q2flTlKuvpXWvz1NQukUTMmVmDMa09U2d/He/nqy03yMK0k/74rq/cGywF2/h9Y3/kD61TeDHcSVkQ8NR/C/uw7ntCWETfQKgUPHO/jXJzcROtEgomJOKTddO+JTrVIsfWOMRTB3YmRU3iqZo5vHJS6pCBARkbM4LAgd3Uvb1nUAJM/4LI5Rw7F7vei5A0diGp17N9O1byvGDpM4/lpCZxQArV0hfvjbd2jtCALwj5+fypSRmVGN5YLHacDyJBJqrKHu+Z9gQgEyPvMlvCOnYyWm449iAeBwWGzcXh0pAABefquSudMKGZbiucA7JdpOP+dXASDxSmOQIiJyFtuAp3AC4dYGwq0NeAon9LoAiMgqwT2sGBMK4B5WDFklZ72k0x+KFAAAxxo7P+WR952dlIOv7EpMKIDDm4iraBLBnIn4kwqivi/HGYuWWXSPRoiIDDQVASIichaHBf4D20iZ/TlSZn8O/4HtOPpwsuq0bAI71hE8XkXqVTcTPF5FYMc6nFbPlsEZyR7+25XdxUGSz82kUQPbQtmygCM7ad/5F1JmLQHLouU/f4nb7oj6vmzbMG96Ae7T7gFYfM1IsjQKICKDQNOBRETkLLYB1+hrMCdv5DUh+rLOYdg48I6bi7d0MsGsMobll0FiBn7T89qT2+ngpmtHMG9aAYkJrgFfVdwYsHJGM2zZPxHKGUvmyMvBhPFbvVtboK8KsxL5wVdnc+hYG6lJHoqyk0CzUURkEKgIEBGRcwo5TjsRttx9fr8/IRsSssEGf2bZeV/ndjrI7WNLz2gKuNMgOw0MhJML+3VfxsCwFC/DUrz9uh8RkYvRdCARERERkTijIkBEREREJM6oCBARERERiTMqAkRERERE4oyKABERERGROKMiQEREREQkzqgIEBERERGJMyoCRERERETizCW3WJijL+vWX6LiIcZzice44zFmUNzxuv+BEi9xni4eY4b4jDseY4b4jbs/WcYYLVguIiIiIhJHNB1IRERERCTOqAgQEREREYkzKgJEREREROKMigARERERkTijIkBEREREJM6oCBARERERiTMqAkRERERE4oyKABERERGROKMiQEREREQkzqgIGGRtbW0sXLiQ6upqAN58800WLVpEeXk5Dz300CAfXf947LHHqKiooKKigjVr1gDxEfcjjzzCjTfeSEVFBU8++SQQH3EDPPjgg9xzzz1AfMS8YsUKKioqWLJkCUuWLGHHjh1xEfdgiLccqvyp/BnrMSt/DiAjg+bdd981CxcuNBMmTDCHDh0ynZ2dZu7cuaaqqsoEg0GzcuVKs3HjxsE+zKh64403zK233mr8fr8JBALmjjvuMC+99FLMx71p0yazfPlyEwwGTWdnp5k/f7758MMPYz5uY4x58803zZVXXmm+/e1vx8Vn3LZtc/XVV5tgMBh5Lh7iHgzxlkOVP5U/Yz1m5c+BpZGAQfSHP/yB733ve+Tk5ACwc+dOSkpKKCoqwuVysWjRIl555ZVBPsroys7O5p577sHj8eB2uxk1ahSVlZUxH/cVV1zB7373O1wuF/X19YTDYVpaWmI+7qamJh566CG+9rWvAfHxGd+/fz8AK1euZPHixTz11FNxEfdgiLccqvyp/BnrMSt/DiwVAYPoBz/4ATNnzow8PnbsGNnZ2ZHHOTk51NbWDsah9ZvRo0czdepUACorK1m3bh2WZcV83ABut5tHH32UiooKZs+eHRf/3/fddx/f/OY3SU1NBeLjM97S0sLs2bP52c9+xm9+8xuefvppjhw5EvNxD4Z4y6HKn8qfsR6z8ufAUhEwhNi2jWVZkcfGmB6PY8nevXtZuXIlq1atoqioKG7ivvvuu3nrrbeoqamhsrIypuN+5plnyM/PZ/bs2ZHn4uEzPm3aNNasWUNKSgqZmZncfPPNPProozEf91AQD58vUP5U/uwWazGD8udAcw32AcgpeXl51NXVRR7X1dVFhrljydatW7n77ru59957qaioYPPmzTEf9759+wgEAowbNw6fz0d5eTmvvPIKTqcz8ppYi/vll1+mrq6OJUuW0NzcTEdHB4cPH47pmAG2bNlCMBiMfHkbYygoKIj5z/hQEA85VPlT+fOkWIsZlD8HmkYChpApU6Zw4MABDh48SDgcZu3atVx77bWDfVhRVVNTw1133cWPf/xjKioqgPiIu7q6mtWrVxMIBAgEAqxfv57ly5fHdNxPPvkka9eu5cUXX+Tuu+/muuuu44knnojpmAFaW1tZs2YNfr+ftrY2nn/+eb71rW/FfNxDQaznEuVP5c9YjhmUPweaRgKGEK/XywMPPMDXv/51/H4/c+fO5YYbbhjsw4qqX/3qV/j9fh544IHIc8uXL4/5uOfOncvOnTtZunQpTqeT8vJyKioqyMzMjOm4zxQPn/H58+ezY8cOli5dim3b3H777UybNi3m4x4KYv3zpfyp/Bnr/9fKnwPLMsaYwT4IEREREREZOJoOJCIiIiISZ1QEiIiIiIjEGRUBIiIiIiJxRkWAiIiIiEicUREgIiIiIhJnVASIiIiIiMQZFQES01auXElDQ8Onfs2mTZtYuHDhRfc3ZsyYc25r/fr13H///QCsWLGCV155herqaqZNm3bRbYqIDAblT5HYpsXCJKa98cYbUXnNp7VgwQIWLFjQ7/sREYkW5U+R2KaRAIlZ3/nOdwD44he/yObNm1mxYgWLFi1i8eLFvPDCC2e9pqamhg0bNrB8+XJuuukm5s2bx8MPP9zn/T788MMsW7aMJUuWsGHDBgCee+45vvrVr0YlLhGR/qb8KRL7NBIgMetHP/oRzz33HL/97W/5/Oc/z6pVqygvL6e2tpZbbrmFkpKSHq/JyMhg1apVPPDAA5SWllJbW8v8+fO54447+rTfwsJCvv/977Nnzx5WrFjBunXr+ilCEZH+ofwpEvtUBEjM27dvH36/n/LycgByc3MpLy/n9ddf7zGn1LIsfv7zn7Nx40bWrl3Lvn37MMbQ2dnZp/3ddtttAJSVlTFq1Ci2b98evWBERAaQ8qdI7NJ0IIl5lmVhWVaP54wxhEKhHs91dHSwbNkydu3axfjx41m1ahUulwtjTJ/253Cc+rOybRuXS7W2iFyalD9FYpeKAIlpTqeTgoICXC4Xr776KgC1tbX8+c9/Zs6cOZHXhEIhDh48SFtbG9/4xje47rrr2LRpE4FAANu2+7TP559/HoBdu3ZRVVXFlClTohuUiMgAUP4UiW0qsSWm3XDDDdx55508/vjj3H///fz0pz8lHA5z1113MWvWrMhrVqxYwSOPPMK8efP47Gc/i8fjoaysjMsuu4yDBw/i8Xh6vc9Dhw6xdOlSLMviJz/5Cenp6f0UnYhI/1H+FIltlunrWJ2IiIiIiFzSNBIg0gdPPPEEL7300jl/9uUvf5nFixcP8BGJiFwalD9FhhaNBIiIiIiIxBndGCwiIiIiEmdUBIiIiIiIxBkVASIiIiIicUZFgIiIiIhInFERICIiIiISZ/4/+yMhMUUXWKIAAAAASUVORK5CYII=", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Create a visualization with Modin df\n", "sns.relplot(\n", " data=modin_tips,\n", " x=\"total_bill\", y=\"tip\", col=\"time\", col_order=[\"Lunch\", \"Dinner\"],\n", " hue=\"smoker\", style=\"smoker\", size=\"size\",\n", ")" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAwEAAAFcCAYAAACQkLIVAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAACckUlEQVR4nOzdeXxU1d348c+9c2fNTPaVfd8RUFEBFVErCiIC7ntbH7X601bbWvXRLj5q9amP1K2ttrZWq+IGgigoihuLCMoi+x4IBJJM1klmudvvj2ggJkDInsz3/XrxejF35p57zmTmzP2eVbFt20YIIYQQQggRN9S2zoAQQgghhBCidUkQIIQQQgghRJyRIEAIIYQQQog4I0GAEEIIIYQQcUaCACGEEEIIIeKMBAFCCCGEEELEGQkCRIfxk5/8hOLiYgD+67/+i+3bt7dpfu655x5eeOGFFr/Otddey8KFC1v8OkKIzqk91p1nnHEGU6dOZerUqUyaNInf/va3FBYWAnDw4EGuuOKKNs2jEPFAa+sMCNFQS5curfn/3//+9zbMiRBCdBztse684YYb+OlPfwqAbds899xz3HjjjcyePZusrCxmzZrVxjkUovOTIEB0CPfeey8A119/Pc8//zxXX301Tz75JFVVVTzxxBPk5OSwa9cuvF4vN910Ey+//DK7du3ivPPO47777gNg8eLF/PWvf0XXdTweD7/5zW8YNWpUrets376dX/7yl3Wuf9111zFjxowG5TUvL48pU6awevXqOo9nz57NokWLUFWV3NxcPB4Pjz32GH379qWwsJDf/e537Ny5E1VVueKKK7juuusA+Pjjj3nhhRcoKipizJgxPPTQQ6iqdOQJIY6uI9SdiqJwyy23MGfOHJYuXUqfPn1q6synn36affv2UVhYyL59+8jKyuJPf/oTmZmZnH322UybNo3ly5eTn5/P1KlT+cUvfnHUPD/99NOsWbOGgoICBg4cyOOPP94M77IQHZQtRAcxYMAAOxgM2rZt2xMmTLDXrVtnf/nll/bgwYPtDRs22LZt2z/96U/tyy+/3I5Go3YwGLSHDh1qHzhwwN61a5d94YUX2sXFxbZt2/bWrVvtcePG2ZWVlY3Oz29+8xv7H//4R53je/futUeOHFnv47fffts+6aST7Pz8fNu2bfvBBx+07777btu2bfu2226zH3vsMdu2bbu8vNyePHmyvXv3bvuaa66xf/azn9mGYdhVVVX2uHHj7JUrVzY630KI+NJR6s7bb7/d/vvf/16rznzqqafsc845x66oqLBt27Zvvvlm+8knn6wpy6OPPmrbtm0fOHDAHj58uL1nz56j5vmpp56yJ06caOu63uj8C9FZSE+A6PC6devGkCFDAOjRoweBQACXy0VqaioJCQmUlZWxcuVKCgoKuOGGG2rOUxSFPXv2MGjQoJpjzdETcCxDhw4lOzsbgCFDhrBo0SIAli1bxq9//WsAAoEA8+fPrzln0qRJOBwOvF4vvXr1IhgMNktehBDxq73VnYqi4PV66xw/5ZRT8Pv9QHWdWVZWVvPcOeecA0BWVhZpaWmUlZWxdu3aI+YZYOTIkWia3P4IId8C0eG5XK5aj+ur3C3LYsyYMfz5z3+uOZafn09mZmat1/Xr14+5c+c2KT+KomDbds1jXddrPe/xeOp9raZpKIpS89zevXtJSUmpee5I6QshRGO0p7rTtm02bNjANddcU+e5I9WZAG63u85zR8vzokWL8Pl8jc6nEJ2JDCoWHYbD4cAwjEadO2bMGJYuXcqOHTsA+Oyzz7jooouIRCLNmUUAEhMT0XW9ZgWO9957r8F5fPvttwGoqKjg+uuvZ/fu3c2ePyFEfGnvdadpmjz77LOkpKQwevToJqfXmvW9EB2Z9ASIDuP888/n2muv5emnnz7uc/v168eDDz7IXXfdhW3baJrGX//6VxISEpqUp5kzZ/LMM8/UPJ4wYQJPPPEEv/71r/mv//ovUlNTOf/88xuU1m9/+1t+//vfM2XKFGzb5uabb2bYsGFNyp8QQrTHuvPFF19k3rx5KIqCaZoMHz6c559/vklptnSehehsFFvGFQghhBBCCBFXZDiQEEIIIYQQcUaCACGEEEIIIeKMBAFCCCGEEELEGQkChBBCCCGEiDMSBAghhBBCCBFnOtwSocFgCMvqvAsapaT4KCmpautstLp4LHc8lhmk3M0hIyPQqPM6e/0J8fn5iscyQ3yWOx7LDM1f7sbWoZ2N9AS0M5rmaOsstIl4LHc8lhmk3KJlxeP7HI9lhvgsdzyWGeK33C1NggAhhBBCCCHijAQBQgghhBBCxBkJAoQQQgghhIgzEgQIIYQQQggRZyQIEEIIIYQQIs5IECCEEEIIIUSckSBACCGEEEKIONOiQUAoFOLCCy8kLy8PgGXLljFlyhTOO+88Zs6c2ZKXFkIIIYQQnYxl2xSURSiqiNK5tz5seS22Y/DatWu5//772b17NwCRSIT77ruPl19+mZycHG6++WY+++wzxo8f31JZEEIIIYQQnURplc6L721i3Y4iFAXGj+rGjPF9SXDLZmKN0WI9AW+88Qa/+93vyMzMBGDdunX07NmT7t27o2kaU6ZMYeHChS11eSGEEEII0VkoMH/ZLtbtKALAtuHTb/JYtaUARWnjvHVQLdYT8PDDD9d6XFBQQEZGRs3jzMxMDh482FKXF0IIIYQQnURV1OTz1fvqHP/k6zzOHJGDxAHHr8WCgB+yLAvlsFDNtu1ajxsqLc3fnNlqlzIyAm2dhTYRj+WOxzKDlLutxEP9CW3/PreFeCwzxGe547HMAKnJXtKTvRwIVtU63jXTT1pKAg6HrHVzvFotCMjOzqawsLDmcWFhYc1QoeMRDIawrM47FSQjI0BhYUVbZ6PVxWO547HMIOVurrQao7PXnxCfn694LDPEZ7njscxQXe5IVYxrJg7i8Ve/qTnuUBUmntqD4uLK405PtGIQMGLECHbt2kVubi7dunVj/vz5zJgxo7UuL4QQQgghOrBB3ZP57Y9P4dtdQVyag+F90uiS6m3rbHVYrRYEuN1uHn30UW6//Xai0Sjjx4/n/PPPb63LCyGEEEKIDkxVoFeWn97Z1UMb7c7dsdniWjwIWLx4cc3/x4wZw7x581r6kkIIIYQQopOSm//mIbMohBBCCCGEiDMSBAghhBBCCBFnJAgQQgghhBAizkgQIIQQQgghRJyRIEAIIYQQQog4I0GAEEIIIYQQcUaCACGEEEIIIeKMBAFCCCGEEELEGQkChBBCCCGEiDMSBAghhBBCCBFnJAgQQgghhBAizkgQIIQQQgghRJyRIEAIIYQQQog4I0GAEEIIIYQQcUaCACGEEEIIIeKMBAFCCCGEEELEGQkChBBCCCGEiDMSBAghhBBCCBFnJAgQQgghhBAizkgQIIQQQgghRJyRIEAIIYQQQrQKT2Ue7lgQAIcVwVu+CxWjjXMVnyQIEEIIIYQQLc4d2kvRW49Q9v7TePUg5rr3KJj1B5Qdy1BtCQRam9bWGRBCCCGEEHFAc+PwJqIX7Kbgpd+AaYCi4gikEVMcbZ27uCM9AUIIIYQQosVFPZmkTv0lKGp1AACkTroVPWsIoLRt5uKQBAFCCCGEEKLFOawI4U2fg23VHKtY+S6uaHEb5ip+SRAghBBCCCFanLNsLxVfvQuKSuoFt6IlZ6MX5BJZsxBNMds6e3FH5gQIIYQQQogWpyf3JPHMK3GmdkPPGkJyRi+qvp6Pe+QkYrbMCWhtEgQIIYQQQogWZyouHAPPRlc1bFsh6snEffp1xGy5HW0L8q4LIYQQQohWYSpOsA89NiQAaDMyJ0AIIYQQQog4I0GAEEIIIYQQcUaCACGEEEIIIeKMBAFCCCGEEELEGQkChBBCCCGEiDMSBAghhBBCCBFnJAgQQgghhBAizkgQIIQQQgghRJyRIEAIIYQQQog4I0GAEEIIIYQQcUaCACGEEEIIIeKMBAFCCCGEEELEGQkChBBCCCGEiDMSBAghhBBCCBFnJAgQQgghhBAizkgQIIQQQgghRJyRIEAIIYQQQog4I0GAEEIIIYQQcUaCACGEEEIIIeKMBAFCCCGEEELEmTYJAubOncvkyZOZPHkyjz32WFtkQQghhBBCiLjV6kFAOBzm4Ycf5uWXX2bu3LmsWrWKZcuWtXY2hBBCCCGEiFutHgSYpollWYTDYQzDwDAM3G53a2dDCCGEEEKIuKW19gX9fj8///nPueCCC/B6vYwePZoTTzyxtbMhhBBCCCFE3FJs27Zb84KbN2/mnnvu4YUXXiAQCPCrX/2KE044gRtvvLE1syGEEEIIIUTcavWegCVLljBmzBjS0tIAmD59Oq+++mqDg4BgMIRltWrc0qoyMgIUFla0dTZaXTyWOx7LDFLu5kqrMTp7/Qnx+fmKxzJDfJY7HssMzV/uxtahnU2rzwkYNGgQy5Yto6qqCtu2Wbx4McOHD2/tbAghhBBCCBG3Wr0n4PTTT2fjxo1Mnz4dp9PJ8OHDuemmm1o7G0IIIYQQQsStVg8CAG666Sa58RdCCCGEEKKNyI7BQgghhBBCxBkJAoQQQgghhIgzEgQIIYQQQgjRDny/jH5rkCBACCGEEKIdcUWDuCOFACjYeKv247D1Ns6V6GzaZGKwEEIIIYSoyxUrpnzhs9hGlJQpd2EG91Aw/2mSxl+FlXxBW2dPfKeyspJ7772X3NxcVFVl6NChTJ48mT//+c/k5OSwa9cuvF4vN910Ey+//DK7du3ivPPO47777gPg9ddf5+WXX0ZVVdLT03nggQfo3bt3rWs88sgjbNmyhb/85S84nU4ef/xxVq5ciWmaDBkyhPvvvx+/38/ZZ5/NCSecwJYtW7jrrrv40Y9+1KAySE+AEEIIIUQ7oVgmthHDCO4j+PrvKZ7/NNgWdiyKbVltnT3xnUWLFlFZWcncuXN56623AMjLy+Pbb7/lpptuYu7cufj9fp5//nmee+45Zs+ezauvvsrBgwdZvnw5//jHP3jppZeYN28eF154Ibfddhu2Xb2Zo23bPPjgg+zfv5+///3vJCQk8Pzzz+NwOJg9ezbz5s0jMzOTxx9/vCY//fv3Z8GCBQ0OAEB6AoQQQggh2o2oJ4OUKXcSfP33WOHqXXIDo6fgGHIuDo8PKuJvx+D26KSTTmLmzJlce+21jB07luuvv57i4mK6devGkCFDAOjRoweBQACXy0VqaioJCQmUlZXxxRdfMGnSJFJTUwGYPn06Dz/8MHl5eQC8+OKLBINB3nnnHVwuFwCffvopFRUVLFu2DABd10lLS6vJz8knn3zcZZAgQAghhBCinVCwMYN7sSKVNcfCO78hech4IL3tMiZq6d69O4sWLWLFihV8+eWX/PjHP+bBBx+suWn/nqbVvdW26unRsW0bwzAAGD16NCeeeCL33nsvr7/+Ok6nE8uyuO+++xg/fjxQPRwpGo3WnO/z+Y67DDIcSAghhBCinfBU5VM8/ymwLQKjp6CldcUI7qPi479jVJW3dfbEd1599VXuvfdeTj/9dH79619z+umns3Hjxgade8YZZ/D+++9TXFwMwNtvv01ycjI9e/YEYNiwYVxzzTUEAgGeeeYZAE4//XReeeUVYrEYlmXxwAMP8MQTTzSpDNITIIQQQgjRTsS8GSSNvwo7FsUx5FySh4yn4uO/Exh/HZovESplOFB7cPHFF/PVV18xadIkvF4vOTk5DBw4kIULFx7z3HHjxnHDDTdw/fXXY1kWqampPPfcc6jqobZ5RVF45JFHuPjiixk/fjy33norjz32GNOmTcM0TQYPHsw999zTpDIo9vezEDqIYDCEZXWoLB+XjIwAhYXx9wWPx3LHY5lByt1caTVGZ68/IT4/X/FYZujc5a5eDtTGVKqHlriIEMPTqct8NM1d7sbWoZ2N9AQIIYQQQrQjpuKs9TiGp41yIjozmRMghBBCCCFEnJEgQAghhBBCiDgjQYAQQgghhBBxRoIAIYQQQggh4owEAUIIIYQQQsQZCQKEEEIIIYSIMxIECCGEEEII0Q48+OCD3HHHHbWOLVmyhHPOOYdQKNSs15IgQAghhBBCiHbgl7/8JevXr+fjjz8GoKqqit///vc88sgj+P3+Zr2WbBYmhBBCCCHEcfj06728tGATRSVh0lO8XHfBYM46qXuT001ISOChhx7ivvvuY8yYMTz11FOcffbZeL1errzySiKRCCkpKfzhD3+ge/fu/Otf/2LOnDmoqsoJJ5zAgw8+2OBrSRAghBBCCCFEA3369V6eeXMtUd0EoLAkzDNvrgVolkBg7NixnH766dx7773s3LmTV199lauvvpq//e1vdOnShS+++IIHHniAF154geeee44vvvgCh8PBf//3f3Pw4EGysrIadB0JAoQQQgghhGiglxZsqgkAvhfVTV5asKlZggCAe+65h7POOotnn32W/Px89u7dy89+9rOa50OhEA6Hg1GjRnHJJZdwzjnn8OMf/7jBAQBIECCEEEIIIUSDFZWEj+t4Y/j9fhITE+natSuhUIhu3boxd+5cAEzTpKioCIC//OUvrFmzhs8//5wbb7yRxx9/nFNOOaVB15CJwUIIIYQQQjRQeor3uI43VZ8+fSgrK2PVqlUAvP322/zqV7+iuLiYSZMmMWDAAH7+858zbtw4tmzZ0uB0pSdACCGEEEKIBrrugsG15gQAuJ0OrrtgcItcz+Vy8eSTT/Lwww8TjUbx+/089thjpKamcvnll3PJJZfg9Xrp3bs3M2bMaHC6EgQIIYQQQgjRQN+P+2+J1YEOt3jx4pr/jxo1irfeeqvOa2644QZuuOGGRqUvQYAQQgghhGgVqmpjWQoAigLYFnYHHJ1+1kndm/2mv7V1vHddCNEg7uItuCMFNY89Vfm4S7ZXV7pCCCFEK3PpZdjr3sWll6IoCs6iLWj7VqMoVltnLS5JT4AQnZCrcANFc59AC6STPPWXKJZJ8Zz/xYqESJ12N3paf2y7rXMphBAiXjjRCa94i6qNX+DeuwX/iecTnP8kmCYZV/6eSKBnW2cx7kgQIEQn47BjxPK3gWVilB2kZM5j2KaBVVUGgFGwEyWlN7YiX38hhBCtQ8eJ78QLiOxeR3TvBqJ7NwCQMGoihi+jjXMXn2Q4kBCdjKm4cAw7n8ApU6ofVwRrAoDEcZehDJyAJQGAEEKIVqb7u5I8/uqax4rmImHU+RgOXxvmKn5JECBEJ2SqHryDxtY+qGp4+o3GVFxtkykhhBBxS1EUtMJNFH/4fM0x24hR9tE/cOmlbZexOCZBgBCdkKcqn+Db/1v7oGVQ/M7jtSYLCyGEEK1BtaLE9m4E0yBh1EQyrnkE1ZdELH87arSirbPXruTl5TFw4ECWLl1a6/jZZ59NXl5es11HxgQI0ck40QmtmodZWQJAYOwloEepWPkuRtlBIusW4Tz1Cgzb0cY5FUK0F4oC5RGDotIIHpeDrGQvaiuuJKYoYNrgUJBFCzopU3GhnXABaV36Y6X1JeLwkTr9N6BHiAa6g/zda3E6nTzwwAPMmzcPv9/fIteQIECITkbHie+0yzAry/D0GIY65FywLQKKgl6Uh/vEKcQkABBCHGZ9bil/m72OyogBwOjBWVx93gASvc4Wv3YwFOOz1Xl8s6WQYX3TOOek7mQkulv8uqL1GaoXMofXPI76ulT/p4MFALapc+CNRwHImvErDr79OADZl92D4mie70xmZiZjx47lscce43/+539qPfe3v/2NefPm4XA4GDduHL/+9a9xOI7/d12GAwnRCcVcKQTOvRl1yLmYigtT9eA4YRK+8dcT0xLbOntCiHbkQGmEma99UxMAAKzcdJCFX+1p8X1FqmImT7z2DfOX7mZ/USUfrtjDH19aSXnYOPbJQrSRA288SmTPRiJ7NrLnqZtq/v99YNBc7rnnHpYsWVJrWNDnn3/O4sWLefvtt5kzZw65ubnMmjWrUelLECBEJxVzJtWaBGyqHnRnUhvmSAjRHu0+UI5VT0vsR1/tafGb8bzCSvKDVbWOlYZi7DkoY8RF+2cbMaxoFbYRa5H0/X4///M//8MDDzxAKBQC4Msvv2Ty5Ml4vV40TWPGjBksX768UelLECCEEELEMdOsfyyGadlYLTxA37Tq3ynWqC8qEaKdyJrxKxRH7RH1ikMja8avm/1ap59+es2wIACrnu+MYTQuWJcgQAghhIhjPbMD9R4fO7xLi88J6JbhJ+CrfQ23y0HPrJaZCClEczj49uPYZu0bb9s0OPj2n1rket8PCyooKOC0007jvffeIxKJYBgGb7/9Nqeddlqj0pUgQAghhIhjXVK9/HTKUNTDlgPqnuln2pl9aOkFggIejXuvG83IARloDpUhvVO5/4bRpPplPxPR/imaC9XtQ9Fa9vP6/bAgXdc566yzOOuss5gxYwaTJ0+mS5cuXHPNNY1KV7HtjrUYVzAYwurE3YQZGQEKC+NvLGQ8ljseywxS7uZKqzE6e/0J8fn5ao4y29gEK2LkB6vwuTW6ZiTg0VqvndDCJhKzcDsdOBoYecjfOn40d7kbW4d+rzVWB2oNskSoEEIIEecUFNIDbtIDbbM0p4qCzyVLF4uOQXE4ybnygZrHh/+/I5HhQEIIIYQQQsQZCQKEEEIIIYSIMxIECCGEEEIIEWckCBBCCCGEECLOSBAghBBCCCFEnJEgQAghhBCdnqL88HFL74IgRPvWJkHA4sWLmT59OhdccAEPPfRQW2RBCCGEEHFCUSycB9bhjhQA4NJL0fJW4rBjbZwz0Zl8++233HHHHW2djQZr9SBg7969/O53v+Mvf/kL8+bNY+PGjXz22WetnQ0hhBBCxAFFAWfBJoJzZ1I69//wVu0n9NHfKX7vGdixBFXp3BvoidYzfPhwnnrqqbbORoO1+mZhixYtYtKkSWRnZwMwc+ZM3O622ZxECCGEEO2PooDDimIo1fcHTmLouBqVlm2DkpiNM7MXesEuCv5zHwCqLwmty2CitgwLEsevsrKSe++9l9zcXFRVZejQoUyePJmHH36Y+fPn89Of/pSioiIAqqqq2Lt3LwsXLqRLly48/vjjrFy5EtM0GTJkCPfffz9+v7/Vy9DqPQG5ubmYpsktt9zC1KlTefXVV0lKSmrtbAghhBCiHVIUcJXsILb8NVxmJe6q/YQXP48rVtzoNKPuNFIuuKXWsbQLbyfqy2lqdkWcWrRoEZWVlcydO5e33noLgLy8vJrnX3jhBebOncubb75JVlYWd911F7169eL555/H4XAwe/Zs5s2bR2ZmJo8//niblKHVewJM02TVqlW8/PLL+Hw+fvaznzFnzhymT5/eoPPT0lo/UmptGRmBts5Cm4jHcsdjmUHK3Vbiof6Etn+f20JnKrNZFaLoi8VUblyKFS4jmr8Tq6qMwIgJZAzqWeu1DS23XnKQgg9ernWs5MO/k3XpPbgzezRb3ltDZ/pbH4/2Vu6TTjqJmTNncu211zJ27Fiuv/56iotrB6qWZfGrX/2KPn36cNNNNwHw6aefUlFRwbJlywDQdZ20tLRWzz+0QRCQnp7OmDFjSE1NBeDcc89l3bp1DQ4CgsEQltV5x+9lZAQoLKxo62y0utYqt4qFK5SH4ctCsXQc0RKi/u7YbfCRkr91fGnOcjf2x7Cz158Qn5+vzlhmz2mXY4RKCe9YDUDyOT8mmjqQ8GHlbGi5FQWcB3cQzV2P6ksi7aJfULr4JfSCXYS2rCSkJmN1kCFBnfFv3RDNXe7mCCi6d+/OokWLWLFiBV9++SU//vGPefDBB2u95uGHHyYcDjNz5syaY5Zlcd999zF+/HigelhRNBptcn4ao9WDgAkTJvCb3/yG8vJyEhIS+OKLLzjnnHNaOxsiTrmr8il49bcETp2KWREkvPUrMq55hLArva2zJoQQ4jtKrBK96NDQiti+rXh6noSlHX9vlm2DkTmYlPNvwZHeg4ivC0kX/D+MvetQ+pyG0UECANG+vPrqq3z99dc8/vjjnHHGGQSDQTZu3Fjz/PPPP8/q1at5+eWXcTgcNcdPP/10XnnlFcaMGYOmaTzwwAP4fL42WS2z1YOAESNGcOONN3LVVVeh6zrjxo1jxowZrZ0NEaeiviySzrqWsk+ru4VTp/yCiDsNOnfjqBBCdBhOooSWzsKqKiP5nBuI5m6gavNSvANGo+SMbFTPrYWG3fM0jO/OjbrTUPpPaJNeYNE5XHzxxXz11VdMmjQJr9dLTk4OAwcOZOHChRw8eJAnnniC3r17c80112BZFgB33HEHt956K4899hjTpk3DNE0GDx7MPffc0yZlUGy7Y30FOnt3tnT1tSynESKy4nWqNnwBQGDMdLShEzHU1l+hSv7W8UWGA7WOePx8dcYyu2NB7OI8jKyhOMwqlIJtmDnDMJVDdXVnLPexxGOZoX0OB+oMWr0nQIi25IhVENnxDakX3YlVWULFyvdIGTgWwyXL1AohRHsRdaWh5KRh22BpiShdT5JWeyGamQQBIq5EfDmkX/UQEWcyim2S0mMEUWdKW2dLCCHaDU1TsSy7zXuNDr/plwBAiOYnQYCIO2EtBWyw0SQAEEKI7+imzcY9JXy0ci+9sgOcdWI30vyN26CrNRjlQfhuAzHNioJtYjh8bZspITqQVt8sTAghhBDtz5a8Up58fQ0bdgZ5b9lunnxjDVHDauts1csd2sv+l3+Lu3Ifmh3D2PABsW/moplVbZ01IToM6QkQQggh4pzDobByU0GtY3kFIQrLInRLa1+t6xoxKlcvwCg9QPHsP+LtfyqV6z4GRcU3eByGv2Nt/iVEW2lwT0BZWRmhUKgl8yKEEEKINmBZNr1zaq+Y4nY5CHidbZSjIzNw4Tvtcrx9RmGFQzUBQOqUXxD1d2/r7AnRYRwzCNi5cyczZsxgzJgxnHrqqVxzzTXs37+/NfImhBBCiFZg2zCqfwajBmQAkODRuP3SESQntM85AZbDgyvzUIu/4tBwBNKwkY2/ROcQCoW48MILycvLO/aLG+mYQcC9997LpZdeytq1a1m9ejUTJ07kv//7v1ssQ0IIIYRofUk+J7dOG8b/3jaOP/5sLEN7JNMetxJyWDHMDR9S9uVcUFS01C7YRozg7D/irpJGStHxrV27liuvvJLdu3e36HWOGQSEw2GuuOIKnE4nLpeLa6+9lqKiohbNlBBCCCFan0NRSA+48bu1drssp6U60dK7geogdcovSJ52L+7eI9GSc7A1T1tnT8QRwzAoKirCMIxmTfeNN97gd7/7HZmZmc2a7g8dc2Jwnz59+OabbzjxxBMB2Lp1K926dWvRTAkhhBBC1MdGwcgZSdef/C8Vaio2Cv6zbgDbIuZKbevsiTixdu1afv7znxOLxXC5XDz55JOMGDGiWdJ++OGHmyWdYzlmELB//36uvfZaBg4ciKZpbNy4kYyMDKZMmQLAu+++2+KZFEIIIYT4no2CO6sX5YUVAMScyW2bIRFXDMPg5z//ec2CObFYjJ///Od8/PHHOByONs5dwx0zCPjVr37VGvkQQgghhBCi3SstLSUWi9U6FovFKCkpIT09vY1ydfyOGATs2LGDvn37kpCQUO/zQ4cObbFMCSGEEEII0R4lJyfjcrlqBQIul4uUlJQ2zNXxO2IQ8L//+78899xzXHrppeTk5NRaISAcDrN8+fJWyaAQQgghhBDthaZpPPnkk3XmBHSkoUBwlCDgscceo7S0lL59+/Lyyy9j2zaKoqDrOtdcc01r5lEI0QwUhXa72ocQQrRXqmLhKsvFTMhEdyTgipWgxiqIBnpInRrHRowYwccff0xJSQkpKSktEgAsXry42dM83BGDgF/+8pcsXboURVEYM2ZMzXGHw8HEiRNbNFNCiOblNCtRCrZgZQ3GUL24y3MBm2hir7bOmhCik4gaFnmFlRRXREgJuOme4cetHXMl8gZzGhVYDhem4gZALyvEgYZ57OmNjaZg4dj7DYXvP0vC8AkkjJ5C2cf/IJa/jbTpvyGa3LfFri3aP4fD0aHmAPzQEb85L7zwAlC9Wdgf//jHVsuQEKJ5aVaE6FdvUfntJ/hPmoS3/ykUzX4MsEm/7AEivq5tnUUR50JRg/xgFQleJ1nJHhyK7Pra0YQiBi/M38ja7Yf2ERrZP4OfXDgYv7vpN+lOo4LIslk4kjNxDD8fhxGmaNE/cfUcgaP/eEylhQIBRUVJSEJxaFR+u5jKjZ+DaeBISAaXr2WuKUQrOea3RgIAITo2Q/Xg6XcKlRu+IPT1+4S+fh8A74BTMV2JbZw7Ee+KKqL88d+rKAlFAbj2gkFMGNkFZJhFh7JuZ7BWAACwZlsh63dmctrgrCalrSgK9v71VG1eCkDA0AkX5hLN/ZbwrnVkdBmA6e/epGsciW2Dntqf1It+QXDOn8Cs3hQqddrdRHw5LXJNIVpL8/XTCSHaLT1zMMlnX1fzWPUkEDjzanQt0Ia5EvFOURS+3HCgJgAAeOWDLQQrokc5S7Q3qqrw6Td59T73ydf7UNSm9ezYto3S5QQShp8FQMWq+URzvwUgZcK1GP6mBRnH4oyWEFr1fq1jVd9+jNOsatHrCtHSJAgQogGcRghP1f6axw7bwFu1D1Wx2jBXDecqy6Xs89dqHluRSqpWL0Szwm2Yq0PcVmXN/xVFwW2GjnmOogKKgowc6bgUBcpCtdfatiwb3egY36v25vDvUfXjY3+PmksgwVX/cb+zWb6jltNLwsDTUBzOmmOu7N4YJftxFO9q+gWOQFVs9G1Lie7dgCMhmdQLb0fRXFSu/RilaBuKYksd1ECKYuGt2ofDNmqOear24zRa73MqapMgQIhjcBohIstfo+jNh3BX5OLAhB1LKPjP/Tj2ft0mgYCiQNS0iJnWMX+AnHaUyq/fxY6F8Q44lbSpd4GqEVr9AVo42DoZPgp3+W5KZz+EO3wARVFwFm2mbN6fcEcL6329adls3V/O02+v5w//+opP1uynPKy3cq5Fc7Asm7En5NT6DJ88KJP0RE/bZaqDclftp/Tth3CH9lQH0uW7KZ39MO6q/DqvdShmvf9vDEWByqjBuSfXPxzn3JN7YJlNH9vliJVTvvJdbPPQdz12YBegQCCj5phTL8MVra7XNLMKd6T+eqShLFtBG3AmCcMnkDLtN6hZ/Um74BYCp07FyhiAs3AzrpIdEggcg6JYaHnfUPCf+7G3f44DA3fFHorefIjIstckEGgjim13rAWugsEQltWhsnxcMjICFH63DXo8ac/ldpdspejNRwBQ3D78J5xNxcr5AKgeP2lXPkjEmXrc6Ta2zKZts25nMS8v3IxC9RjqE3qnoh7lV8ill2LuWIHabyy6FsBVuAmw0DOHtvoSd4eX22OUEZz1AFZVOQ5/Cknjr6Z44d/ANPD0HonvR7eic6iFUVHgm+1Bnn5zba00+3ZN4s7LR+Jztd81mpvzM56R0bhhXO2x/rSx2VtYxY79ZSQnuOnfLQm/p/GTPNtzXdJSUjw6+1/+LUZJPqrbR/KPbqTkw+exYxGcmT1Jmvobokr1JFanUYG+9n3cQyeAZRDd+iXO4RPRHfVvDHo0VTGTxd/ksWB5LqMHZdK3ezKzFm0lHDXwujWumTiI0YMy0Jo4HEhRFLTc5dV1A5B48gUYZYVUbVsFKGRe/SDhhO449TKqPvs3ZlUZKZNuo3L1B0S2ryTl4t8Q9TZtyJCGjmrpVH78PLH8bQRGT0FL60pw3pMoqkr6Fb8n4uvSpGs0REf9fHv0kuq6Plx9sx8YfSGhdYuxo9VDqtIvuZdo6sAjnt/c5W5sHdrZSBDQznTUL3hTtedyq5gou5ZTuugftY97Ekidfi9Rf7dGpdvYMu8tquR3/1hR69gfbjyN7ulHX6lCVaA9fHUOL7eigLN4O8Vz/hfbODQsxJGYQcrFvybqyax1bli3uO+5ZXWGkAD8+uqTGNw9qWUz3wQSBLSO9lyXtJSMjADluzZQ/PajWNFD49RVr7+6jko4tAKYO3yQknceAxRsQ0f1+kmechdR9/Evc7hiSwHPzVlf87hLegJ3XTGKSMwg4HOR6HXSXLcYDiuCtfZdFIeDqu3f4EztAti4u/RH7X0yMWcqmhXGWPMeFavmg6qBZeDqNojAuTcTczV9J1dFAWdwK8Vz/lSrR8J/0iSco6ZgqN4mX+NYOvLn212ZV/0ZjdRu9U8+9yfYfcZhceRGnHgLAp555hkWLFgAwPjx47n77rtb5DoyHEiIY7BwoPY5FU+vEbWOp/zoRmKBxgUATXGwpO44/oKSY09Qa4/3frYNRlo/ks68otbx1AtuqRMAAFRG9HoDAICSikiL5FGIjiAW6Enyj26sdSzlvJuI+WsvARz1ZpEy8WbMiiBWuJyUiTc3KgBQVYUNO4trHdtfVElV1KBLqo+AR2u2AADAdrhwpHahatvXaElZJJwyFcXhpvybRSih6uE/hurFM+oCHP4UsKrHnSef85NmCQDgu/oqfSCJp19ac8yRkIz3xMmtEgB0dDF/N1LO+69axzy9huPoO+aoAUC8WbZsGUuWLGHOnDm88847bNiwgUWLFrXItSQIEOIYHLaBtW0pkd21h6AUf/h3XOW5rZ6frJS6Lf5ZqR1zvWpFAa1oC6WfvVrrePF7z+AJH6jz+gSPRnLAXW9aqTKOXMQxV9kuSj58vtaxkoV/w1Wxt9YxT/ggxe//pXq9fX8Kxe8/e8T5N0djWTbD+6bVOpaY4CLlCN/PprJsFbv7SQROm4b3zOuJ+LqSPPZiks+5gVhqPwA0K0zVqnmYoRIUrXoYYemHz+OKFR8t6eOiFWyk7Is3ah6blaVUffUOmqwUdEyu8j0Uf/BcrWOR3d9ibP2i1mThjqKkpIS//vWv3Hbbbfztb3+jpKSkWdLNyMjgnnvuweVy4XQ66du3L/v37z/2iY0gQYAQx6CV7qB08YtA9RCgxDHTAbCjVRTPfgyP3nw/MA3RNc3HXVeMIjvVR3aqj7uuHEWXDhoEuPUySt5/tnrzncQMUqf8HEVzYYZKqFj6Ok5qt/p7nQ5+PGlInXQG9UyhZ5a/tbItRLtiVBRT+uFz2LEIqtdP2tS7UN0+rGgVZR+9gNs+dINqOhNIGH4WKVPvJmXab0gYegamdvzzAQCG9U7lih8NJDHBxeBeKdx9zUkktOC8HFP1YHQ7Gd1Zvb+JO7v3d/OaquccKKaOVVmKq9sgMq57jMDJF2JWlaOYzbNwgNOqonLdx2AZ+E+aRPqMe1AcTqo2fYEWK2uWa3RWHr2E4jmP1cwBSBwzHdVTXWeXffISWsmOtszecSspKeHyyy/npZdeYsWKFbz00ktcccUVzRII9O/fn5EjRwKwe/duFixYwPjx45ucbn1kTkA705HH+zVFey6306gksvJNIttWkjr9Xgx/DuxaTulH/yT1wtsxckZgNyKebmqZ9e++B84mTrprbT8st7tiD2Ufv0DSxFuJ+bJwFm+jYukbJJ53C1FXWp3zLdtm18EQn3ydR2FpmDNHduWEvmkEmjCZtDXInIDW0Z7rkpaSkRGgYs82Shc+S9J5txDzd8VVsYeyj/5J0vm31hla51AsTFut8//GUBSFiG6iOVQcrVwV1fe3durlKJZBzF09R8ChVzZquNORuPQyrPxN0G0EpsOLM7gVRXMRTezVbNc4mo76+Vaw0fLXUjz/KZLP+TH0GYtWmU/x24/i6XsSntMuO+rk9PY2J+Cvf/0rL730Erp+KMB0Op1cd911/OxnP2tq9gDYtm0bN998M7fffjvTpk1rljR/SIKAdqajfsGbqr2X22lU4tAriHizgerJwu5wARFvVqMCAGj/ZW4p9ZXbZVcR+271EkUBl1lFVD1674bDUf2+m2bHWFNegoDWEY/fq+/L7LaralYBAuo87mza6m+tKErNfAdFoVVXWHP7XARLqvC6tFYPuppKwcYTPkDUm1kzB8AdPoDlChxzdar2FgTcdtttrFixos7xU089lWeffbZJaQN8/fXX3HHHHdx3331Mnjy5yekdSftuOhOindC1BPTDuswtHIS9smV8c4kddqNi2xwzAICOc/MvRGv54Q1/SwcALr0ULIuYu3qJZG+0gKgrFUvp3LcWh7edtlYAYNo2m/eU8sqHWzgQrKJ/92Su/NEAemV2nGGQNkqd383odw1rHc3w4cNZvXo1sdihIasul4vhw4c3Oe38/Hxuu+02Zs6cyZgxY5qc3tHInAAhhBBCHBeXXkbok39S8eFfceuluIKbKfjPfah7VqJ2wEmeTaFwKBJoqdGZ2/eX83+vreZAsHpM/ba9pTz84kr2F7ePXd/jzeWXX04gEMDprN7B2uVyEQgEuPzyy5uc9gsvvEA0GuXRRx9l6tSpTJ06lddee63J6danc4frQgghhGh2ihHBCO7DrAhSOvthjIoSsAz0oj1o3UdCJ+8N+J67PBezvACr24k4w0UY+zah9RmDoTbfKkk2MO+LXXWOm5bNys0HuXhc72ZdjlUcW0pKCrNmzWLWrFmsX7+e4cOHc/nll5OS0vTlaO+//37uv//+ZsjlscXHt1QIIYQQzSbqzSJ12t0E33gQo6x6idGEEefiHHVR3KyZ77FClCx4FqOsgKSzrqHkm4WY5UVkZvXCCPRqtuvopsXB4vqXIM0vqkRVFUxTgoDWlpKS0myTgNuKDAcSQgghxHGzKkuwYtGax3pBLqoRP5v2RVQ/yRf+HNXrp+zT/2CWF5E04TpijdxF/khcmsppw+ofOz9qQKbMjxKNJkGAEEIIIY6Lp2o/wTmPg2XgGzYeRyCNWP42Qp+8gMuOo3HqDg3F4ax5qLi82EozTwyw4axRXetsxNavWxJDejXPbsgiPslwICGEEEIcF92bRuCUi7BiVbhOnIpvVDll7z9J4LQZxFQvtPHoFEUBdyxI1JWKbStoZhWqGSXmar6bZrcZonT+k5ihEgKnXkzluo8p/eA5Mq/sQjjQs9muA5AecPPbH5/CrgMV5BdV0jUzgT45ifjdchsnGk8+PUIIIYQ4LqbixjFsIpptYqheDK+XpGn3EXP4W3Xd/PooCrhKdlD0zp9IueBWlMwBRFbNRi/IJTDx1mYLBGKan6RzfoxZvB/6jiO132hiu75B92Ue++RGSPI5+dGpPeNuHwzRciQIEEIIIcRxM3+wAk5U9bd5DwCAahvE9m3GjkUonvdnXF36E9u3BcXhRI2WQTMFAbYNemp/lOTeWGiYCV1xDMvG+G4jLCHaO5kTIIQQQohOw0RDHXIugVMuAtsitm8LKCqp0+4mmtirWa9l29TaHM2UAEA0gyeffJJJkyYxefJk/vWvf7XYdaQnQAghhBCNpiitt3NuQymWiRU9bFlN28bWw1R3VbTQjl4i7kQiEYqKikhPT8fj8TRLml999RVffvkl8+bNwzAMJk2axPjx4+nTp0+zpH846QkQQgghRKOomLgKNuDSSwFwxYpxF21Coe2WrXRgYKx7n8q1H6E4nHgHngbYFM/7M+7SuptuCXG8DMPgiSee4JxzzuHKK6/knHPO4YknnsAwmr5b9imnnMJLL72EpmkEg0FM08Tn8zVDruuSngAhhBCihamqgmW1s+byJlIUUPesomjBX3H3GErShOso+eA59AO7SJt6J3r2CW3SQ2Ci4e5/KlUbl5B8wa3Yqb3QUnLQC3IxfWmtnyHR6Tz11FPMnj2baPTQPhmzZ89GURTuvPPOJqfvdDp56qmn+Oc//8n5559PVlZWk9Osj/QECCGEEC0kalis2Rnkn+9vZtnGg1RGmt5SeLycRgXu4CYcmLir8nFX5DZLujHDZoejH9tG/pxg5skUzfoD+oGdOBLTUZKy23SIUNTfndQr/oCeNgAdF47h5+Mbfz26M6ntMiU6hUgkwttvv00kEqlz/K233qpzvLHuuOMOli9fTn5+Pm+88UazpPlD0hMgRCvRrDCqESbmSgXANg08epCIU1qmhOiMFAVWbSnkhXc3APD5mn2cd2oPrpjQr1XzoBTtoGjen0kcM53itR+jJaaReOGviKqNH2JgAx9/k8cbH28Dqne1vXfiT0hd/gxpF/2CsKdllsk8HhFHUs1qRabixnS6j36CEA1QVFSEqtbfhq6qKkVFRXTr1vhdo3fs2EEsFmPw4MF4vV7OO+88tmzZ0uj0jkZ6AoRoBZoVJvbNXMrfexJ3NIiqWIQ2LKHolftxl+9u6+wJIVqAYcH7y3fXOvbRV3uoiLZeb4Btg5U1hISR51G+fDa2HiX5/FubFAAAlFbGeOuT7TWPY4bFp7kaWko2ZV+8VjNHQIjOJj09Hcuqf86Lbdukp6c3Kf28vDzuv/9+YrEYsViMjz/+mJNOOqlJaR6JBAGizRwoDbNicwHrc0sIlnXebeYVBdSCLVR+sxC9MJey959C2bGEwnefwY6FKXl3Jh6zvK2zKYRoZg4FemT6ax1LS/LidrTuT68jHCS8dQWoGrYeIbZ3PardtEDEsqtveA5nqi4cviSiueuxD25DUWQVHtH5eDweZsyYUWc1oCMdP17jx4/nrLPO4uKLL2bGjBmMGjWKyZMnNynNI5HhQKJN5AWrePCfKzDM6h+RsSfkcMMFg9A64Y+GbYOdMYCEEedQufZj9MJcSj/6JwCK5iLlwp8T1RLbxSY7QojmdfH4vmzLK6O4PILXrXHztOG4tFZuf4uGcCQkkX7ZA4S3LEcvzsfVS8dyNP4WINXvYtKYXry3bDcADlXhnNE9Cfiux7NvI3aXE+oECUJ0FnfccQeKovDWW2+hqiqWZTFjxgxuv/32Zkn/9ttvb7a0jkaxO9i3NBgMdboVFg6XkRHo9FuCK4rC25/vZP7S2ku1PXzzGHJSvG2Uq5bntsOUzXsM/eDummNp036Nnjm03a2x3ZLi4TNen+Ysd0ZGoFHndfb6E9rn5yscMwmWR0jyu0n0as3+fT9WmRUFXGaIqOpHs6JgmxiOpi85GDUscg+GKK+M0i3TT06KF9sGVbGx7JZv0GmPf+uWFo9lhuYvd2Pr0B9qiX0CWpMMBxJtwMbtqrurokPtfL0A31MVC2vfevSDtVflKF/yBq5IsI1yJYRoDV6Xg27pCQQ8Rw4ANLMKV6wUAHesBIetH/d1FEVB01Q8RjGKYuOwddyxEmwbomr1sCRDdTdLAADg1lQGdE3k5AEZZCd7a8rWGgGAEO2Bx+OhW7duHTIAAAkCRBuwbRg9KJMEr7Pm2NTxfUlP9GADhm2327Gkh8cp1Vk8dpOeooBWsJHi9/8C2CiaC3evEQDVcwQWPIVbL2uR/Aoh2j/NjqGvmU9o8d/xhvMpnfsY7FrO8bSLBEMx3lm6m6feXMvK9fugaCf2ts8ofecx3LHilsu8EKLDatM5AY899hglJSU8+uijbZkN0UCKAqGoiUNV8DrVI7ZofT/hzFY1Sioi7DpYCUCv7ERSEqpv/DOTPDz0X6eRV1RJgkejT9ck1m4rYv7SXZSFoow/sRtjhmaT7HPWf5E24NLLMLYvw9lvHIYrCWfRJmw9hpEzHPsoLV+2DUpSDs7MHhjF+aTNuAc7MQfXV29TsfYjEkach+70H/F8IcSRRU0L3bCO2srenq5pA6GogdfpQPvuLt9UXXj6nkTl2o8oePleHAnJaNn9iTYw7dKKCE++sYa8ghAAq7bALyem0WPlKySefhlGA+qXsG6SeyBEeVWMLmkJ5KR5cRxHY4wnfBDTmYCu+XFHi7AVBzFXSp3XKQpUxSws2ybg0Tr98LQjcUcKsB1uYs4kXHopiqkT9WS0dbZEnGmzIGD58uXMmTOHs846q62yII5D1LD4dPU+Zn+2A59H48aLhjGkRxIKtX8kVNtAyf0KRVHICwzjwRe/IRIzAfC6NR748SlkJ1d3myX5nCT1SEZRYPn6Azw359uadN78eBsrNhzg7qtOxFfP0KHW5iRG1fJZhDcvx5u/A9+w8QTnPQm2ReaVvyfs73HU86OuNJIm3YESDRFN7IltQ8qEK/EMHEMsuRcWbV9GITqabfkVPD/nW0pDUSaN68XE0Uf/HrbUNRtaR5WHdd7+dAdL1u2nW6afmy8eTpfvxtHjTUJ1J2AaMdRAOraz4UN28ouqagKA732zz6ZPUgZaWjcM1XnUTsvKqMFf5qxn0+5DPQY/nTKUccOyGrRggacqn+DsR/H0HoF/9EWUvP8UittH4NxbiLmSa15n2zZfbw/y4nubMEyLK84dwNih2Tgd7bPnt6W4I4WUvvsEjuQsks66jrLF/8IMlZB84Z1E3U1bXlKI49Emw4FKS0uZOXMmt9xyS1tcXjTC9n1lvP7xNnTDoiwUY+Zr31BYHq3zOsU2MEvyKVv3KW8t3loTAACEowbvLtlZ5zelImLwnwWb6qS150AFuQfbxwQoHRcJJ05C9SQQ3r6K4Dv/B5ZBwqjz0H0Na72JutKIBHrWtBxqCclEkvtKACBEI5RUxvi/V74mWB7BtGze/WIXm3JLWvyaj//gmpv3lDboXEVRWLr+AF+s3Y9tw96DIf48azURw8Kp6FR9NRuwSZvyc4yiPZg7vmzwcKCAz4nzBysOdc/y4+o2iJIFf8EdO/r7sn1/ea0AAODf72+iJBRr0PUtpw9ndh+qNnxOwYu/Qi/cg7vbEKwfBDL7SyI8+9Y6KsM60ZjJv9/fxO6C9lHHtybL6cPVdRDRXWsp+Ncvieaux91tMKbWeRfGEO1Tm/QE/Pa3v+XOO+8kPz//uM9NS+v8wyaaa9Z6cwquP1DrsWVDKGIwtO8P8xpAP2USFTHYtqKqTjpb9pTi9rpI8h/aubE8r5TKSP1rVofCRvt5PzKGYE+4luCCvwGguDyknjYFZ0rjd8ZsN2VrZVLuttGZ6s/9JYXEjNob9uw5WN0a3lLv8/6SQvQfXDP3YAUTx/Q65rmWZbNma2GtY0VlEQwLktNTiZ11JXYsiju7N+60bLRAGlpiw8ph2zZ3XD6SZ95cSzRmMrxvKmNO7E16Qk/0k8/HqgqSkhXANg1iRXvx9ToBRTs01LJ4bd3fYsO0iOhWA9/LAJ6zr2HfztUAOBKSSR49EWdi7d3QN+XVnftUFoo16e/V1t+pxgkQO30GVZuWgGmgaC5Sx16MMzW7QWd3zDI3XbyWuyW1ehDw5ptvkpOTw5gxY5g9e/Zxn9/Zl7hrr8t/dUmr3aLj0lRSElx18uq0wkSWvIwjuJ8T+0zns/VFtZ4/aVAmekSnMHyohcmpQkrATUlF3Z6FZH/da7QFRVFwFm0i+OE/ao7ZsQhFH/4L7xnXoTsTjzvN9vq3bmlS7uZJqzE6U/2Z6NVI8DqpDB9aRWdAjySAFvt81XvN7kkNup6iwLgTurBtb2nNsV45ibg15bvzA+AIQGEFuHIgSvX/GyAjI8CwHik89rOxRGImaYluHIpCScSB11IoeOOPePuPxoqFieVtJuPqhwm7D/VgZqfWbYH2ujUS3FqDyuaOFlH2/pMAqB4/ZmUpwcWv4Dn1MnTtUOCZFnCjqkqtz2BGsrfRf6+OWpe4YiWEFv8DTAPV48eKhChY+Hf8439ca/hUfTpqmZuqvS4R2tJaeu5sqwcB77//PoWFhUydOpWysjKqqqp45JFHuO+++1o7K+I49MoKcMelI5j92Q6S/C4uPbs/qX5XnUlxuurFO3gcXhQu9PRmw94Kisqqb+6zUn38aHT3OhvIeJ0Obpw6jD/95+tax08cmEGPjPbRcqmaYSI7vgHTIGHU+fgGjyU4+zEiud/iP7W8UUGAEKLx/B6Ne687mbc+2c7B4iouOr03/bsmtdtr2jacNDCdaGwgi7/eS//uyVx0em+czbQ0sm3bJHqdJHprL6YQ8WWTeuHtFL9bfZOePuM3RDwZtcb6984OMOGkbnzydR4AmkPh1hknkORr4MRnu/pfYMx0vEPOovyTF8G2+OGEgsxkD/ddfzJvfLyNqG5y6YT+9EhvnuVKOxwbEkacS8LoqYRWvI1ZISs4dTShUIgPPviA3NxcevbsycSJE/H7m++epTXmzrbpZmGzZ8/mq6++Oq4IpzO1ZNWnvUf5pl29TGZDf7bCukleYSUo0DU9AZ+z/vHvySkJrNlykC83HKC4PMppQ7MZ2COZhHYwKfh7TrMKpXArVuZADNWLO7QXTINYcu9GrUrS3v/WLUXK3TxpNUZnrD9twLJtNFXBtlvn8/XDax4PRVHQLat6ZaAjnatAJGbi1Bw0ZM7s0crs0ssIffpPorurF17wDjwN79gr0bXanyHDsskvDhMKx8hK8ZEacB3XLubuWDGm04+huHDqZaA4avUCHO77AVVNnZTYkesSV6wUy+HEcCTgNEIolnHMXgDo2GVuivbWE7Bq1SruvPNObNsmEong8XhQFIWZM2dy8sknNzl/paWl3HTTTUyaNInNmzd3np4A0bEd7yIOXqeD/l2O3Uru1FR6Zfrpk90fRVEwTeuY57Q23eFDyRlZ86Mf9Xev/k/nuqcSokNRAIdy/DfjbXVN27bRlCMHAKWhKr7ZeIA5y/I4b1QG/fvm0K9LUs1yosedVzOKVVFM2iX3gh6j4ss5KGaszq+/pip0T/cB37XMH6VsLr0MNVxMNKkXtq3gKd+N5UnCUFwA6M6j947IBkXUuuE/UrAk2qdQKMSdd95JOByuORaJRAC48847WbBgQZN7BJoyd/Z4tOl3cfr06bJHgKjFsux2GQB8r+36zYQQHZFmRVCU6orDYcdQqX8RBIBQJIK1fTkjzbVcOz6b8cYX+Mt2sPmweQTHK+rJJOnie9BT+6NnDSFw4S+IudOOfeIROPUyQp+8QOEbD+E8uBFXcDOFbzxExUfP4TrGKkQdXTvdw1K0sg8++KDOsObv2bbNBx980KT0D58729KkJ0AIIYRoAU6zkuiqOXh6j8TOGoixaTGq24faZyxWPT+/+w5WkB4L4/p2LsMSPsMMV6BljWD2p9sYcPXJuLTGtdtF1YTvWvYVYmrTWihthxtXZk+iu9cRfOfx6jtj28KZ2QtL8zQp7faqrEpna14pBSVhumX46dslEb9Hbp/iVW5ubk3L/w9FIhH27NnTpPRbc+6sfIqFEEKIFuCIllG1cQmV6xbj7T+a8NYVaKk5JHc/gagzuc7rdxVEKFb7MyKQillRjNLrZBbu9rA7P0jUsBodBDQnQ/XgHHkhnuC+6sUSbBt3j6F4Tr6YGJ0vCCiqiPLHl1bVWr2ud5dEfn7pSBK9cgsVj3r27InH46k3EPB4PPTo0bRNC//1r3/V/P/7ubMttXhO29coQgghRCcUTehC+ozfgG0R3roC1Rsg5aJf1hsAAIzolcCw8s8xK8tQ+o7B3r2Kad0LOG1oFj532yySoCqHhj14w/twmpUowV1Edq2rOR7N2wwFO2qGPXUWigJfrN1fZ/nqXfvL2bA72Ea5Em1t4sSJKEcYG6YoChMnTmzlHDWeBAFCCCFEC1CtGNF9W2oeW5FKrNIDR7xZTk5KItJjDKHTbuLVilMxx1zPbiuH88f0wtEGA9I1swo2LsQdLcRdup3oli9xVuZTsuAvYBn4Bp5C4MTzwDIpWfDsMXcm7mhips2XP9go83tfby7A4ZBbqHjk9/uZOXMmXq8Xj6e698vj8eD1epk5c2azLhPa0nNnpS9LCCGEaAGucBEFS96oHgI0/hqC85+m5MPnSbviQSJacp3Xe10OrB6D2Lm/Am9CId8q2fTvnUROiu+IExFbkqNsLyVfvI624XN8/U+h/Kt3ieRtJvXs64js2YAZCeEdfDqKx4+r+3CirtROtVqaU1XITvNRWBqu81zXDH+b/E1E+3DyySezYMECPvjgA/bs2UOPHj2afZ+A1iBBgBBCCNECIgk5pE27GyWQRsyTQdr036A43cQOW6PfYeuYyqENvvxuJ326JDKgezJep4JlUetmU1EOrVJ2+P8Pp2JQEQPNoZJsBjG0ALrqwWFHccbKiRy2W/DRGCm9STz9MsqXvEH5irk4EtPx9j6B4MK/48rsTuD0Kwkn9MAxrAsxpYEbi3UwF47rzbc7ag/9cagKpwzJ7nR7bjREzLTQDRuf29Hg/YI6K7/fz4wZM9o6G00iQUAnZ9k2UcPG61JbpIXGHS0EG2LeTNyVezHcqRiOON0BUgghDmPbCnrGoOqbYxv0pB6oe1ehFubi7D4CR8V+wgd24+k5HBuFQjuVhStyWfTVHhK8Tq780UBOGpBes0eAwzZQcr/CkdUP2+HG2rMGpdfJwKGgoqwywtzPd/D5uoMk+d1cPSaFUYmb8PY7mci3H1K+7hNSp9+D5fKj6wZRZxJJZhBL0Yi5Un5QAgXU6rkIjsR0ksdOxwpXgEMjYfBYorvW4ByeVb3OfSe9H+6akcDV5w9i8aq95BdV0qdrEhed0Yfu6V6s9ruadbOzsdmSV86L72+ksCTMqUOymTGhH2l+V1tnTTSBDGjrxIoqojw1+1vufPJzXvpgK2VhvVnTd9lRKj55kdL5M9H2f0Pha7+HfWuPOGFGCCHizeGt4y69jLLPXqXs81exdn/FwTf/iMPSKX3/GUIr5zNvyU4WfpmLadmUV8Z47p1v2ZpXVnO+Uy+jfOmblMx9nIqP/kbpp//BESmted6y4eUPt/Hp2oNYNpRURHnmwwNsC/kpnfMoFV++gxWuwK4sZuv+EPe9vIlfP7uUhasLKd+yEie1fyO0kp2Uf/4azpy+JJ16EcEP/wmqSsaFt1Hy2SwULDi4GbWRG5l1BOWVMWZ9uIWe2YlMOb0PSQkuPlm1B1WNr9unvKIwf3rlawqKw9g2fLnhAM+8tRbd7KTRX5yIr09xHDFteOHdjazbVkTMsPj0mzw+XpVHc/bfxRQ3gTOvwQyVUPzuk7i7D0HpMlTGSQohRD0izlRSp/8G2zSI5e/A03UAJV+8jlFWSGzEdD79Jq/OOZ+v3VczATXiSiNt6i8xywqI7d1EyqTbiPq71ry2uCLKN1sK66Sx5qCjugUfSJ1yBwW+fjwyazMHiiOEwjovfXqA3KQT0XHWOs9K6k7i2EtIOu9naKldUDSNkk9epWD249iWiTM1B7IHH3FYjKIoFIdifLu7mG37y4noZqPfu7aS7HeTnuzhy/X5vLtkJ6u3FjJ6SDaGEUfdAMD2faV1hnvlHqigoJ75EqLjkCCgk6qMGGzZU3ulhpWbDmI0YQyjO3wAd9V+VMXGU7YTlxnCDpdhG9WtR2Z5IaoRPUYqQgjRNhx2DHeseny3K1aC0wi1eh7scDm2oeMIpGJWllYfM3UUI4pWz2ozCR5nTcOKUy+lbNmbKA4nqsdP2eev4gofrHmtw6FQX6O816lgm9X1tFFaQFVlGPMHvwX5wao65+kOH+rwyUTd6URT+pNy9vWADbZF4ikXQa9T0R0JRyzr9vxy7vnrMmbOWsMfX1rF/81aQ1lV8/ZItzSPpnLnFSdy0qBMUhM9XHHuAEb1S2/rbLU6l1b/ErWyQlLHJn+9TirB42Bwr9rjO08dmo2zkd22LqKEPn+F4tmPoWz7lMLX/wcluJPwxs/x9hlJxlX/gxWpwgrmynAgIUS7oyg27FxOyexH8VbmUfHRc8RWv4tG692UeowSiuc/hcMbQEvJwQyHSPvRj/H1Own31g+ZflafH+QZzhzZ5VBLu6KhJaSQOv03pF32AM60rqBUT+1TMck0D3D+qd1rpeFQFU5Ii5AwaCygUP75q3SPbiM10V3rdTk+o973wrQVFAWcxTso+ehfoKjg0ChfMRf7wKYjLncaMSyem/MthnmoxXzHvjK+3dnx1tfPTHRz68XD+OPNY5g4ujteV9vs2dCW+ndPwu2sXe5Th2aRmdT5NohrD6699lomT57M1KlTmTp1KmvXrm2R68jE4E7KoSj85MKhvLV4G+t3FXP6iBzOPrFbo1dviOEmMOEGgq//ntLF/8bb/xSs9H54UnuBbRNxJpF22e+IuRJlOJAQot2xbQWtyyCw51Lwyv0oTjeJ4y4npjhbbVJr1JlC6kV3gW1jpvYh45LuGGVFeE6/Gmw4kwApiT4+WrmHlICb80/tSY+MQy3tuubHNeZKdNWNbYPv3J8RpXpiprNkJ0VvPsz5p/+EbpP68fHaQrJT3JzTT6F/so7d5QpSc/oTWrMIR1Y/bpvh4KUFW6io0rnkrN7075OEcdhwICc6Ok4UBRxWDDtWBbZN2sW/RHH5CM55DCtcgWKZ2ErdW4lQWKeorO6Oqpt2lzB+RBdMs2MNp1EAp0OJ29+3rCQPv/3JKXy0ai+788s5Y2QXThqQWW/PUzwwDIN58+bx2muvUVhYSEZGBldeeSUXXXQRmta0W2vbttm9ezeffPJJk9M6FgkCOrE0v4ubLhpK1LDwOh1NqrwULMzgHqxwdfd5dN8WEiKlRH1dal4TcaU2Oc9CCNFiHC7UhBTMimIUpwfcCa26rKVtg57ar+b/Vd6uKL5u2LaNqoDHhlMGpHHq4AywqXesvaG4a4IWnUMrs1j+bLz9RxNe8k+GaS5Gdh1M4tgZKAkp6Jofy1JQup9EIHsQMS1A7yz47+tOxrJt3JpaewJzNEjVsln4x1yKFS4nsm0F7pMuJuPaPxJ2VQ+FSb/qIWKuFCzqbxX3e52kJXoIltcOBAb1TO5wAYCo/rzmpHi5fuJAbOzqUWHxGQ9hGAZ33HEH69atIxKp/nyHQiGeeOIJPvroI5566qkm3bzv3LkTgJ/85CeUlpZy2WWXcc011zRL3n9IhgN1cgrVYxqb2nqh2Dr5US9lE35N4IancCQkY5cX1jv0xxUrxWFVfzE8RufaQVII0TGpKhi7VmEE95F28V0oDo2qle/gVBo/HOjw6q+hoyDtH9w82baNuyofZftnOBUdbd/XOIM7jrvO1p0BfEPPrE7TiKHoYUhII6oGsCzlu2uptfYo0FQFv1mGasVQlOr6WlFAsXRiB3YQfOshiuf8L0bpQbD0mgAAIOJKP2IAANW/OzdPG47mOPTG9OqSyAl94288fWdiWTa2Fb8BAMC8efNqBQDfi0QirFu3jnfffbdJ6ZeXlzNmzBieffZZXnzxRWbNmsXSpUublOaRSE+AOKaIbvLe8n0sWL4fy4auGSF+ccndeJPq7mLpMiqo+Ph53F364+kziuA7j5N60V1Ek/ocIXUhhGh5lgXOvmNI7z6UaEIXUi7+NbbDRcx2HvvkeiiKhbNwMyRmYzvcKEU7MDMHYiruY598GFUB48A2Sj/+F55da4jsXIN/9GScyd1rDc85Gt2wKC0OEt5bSGJWPzTVJrZ/K+Elr+AZd/URJ++69FLKFz6Nt+/JuLoMIPjun0mbfjfRQA+Sz7qW4LtPApB0+hWEncnHVS6AfjkB/njLWPIKK/G4HPTI8uN1xt94etG5vPbaa3UCgO9FIhFeffVVpk2b1uj0R40axahRo2oeX3LJJXz22WeMGzeu0WkeiQQB4pg27y3jvWW7ax7vKwzx3LtbuPuqUWg/6EsyNB++weMo/fDvVKyYiyunP7b3hxvQCCFE69M1f83GVlFvdpPScsdKCb73DI6UbLTkbMJbviTzqgcJJ3SreY2qKsfcVdayq4MTz641RHZ8g5aShWfkZKLHCADceimG5idmK/zn/Q3M/WIn2WleRvW7nKljuuJY+iKevidhaN4jznkwnX58g8ZR9unL1Wn2HIblTsQdyqNowV9xdR2IWV5IcN5MUqbfS7QRQz7TAm7SAscXGDUn07YpKI0QjhrkpPkkCBFNVlhYdxne43n+WFatWoWu64wZMwao7i1sqbkBEgSIo3I4FL7efLDO8e15pZRWxkj/QeVu4cCT1q16BQnbwpnTD1PztlZ2hRCiVUScqaRN+zWFs/6AfmAnKRfcSiShes3+yqjBup1BNueWMrJ/OkN6puD+YYvJdxRsyFtLZMdqtJRsjJID6FuX4BhwFqZSfyDg0kso/+CvJJ1yIUaogtWbHdx2bgZ9yOOVnRo7i7IYcub1GJoX2z7yqF9L0XBl9jqUbk5/TM2HrWgknXE5jl4noxhhzIM7MJyBI6ZzLO5oISE1kZIqmxS/k0SjmIgno8WHlFi2zYcr83hz8Tagevffu64cRYpPdrkVjZeRkUEodOTlhTMyMpqUfkVFBU899RSzZs1C13XmzJnDH/7whyaleSQSBIijsiybLhn+Ose9bg1PPS0qHquC4vlP4crui3fgaZR9+jIZfUdhpgxojewKIUSrcJqVVH77SXWDh6oSWr2QxKx+xNypvP9lLguW5wLwxZp9/PjCIZw5PLv+m15FQU1IxT96Mp6Rk6sDgKQsjHpW3Pme5fCReMpkSj9/Db04n9tHTUfbugS7vIArzr2LnZUxdEf6MVc9cutlBOc/ibvnMNzdh1K+5HUye59AONAbZcDZmLYCziSU3lnYduOWgXFHCtiZe5CnPtlGYWmE7DQft4910KtbiFhy70YFAlHDYm9hiC83F+JxqfTJSSLRW/f9KiyL1gQAAPsKK9mws5jTh1X3AlVGTQpKwyT6XKQFJDAQDXPllVfyxBNP1DskyOPxcNVVVzUp/QkTJrB27VouvvhiLMviqquuqjU8qDlJECCOyrbhpIEZvLd0F1URo+b41RMHEfA668wJiKgBUi+6C8vpw3AlkpnTDz2had3uQgjR1g4f2qMoCoqpY1aWkDrlF6gJyZR99A8UUycUNfhgxZ5a576/bDdjhmbhVBXcoTxMdxK6FsBTlY+leYil9EFJ6sHOEp1i14lkeRPIVJQjD+NxuHE4ffiHnkHp1wtxrJ6NDURPuJiXV0S4alLdhpv6RLQk0qbdjeXyYzr9ZHYfQsxfveLb4Tf9jQ0AAGK6wYtfhiksrb5hOhCs4s0NifwivRJsi+NdnySim7yyaBtL1+2vOZbid3PPdSeT8YO9DyKxujsUl1fpKAocLIvy2MurKKmI4lAV/t8lIxjZNy1ulwAVDXfRRRfx0Ucf1Zkc7PF4OOGEE5gyZUqTr/GLX/yCX/ziF01O51gkCBDHlJno4fc/PZXNuSWUV8YY0COF3ln+I1aW33eJA4QDPVsrm0II0SI0Kwy5q9G6DEWxYliFuzC7jsJ/zi3oDh82CkkX30NU8eEyLTKSPRwsDtec3y3Tj/ZdAFD0+oN4+40m4aQLCM75E1pSJoFJd7B8V5Rn31oHVAcc910/mj5Z1TfzDsXCPGxYjydWQtG8P5M8/nJUhxPL4URLzUFNCHDNBUPISnbR0M3hI4fNYWiJ+rrKk82ewq21ju0uiBJJG4nruwDAsuFAaZhQOEaK30NGkvuIAdDO/IpaAQBASSjK+1/mcv3EAbXOy0710rdrEjv2lQHV7+vwPmkoisJHK/dQUlG9w71p2Tw/dz2P3jqWgLv1bot0y8aybLwuR/WqO1T3crgcatyuv98RaJrGU089xbvvvsurr75as0/A1VdfzYUXXtjia/s3p46TU9GqFGw8lfuIeTOwHG66qUXkDE7G0Jq+rrZu2YTCOn6v86g7GGt2FPO7TXGcxGqtiX24759TFHCYUQy17SahCSE6Hy1STNm3n6DlbSJ2YCfuLv3xZfbDtgxc4RLsQAZK+UEcCZlEdRc3XTyc4vIIBSVhqiIG407IARtMdyLegWPxdu1DxeevYOtRks68gvywxp6DRST5XZSFYliWzZuLt/KbK0fi04NUbVqKa+i5mIoTU3VjaD4yL7+f4Pt/waoqJ23SLcTytxHb9zXpvfsR3bQEV9ehFDiy8HucaG14R5nggnNGZbNw5aEb93NHppOgmeholIV1XvlwK6s2Vc89UxU4/7ReTBrTE6dDJRTRCXiry6AoClv21L/s9NK1+7n0rL74DtvN162p/L8ZJ7B5TwnllTqDeqXQPd2HolT3SBwuHDXQdQta4ecjopus3h5k7uc7iMRMJo/tRf/uKby/fDebdgXp1SWRaeP70TsrAYX6/3Y2UFal43Y68LnUuF6ysy1omsa0adOatApQeyBBgKhDUcBZtJmC2X8iccwMXN0HUzT7MXyDxuI69XIMtfHbhBeUR/nL2+vYc7CCPl0SuWXa8DqTiwGcRjmRL9/EN+wscPkILXuDhDOuIeZOq/U6VzRI5Rf/wT/uMuxIJeGNn+M55VL0JkxiE0KIwxneDLy9hoOp4xo8luiBnSixEKGv38OT1Yvwnk0YQyay4UAlX23azapNB0kOuJk8rjfrthVy3sndAdC1RBIGnEThOzPxDzsTZeQUXv+6ivmrvsSpqUwa15uvNhwgryBETDfxlO2kfOV8fP1OxNr5JVYshmvAWMJfzUYZdhbJZ12FYpkULXgOK1xBYNR5RHaspnzFO6heP3tG/D/mr6vipouHku5vm8YRZ8EmzvFsJG3cAL7ZZzG6u4MTjJWYG3fCkInMWnQoAIDqXoH3l+8m0e9izdZCNueW0K9bEjdNHUZ6wE3yEcqR5HehOeoOLUryOTl1UOahAzaYps3E03qyfmew5vDpI7qQktDy8wIUBT76Oo/Zn+6oOWZa8Mi/V2J8t4nahp3FbNz1FQ/8+BR6ZdYd2hWKGLz16Q4+X7OPxAQXt04fzsBuyTKUSRw32SxM1GHboPgz0FK7UL7sTYpefxDb0PH0PRnT0fgfEtOGlxZsYs/BCgB27i/njcXbqW/vSMUyMStLCM5+jOI3/wej9CCKXfeVim1ilORT/Mb/UDznfzErS1HsQ+NAnXoFnordKIpdvRlOxR5cMdnATAjRcFq0FKMiSNmKd6lYvYhI7npsI4Z/7BWgOYkV7WVDgcKBkkjNDW1pRZR3Pt1O7y5JbNtXCoC7Kh+9MBdnSjahbz9l074wc78qxLRsIjGT2Z9s55Sh1XOoZpzoh3AFkbzNlC6dTckn/8GOVqJYBmZlKcVv/ZHiD/5B+ablpP3oJ6CoVKz+kPIV74CiEhp5Ff9ZWsT2vFLe+WznseYItxg7uRuByr1M7Bnm/mtHcd4QD97CjTj7nMSB0hgrNtZdfQ5g7uc76ZmTCMD2vDLmfLYDGxjcK6XWBmTfu/Ts/rjqOX4kA7slce91JzP1zD7cMm04l07o1+AN35qiOBRj3hc7ax6nBNwUlFTVBADfs21Y+GUu9XUErN1RxOdr9gFQXhlj5qw1lFbGWjTfonOSIEDUK+ZJJ/nMK2see/udBFkDmzRBLKqbbN9bWuvYltxidKPuzX3MlULiuMuwjRhWtIqkM64g5q277FbMm0nSmVdhRauwjRiJYy8l5koGQNPLCS99mcJZD+I8uBFXyQ6K3nyIio+ek0BACNFgtuLA23MYitODWVmKf9h4SMzELMmjYtUHJJ44kdKYk9355bXOq4wYOJ0OCkq+mx+guaja9S3J42YAUKrXbXlOS1C5f2ICQ9wHqdi4lMDIc6sbNzQX3hPOJepOJ3HspdimjlkRJGHAqST0PxlP7xE1aajdhvLJ/gBFZdU3hptzS9DNtgkDYs4kEs7+L6weo4nqoKcOJHnafUQ9mVRGjrxbczhq4DpsBbpNuSXohkV2spd7rxtN9+/mS/g8Gj++cDAj+qYdKal6aapC/y6JXDyuF6cMzCDgaZ2BEZVhHeOwv0VqooeCkqp6X5t7oAKjnskdO/fX/pxFdZPyKgkCxPGTIEDUy1W6k+D8p0BRUX2JhLetRF//IQ6r8RWNz+XgjFFdax0bP6obbmfdj6EnVkTwnf9DS8nG1XUQxe89g7tyf53XuSv3U/zeM7i6DUZLyaZ47v/hiVV38doOF1pyNtgWwbn/R9Fbj2AbMbSUbGy1cbuECiHij+30EPr2M8DGN3AMFas/xC7ajVG4h5Txl1P6xRt0DVgM7V37RjQnLYGSigg9s6uHJ0ZdaSSffhlFC/8OikpXv1Hr9S5Npa+5k275iwkte4PA4NMo/2o+3r4ngqJStujveGMFFM89VDdWrllEybK3iexcXZOOtfdbJiZs5OS+1dc968SuuLXqBpxQ1OCb7UE+Xr2P3IJQq/QQ6FoAi+obetu2iTqqb+BTAu56W/UBMpK9lH43cRfgrFFda34remf5+e/rRvP0L8/isZ+N48zhOUfch+FYWnsETXLATYL30O/P/qJKemYn1vvakwZm1jufY+SA2g1i6Uke0pMaP0xXxC+ZEyDqp7lRXR6SJ9+BmpxNybtPoPoC2GrTdlucMrY3SQkuVm0u4LSh2YwbnoOCjREqBRw4rDAoTnRnEklnXoEjqx+26sTK+xbDnVwnPcOdTPKEa1C7jUAxY5gF29Gd1RWqqXpwjrwQX2kBVVu/BBtcXfrjO+1yoopsYCaEaJiYmkDCSZPwn+bFSuyCu8dQ7OTuqOkDQA+Rcs4NJHXLwllkc92kwXy+eh/ZaQl0y/QTqorRO/vQHCXL6UV1eUmadBspKT34tXqAd9dV4fc5mTo6g16JVZjp52CGSnAkppNy9rXECveQPvlWLMVBzJVM4hmX48jsh+1woZbuoXzFPFBUUqf8AhQonvdn3EVbOaXvCfTvk8PYYTnYdvWQzFc+3MKKDYeG4Pz3DaPpm902c6jS/C4uHt+Ptw5by/97V503kH2FIXpkBxg7PIexw2rvs+ByKHTNTqSwsKJDTYoNeDR+OmUoT72xBqju8dAcKhnJnpplVAH8XidnjupS7ypJg7olcduME1iwfDc9sgNccFov2QlZNIpid7CZJMFg6JjbsHdkGRkBCgsr2jobAHiMUqLOJGxbwW2U1mrNaQpFUbCpHupo2zaugvVUfbuYwFnXUbXmQxyBNOh/JjhcNX/rw9forpveodacH/7fVbKD4OzHsI3vejAUlbSpv0TPGtKkoU3NoT39rVuTlLt50mqMzl5/Qst9vg6vW35YH9V6zqEQipgUlYZxaiqZyR4cPxhs7jHKiDoTsW0Fl16K4UrEVtTvbvhsXGW7wZdCVEvGbVdB8R70tP7YigPbrn19VVUIKBVEivYTS+kPCriKt4M/nagrtaaeBSgoi3DPX5fVyssZI7pw44WDMdtouFDUsPh6ayFvfLyN8soYXdITuPJHAxjUIxmnQ8WyqVWGw3XUusSyYW9hJWu2F1IZ1hk1IIPMVB9b95SwfW8pPXISGdY7jTR/3eFitcqsKNWfvU7+nYbm/1s3tg79oX379tUsEdq1a9djn9BAixcv5plnniEcDjNu3Djuv//+Zkv7cNIT0Ekd6cb4eES05JpWiKiW3FxZq6nMbarzhtNDJHc9kZfuwdYjpJz/MyzFUetH9mg3LoeX7fD/u41yit97GtuI4Rs2HtXjJ7TqPYrfe5qMax4m7Dy+MaRCiPh1eN3yw/qo1nOmjc+p0iMj4YhpRbQkHJaOpTqJOZNRbQNssFEBhWhi75rXRhUfpA367kJ1r29ZNs70bMrM6iE2ChBN6V+TscNz6tRUNIdSa0x6SqKnTVvS3ZrKuKFZnNg/nYhu4XM7cH23ys/35exst7iqAj0zE+iV5UdRDpVzzOAsxg3NbnigbtsdqhekM9m4cSOPPPIIu3btwul0ous6vXv35r777mPIkCFNSnvv3r387ne/48033yQtLY3rr7+ezz77jPHjxzdT7g+ROQGdkNMox7H9M1xmJe7KfWj7vkatdw2etmfbYCd3x9trOLYewRFIR+s6GKsZPpoRRyIpF92J/8Tz8Zx6GdrIKSSOmUbq1LuIuCQAEEIcmaYcWmXMgXGUVzYibSuCuW4+rpIdaIqJsn0JWv5alEbU05oVpfzrD3BHg7hiJTh2fI5mRep9bUqCi2vPH1zzONnvYtzwnDbvHbJt8DgdJPucNQFAPLBtu85739Z/C3FsGzdu5KabbmLz5s1Eo1FCoRDRaJTNmzdz0003sXHjxialv2jRIiZNmkR2djZOp5OZM2cyYsSIY5/YCNIT0Ak5qoopXPxv3L3XoOfvQPUlkjy1H1EtqdmuoSjgKs8FpxfDnYJWshMzqRuG48itX/WnY2NtX0rV9q8JnDqV0NcLqFz2Op4zbkCn6ZN3o4GeOE/uUpOWY9gkdNUprSdCiCNyxUqIrl2AZ8QFEC4ltm8TjsHnYCrNs9a+Fi6iZOV8Ql8vIGH4BEKrP8CRlEnKjD7HXU9rsTKCn72Gw58KDgdGyQHSuwzE8NQ/UXTcsCz6d0+iokonJ82HvxV3yBWiM3jkkUeIROoPtCORCH/84x95+eWXG51+bm4uTqeTW265hfz8fM466yx+8YtfNDq9o5FvfycUTepJ8jk3UPrRPwFIv/wBws0YAAC4zBClC/4Kto1vyDiKls8mfcZvMNIGH/vkw9i2gtZjBFkz7iaaPoi0nsPB6SXaDAHA94zD0jIVZ+frWxZCNCtVr6Ry4xIiu9dhhkpxZvYk0H8sprN5goBIQjfSpt5FcM6fCK3+ANUbIHXqL9FVDw5br66nGpqWJ5OsGb8i/5XfA5B+yb1EPFlHfL2qKGQne8lOlsURhDhe+/btY9euXUd9zc6dO9m3b1+j5wiYpsmqVat4+eWX8fl8/OxnP2POnDlMnz69UekdTfz0u8URV2U+5UvfQvUlVm8as2IuTqN5J09FVT8pU36BURGkfPlsAqdchJHat3FpudJIGHQqpq0STe5HNKH5JtcIIcTxivq7k3LuTzBKDmDrEZLP/QkxZ3Kzpe+wDcyygprHth6FSAX2lk+xt31+XMOPXNEgwU/+g+L0oLi8lH32Kq5oUbPlVQhxSGFhIU7n0YN0p9NJYWFho6+Rnp7OmDFjSE1NxePxcO6557Ju3bpGp3c0EgR0RqqGK6cvaZf9jtSL7kRNSMRWmrfTx2HrxPauB8sARSW89Su0aGmzXkMIIY6HSy8jWpALgCd8AKdZ2ah03BV7KPnwHzjTu6G4fZQs/Bveqrzmy2f4AKWL/40jMZ3k0y/BtkyK33saFZPKbz9B00MNT0zVcCZlknbJvaRd+t9oKZmgSie/EC0hIyMDXT/yJncAuq6TkVF3c9OGmjBhAkuWLKG8vBzTNPniiy8YOnRoo9M7GqkpOqGoJxPfuT8jggslOxUtcwBGM41l/Z5qRokc2EXi6Zfj6TOK4nkzIVoJcb5fSVXMZF+wEqdDpUuqD1cjN7ARQhwfDYPI13Mp3f0tKefdSHD+0/hPvgB16CSs41wO2HYH8J9wForqwJXVi9iBncT2bkIblI1hN/1nM+rLIXXy7ahp3VBMnRRVA1Wl/JsPSZ1+L5Hj6HWIOZNIv+BmikPV4xy9428k1kz1vaKAw4phKC4cdgxLcWLTtksrC9GWunbtSu/evdm8efMRX9OnT58mLRc6YsQIbrzxRq666ip0XWfcuHHMmDGj0ekdjewT0M40Zi1cRQGnWUlMTcBhhkFRMdXmvemvj9MIYTlcmIoLj1lBVAs0esJtR13v+XDlYZ0nZq1mz8HqVrwzRnbh6nMHHDEQ6Axlbgwpd/Ok1Ridvf50hw9SPPuPWJWlaGldSb7wLqLuxq0E5lEilC94htj+bbh7DcN/1k+Jqr5my6uiKLijhUQ3f0FozUfYho5tGqRNuQM1pSsRd3VLojtSgKX50DX/EdNqie+UooAzuJXozm/wnjSF6IZP0NK6YeSMaDeBQDzWJa1RZpvq3zPNoeB3O+vdo6G1tad9Ar5fHai+ycEej4fnn3++ycuEthZppuzgFAWcBRuoWPg0XqMYY+18rI0f4bBi9b7eaVTgCe1FVRXcVfm4GzB2tLA8yqdr9/PRN/vYur+cikj1eFVd82Mq1ZuZRByNDwA6i+37ymoCAIAv1uxnf3FVG+ZIiDhj6tX/ADsWrR6u+J3CiihfrD/A0g0HCVZEa53miR7EV7ETlxHCW7kXT+UeduZX8IX/fDYM+3+Uebtj7V+PtyoPVbFwR4twV+U3KouH9g2zUUKFlH81H9WXRMr4K1Gcbko++ifG7m9wl+/GEzlIyZzHiCx/Dadx7CFCGrWHKWh2/b8DDWLb2JEQoW8+oHjW7yhf9hZWVRkK5rHPFR1WWZXOvxZs5q6nvuDuZ5fx6dr96Gb7XGK8rQwZMoTnn3+ewYMH43a78fv9uN1uBg8e3KECAJDhQB2ebYPiTkAv2E3hS/dgGzFSzr8ZS627s6+qgrVzBcVfzCL1vJso/uwV3N0G4T3rxiMuxxkMxfjDP1dQ9d2Nf5LfxdQz+zJuWDZOtX20BrUXMb1uRakbUnkK0Ro0DCKbl6AF0kmadjfF859Cz12DOvg8iip0HnxhBZXf12MJLn7/01NJ8jlx22HsolwOLnweX7+TUYA9KSfz0MKKml6TAd268es+AWLzn8Z/8mTK1iwCbJKm3Ve9mdcxuCv3galjJPVAK9iEndSFmCsFI60Paef9BKOskPJvPiRz+q8wyosIfvB3FFVDcbqxqsowSgtQzNhRf7GdRojYN3PxDDmDWKAnrrJdRLZ+iWvkFHTt+JZuBrBRULoMw9NrOJHd69BSc9D6jCYqtw2dlg289ekOlq7bD0A4avDv9zeRmuhheK+Uts1cOzNkyBBefvnlFtsxuLVIT0AnYAWy8fYehW3EUH2JaF0GY1E3CLAscPQ5FVdOf4oX/AVFUfGPveyo6/HvPlBeEwAAlIVilIWiFJSEW6QsHVmfrol4XIfe964ZfrqmH/+PrxDi+BlouEZeSOaMXxIJ9CRl+r2oA87EshV2H6ioCQAAyipj5BZUDy2IKl6U9J4kj7mYqi1fEs79lg0VybWGTW3NK2e/ko2r+xBKP/onRskBks+/tUEBgJMYlStmE3z7Ucy171I0509Y+9ajqqAU76Xk09co/3ohRkk+sbIilJwheHuNwNYjWFVlaKldSDz/NmLu1CNew7Zt7D1fE1qziOK3H0XLW0Vw9qOEvlmIvX8dinL8DTaqYmNu+4LI7nV4B56GUXyAyOr30Jp54zTRfpSFdZZ9u7/O8S/W7sMRR5u4HY+uXbsycuTIDhkAgPQEdHiqqmDvXEXVluUkjplOxdfvU/HZS/jOuRmduvMClFgFRnH1l9yMVGCFilDS0o44lMftrK9HQcFVz/F4l5Xk4fc/PZUd+8twaQ76dU3C55L3SYjWomt+XGkBKKwg6k6vOe6p53vo1qqPKVjYoWL0YHW9aBs6fmfdHjyPBnr+9urXmAZm8T7UhGws++g3Rzou/GdeS+z131Hx5Rx8A09D6XUKlgUYUVxdB5B41nWE136EFQ6h6VXECnbXnG9WFEO4FI4yWVhRFJQeo/D2W094+0qK33saAO+gsShdhjVqTLdlK3i6DiFl4k0ovU7B2/8UVH+a9AR0Ypqq4HY6iMRqD/lK9rvbxbwA0fwktOvgLMtG7TqM9Ol3o55wIWnT78V/2oz6AwDFxizMRfX6ybz+MTw9TyC6ex0OM1pPytV6ZwcY1ufQxLoT+qfTNd1PemLLTzzuaGwbMpM8jBmcxUn900nyNd+GZ0KIxuuVFeCEfoeCglOGZNEjq3qirWaGsStLiORtJfW8n5I8ZhqDXflkpx7aTOuSCf3IVkswq8rIuPohEkadR3jrVziMY/eIqpgY+zdjVpahaC7CO9egVlTPJ9Azh5Aw4UbM0gN4+5+Mu+/JlMz7P8yKIK6cfngHnIKtRwjOfhRv7Ojzt3QtkYQTJ9Y65h95HrrW+AmQEV8OZq8x6LaG3mUU0cSejU5LtH8Bj5NLz+lf65iqKpwxokunXlAgnsnqQO1MQ2bAu+woMcWNioVixTDVhq/LqWLg1ENEnck49TJshwtDPfrOkVHDIq+okqhukpbkISPgobmnA8gqD/FDyt08aTVGZ68/4cjvc9SwyC+uQlEgJ9WH67DhDU69As2OEHWl4jJDKLZF0EokP1iF162RmezB6QC3XkZES0Yzq1AsHd157J3YHVYUa9PHWNFKfCecS9miv+M/eTJ65lBsGzyVeyl87Q8oqkratLuxy/Kp+PZTAqMmoiVnEN71LVpKNlpKDoY7pd5VgjIyApTv+Jbg7EexYxEUhxPb1FHdPlJn3EPU36NJ72l7FY91SUuXOWZYbN5bymer95Ga6OHMkV3onpFQPWGgDbWn1YE6E+nX62Dclfuo+OJVEs/5KcaBbRhFe3GMmIx5jBv571loRL/rVm7IDxhAghplYPRb1C5DUKwQVt5arG4nHtfW9kII0ZbcmkqvzPqX2dSdAXSqbwoijup60Qf0zTl0o2DbENGSATAcPuqZdlUvU3XjGHIuqmUSdnjxT7wN3ZFQMwTTdKfiG3o6Vd9+StGbD6H6EvEPOwPV6eLgqw+SeNpUVF8ShbMexDdkHK5Tr8D4QcOP/d1KPrYewztoLIFxl1Hx2SuEd36DHalECRD3q7eJhnFpKif0TmVUv3Rs265uNJDPTqclQUAHpBfkEpz1W6xwiIQTz0dr4S+oFiml6OMXcWb3xY6EMEMlpF3eB9Pd+B3xhBCiPdI0FdO0mzQGWlHAYVShRUuI+buBCa5IkEhCV2Jq7VZVXUvAf+p0wlu/wo5WYVWV40jugp3SHWdGN8q/fAd4B1QNz4DTiNXT86soCnrWUDKu+B0kpBBWE/GNv47EMRdT5e0qN3HiuJmyLGhckDkBHYzuzyEw+kKscAgUlYSRE6my3S1ax0cTckidcgexvE3oRXtJu/hXNRvZtAeHr3zRiEUwhBCCqGGxcmsRj/zna+Yt201p5bHX2FfV6ppXUaonGAMo2LiLt6F/8w5Fs/6AK3819pbFFLzyAM6D6+vUUU4jRGjZG9jRQ3uKlC7+N3a4nMSxl9Yc8/Y7CTu97xHzYtsKtstPxeJ/4QkfgPIDhL75EKcRX8NlhBANJz0BHYyzeDtFX8zCN3gs5Y5U3ly6n8XrN9KvWzKXTuhHRgtM2HVFiihb8jqKywOmSfnyt/BPuJGYK7nZr3W8HLaBum8NanpvYu7U6h9ZfzpRX05bZ00I0YGs3Rnkb7O/BWDz7hK27CnhzstG4DhCy4IrVoy+ZQmuIRNQQoWYJflYvU7DaYQo/XwW/sGnYlsWhXP/DICWnI2SlF1nWI4jWkp4ywoUp5u0ab+matNSwpuWopQfIPjhP0HVcPgSCW9dgTO9G44h5x15R3jLxCjOJ/jGg9iGjqv7ENyWbO4lhKifBAEdjO3PIuX8m1G6n8iStQXMXrgNgFWbDnKgqJL7bzi51oS3ZrkmCo6EZJLOuwX0CJVrPsRuJ03uroq9FLz3DM6MnvhPuoDgB8+jJWWSfMn9RNX6x/8KIcThHA6VT1fl1Tq2cVcxJaEY6YH6b7iVcCnlK+bh2v0tetEenNl9CXQbTlRLIuX8m6lYMgtf35FUbVsFQNKZVxLzZtQZmhP1dyN12t0oDo1Ycm88o3Nwdx2ImtoF1ZdI8nn/hRrIpPS9mTgSM7DUI/9sx3yZJJ15JcF5MwFIHHsJkXbQWCOEaJ9kOFAHE3MmYfYci6V5WbahsNZzeYUhyir1I5zZhGu60/Cd9/+IJnQlltIX7/gfN3hScUuLBboQOG06emEuJQv/huJ0k3z+zyQAEEI0mG3bDOhZe0fUxAQXPvdRbriT+5A84Rpi+duw9RgpP7qRmJaIZlYR2b4SLSmdqm2rUH2JoGoUv/8srpLt9Vwb9LT+RJN6V/9fS8DsOYZwQg9SL/sdsbRBRFypJE35FVaP0fVuBPk9d+W+6ut0G4yWlEXx3CfwxIKNf2OEEJ2a9AR0QN/P2D9pUCbb9pbWHM9M9RJoobXpv993wLZBVxq+JGlLsxQXzvRuNY8d3kTwyNJfQoiGsyyb8SO7sGVPCVtyS0hMcHHHpSOPutmfqyyX4OevVS/dWVZI4eL/UDrkEtLSkvEYOs6UHLTkLJJOuRDVl0Tww3+AVn+vwg+HCH0/KTniSKw5FtWO3fBiuFNInnANarcRKGYUq2Anuqt9NNg0lGbHMBQXAE70o+5oL4RoGgkCOijbhjFDswmWRfjk6730ykniJxcOwaM1vXNHUZTqiW6WjqVo2DZoGBjt8OPirsil8L1nUFxefP1PoXLDZ5QteJqkC39J1CG9AUKIhklJcPHLy0dSGorh82gkuB1HX1bTnUDC4HEc7HEOVvF+PFX5vP5FHv176Uw/7XwsRSO554nYDjc6DtKufKjmpl5TTAy78buJK1gotoWlVNfJll694aPh8KH2HY9h2eAEpWdm05cGVaAqaqIoCqpi4XRoqN9NhFawj9ozcbzcFXsIr1+Me/QMVCNM5YrZ+MZcRsyVWu/rY7rJgdIwxeVRfB6NLmkJuBztY6iqEB1Bm9zVPfPMMyxYsACA8ePHc/fdd7dFNpqdy6pCd/iwbXDZYWJKw9bu/56Nze59JURjJjaQ7AHHYS1HLiLE8KBgo1lhAh4fV0/ozsWn98TldKKYEXYcqMC2LNxuJ163Rk4ACkKgAMl+Fy67Og3NDqPaFjE1oSZdgMLyKCs2HqAqojN2QCLpAQdVlovU8F7cKVlE3Wn1Z76NmN50Ek44G++QMzADOTiSMnBm9iamtf3mJkKI5uO0wujf7YdyeJ11JFHToiwUIzHB1aDGEc0Ko6ou0hPdeO0KorYXh22i2BaqrRM0PER1yPFb6IqHIpKJDLmUdz7dyebcCJmBDLYfCLHjQIRzT+6G1+lAUTRcoX2YB3ZjZ/XGpwRRNCehdZ/iPnEKscOGVSqKQnlEJxo1SQm4jjghWcFCy1+LFa1C7XUaTr2Mog/m4xlxPrbqBJSaRRsaEgAYlk3Jd4GP/weBT2mVzser9rLoqz04NZXTR3Yl2e/i5EGZZJetxzZj0OOU4woEwrpJRZVe/Xt0+IZtdozwtx9Tuf4zzMoy9OA+zLICvP1ORumWVmvJ1uJQjK15pRSXVwc/X67PJ68gRJ+uidw6/QRSE1wNzo8Q8azVg4Bly5axZMkS5syZg6Io3HjjjSxatIgf/ehHrZ2VZuWKlRD69EX8o6egOJxULH0d/9k/JdbAm+aIbjJ3yS4+/GoPDlXl/DE9ORis5LJx2aRnpOOqOkD5xy+QdM5PMcsOEtm2Et9p04lu+AR/IB16jOKVxblEcFEZ1vl6cwFup4PLzh3AohW5FJaGuWRCH840lpE6fBxmcA9V21eTOGY6JYtfJPHsn1CoZvDQi19RUVU9r+DDrxQuO6c/sxatZWjPRG6eBP7mX3yoSXTNj+vUy4l+12XsGHoeuurEtqU1SIjOwqmXEf7iP/iGT0D1JVH2yb9JPPdGop7Mel9fWB7lyTfWsr8oRHqSh59cNJRQlU7/rkkkfTdk0m2UVW/apWp49GIq136EK6MHnvRulC7+N0ljp2NWlmGWB6ks3ENl+kjKCeDf/xHbB1zDX+ZsJBw1GNY3jdOG5fDhij0AdM1IwKWpaOgYa+ZT8s0HJI+bTnTLcipWLyL5zMsJ71iF4vLiOHEapq2iKPDt7mL+OvtbwlGDEwdlcN3EQSR66w6F8YQPUjD/abAtkifECO38mkjuemIFuSQMPJVw3lZ8Z92I3oCd5EsqdZ6ft54tuSUEfE5uv3Qk/bsEsO3qJVP/Onsd2/LKgOqdZD/4MpfThuVQVhFhctVirH0byLiqK5EG7ki8v7iKJ2atobg8QrcMP7dfOqJmRTtdceE+5RLMyjIiO1cDkHj65VhdRtQKAAorovzPP78iFD40/+3Sc/rz3tJd7NxXzvNz1/OrK0ahNfe29kJ0Qq0+MTgjI4N77rkHl8uF0+mkb9++7N+/v7Wz0QJUwCY4+1GCbz2MbejHtWj9zvwKPlixB9sGw7SYv2QX3bISeXNpfnWDtqJiVVVQ9PrvKX73SVDV6ms6nJR+/C/2rF/HR2sKSU308PXmAgCiuskrCzdx2vAcTMvm9Y93kO8fTOGbDxNc9C9sI0po7SLMiiAoCvsKQzUBAFSPk62KGABsyC1n6dbKdrkOv3HYmFFTcUkAIESno4CqEpz7fxS98SB2LAxK/T9fumHy5ifb2V8UAqCoLMKsD7fy1caDPPqfrwlFDVyxYsreexJ1z1e4ircTfON/8HbtR8nCvxHevhIrFqbwnZkEP/gHlm1hWgq+5X8nZ9UzlGeN4um31hOOVteN63cEcTkdeFwOkvwufnxON5yKhYETT7/RKJqL0iVvUbF6EVpSJhg6akIq7qETMO3qMhRXxnj6zbU1aX6zuZC12+uf0Bv1ZpB8zg0AlH7ybyK561G9fhL6j6Z02Rz8J07CcBw7AFAUhSXr9rMltwSAiiqdp99cQyhavaRoXlFlTQBwuBUb8vF63ejp/Uk84wr0hOxjXguqexyem7ue4vJIdfqFId75fEetDlvVCKMXH7of0At2o5qRWnleufFgrQAAYOHy3Yw9oQsAW/eUkl9chRDi2Fq9J6B///41/9+9ezcLFizgtddea/D5aWntdZx3AOeYizmwex02kHL6dBK69qx5tqi0iv1FlaQmeuiWWXfiasWWwjrHTNNi+74KXB4XgdReMPZiit7/GwDJp0zG06U7Mdd4QivnY1ombpdGVaR25WjZYFqHqtmQK52M78aPJvQ/meAn/yFtwrUEuvYmVa9b4TsO667dllfGNZNabtJtRkb8TeiNxzKDlLuttN/681gCeE6bQnjrCmzLJHnsdPxde5FYT6tEeSjGzn2167K8whDD+6WzatNB8ovD9OifitFrGCUfPA+At/cIXKldQFEJrf2YlPFXEVxY/ZyV1psyT18Stn+JHYsQCvQiZuyolX6ooorfXZj8/9u788Co6nPh499zZp/JZJnsCZusYgREREVBFiWiQRZFBb2oxdtXrS2320ut5XbxtS16e0Wteu29Vmtr+7oU0UoFrQhqQUGQRZEdExISkpB9JrOcOefcPyKBQIQEJwszz+cfMoeZ3znPZObJPPPbcNTsI6WsmNTBt2BxezFShhIquJymrasBSDp/Ak3b15A+9RskHff3oepADVq07Q6tpVVNZGYOoT2abSxNH72K7q9vafe88QR2bwDTwO5ykpzRsdfZnuMWloCWQsCg5XW6s50CAFqGGHmdFuzle0m+7B7svo71dlfWBjhU5W9zbF9ZAy63Ha/HgR5upvr9V9Ebqki7Yh6RygMEdm/AM+QiMi+Y0vqYksMnb37W1KzhOm4lp6jZ8++1rhTPsZ1KosbdlXpspufevXu56667WLRoEQMGDOjw42pq/BhG7xvs7QxXcWTZw9hzB6PYHFQtX0rGvF8QcuUSCOv89q/b2FNaT5LLxuI7LiYrpe24mvwMD4pybAyn025BUWDqhdlEwxr+6n0cWfnfuIZeglZZTOUrS8i4cTF1q/4LxWoj16Vx4TlppHrbfgOUkmRH01q+2bFZVTKb9+McNBq97jB16/5KxjV3c2TFk6jpfUn3nsONU4bw1zV7MU24YEgG1fXHvlG5YmQW1dVds/tkZqa3y9rurRIxZpC4Y9XWmeit+fN0nFoNNa8swZbRF0tKFtUrnoDkLELu/JPum5npZerFfXnpnb2txy4tyGH7viNAy8o7dQGw9x0BG/7W0v7AC6j94CVUl5f0qXdQveJJnP3PR/fXYWssx7vp75gpOejJuaSXryMvYyDlR47lxosGenCv/QWK3YF7/kPUBlQs/lrYvZqmrauxJKVihEPUr19O2oSbOPLW79FVO1r6MEwTkl1WctM9VNQEWtscPaz9fGvXGvCvfbalAFAtYOg0bl5F2pW3ESzdTe17/x/P1HuJdGAVt0mj89lx4FiPw7n903BZVaqrm0j7irGfLoeV4b4Q+sYdVL32GN6r7/nKibvHUxWYeGEf1mw+th/D1LH9CAc1Qs0tuzO7L70J56AxGHkX4Bh8GY6Bo9FzCto8D2POzeLjnZVt2h6Qm9xaYKiqgtdpjdscI/kzdu2JHioCNm/ezMKFC7n//vspKirqiUuIOc2eRlrhN1EyBoKiYlbtJepoSYxV9cHWb1z8QY3iw41kpWS2eXyfdBc/uX0sb20owWG3cu4AH2o0woWDfBiGSdSVju+aeyD3PDxaM2ZDBSGHD+/lN6HY3RieLOZnlFGmJ/HtOSP5YFs5vmQnl4/MZc3mMiaMymPq2Hz6R/fBiG9gCVRhBOoxc4bju/Zb6J4srKrCtIv7MHZ4Fno0ipMQ7+9sZMQgH1NGZnLeCetoCyFEd4jYUkmb+q8oaX0xLHbcw3YRdX71B88JI3PxOG18+FkFA/NTCEV0SiubuHBYJn0zPTgitRz52yM4+o/Amp5P3ernyb75J+g2D4bDS/pV38Ca2Q8zGsEIh0iedDulWioH63TG+Wr5/sgRrN1WScnhRqaOzqLPzj9hzxtEpOIA/vUv47zsVqJWF7bUXKy+PJIKxqM6PDRs+BuWpDTMUACj8QiKbzAmFpxWle/NvYD3t5a3tHlxP4bkJbcbmxINEa0pbylYbvgx0cN7qXvnObTaKrzj5mCgdqgAABgx0Me354xi7SdlDO2XyuUj8lrH0melOLjpyiG8vHpvm8d885qBDMh1UOP1EW2oQomGoSPzcE2YNWEgueketuyp4rIReVw4JKPNeP+w3dc6CVi32lD6jWvz/wAFA3xMHJ3Pe1sOAZCZ6uLykXn85e1dAMy6YiAZyQ5ZGEKIDlDME99hXayiooLZs2ezdOlSxo0b1+nH9+ZvshRFaU1Yx/9c64/w7//zUet4z8XfGMvAbO8JjzVxNBTjzO5HuKEGbG7CliQM46vab+k1UBSwRhpBUYnavDgiNWj2VKx2O7puoOsGqtoyX0HXzdY2jvait7ShnJRoj55PUVq+WdF1s937xEoifruRiDGDxB2rts5Eb86fp3N8T+nxP5/o6POsKC3DGf2hKAcr/VhUhfwMN05by0o2zsYvMFw+DIsda+Mhoqn9W+cXqaqCYRzLky3/KihKy3HTNFFVBVVVsPgriexZj31EIWb1AUwtRLTPaExTRVFMXFodRnMDpisVi6KgGwZKsI5IyoCTVtU52mb0hKFBJ3KEqkDXCHvyUYniDpQScmYTtbjP6Lm1WtV2c3zUMCmp9LP7YB12q8rwfAd9s1MxFBuOYCWYOmF3XqfOdfT3croYT8UwTaoaQtQHNIoPNbBs7T7SvA7mTBnMyIHprb/jeCT5M3btiR4oAh588EGWLVtGv37HVhOYO3cu8+bN69Djz9Y/YofrgxRXNJKV5mZAtpcTFy6wGiEiG15CO1JKtLYc38wfEE4ddNp2bUYzwfefR7FY8Vw4jZrlvyFt6p1oOSO//vrQ3SgRE1sixgwSd6zaOhNna/7sjPaeZ0VRqA9ECISiZKQ42ixNGQsWxfhykq+JgoF5Bmvnq0rLHC4ABROTji9w0NXvqVMVXT0pM9PL4apGQhEDm1WJ+e+1N5L8Gbv2RA8MB1q8eDGLFy/u7tP2uJxUFzmp7e8b4NTqiNhTcBVcga3scwJ6FLztL313oqjFhfu8CdS8/gjNO9dhTc9HSc3rlQlbCCG6y7HeUoVPi2t58q/bCWs6A3K9fHvOqJiuJX90lZ+Wj++dLwDsWj3RfR9iHzoBpbkWvfYQRv/Orb/flXrz3xOLouBx9I7nSYizTfyXzb2cI1RF7SsPYCndhH54L01b/kHKxUXQWNGhx5umgprkQ7F+uc26Lw/jy+XhWrqwjw4f6sVZXAghYsARKCN0aC8WRcde/TnOaD0NzRpPLWspAACKK5r48LPDKL1ovWM1VE/DumU0vfUENa8uwb/tHaya//QPFEKIr0GKgB6m25JwDhlL7ZtPUbf2L7iHjoXMIWhpgzv0eIcZouHdZ7Ekp+O75lsE921GqdyFqoK9/gC2w9uxKRqWA+taxnAKIUQcshNEr9hN05Z/oH7xEXrNQYIfv0YkFCQU0dvct7iiEbWLNpM6k3bDyQNIu+oOImW7MMPNpE27p81uwkII0RV6bIlQ0UK3uLDnDCbA2wA48oYQsSdjmqeuz1zhakKOdHTTJG3y7ehWJxFnBpnzcom6M1D1CKE9H+Hf+g6uIWMI7vmY1KsWoAzKks20hBBxx6qHqP3sfbSqEtxDGojUV2N6fGTb/Qzvn8bOLzfFApg4Oh9dP/OJqe2JGga7yxrZtKuSATnJjB6S0e6Ov+1xNJVx5L2/YEnJRG+qo+mfL+GaMB/N1v4KQUIIEQtSBPQwZ7iKqrd+h3vkFBQUjrz5NFnzf03Qmf3Vj/GXUvXSL/BdfTdadQmNu9bhm/PvmKZCKKlv6/1cF80kFAjgd2XgvfAaLIMuQ5cCQAgRh0IWL2njb6Rm9fNUpY1idbOX/eUaVx9UuHNGAZ/srqbkcCOXnp/LsD6x/3D9eUk9j760FYD3OMSn52Zyz8wRWDqQck2bA9egMbgvuQGz8TBaxV4Ma/vr9AshRKxIEdDDQo4sMuf+jKgrA4DkCyYTcJ56UrDu8uEePp7aN58AILXwm0SsbWe6WwyNssO1vHB4NLvK/IwblsL11VV4M3KhE6tOCCFEb2eP1KEoJmY0gmPSnfzXG/WUHakF4L9f/5x7bxjJ1DH5qGofdD3286OsVpV3Pi5tc+yTXdXUXxki3Xv6NfvDjkycE24nbFohPRWLbxC60rFeBCGEOFMyJ6CHmSiEkvoRtbiJWty4+g4/7dJwpqKiOo6tB63aXZhK219lRLHw+7VVbC9uJBI1eG9HHf/Y0YzSReNghRCiJ9iNZvxrnqN53YsYIT/lZZWUHbebL8CHn1WgqmprAXD8pOBYzA82DJN+2Z42x7J9LjxKqMNtaOax7+SkABBCdAcpArqQooCzqQRnuBorEZz1+3AYHVvxwRk8jKO5HFUxcTcV42rYj6KAw1+KPVCBf/NKUq9agHvEZOpXP4dDa2jz+EDIYPfBtsc+3n0ErQu+BesIp78MR6gaFR1n/T6sevPpHySEEKcRUd0kjZlG895N1Lz9LMl2gySXDZtVxeVo+WA9cnAmhmEQ1HS27K9h2QcH2Hqglq0HatiyswxLzR4sZgRHuAZnY3GnCwPDMJk8pi95GW765XiZVziMMedmsbU0TFMo2gVR9342PYCzfh8qOo5gJY7AoZ6+JCHECWQ4UAwpivnlesoKqmJgjwaoXfEYqsODa+Ao6jauIKPoXhxp+afcZdFOmKb3/0yk+iApk24lWFVM4ydvkzZlPjXvv4hnxCSyb/sVIUcmzv5jcF9wNSFbWps2PE4LwweksbP42GS4SwpysKlKt6/57DAD1L/1FKZu4BkxieoPXiR91g8h6/zuvRAhRK+nKArhqIHVAuppekUdhp+wmoRqd6OoFkzdIKl+Dz+YNxelthSrHuKzQDqTfeVYw1ZWbGlm2Zp9rY+fMWEgk/sEqF/2H7gvnkWgZDtGoBbfzQ8QsnRu3kCOU+MnE032kcfSZbtbj194bhb3zCzA0ouWJO1qigJK1R6q33iMlIm3UrflLSwuL8kzFhFW2t8vRwjR/aQnIEYUxcRa8Sm2ys+wE4YdqzC1IOnXfRftSCmNG1eQNGoKTZ+uBf3U3wxFcOCdfAeYOnVvPoUlOQN7/mDq3nkO1ZWEc9TVNDtzMRQrmsVD2JVzUhsWRWHB9AIuOS8bj8vG1Zf2Y8qFfXpk05ew4iH12u+gN9XQ8MGLJF04DT2jY0ugCiESR0gzeHfrIe7/3Yc8+sp2KuqCX3lfR3MF9ct+iTtUTsOa57Fl5JM+/ds0797IMEc1+TUb8H70NFPVj6j/++NoBz9j4462+698+GkF/zzkJHr+dJo3voZWeQDfjB90ugAAUPQQ9vKtrPu8rs3xT3ZVUdMY7nR7ZzPTBCN7OJ4Rk2l4788YQT8pV98jBYAQvYz0BMSIRQ8R2r+J5s//iSN/GOGynaT78gj761vvEy7bTcr4G9GS+xzdw6tdCgZ6zUGMYADF5kAxDbTKEgD0pjrM+grISD3tNaUn2fk/MwoIRw1cNgtmN1cAqqnBl3MVokdKsOcMJHxoN6EDW0k9/0p05+knzAkhEsfnJbX8aeUuAGobQzx0eDO/vGscHvvJO8KaVgeqy0vlCz8l5eIiLKnZGDkF5N3+K4KWVDzjbiJcugv/J2/hHHwRDLyUyLtb2rSRluykv0/BuuPT1mPh0h1Yzs3p9Lj8iCMDzxW3kb2uqs1xu1XFYUu8HW0t4QZCJS3Pq6mFiFYdQO2X3mt2QRZCSE9AzERVF0njbkR1uAmX7SRp9NWoWYMJf7GF5IuuwTd1AUYkiNF4BFvN3lO2pRoRIof24Bkxmcx/+TWGFsKWnkf2N/4T1+AxaJUHsJhah65LAZxWtdsLAIsZhX0foHyxHrsZQgk3Y8/uT/ZtSzANHYL13Xo9QojeTVUVPjtQ0+ZYYyBCbWP7k2s1eyqe8yeCEaXho9dRU3KIqi6ceYMxFAvawe3ojdWoTg+hL7ZirS/h7tkjWj+Qu51WLj4vmxQzgCXSRNrcB0iZeCvh0s9Ro1/dA3EqIcXDhFF5ZKY6W2P61xkFpHoSa6KvogDNdSiqhazbH8Z7URHh0p2oemL1iAjR2ylmd386/JpqavwYRu+7ZKsRJLL5dQJbVmHL6INWU076rB+Arx9KQznYXVicXiKHdqHmDSdiS223ncxML9XVTVj1IJg6UWsSTr0R0zQJW1OwRhtBdRBVe/ca0s5IDbV/fRDdX4en4AoCn/8TxeYg4+afYdrdhE/obj8adyJJxJhB4o5VW2eit+bPozbsruJ3yz9rve1xWvnVPZfhdZzcae0KlFK/+llSLplB49Z3safn4SqYSFK/YdRU16HsX4feVItrxBSa3v8z7hGT0TLPpS4Qpa4pjC/ZQSSiE44anJOsodlSsJgaqh5Es369fQSCEZ3K+iBJLhsZXjtdvSxzb3xPKYqJI9pEyJKM1QiiGDqaNSmm5+iNcXe1RIwZYh/3mebQeCNFQIyoGKilH6MoKmpeAaFP3sA5fAJhT9sJwKqqnPL64+kN7gwe5shLP8eMhACFzHk/J+Tt3+594ynujkrEmEHijlVbZ6K35s+jmiNR/rGpjDfXF5Pjc3PnjAL6Z3ranctkMyOo5duoeet/yJjxbzRtXolp6GRf/0PqghZUoihGFF11YjOCaGr8jkeX91TiSMSYQYqAriJzAmLEQMXsOxYAExXb2BtaNn4BHOEj6FYXUYsHR7CKkCPjtHsBnO0UTPSGw5ha5MsjJlr1QVRvHxkTKoRol9tuZeblA5g6ti92i4r1FKuZaYode9YQbLmDqX7tEdLGz8GaPxzV4cJT+wXN7j5YzDDWcDVhR2b3BiKEEGcBmRMQQyYq5pdPafRoARBtoP5v/0lk03JsldupeuHHOBqLe/Aqu4dDq6Vu1dMoNgfps36AJSmN+nf/gCNU3dOXJoTozUxw2yxYO7SxoYKiWki7/AYaNq0kvH8TTVtXU/mXn+Oq20144zLq31iKI9rY5ZcthBBnG+kJ6GIRazLJl82h9u+/JbDtHVxDL0F3Z/T0ZXW5sD2d9NmLQLEQSemPb/aPMAN1hFzZp1wZSQghOkJRgJoviBzajXfsDLwjIzR89BoAnuGX07h5FcH9n+CbvpBIjMeii7OfokBTKEpEM0jz2GOyc7QQZxspArqYiYJiP7YUpuJwYyrx3wFjmhBOGfjlDQi5clDcOT2yT4EQIv6YJujZBWTe8v8IuzKx+I+0/p9id2IG6o79LJ3e4jiKAp8V1/P08u0EQlEuG5HL3CuHkOSUj0QisUhm7GKOaAP17zyLa9g40qbdQ/Ona7EGKnv6snqEFABCiFjSFRtBZzb2SD31a/6EZ3QhaZNuwb99DSmXzMQ19BLq33kWh97x4UCaYVIXiKD14gnU4uup9Ud47OWtBEItG3eu/7SC7ScsTytEIpCyt4uFrSmkzVqEYXMTtSaRNb8fYVdWT1+WEELEjZDNR/pNPyVqT8abmoQj/1yCnnxc4/PwaM0nLUn8VY40hXl6+accKG+kX7aXb90wkqzk3r0cs+i8pmaNqG60OXawsonx5+f06tWzhIg16QnoBmFXNprVi4lC0JUrq+MIIUSMhVw5RC1urE4PAU8/DCxoVi9hV3aHHm8AL7+7jwPlLb0GByub+NOqnejymTDuZKY4yUhpu2P9qMEZUgCIhCM9AUIIIbpMc0SnrilMmteB2957vwAJajr9crxk+dz8c+shGgMR9h6sJ6zpMb1ui6lhD9cQcuVgC9eBqRNxxP9iEb2Jy27hh7eO4Z2PD1JWFWDapf0Ykv/1NogT4mwkRYAQQoh2NQajbN/fMlZ65KB0kl2d+5NRXhfkP/68mQZ/hJQkO//31jHkpbXdtEvBxNF0ECMpCzVQjeFIIWJLiVkMHVHjD/P4K9sorfTjsFu4ftJg/vbBAS49PyemBYCigOXw51T//Ql8M/4N/+aVmFGN5Gu/S1h1x+w84vSykh38y9ShKIqCfsLQICEShRQBQggh2vXKmr2s214BwOUjc7mzaHiHl/jVTZPn39xJg79lw8AGf4Q/rdzFD+ddgOW49RitejP+TSswQgG0qmLSr/8RdGMRoKoK728tp7TSD0A4ovP6+/u5c0YBg3Jjex2mCaQPwN7nXGqW/wYsVjLm3E/E4palk3tAy/AfeeJF4pI5AUIIIU4SNU32H2povX3gUCN6J8ZMa7pJebW/zbGyqia0EwbZR60e3COmEC79HFtWf4yk7l04QVUViivarh7UHIrSNzOp0z0fHWNiGvrRHzF1vQvOIYQQpydFgBBCiJPYVIW5Vw1FVUBV4OarhnRwF98WLpvKNeMGtDl2zbgBuGxt/+zY9Gb8m/5O+qwfojqSUJsOx+LyOywaNZhyUd82x0YOziAtyR7zcykKUFuCVrGPjBvuw9H/fPwfLcOmN8f8XEIIcToyHEgIIcRJTBNGnuPjoW+NByDda+/UXh+mCZNG5+NLcbJ93xFGDs5g5MD0k9qIqG6SCu9Bs3hwpZ9D2NL9Y+PP65fG9+eN5qPPDnNOXjIXDctqM2QpVkwT9KzzyLj1QcLOLJIm5YChE5H5AEKIHiBFgBBCiK+U7j3zb8TddguXnpvF5QU5p5x8GVE9YIJm8Zzxub4Om0Xh/P5pjBrowzDMLt3YUFds6I4sMCFiS+26EwkhxGlIESCEEKJLnS2rr+iyKYAQIoHInAAhhBBCCCESjBQBQgghhBBCJBgpAoQQQgghhEgwUgQIIYQQQgiRYKQIEEIIIYQQIsFIESCEEEIIIUSCkSJACCGEEEKIBHPW7ROgdmLb+rNVIsTYnkSMOxFjBok7Uc/fXRIlzuMlYsyQmHEnYsyQuHF3JcU0u3JvRCGEEEIIIURvI8OBhBBCCCGESDBSBAghhBBCCJFgpAgQQgghhBAiwUgRIIQQQgghRIKRIkAIIYQQQogEI0WAEEIIIYQQCUaKACGEEEIIIRKMFAFCCCGEEEIkGCkChBBCCCGESDBSBPQwv9/P9OnTKSsrA2D9+vVcd911FBYWsnTp0h6+uq7xxBNPUFRURFFREQ8//DCQGHE/9thjXHvttRQVFfHcc88BiRE3wEMPPcR9990HJEbM8+fPp6ioiJkzZzJz5ky2bduWEHH3hETLoZI/JX/Ge8ySP7uRKXrM1q1bzenTp5sFBQVmaWmpGQwGzYkTJ5oHDx40NU0zFyxYYK5du7anLzOm1q1bZ958881mOBw2I5GIedttt5lvvPFG3Me9YcMGc+7cuaamaWYwGDQnT55s7ty5M+7jNk3TXL9+vXnJJZeYP/rRjxLiNW4Yhjl+/HhT07TWY4kQd09ItBwq+VPyZ7zHLPmze0lPQA96+eWX+dnPfkZWVhYA27dvp3///vTt2xer1cp1113HqlWrevgqYyszM5P77rsPu92OzWZj0KBBFBcXx33cF198MX/84x+xWq3U1NSg6zqNjY1xH3d9fT1Lly7l7rvvBhLjNX7gwAEAFixYwIwZM3jhhRcSIu6ekGg5VPKn5M94j1nyZ/eSIqAH/fKXv+Siiy5qvV1VVUVmZmbr7aysLCorK3vi0rrMkCFDuOCCCwAoLi5m5cqVKIoS93ED2Gw2Hn/8cYqKihg3blxC/L5/+tOf8r3vfY/k5GQgMV7jjY2NjBs3jieffJI//OEPvPjii5SXl8d93D0h0XKo5E/Jn/Ees+TP7iVFQC9iGAaKorTeNk2zze14snfvXhYsWMCiRYvo27dvwsS9cOFCPvzwQyoqKiguLo7ruF955RVyc3MZN25c67FEeI2PHj2ahx9+GK/Xi8/nY86cOTz++ONxH3dvkAivL5D8KfmzRbzFDJI/u5u1py9AHJOTk0N1dXXr7erq6tZu7niyefNmFi5cyP33309RUREbN26M+7j3799PJBJh+PDhuFwuCgsLWbVqFRaLpfU+8Rb3m2++SXV1NTNnzqShoYHm5mYOHToU1zEDbNq0CU3TWv94m6ZJfn5+3L/Ge4NEyKGSPyV/HhVvMYPkz+4mPQG9yKhRo/jiiy8oKSlB13VWrFjBFVdc0dOXFVMVFRXce++9/OY3v6GoqAhIjLjLyspYvHgxkUiESCTC6tWrmTt3blzH/dxzz7FixQpef/11Fi5cyJQpU3jmmWfiOmaApqYmHn74YcLhMH6/n+XLl/P9738/7uPuDeI9l0j+lPwZzzGD5M/uJj0BvYjD4WDJkiV85zvfIRwOM3HiRKZNm9bTlxVTv//97wmHwyxZsqT12Ny5c+M+7okTJ7J9+3ZmzZqFxWKhsLCQoqIifD5fXMd9okR4jU+ePJlt27Yxa9YsDMPglltuYfTo0XEfd28Q768vyZ+SP+P9dy35s3sppmmaPX0RQgghhBBCiO4jw4GEEEIIIYRIMFIECCGEEEIIkWCkCBBCCCGEECLBSBEghBBCCCFEgpEiQAghhBBCiAQjRYAQQgghhBAJRooAEdcWLFhAbW3t177Phg0bmD59+mnPN2zYsHbbWr16NQ8++CAA8+fPZ9WqVZSVlTF69OjTtimEED1B8qcQ8U02CxNxbd26dTG5z9d15ZVXcuWVV3b5eYQQIlYkfwoR36QnQMStH//4xwDcfvvtbNy4kfnz53PdddcxY8YMXnvttZPuU1FRwZo1a5g7dy7XX389kyZN4tFHH+30eR999FFmz57NzJkzWbNmDQCvvvoqd911V0ziEkKIrib5U4j4Jz0BIm79+te/5tVXX+X555/npptuYtGiRRQWFlJZWcmNN95I//7929wnLS2NRYsWsWTJEgYMGEBlZSWTJ0/mtttu69R5+/TpwwMPPMCePXuYP38+K1eu7KIIhRCia0j+FCL+SREg4t7+/fsJh8MUFhYCkJ2dTWFhIR988EGbMaWKovD000+zdu1aVqxYwf79+zFNk2Aw2KnzzZs3D4ChQ4cyaNAgtmzZErtghBCiG0n+FCJ+yXAgEfcURUFRlDbHTNMkGo22Odbc3Mzs2bPZsWMH5513HosWLcJqtWKaZqfOp6rH3laGYWC1Sq0thDg7Sf4UIn5JESDimsViIT8/H6vVyttvvw1AZWUlb731FpdddlnrfaLRKCUlJfj9fr773e8yZcoUNmzYQCQSwTCMTp1z+fLlAOzYsYODBw8yatSo2AYlhBDdQPKnEPFNSmwR16ZNm8Ydd9zBU089xYMPPshvf/tbdF3n3nvv5dJLL229z/z583nssceYNGkS11xzDXa7naFDhzJ48GBKSkqw2+0dPmdpaSmzZs1CURQeeeQRUlNTuyg6IYToOpI/hYhvitnZvjohhBBCCCHEWU16AoTohGeeeYY33nij3f+78847mTFjRjdfkRBCnB0kfwrRu0hPgBBCCCGEEAlGJgYLIYQQQgiRYKQIEEIIIYQQIsFIESCEEEIIIUSCkSJACCGEEEKIBCNFgBBCCCGEEAnmfwEf1V+EKeAwDAAAAABJRU5ErkJggg==", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Create a visualization with pandas df\n", "sns.relplot(\n", " data=pandas_tips,\n", " x=\"total_bill\", y=\"tip\", col=\"time\", col_order=[\"Lunch\", \"Dinner\"],\n", " hue=\"smoker\", style=\"smoker\", size=\"size\",\n", ")" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAEJCAYAAACdePCvAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAA4f0lEQVR4nO2de3wU5fX/P3tPdjebkBsESEADAVTuogaUi5KgxBCIAoGKWqWKl9JiLVDqr/bnCyva/kqlSu23+rWtV6AFEYsIilokFAUF1IKBcAmRkCy5bXazt+zO74/NDnuZ2Uv2Mrs75/16+ZLdnZnnnJnJc57nPM85R8IwDAOCIAhCdEiFFoAgCIIQBjIABEEQIoUMAEEQhEghA0AQBCFSyAAQBEGIFDIABEEQIoUMAEEQhEiRCy1AuLS3m+B0Bg5dyMnRorXVGCeJhEMsegLi0VUsegLi0VVoPaVSCfr103D+lnQGwOlkghoA93FiQCx6AuLRVSx6AuLRNVH1JBcQQRCESCEDQBAEIVLIABAEQYgUMgAEQRAihQwAQRCESCEDQBAEIVLIABAEQYgUMgAEQRAihQwAQRCESCEDQBAEIVLIABAEQYiUmBoAo9GI22+/HY2NjQCA2tpaVFZWory8HOvXr49l0wRBEEQQYmYAjh49ikWLFuHs2bMAAIvFgjVr1mDjxo3YuXMnvvnmG3z66aexap4gCCLxkAAGsx0NehMMlh5AIqw4MTMAmzdvxpNPPon8/HwAwLFjxzBkyBAUFhZCLpejsrISu3btilXzBEEQiYUEON7QiVUba/HrVw5i1Yv7cbyhU1AjEDMD8PTTT+Paa69lP7e0tCAvL4/9nJ+fj+bm5lg1TxAEkVAYuu3YsPkIrHYHAMBqd2DD5iMwdNsFkylu9QCcTickksumjmEYr8+hsvJPtWhpN3P+tqh8BBbPGgkAyMvL6JugSYZY9ATEo6tY9ATEo2teXgYuntKznb8bq92BbrsDxUXZgsgVNwMwYMAA6PV69rNer2fdQ+Hw3EOTAxZX0Ou7kJeXAb2+q09yJhNi0RMQj65i0RMQj65uPdUqOVQKmZcRUClkUCtkMb0PUqkEOTla7t9i1qoPY8eOxZkzZ3Du3Dk4HA689957mDp1aryaJwiCEBRduhzLF4yDSiED4Or8ly8YB51aIZhMcZsBqFQqrFu3Dj/+8Y9htVoxbdo03HrrrfFqniAIQlgYYFRRJp59eDI6TDZkaZSuzl/AapExNwB79+5l/11aWop333031k0SBEEkJgygS1dAl65gPwsJRQITBEGIFDIABEEQIoUMAEEQhEghA0AQBCFSyAAQBEGIFDIABEEQIoUMAEEQhEghA0AQBCFSyAAQBEGIFDIABEEQIoUMAEEQRDgkWFWvSIhbMjiCIIikp7eql7uwizuj56iiTMHz+vQFmgEQBEGESCJW9YqEpJsBBKoINmfKUMy96co4S0QQhFjoMNo4q3p1mGyXM3wmEUlnAIJVBCMIgogVWRkqzqpeWRqlgFL1HXIBEQRBhEgiVvWKhKSbARAEQQhGAlb1igQyAARBEOGQYFW9IoFcQARBECKFDABBEIRIIQNAEAQhUsgAEARBiBQyAARBECKFDABBEIRIIQNAEAQhUsgAEARBiBQyAARBECKFDABBEIRIEcQAbN++HRUVFaioqMCzzz4rhAgEQRCiJ+4GwGw24+mnn8Zrr72G7du349ChQ6itrY23GARBEKIn7gbA4XDA6XTCbDajp6cHPT09UKlU8RaDIAhC9MQ9G6hWq8VPfvIT3HbbbUhPT8ekSZMwYcKEeItBEAQheiQMw8Q1memJEyewevVqvPLKK8jIyMDjjz+OMWPGYOnSpfEUgyAIQvTEfQbw2WefobS0FDk5OQCA6upqvPnmmyEbgNZWY9CSkHl5GdDruyKWNdERi56AeHQVi56AeHQVWk+pVIKcHC33b3GWBSNHjkRtbS26u7vBMAz27t2L0aNHx1sMgiAI0RP3GcCNN96I//73v6iuroZCocDo0aPxwAMPxFsMgiAI0SNIScgHHniAOn2CEAsSwNBtR4fRhqwMFXTp8qQuo5hKUE1ggiBihwQ43tCJDZuPwGp3QKWQYfmCcRhVlCm0ZAQoFQRBEDHE0G1nO38AsNod2LD5CAzddoElI4AknAGs/FMtWtrNnL/NmTIUc2+6Ms4SEQTBR4fRxnb+bqx2BzpMNoEkIjxJOgPw3EOTg24DJQgiMcjKUEGlkHkZAZVChiyNUkCpCDfkAiIIImbo0uVYvmAcVAoZALBrADq1QmDJCCAJZwAEQSQRDDCqKBPPPjwZHSYbsjRKV+dPk/iEgAwAQRCxhQF06Qro0hXsZyIxIBcQQRCESCEDQBAEIVLIABAEQYgUMgAEQRAihQwAQRCESCEDQBAEIVLIABAEQYgUMgAEQRAihQwAQYgFCWAw29GgN8Fg6QEkQguUpKTQfaRIYIIQA4Hy8lNkbuik2H2kGQBBiADKyx8dUu0+Jt0MIFA9gJLCTKz+wcQ4S0QQiU+gvPxsjh4iKKl2H5POAFA9AKJPCFWXNkHq4VJe/uiQavcx6QwAQYSNUH7bBPIXu/Py+8pCqZnDI9Xuo4RhmKQSu7XVGHQGkJeXAb2+K04SCYdY9AQi09VgtmPVxlq/UduzD0+O6bS9L+3G9Jm6ZyMJkpc/ad/fMO+j0HpKpRLk5Gg5f0u6GQCtARDhIpTfNuH8xZSXPzqk0H1MOgNAawBEuAjlt001fzGRetA2UCLlEaouLdXDJRKdpJsBEETYCFWXlurhEgkOGQBCHAjlt00hfzGRepALiCAIQqQIYgD27t2L6upq3HbbbVi7dq0QIhBEapBCicmI+BN3F9D58+fx5JNPYsuWLcjJycE999yDTz/9FNOmTYu3KASR3CRQoBmRnMR9BrBnzx7Mnj0bAwYMgEKhwPr16zF27Nh4i0EQwhGlUXuqJSYj4k/cZwDnzp2DQqHAsmXL0NTUhOnTp+OnP/1pyOcHCgRbVD4Ci2eNBOCKvhMDYtETiL2uTieDpksmtBnMyNaloyBXA6k0uj4Vp5PBga+bsP6tL9lR+4pFE1A6uoBtK1Q9L57ScwaaddsdKC7KjqrcsUIs72+i6hl3A+BwOHDo0CG89tprUKvVeOihh7Bt2zZUV1eHdH6wQDC9vkvw0Ot4IRY9gTjoGid3isFsZzt/wNVhr3/rSwzo50oPEY6eapWcM9BMrZAlxXshlvdXaD0TKhVEbm4uSktLkZ3tGqHMnDkTx44dC9kABJoBzJkyFHNvujJqshLigc+dEu18QdFMD5FqicmI+BN3AzBjxgysWrUKBoMBGo0G+/btwy233BJvMQjCi3jl7YlqeggKNCMiJO4GYOzYsVi6dCkWL14Mu92OKVOm4I477gj5fMoFRPSZALn545W3J+qjdgo0IyKA0kEnMWLRE4iCrsF8/PHcUhkgnXBeXgb0l7oSoohMrBHL+yu0ngm1BkAQQhDUxx9Pd0qAUbvTydDefiJuJJ0BoEVgoi+E5ONPAHdK0yVTXBajCQKgXECESHD7+D1JxNz8bQYzr6EiiGiTdDMAWgQm+kKybJnM1qVTERkibiSdASCIPpEkWyYLcjVJYaiI1IAMACEeEsDHHwypVJIUhopIDcgAEESiEQ9DJQGMlh4YzD2wWHuQm5mWsttNCX5CNgCdnZ2QyWTQarn3kxJhECAgiUhQYvnMPK5tYyRQShHb90EC1F/oQqPeiLf31NF2UxET1ACcPn0aP//5z3H8+HFIJBKMHz8ezz33HAYOHBgP+VIPyuGefMTymXFc+9H5Y3H10CzAGQ3h/TF021F/wYBtn5yi7aYiJ6gB+MUvfoH58+fjjjvuAMMw2LRpE375y1/i1VdfjYd8fgSKAygpzMTqH0yMs0ThEa+kYylLKCPxaI3We69jtDoCP7MI2uN6H17YchRr7p2Ewlz15esEayMMGTqMNjgZhnO76aUuq7BrDuHcSyFm0ik2ew9qAMxmM2pqatjPS5YswebNm2MqVF+51GkRWoSgxCvpWEoSykg8WqN1j+tUTSvmf2ZqRUTt8b0Px8+2IVOtYA1MNNNYZGWoIJVIOLebnjrfCavVIcyMNBw9hJhJp+DsPWgg2JVXXokvv/yS/VxXV4fBgwfHVKi+cuPoAqFFCEqyBCQlIqFUwIpWlSzf6/A9s0jb43sfnE6wwV/B2ghXBl26HMUDdagpK2HbVilkWDxrBD784pxgVcXC0UOIamipWIEtqAG4cOEClixZgurqaixYsADV1dU4ceIEKisrUVlZGQ8ZQ+bd/WeFFiEo7oAkzz88dp83EZBAs6dwjgm3rb2HGrDQp7N0P7NI29Oly/Ho/LFe115YVoJ9RxrZQUGwNsKWgQGKB2Zg4og8/PKHk/Dj+eOwYOZw7PjsNC51WASLPA5Hj2g951jJlywEdQE9/vjj8ZAjZJI+EjhJApISkVBSNkcrrbPndS51WLCz9gzmTR+GYYMzkatTsc8s4vYY4OqhWVhz7yQcP9sGpxPYc/Ac7rp1VMht9EkGBtCq5HAywNOvfpEQkcfh6BGv9N1CtxlreGcA9fX1AACNRsP533XXXYfrrrsuboKmFL37vItyNWwmSiI4ocyeojXD8r1Ol8mOwnwtrhyg9XpmUWnPCRTmqnHDVf0xbkQuVt010cuvHKyNSGQQbEYqAb5vMaJBb4LB0gNIwpNFCLlTcfbOWw/gwQcfxJ///GdcddVVKCgogOdhZrMZBw4ciJuQnlA9gMuIRU/AQ9cAufRZQjkmFKRAq8GGVoMFObo05OiU3Fszo9UeAjzTYG1EIkMU5Q+1Pd7FVIQhS7zl7mObQv+dBqoHwGsAOjo6ALh2/bz22mtgGAYSiQR2ux133XUXPvjgg5gJHAgyAJcRi56AALoKtOND8Gcah22OBrMdqzbW+rlSUnUrtNDPtE8FYX72s59h//79kEgkKC0tZb+XyWSYNWtW9KUMEaoHQMQDUcZrxMno0VboxIHXALzyyisAXIFgzzzzTNwEIohEQIydVLyMXioupiYrQbeBUudPiJGEjdeQuFwonoun0SJe2xxTcTE1WUm6bKBJvw2USAoSsoBMjF00cRuZ926Ffv6x6bjYaqSt0AKSdAaAIOJCAsZrxNpFE1ejxwCD8rVQShj2MxF/ks4A0CJwgiJ0kqxot++x3U+TpoDRbAckEkGTf8V8XSIBjR4RW5LOAJALKAEROklWtNvnuN7CshI2Qleo5F9xcdEkQdU0InoEXQQmiGDELEmWx4Ln9y1G3gXPaLfPdb1Ne+pw07jBsUn+FaKetHhKRJukmwEEcgGlKWXY+Ni0OEtExMQ1EcaoPtrt810PkhhsBQ1n9kIuGiLKJJ0BIBdQ4hGWayJEX33ABU+1wusa2ZlpUXWN8OkDJvoul7AXdslFQ0QRQQ3As88+i/b2dqxbty7kc5K9IlgqEvLukSiM6i8ZrPi+tRvnm4348Itz6DLZsXzBODy+eAJ+9+aXUdm9wqWPew0g2rtixBhwRiQOghmAAwcOYNu2bZg+fXpY54l6BuAzes6J9X0IdWeNh2vCaLFDpZSj02hDU7sFGWoFtCrX6NlztJublYabJxbhUmc3WgxpsFh7kKW93AbXKLwgRw2b3YlzF7sglUhQPX0Ytn5yih0xB3SNhLNLyMfVkqFWwmp3oKh/BnIz07jP8Uwcl5mGnAyexHE+pERUrNA7wIg+I4gB6OjowPr167Fs2TKcOHEirHNFOwPgGD2vWDQBJYMyYvPHFu7OGgbQqRX4/lI3Nmz+gj2npqwEg/O0KB6YwY52c7PSMHvyFdhz8BzKrh+CJ//nP35t+I7CC3LUuGPGcDy/6Suva1dMvgJ/23kclwxWyKUS3nq5Ye8ScrtaQin5KAWO1rfhpa1fs8csqx6NscXZQY1AQgachYPQO8CIiBBkF9CvfvUrrFixAjqdLqrXHVnUL6rXSyS4fMXr3/oyZuXo+rKzhuuct/fUof6CAYZuOzvavXliEburZtOeOu42PEbhv156PZYvHI+/bP/G79r52WpXLdvGTvz6lYNY9eJ+HG/o9NpJE8kuoVDObTXY2M7ffcxLW79GqyGEFAo+ej7/2PSk6jxTsUyimIj7DGDLli0oKChAaWkptm7dGtVrv7v/LH5UPRaAKwVrKnHxlJ7TV9xtd6C4KDsh2uM7x8kw6LY7cPUVuVixaALONnV67aoJ1EZe7/eH/nuR81irzYGashL8q/YM+92GzUfw/GPTMShf22ddwrkPJy9e4Dym3WjFyCtyAl7fTV7wQxKSSN/LVPs75SNR9Yy7Adi5cyf0ej2qqqrQ2dmJ7u5u/OY3v8GaNWsivvacKUOh13cJnn87FqhVck5fsVohi4mufWmP7xypRAK1QobWViNKBmUgR6fCp182YuiAjJDbGJCj4Ty2IFeDN3efwKUOC/u91e7AxVYjm2YgbF08fNoatQIFOWo0tXbznpvN48fvp1WF/WyS7d2N5L1MNl37itB69qkgTDzYunUrPv/887B2AYm2IEyirwFIAKOlB2cudmHjP45xrgGw58mAIyfbsOWjOpRdP4R1AwVqIydHi8+OfO8nT2F/LX7+x88CFxcJRxeOY5dVj8bmD+vQ1Nod9TUAX5Lu3Y1gDSDpdO0jQutJBiBV8ClHd8XgfmhtNcatPd6FSY9OIEOjwOzJV2BgnhbadAV06XJo07wXZVuNNjzx5wNeu4GkUmBcSR7ydSrONvLyMqC/1OUvD0LsgELUha9a1doHS2Gy2PnPDbV8ZBDi8u7GMG9SOMFpKft36oPQevapIlg8qK6uRnV1dVjnBNoFlK1T4XcPT4mGaImJTxCQVBrFZPAhtMf3R+25EGjtcODvO4+zo3Ctyr9zaTVY2A72UocFmz+qAwAMLdAhP0MVtjwhRcfy6eLTGbZ2WTl92iaLHUW5Gv774ARytErkaJXs54QkFrt2KDgtaUm6SOBA3Di6QGgRREm4wUw5PJG7OTqePfbB6GsHxNEZPrZ4QvLvyw+AKEtdEryklAEg+kiELgG+oC1NmgIX2rqRnqaA2eaAxdqD3Mw05GQqserua1HX0A4nA+z7qhELZpZcdptwycMh7yWDFekqOXRaBRx2p/fxkuCBWVyBaY0tXVi5ZCJe3v6Nl88/afblB6HDJPLIYwpa8yKlDMCJhnahRUg+ouAS4AraWjCzBL9/60tU3nQlrDbXnn3P3zwXTB+ZPxbXXJEFOPjlycnW8sr7o6pr8M+PT3p12JAw2LDpaMBFWd/ANM/F6PvnXA2TxY7hhf0wsF9aynQSmjQF5wxHkyaOzp+C1rxJqXTQqRwI1idCqB8blUAej2CmtQ/egJ/UjEdzWzfuum0UDCY72/kDwE3jBvsFTb245SgMRntAeZoumXh//8v2b3DTuMFex59vNgUNzPINTPM8/pV3v4XV5mTTWPTl3iYiVlsPFpaVeKWUXlhWAqu9R2DJYg8FrfmTUjOAd/efpYpgbkIc7UQtGRlHKoiashI4GZ9gL57gL3d7fPK0dZkxQJfG68Lw7IDdwWe+x7QaLK5F2l43gNFsx6Pzx+J7vZHzmoX9tdyunyQeSWrVSuw5eA5VU4td94wB9hw8hwnDUzSFigeUeM+fpDMAok4GFwahLvZFMxmZb5tOBpBKJJzX52uPT57sjHRAwkDCcz3PjtcdfOYJu8js03kX5Kjx6Pxx2PZJvd81B+Wow09VneAdiS5djrtuHZW8uYciICUS70WZpDMAot4GGgahjnZCSkYW4sKZ0Wy/PLIE8NV3zZgxsRA1ZSWsG2jfV4340dxr8Jd3vmHbe3T+WOg0CsDJL0//bDVOnW/H2aZO3D/napeLpvf3B+aNxj96t5J6rgG4/9jdawAqpQxGS49X593U2o0XthzBo/PH4oUtR0PqFJN6JCniojJJn3gvBiSdAQgEb6peERLyaCdYhxDI3YHLhiE7Mw3tJhu2/7uePW5hWQn2fdWI++ZcgycKr4O+3Yz2Lgv+8/UFNieQ0wm8+cEJr1q7vqmYbT0O1H59ga0BoJBJsWLRBDS2dMFmd6J/dhpW3TXRW34JsHZZKZrbzGhsMeKND06gy2THQ3eMQYZGAWvH5fvS1NqNfhnKkDtFwUeSke5kEeu+fREbPz5SygDQIvBlwhrtBOgQ+Nwdv310Cs63mNjfaspGYNsnp7yO27SnDmvunYQcrRINvccCwIJbSrD+rS+9OlAvF0rveoK+w4Jjpy6xswe3UdlZewbr3/oSVVOLsf3f9Zh8TX9/+RlAIZXgjx6yA8Cf/nkM86YPw9t7vmO/Uylk0KYpQu4Uue7tsuoxcACubRV9DQKTAN+3GHHxkom/Yw/RINMWRx7Eavx4SCkD8K8D52gR2E1fRzs+o0uj2c7p7jCYe3z8/QzncRab61h32cYMjQL9s9VBXSiGbjvqLxg4jUrV1GJs/qgOUin65Kop7K/1cg+F7QZw39tHJqNRb0JjiwlvfHAcXSZ7n3MAhbqwHKpBTqaFaUI4UsoAKOQptas1csId7fR2Qq/vOo6bxg2GVAqMHJKNUUOyoO+0oGLyFcjPVoNhGNh7HFgwcziydWnI1KqgkEmguf0q7PjsNJuZU6WQocfhxNFTegwb3A+PLR4PQIKLrSYvF0puVhpmThqCHgcDg6UHOrUc3TYHBuVpUDWtGHsPNQAAbp5YBEiAoQUZGDUkCxNG5MNg8q885obPVTMoRx25G4AB7D0M/rj5qNf1X9r6NdY+WOq128g3oI1rlB7qwjKfUfM1yHFdmPbVUy2HwRTiTIQCswQlpQwAERmGbjte33XcL0Pnw3eOgcPhRKfRhtffP47Km66ExebA5g9PssfUlJUgTSnDHdOH4Z+fnEKXyY6lVdfgvX31mHR1AZ565SB77OJZI3BPxSj87V/HkaFRoGLyFV5uHt/sm/dUjIK9x4k3P/iOPebB6tFY/9aX7DFcWUd1ajmWVY/2y9J5ecE5MjeAZ04jN+x20wwl54heKZf61S4eVZQZ8sIyn1GzWHuEWZjm2FXlG+jHNxNxOpmk3U6bKqTUkFmdRvYsEjqMNs4qXRv/cQxpSgXe7q3iZTDZ2V087mPe3lOHTpMdnb1F2pcvHIetH5/E6GH5ftd784PvYLb2oGpqMX54+9VegWLuoC3PwC6Dyc52/u7v/uxzjLvymNHSwwZotRps2P2fs6iaWowFM0tcrqMP62AweQT+RBDQ5c5p5Il7uynfiL7+goEzEMndsftey3dh2b3+4BnItXzBOOTyyBLrhWlfPbkC/fiCrZoumTjvkZgDs+JNSvWYbQar0CIkNVkZKkil3IFaZlsPG3DF5+93B1/Zepw439zlKqLCE/hltTux+aM61JSVBA3sksskIQV/qZRSnG7qwp/+ebkegXvR2LNgDDsqjjCgKydDyTnDyNEp0eARicx1jzy/6zDZUJSnCW3Rnm9tBxBki6PfzCVIoJ8nbQZz8m6nTRGSzgBQIFjs0KXLMWpoNqeLIV0pZ0eYfMFd7uCr3AwVFL3Xcf/GFbilUshwxcDMgIFduVlpKOzPXTnMN/irfz8NWzQe8F80dh/nHhVHHNDlBMYWZ2Ptg6V+dQD4XDVcAWpZGiXbsT//2HRcbDUGXpuIJC12lOHTM5Qtstm6dArMEpikcwGt/FMt7lu3l/O/dW8cFlq85EMCGK09uNBhwemLRmRqlXhs0Xg/F4Na5fKz7/uqETqNAotnjfQ6pqasBJkaBYoH6qBTK1CYp8aj88fi3IUOVxCWx7FLq65BepoM86YPg9Fs87vWsurR2HekEQAwc9IQ/PW9b/3y1zx05xj2GHf7Ep7Rp7T3LfcaFSNwQFfI9NYBKBmocy389u7+4XPVFA/U+X3nlgcMMChfi6JcDbsdNix6DUOfz+8Dvnru+6rR73l76ehBQa6G8x5xHUvEBkErgvWF+9fu5o0EnjNlKObedKXgFXjiRcR6SoD6C11o1Bu9FmGXLxiHwnwN2rqsXi4Gl3+9Bw6HA9p0Jax2B0yWHqQpZZDLpdAoZd7Vv2TA0VNt2PxhHburqKSoH/Kz03Cp3QJNmgLdVjuUSjnkUglM5t6KWxoFDCZXumeHk8Ezf/uCTdfszl9TUpSF3EwVLnVakaaSQ5cuh5MBVr24329EyVfNy2Dp4Tw+ajtnuCplIXD1rKR8d3317H1+wWYivFXekqpHCo7QzzRhS0L2BVGXhPQhJ0eLM43tfd5CZzDb8Z//tnjttQei1wl6ln70vPb/feAG3tKPfjLydNLzpg/DDVfle8vYhzrGibYLRSzvLiAeXYXWM2FLQvaFQLmAZFIJ/rJyRpwlEggJcODrJjaiti+dV4fRxrvAGo2FOL5tkhf0JrR2WEKSVZcux0N3jOFc2B05JMtbxj4EvxXma/B/7r8eFlsPcjNUsRmB0l53IkFJOgNAi8AuDN12r3QKXguYagV/hyMF2o12mCx22BxODC/K4lyIS1PJcbrZiHSVnDPIyg+fKl0ZagXy+nEv8qmUMmzYdMTlmjHbA3eKDHDlQB3mTR/m2kHDADtrz6DLZOdeLAw1+I1n9K/rTakQNRJwliEKyOiGRNIZAMoG6oJvAdNosffm5OfocCTA8YYOtLSZWZ//qCFZeGDeaPzPtstbGZdWXYNzTZ3409ZveIOsvODo5NznLF841qsy19Kqa/DOpy6X0+HvWvD2nrqgnaJWJcPQAh3Wv/UlMjQKzJw0BIX9tYBEwq4JhEu8Ujonc+ropIWMbsgk3S6gQIgpGyhf4JBKIecNrmk12HC+2eQVeDV6WD7+8VGdV7DU1o9PQqtWsee7g6z4AnS4Ojn3Of2z1Vhz7yQ2OGz3wbM4eb4TKoUMzt4dM0EDgBigdHQBfvvoFNx16yhs++QUfvv6Yax6cT+ON3T2qRpXVHYAJVA7xGWo8lfopJQBqDvfKbQIcUOXLseKRRP8ttCZLNzJ2zpMrgLpfkFcElc65M0f1WHzh3XY/JErBYO7BKP7fCfD8HZafJ2ck2HQ2mnFH7ccQYfRig2bjrCd//1zrsbeww1+MvIhlUrgdDLsWoD7nL7+YYcaeRsp8WqHuAwZ3dBJOhdQIEoKo+y/TWR6R8UD+nkveBrMPazf3b11Uip1FQPXpCtwtqkrpMAdm93p9VkqkfB2WoGCntRpcnSZ7NhZe4YtFiOVSGC29uBSh8VPxkAunWgWYolXcRAqQhJ/BK/XkETQNtAkhlNPj4yevkndHl88AXanE82t3dh98BxuGjcYmnQZMjPS8Kd/XN5l45uMjXcNoHehzWzrgb7Tihc9KmrVlJUgNzMNQwfq0HDR6NUBLp41Agq5FDv2nfaTkc9Xm5eXgfrz7Xj2tUOuHEC9bp99XzVi1V0T++ZP59qnH4u/hjDaEcu7C8RQ1wRbAxD6maZUHAAFgl2GV08J0NrlvQffnXJ5eGEmMjRKtHaa8Ye3LmdwXFY9BgzD+AViuYOsAFeuJc+0xp6GJk0pg8XmQKZGhTSVDC1tLiOz6q6J0KkVuNDuKu4CBqzr54e3X40Nm474jdS4Fkjz8jKgb+3C0fo2v9w7AfPvJ9luELG8u0CMdY2XcQ8BoZ9pSsUBBOKzr5uoIAwAMIDJo5BLblYaZk++wmukXVNWwpZGbGrtxm/++sXljtfpsZWSZzRVmK/Bhs1HUDW12JVvZ5or06YvbveMNk2O7Z96F14/39wVlkvHYLL7ZZp8aevX/DtqEmwkSMQRqvwVEim1CNxt6RFahITBc/Hx5olFfimZ395T50qt0AvfIhnfjgqjxYEMjcIr+2OgxU6u3DijPBLGcZ3jS7iLewm1GySCtNMEESsEmQG88MILeP/99wEA06ZNw8qVK6Ny3aSrBxDIPRHMdSFxjaC/b+mCSimDWiWHxdYDbboSOrUcYBgsXzgO55uNIaVTdgd/ndEboU1TwGSxQ5OmgM3egzX3TkKbwQqVUoptn5xCe5cVHV0W/GDWKGTrlFi7rBQ2mwNX/3ASWjssSFPJoG83Y2CO5vLUuzdK97ePToHB3AOLtQeZGiUeXzzBr0CKV86cXv3teiMUCllYi3tGs51deAaAvYcacKnD4pUO2u0mcOclkstkoQW+hfksQ5qJuGsCt5qgSVPAauuBVq1MeLcVkbzEvcesra3FZ599hm3btkEikWDp0qXYs2cPysrKQjo/ZSKBgxT3Dthh8AReqZQy7Nh3GvNnlmCLxyLu6nuuDZia2H3+me872WpeC8tK8MW3Tbh50hC8vP0btp1H7hwDg8mG5zd5LuqORJpSiv/d8V8vebjgqlv720en+CWe49Kv9usLqCkr8Utcx+nflQDtRhu2/7veK4XEnoPnXAaD4x66fy+/fkjgwLdwnmWo5R4DyHPXraPIbUXEhLgvAp88eRImkwnjxo0DADz11FMYMmQI7rnnnpDOD7QIXFKYidU/mCj4oksoGMx2rNpYy7kAKpVKUPtNM1s8ZO+hBnSZ7GyHwXfuvOnD4HQy2P7veq8c+K4yfSPw0tbLO31+VHUNTBY7TJYedmG2y2Rnz1MpZFi+cJzfIm1N2QjO5HHzpg/D23u+8/vOM2FbIJ09ffh8x1VNLcbeww2YOWkIhg3ORK6OP3cP3zXW3DsJhblqGLr529j+73ruZHN9eJa6dAUa9Cb8+pWDfuf9eun1KMrVBNV5+7/rUzJyOBn+TqOB0Hom1CLw8OHD2X+fPXsW77//Pt56662oXLvufCfy8jIAgP1/onLxlJ7TLWN1OnGxpZvtZD2Tn3XbHSguyuY918kwl33yHu6dptZuKGQSVE0tRv8cNZpbu9FltuFv/zruL5jk8vUsNodfO8Gqgfl+55Y5kM6exwQ6DhLgUocFb+/5Dr95eLLXOb7wXUMulyIvNyNgG1yyByKYXjaGu4DOgBwt8vK0QXXmukepQqL/nUaLRNVTMKf5yZMn8eCDD2LlypUYOnRoVK6ZrVNBr+8S3OKGglol53bLMBJs/Id3tOumPXWYN30Y1AoZ9Pou/nMlrmhZlcK/WpZUKnHNDKYVs//nuob7PJVChjSlv889WDUw3+/cMgfS2fOYQMd5yuZ7Tqj3V6MMfA/dlcp8ZQ9EML2UUu5yjUopE/TeuOUJVZZkIhn+TqOB0HoGmgEIsgvo8OHDuPfee/Gzn/0M8+bNi9p1bxxdELVrxRq+ilF8qRwK+2td7g4JIJUAD90xxuvcmrIS6DQK7DvSiKVV1/hVy9r26SksnjUC+75qxMLeyl6+VbZqykqw93ADO+t4b189llZd43XM4HwNHpjr/d3iWSPRL0Ppdy13dTBWZ7Uca+6dhJqyEiyYWYKCHDVnBSiue+MpWyhVo/jur/s8rt8XlpVg35FGTtkjacszTfWvl16PZx+e7OfTDyQPVckiYkXc1wCampowb948rF+/HqWlpWGfn1KRwBzBKgZzgCpVagW7UJihUWD25CtQkKtBmlIGhVwKm92JuoYOHD5xERNHDkD/bDW6La5dKRs2He0twO6ESilFfj81urqtGJibgW6LHTqtCjIZ0GWyQ5uuQLelB+o0ORjGCYlEik6jDdp0BV5//7/Qd1rYjJyD8tSw9zDoNFqhVSthMNmQrpIjU63wrg7Gscj56PyxuHpoFncQl8+9SU9XoulSkFq5Idxfrkhmz11AMpkMunS5t+x9fJZhL9pKAJtTgoutRtcuIHsPtGmKlE0bkTR/pxEitJ4JFQm8du1a/POf/0RR0eU96DU1NVi0aFFI56fKIjAvAXaUBFq43PxRHX5+10TkZ6Xj8HctcDovR9zOnnwFCvO1rjz8m4/gUoeFPT83Kw133TrKq+AKb7BUbydntNihUshdZRa1oUXXhroAzEdSP9MwEIuegHh0FVrPhFoEfuKJJ/DEE0/E5NqXOi3BD0p0AlS14guESk+ToaZsBKw2Byx2Bz78ooFNtMYVAfyv2jOsEZg5aQg27fnOa7/867uOc+fXYQCdWtFbb+CLsKJro5nIjSCI6JBkkVMigSeMnSvLYUGOGtp0Jd7e/Y1fJ88XAezesqlSyFA8SAelwjsh28KyEhgtdu50DH0scEIZGgki8SADkGhwlFZ0R6Xq1HI8Mn+sV9bNpXOuwXOvH0aGRoGqicW92wadWDZ3NCw2B2ckbPHgTKy++1rk6FwFdH73hndpyU176rD2Qe71mUsGq1eOoZsnFgESwGjpCeirDpoWOZmStiWTrAQRgJQyAEmXC6i343SnRsjNTIO+3eyVGoFNxTwoA8fPdaLdYL7cqTPA+ZYudkHYcxT/o7nXQCmXckbC1jd2sjOAn9aM53TNmCx25Gj9R+fpvdsVfdvc/mkQV1Cggu3JlLQtmWQliCAkXTrolNkFJAHqL3ShUW/0Sm2weNZI7PisnvXRuyNqJ47IwxN/PuDax++RVXPlXRPR0GwMOTp3xaIJePndr9nr80X28rl0jDYHDp9ogdXuDOu8QPehtcvmWrhmLs9SfK+VKM800sXsYCSKnvFALLoKrWdCLQJHSqCi8ADwv6tvjqM0fcfQbUf9BYNXJ2q1O/DmBye80ji4o1JbDRZY7Q589V0z7p9zNV5591tY7Q7oO7rRP1sdcnRuY0uX1y6gD784h4fuGOO3C4jPnaNVyTA4T4suM3/pyZA7Qp78Nzt7F6kTcYGYFrOJVCLpDEAgkqkkZIfRxptWwS9Lp1KGHF0aCnLUKB09ENs+OYWashL0z1ZDJpcCjGsxuKm12+s8rujcHod3r95lsuPKggxu1wwXDFA8MAOtXbaIF3W5FpQ37alj898k4gIxLWYTqURK1QMYWdRPaBFCJitDxaZV8IQrS2f/7HTkZCqxrHoMNu2pg93hBAPg+U1H8Mxfv8Dv3/wSd8wYjoIcNXve4lkjkKlR+ETs+n+3fME4aNPk0KUrUJSrcY1igzkFGSAnQxk4+jUE+EbT0t7UCYkY/Ro06pcgkoiUmgEkE7p0OYoH6vzSGz+6YCzaOi1YMLMEYIB/1Z5hM4EyvTOGqonFfts7/7L9G6y5dxLau6wYkKNGT48DOrWSHdm7I0s9v4uoVF6gRd0Q4RtNTxyRj5wMZWIuqkZB75hBu5OIMEkpA5BUJSF7XSn9s9MxvKifaxeQTgWj2Y7/98aXfod3mGzI0vZW+fKowuXGanfg2zNt2PxhnVeaYQAevunLbgqdWgFDtx0NLaa+dxYRlt3j2xqasJ2/m0QsN0i7k4g+kHQGIGUKwgAAA2hVcmhVHo+BJ9tmlkbJdpjnW4wBM0cG9UcnSmeRyKPpJKOvAXqEuEk6AxBoF9CcKUOjMwOIdCodQjlHvt+DBUyNKspEYb4GA3LUXjt33Hv8A+3gcWO09OB8ixFV04oBuLZeRr2zCPUeJuJoOgmh3UlEX0g6AxBzIh0dBzs/2O/BRsW9s4YxV/TD849N98ocOWH4xOAjaAlwuqmLs+BM1DqLRJlhiAjanUT0BQoE8yHSQJ9g50czkKgvegYqJxlqCcS+thHJDEPoYJp40Wc9k9Do0jOND6IJBJNJJfjLyhkRXT/SqXSw84WeqvO1zxaciUJnIbSOooTWU4g+kHQGINaLwJFOpYOdL/RUna/9QTnqqHUWQusoWmg9hQiTlAoEiwaRBvr0pRRhPAOJ4tG+0DoSBBEatAbARaTl/cIoRRjJVD0Sf3E02o9nG0L7UeOFWPQExKOr0Hqm1BpAXIh0Kh3sfKGn6vFoX2gdCYIICrmACIIgRAoZAIIgCJFCBoAgCEKkkAEgCIIQKWQACIIgRAoZAIIgCJFCBoAgCEKkkAEgCIIQKWQACIIgRIogBmDHjh2YPXs2ysvL8cYbbwghAkEQhOiJeyqI5uZmrF+/Hlu3boVSqURNTQ2uv/56DBs2LN6iEARBiJq4zwBqa2txww03ICsrC2q1GrNmzcKuXbviLQZBEIToibsBaGlpQV5eHvs5Pz8fzc3N8RaDIAhC9MTdBeR0OiGRSNjPDMN4fQ4GX1pTX/LyMsKWLRkRi56AeHQVi56AeHRNVD3jbgAGDBiAQ4cOsZ/1ej3y8/NDPj8u9QCSBLHoCYhHV7HoCYhHV6H1DFQPIO4uoMmTJ+PAgQNoa2uD2WzG7t27MXXq1HiLQRAEIXriPgPo378/VqxYgbvvvht2ux133nknxowZE28xCIIgRI8gFcEqKytRWVkpRNMEQRBELxQJTBAEIVLIABAEQYgUMgAEQRAihQwAQRCESCEDQBAEIVLIABAEQYgUQbaBJgvv7DuNuTddGbPz171xGKt/MBHr3jiMkUX9vI59Z99pfPZ1E24cXeB3jXVvHAYATBw1AB/85yzaDVZUThnKHu/ms6+b8LuHp+Dxjftx4+gCnGhox8iifgCA3V+cBwAU9dfiUqcFuZlpqDvfiTlThmLuTVfi8Y37kZuZBgAYWdQPJxra2X+7r/vw7z9F+aRC9lrlkwrxrwPnAADFg3TssTeOLsBnXzcBAPvvbksPyicVYsf+s36yu/V16+nW9fDxi6ysDc1GFPXXsjLNvelKr/v4zr7TAMDq7JbZ9xj3eb56rv7BRL9ruHHL5/7ds213O77vgSee8nk+23VvHGb1dL8Xq38w0e9aXOd7fn+iod3vPK5ruGX2lMn3euHgK6/vNbiu6Sl3sGOD6RPOb9H42w52n4OdH0n70YIMQADe3X82oocU7Py6853s/+vOd3od++7+s7zX8DyP63hf2gxW9nvPczw/txmsXu21Gazsd57neP7bYnN4tef5b7dOvt9z/dv3/259+dr1lct97zzvo68svnK5j3Gfx3Vv+K7hls9TXk99ffF9Jp7yeT5b32twXc8ts+/5vt8HwlNvdxue98PzmHDgun++77TvNT3lDnZsMH3C+S0af9uREGn70YJcQARBECKFDABBEIRISToXkFQaWuroUI8LRH6/9IiuE+x89+/5/dIBeMvs/s73e9/fguF5/Wie05frhtM+0Hc9+WTjOiaUY4PJ53tssOcV6jPnen/43plA75IvXHr7tteXd9/3nGCffWUJdmyo7YbyWzT+toG+90eRth8OgdqRMAwTOLcyQRAEkZKQC4ggCEKkkAEgCIIQKWQACIIgRAoZAIIgCJFCBoAgCEKkkAEgCIIQKWQACIIgRAoZAIIgCJFCBoAgCEKkpJQB2LFjB2bPno3y8nK88cYbQosTdYxGI26//XY0NjYCAGpra1FZWYny8nKsX79eYOmixwsvvICKigpUVFTgueeeA5C6uj7//POYPXs2Kioq8OqrrwJIXV0B4Nlnn8Xq1asBpKaeS5YsQUVFBaqqqlBVVYWjR48mtp5MinDx4kVmxowZTHt7O2MymZjKykrm5MmTQosVNY4cOcLcfvvtzNVXX82cP3+eMZvNzLRp05iGhgbGbrcz9913H/PJJ58ILWbE7N+/n1m4cCFjtVoZm83G3H333cyOHTtSUteDBw8yNTU1jN1uZ8xmMzNjxgzm+PHjKakrwzBMbW0tc/311zOrVq1KyffX6XQyN954I2O329nvEl3PlJkB1NbW4oYbbkBWVhbUajVmzZqFXbt2CS1W1Ni8eTOefPJJ5OfnAwCOHTuGIUOGoLCwEHK5HJWVlSmhb15eHlavXg2lUgmFQoHi4mKcPXs2JXW97rrr8Pe//x1yuRytra1wOBwwGAwpqWtHRwfWr1+PZcuWAUjN9/f0aVeRnvvuuw9z5szB66+/nvB6powBaGlpQV5eHvs5Pz8fzc3NAkoUXZ5++mlce+217OdU1Xf48OEYN24cAODs2bN4//33IZFIUlJXAFAoFNiwYQMqKipQWlqass/1V7/6FVasWAGdTgcgNd9fg8GA0tJSvPjii/jrX/+Kt99+GxcuXEhoPVPGADidTkgkl9OeMgzj9TnVSHV9T548ifvuuw8rV65EYWFhSuu6fPlyHDhwAE1NTTh79mzK6bplyxYUFBSgtLSU/S4V39/x48fjueeeQ0ZGBrKzs3HnnXdiw4YNCa1n0tUD4GPAgAE4dOgQ+1mv17PuklRkwIAB0Ov17OdU0vfw4cNYvnw51qxZg4qKCnz++ecpqWt9fT1sNhtGjRqF9PR0lJeXY9euXZDJZOwxqaDrzp07odfrUVVVhc7OTnR3d+P7779POT0PHToEu93OGjqGYTBo0KCEfndTZgYwefJkHDhwAG1tbTCbzdi9ezemTp0qtFgxY+zYsThz5gzOnTsHh8OB9957LyX0bWpqwiOPPILf/e53qKioAJC6ujY2NuKJJ56AzWaDzWbDRx99hJqampTT9dVXX8V7772H7du3Y/ny5bj55pvx8ssvp5yeXV1deO6552C1WmE0GrFt2zY89thjCa1nyswA+vfvjxUrVuDuu++G3W7HnXfeiTFjxggtVsxQqVRYt24dfvzjH8NqtWLatGm49dZbhRYrYl555RVYrVasW7eO/a6mpiYldZ02bRqOHTuGuXPnQiaToby8HBUVFcjOzk45XX1Jxfd3xowZOHr0KObOnQun04nFixdj/PjxCa0nVQQjCIIQKSnjAiIIgiDCgwwAQRCESCEDQBAEIVLIABAEQYgUMgAEQRAihQwAQRCESCEDQIiS++67D21tbREfc/DgQdx+++1B2xsxYgTntT766COsXbsWgCuV8K5du9DY2Ijx48cHvSZBRErKBIIRRDjs378/KsdEyi233IJbbrkl5u0QBBc0AyBExy9+8QsAwD333IPPP/8cS5YsQWVlJebMmYN33nnH75impiZ8/PHHqKmpQXV1NaZPn44//OEPYbf7hz/8AfPmzUNVVRU+/vhjAMDWrVvx4IMPRkUvgggXmgEQouOZZ57B1q1b8be//Q0LFizAypUrUV5ejubmZsyfPx9DhgzxOqZfv35YuXIl1q1bh6FDh6K5uRkzZszA3XffHVa7gwcPxlNPPYW6ujosWbIE77//fow0JIjQIANAiJb6+npYrVaUl5cDcOWTKi8vx759+7x88BKJBC+99BI++eQTvPfee6ivrwfDMDCbzWG1t2jRIgBASUkJiouL8dVXX0VPGYLoA+QCIkSLRCLxy83OMAx6enq8vuvu7sa8efPw7bff4qqrrsLKlSshl8sRbhotqfTyn5vT6YRcTuMvQljIABCiRCaTYdCgQZDL5di9ezcAoLm5GR988AEmT57MHtPT04Nz587BaDTipz/9KW6++WYcPHgQNpsNTqczrDa3bdsGAPj222/R0NCAsWPHRlcpgggTGoIQouTWW2/Fvffei40bN2Lt2rX44x//CIfDgUceeQQ33HADe8ySJUvw/PPPY/r06bjtttugVCpRUlKCYcOG4dy5c1AqlSG3ef78ecydOxcSiQS///3vkZWVFSPtCCI0KB00QRCESKEZAEFEgZdffhk7duzg/O3+++/HnDlz4iwRQQSHZgAEQRAihRaBCYIgRAoZAIIgCJFCBoAgCEKkkAEgCIIQKWQACIIgRMr/B0x5tiafPYALAAAAAElFTkSuQmCC", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Create a visualization with Modin df\n", "sns.scatterplot(data=modin_tips, x=\"total_bill\", y=\"tip\")\n", "sns.rugplot(data=modin_tips, x=\"total_bill\", y=\"tip\")" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAEJCAYAAACdePCvAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAA4f0lEQVR4nO2de3wU5fX/P3tPdjebkBsESEADAVTuogaUi5KgxBCIAoGKWqWKl9JiLVDqr/bnCyva/kqlSu23+rWtV6AFEYsIilokFAUF1IKBcAmRkCy5bXazt+zO74/NDnuZ2Uv2Mrs75/16+ZLdnZnnnJnJc57nPM85R8IwDAOCIAhCdEiFFoAgCIIQBjIABEEQIoUMAEEQhEghA0AQBCFSyAAQBEGIFDIABEEQIoUMAEEQhEiRCy1AuLS3m+B0Bg5dyMnRorXVGCeJhEMsegLi0VUsegLi0VVoPaVSCfr103D+lnQGwOlkghoA93FiQCx6AuLRVSx6AuLRNVH1JBcQQRCESCEDQBAEIVLIABAEQYgUMgAEQRAihQwAQRCESCEDQBAEIVLIABAEQYgUMgAEQRAihQwAQRCESCEDQBAEIVLIABAEQYiUmBoAo9GI22+/HY2NjQCA2tpaVFZWory8HOvXr49l0wRBEEQQYmYAjh49ikWLFuHs2bMAAIvFgjVr1mDjxo3YuXMnvvnmG3z66aexap4gCCLxkAAGsx0NehMMlh5AIqw4MTMAmzdvxpNPPon8/HwAwLFjxzBkyBAUFhZCLpejsrISu3btilXzBEEQiYUEON7QiVUba/HrVw5i1Yv7cbyhU1AjEDMD8PTTT+Paa69lP7e0tCAvL4/9nJ+fj+bm5lg1TxAEkVAYuu3YsPkIrHYHAMBqd2DD5iMwdNsFkylu9QCcTickksumjmEYr8+hsvJPtWhpN3P+tqh8BBbPGgkAyMvL6JugSYZY9ATEo6tY9ATEo2teXgYuntKznb8bq92BbrsDxUXZgsgVNwMwYMAA6PV69rNer2fdQ+Hw3EOTAxZX0Ou7kJeXAb2+q09yJhNi0RMQj65i0RMQj65uPdUqOVQKmZcRUClkUCtkMb0PUqkEOTla7t9i1qoPY8eOxZkzZ3Du3Dk4HA689957mDp1aryaJwiCEBRduhzLF4yDSiED4Or8ly8YB51aIZhMcZsBqFQqrFu3Dj/+8Y9htVoxbdo03HrrrfFqniAIQlgYYFRRJp59eDI6TDZkaZSuzl/AapExNwB79+5l/11aWop333031k0SBEEkJgygS1dAl65gPwsJRQITBEGIFDIABEEQIoUMAEEQhEghA0AQBCFSyAAQBEGIFDIABEEQIoUMAEEQhEghA0AQBCFSyAAQBEGIFDIABEEQIoUMAEEQRDgkWFWvSIhbMjiCIIikp7eql7uwizuj56iiTMHz+vQFmgEQBEGESCJW9YqEpJsBBKoINmfKUMy96co4S0QQhFjoMNo4q3p1mGyXM3wmEUlnAIJVBCMIgogVWRkqzqpeWRqlgFL1HXIBEQRBhEgiVvWKhKSbARAEQQhGAlb1igQyAARBEOGQYFW9IoFcQARBECKFDABBEIRIIQNAEAQhUsgAEARBiBQyAARBECKFDABBEIRIIQNAEAQhUsgAEARBiBQyAARBECKFDABBEIRIEcQAbN++HRUVFaioqMCzzz4rhAgEQRCiJ+4GwGw24+mnn8Zrr72G7du349ChQ6itrY23GARBEKIn7gbA4XDA6XTCbDajp6cHPT09UKlU8RaDIAhC9MQ9G6hWq8VPfvIT3HbbbUhPT8ekSZMwYcKEeItBEAQheiQMw8Q1memJEyewevVqvPLKK8jIyMDjjz+OMWPGYOnSpfEUgyAIQvTEfQbw2WefobS0FDk5OQCA6upqvPnmmyEbgNZWY9CSkHl5GdDruyKWNdERi56AeHQVi56AeHQVWk+pVIKcHC33b3GWBSNHjkRtbS26u7vBMAz27t2L0aNHx1sMgiAI0RP3GcCNN96I//73v6iuroZCocDo0aPxwAMPxFsMgiAI0SNIScgHHniAOn2CEAsSwNBtR4fRhqwMFXTp8qQuo5hKUE1ggiBihwQ43tCJDZuPwGp3QKWQYfmCcRhVlCm0ZAQoFQRBEDHE0G1nO38AsNod2LD5CAzddoElI4AknAGs/FMtWtrNnL/NmTIUc2+6Ms4SEQTBR4fRxnb+bqx2BzpMNoEkIjxJOgPw3EOTg24DJQgiMcjKUEGlkHkZAZVChiyNUkCpCDfkAiIIImbo0uVYvmAcVAoZALBrADq1QmDJCCAJZwAEQSQRDDCqKBPPPjwZHSYbsjRKV+dPk/iEgAwAQRCxhQF06Qro0hXsZyIxIBcQQRCESCEDQBAEIVLIABAEQYgUMgAEQRAihQwAQRCESCEDQBAEIVLIABAEQYgUMgAEQRAihQwAQYgFCWAw29GgN8Fg6QEkQguUpKTQfaRIYIIQA4Hy8lNkbuik2H2kGQBBiADKyx8dUu0+Jt0MIFA9gJLCTKz+wcQ4S0QQiU+gvPxsjh4iKKl2H5POAFA9AKJPCFWXNkHq4VJe/uiQavcx6QwAQYSNUH7bBPIXu/Py+8pCqZnDI9Xuo4RhmKQSu7XVGHQGkJeXAb2+K04SCYdY9AQi09VgtmPVxlq/UduzD0+O6bS9L+3G9Jm6ZyMJkpc/ad/fMO+j0HpKpRLk5Gg5f0u6GQCtARDhIpTfNuH8xZSXPzqk0H1MOgNAawBEuAjlt001fzGRetA2UCLlEaouLdXDJRKdpJsBEETYCFWXlurhEgkOGQBCHAjlt00hfzGRepALiCAIQqQIYgD27t2L6upq3HbbbVi7dq0QIhBEapBCicmI+BN3F9D58+fx5JNPYsuWLcjJycE999yDTz/9FNOmTYu3KASR3CRQoBmRnMR9BrBnzx7Mnj0bAwYMgEKhwPr16zF27Nh4i0EQwhGlUXuqJSYj4k/cZwDnzp2DQqHAsmXL0NTUhOnTp+OnP/1pyOcHCgRbVD4Ci2eNBOCKvhMDYtETiL2uTieDpksmtBnMyNaloyBXA6k0uj4Vp5PBga+bsP6tL9lR+4pFE1A6uoBtK1Q9L57ScwaaddsdKC7KjqrcsUIs72+i6hl3A+BwOHDo0CG89tprUKvVeOihh7Bt2zZUV1eHdH6wQDC9vkvw0Ot4IRY9gTjoGid3isFsZzt/wNVhr3/rSwzo50oPEY6eapWcM9BMrZAlxXshlvdXaD0TKhVEbm4uSktLkZ3tGqHMnDkTx44dC9kABJoBzJkyFHNvujJqshLigc+dEu18QdFMD5FqicmI+BN3AzBjxgysWrUKBoMBGo0G+/btwy233BJvMQjCi3jl7YlqeggKNCMiJO4GYOzYsVi6dCkWL14Mu92OKVOm4I477gj5fMoFRPSZALn545W3J+qjdgo0IyKA0kEnMWLRE4iCrsF8/PHcUhkgnXBeXgb0l7oSoohMrBHL+yu0ngm1BkAQQhDUxx9Pd0qAUbvTydDefiJuJJ0BoEVgoi+E5ONPAHdK0yVTXBajCQKgXECESHD7+D1JxNz8bQYzr6EiiGiTdDMAWgQm+kKybJnM1qVTERkibiSdASCIPpEkWyYLcjVJYaiI1IAMACEeEsDHHwypVJIUhopIDcgAEESiEQ9DJQGMlh4YzD2wWHuQm5mWsttNCX5CNgCdnZ2QyWTQarn3kxJhECAgiUhQYvnMPK5tYyRQShHb90EC1F/oQqPeiLf31NF2UxET1ACcPn0aP//5z3H8+HFIJBKMHz8ezz33HAYOHBgP+VIPyuGefMTymXFc+9H5Y3H10CzAGQ3h/TF021F/wYBtn5yi7aYiJ6gB+MUvfoH58+fjjjvuAMMw2LRpE375y1/i1VdfjYd8fgSKAygpzMTqH0yMs0ThEa+kYylLKCPxaI3We69jtDoCP7MI2uN6H17YchRr7p2Ewlz15esEayMMGTqMNjgZhnO76aUuq7BrDuHcSyFm0ik2ew9qAMxmM2pqatjPS5YswebNm2MqVF+51GkRWoSgxCvpWEoSykg8WqN1j+tUTSvmf2ZqRUTt8b0Px8+2IVOtYA1MNNNYZGWoIJVIOLebnjrfCavVIcyMNBw9hJhJp+DsPWgg2JVXXokvv/yS/VxXV4fBgwfHVKi+cuPoAqFFCEqyBCQlIqFUwIpWlSzf6/A9s0jb43sfnE6wwV/B2ghXBl26HMUDdagpK2HbVilkWDxrBD784pxgVcXC0UOIamipWIEtqAG4cOEClixZgurqaixYsADV1dU4ceIEKisrUVlZGQ8ZQ+bd/WeFFiEo7oAkzz88dp83EZBAs6dwjgm3rb2HGrDQp7N0P7NI29Oly/Ho/LFe115YVoJ9RxrZQUGwNsKWgQGKB2Zg4og8/PKHk/Dj+eOwYOZw7PjsNC51WASLPA5Hj2g951jJlywEdQE9/vjj8ZAjZJI+EjhJApISkVBSNkcrrbPndS51WLCz9gzmTR+GYYMzkatTsc8s4vYY4OqhWVhz7yQcP9sGpxPYc/Ac7rp1VMht9EkGBtCq5HAywNOvfpEQkcfh6BGv9N1CtxlreGcA9fX1AACNRsP533XXXYfrrrsuboKmFL37vItyNWwmSiI4ocyeojXD8r1Ol8mOwnwtrhyg9XpmUWnPCRTmqnHDVf0xbkQuVt010cuvHKyNSGQQbEYqAb5vMaJBb4LB0gNIwpNFCLlTcfbOWw/gwQcfxJ///GdcddVVKCgogOdhZrMZBw4ciJuQnlA9gMuIRU/AQ9cAufRZQjkmFKRAq8GGVoMFObo05OiU3Fszo9UeAjzTYG1EIkMU5Q+1Pd7FVIQhS7zl7mObQv+dBqoHwGsAOjo6ALh2/bz22mtgGAYSiQR2ux133XUXPvjgg5gJHAgyAJcRi56AALoKtOND8Gcah22OBrMdqzbW+rlSUnUrtNDPtE8FYX72s59h//79kEgkKC0tZb+XyWSYNWtW9KUMEaoHQMQDUcZrxMno0VboxIHXALzyyisAXIFgzzzzTNwEIohEQIydVLyMXioupiYrQbeBUudPiJGEjdeQuFwonoun0SJe2xxTcTE1WUm6bKBJvw2USAoSsoBMjF00cRuZ926Ffv6x6bjYaqSt0AKSdAaAIOJCAsZrxNpFE1ejxwCD8rVQShj2MxF/ks4A0CJwgiJ0kqxot++x3U+TpoDRbAckEkGTf8V8XSIBjR4RW5LOAJALKAEROklWtNvnuN7CshI2Qleo5F9xcdEkQdU0InoEXQQmiGDELEmWx4Ln9y1G3gXPaLfPdb1Ne+pw07jBsUn+FaKetHhKRJukmwEEcgGlKWXY+Ni0OEtExMQ1EcaoPtrt810PkhhsBQ1n9kIuGiLKJJ0BIBdQ4hGWayJEX33ABU+1wusa2ZlpUXWN8OkDJvoul7AXdslFQ0QRQQ3As88+i/b2dqxbty7kc5K9IlgqEvLukSiM6i8ZrPi+tRvnm4348Itz6DLZsXzBODy+eAJ+9+aXUdm9wqWPew0g2rtixBhwRiQOghmAAwcOYNu2bZg+fXpY54l6BuAzes6J9X0IdWeNh2vCaLFDpZSj02hDU7sFGWoFtCrX6NlztJublYabJxbhUmc3WgxpsFh7kKW93AbXKLwgRw2b3YlzF7sglUhQPX0Ytn5yih0xB3SNhLNLyMfVkqFWwmp3oKh/BnIz07jP8Uwcl5mGnAyexHE+pERUrNA7wIg+I4gB6OjowPr167Fs2TKcOHEirHNFOwPgGD2vWDQBJYMyYvPHFu7OGgbQqRX4/lI3Nmz+gj2npqwEg/O0KB6YwY52c7PSMHvyFdhz8BzKrh+CJ//nP35t+I7CC3LUuGPGcDy/6Suva1dMvgJ/23kclwxWyKUS3nq5Ye8ScrtaQin5KAWO1rfhpa1fs8csqx6NscXZQY1AQgachYPQO8CIiBBkF9CvfvUrrFixAjqdLqrXHVnUL6rXSyS4fMXr3/oyZuXo+rKzhuuct/fUof6CAYZuOzvavXliEburZtOeOu42PEbhv156PZYvHI+/bP/G79r52WpXLdvGTvz6lYNY9eJ+HG/o9NpJE8kuoVDObTXY2M7ffcxLW79GqyGEFAo+ej7/2PSk6jxTsUyimIj7DGDLli0oKChAaWkptm7dGtVrv7v/LH5UPRaAKwVrKnHxlJ7TV9xtd6C4KDsh2uM7x8kw6LY7cPUVuVixaALONnV67aoJ1EZe7/eH/nuR81irzYGashL8q/YM+92GzUfw/GPTMShf22ddwrkPJy9e4Dym3WjFyCtyAl7fTV7wQxKSSN/LVPs75SNR9Yy7Adi5cyf0ej2qqqrQ2dmJ7u5u/OY3v8GaNWsivvacKUOh13cJnn87FqhVck5fsVohi4mufWmP7xypRAK1QobWViNKBmUgR6fCp182YuiAjJDbGJCj4Ty2IFeDN3efwKUOC/u91e7AxVYjm2YgbF08fNoatQIFOWo0tXbznpvN48fvp1WF/WyS7d2N5L1MNl37itB69qkgTDzYunUrPv/887B2AYm2IEyirwFIAKOlB2cudmHjP45xrgGw58mAIyfbsOWjOpRdP4R1AwVqIydHi8+OfO8nT2F/LX7+x88CFxcJRxeOY5dVj8bmD+vQ1Nod9TUAX5Lu3Y1gDSDpdO0jQutJBiBV8ClHd8XgfmhtNcatPd6FSY9OIEOjwOzJV2BgnhbadAV06XJo07wXZVuNNjzx5wNeu4GkUmBcSR7ydSrONvLyMqC/1OUvD0LsgELUha9a1doHS2Gy2PnPDbV8ZBDi8u7GMG9SOMFpKft36oPQevapIlg8qK6uRnV1dVjnBNoFlK1T4XcPT4mGaImJTxCQVBrFZPAhtMf3R+25EGjtcODvO4+zo3Ctyr9zaTVY2A72UocFmz+qAwAMLdAhP0MVtjwhRcfy6eLTGbZ2WTl92iaLHUW5Gv774ARytErkaJXs54QkFrt2KDgtaUm6SOBA3Di6QGgRREm4wUw5PJG7OTqePfbB6GsHxNEZPrZ4QvLvyw+AKEtdEryklAEg+kiELgG+oC1NmgIX2rqRnqaA2eaAxdqD3Mw05GQqserua1HX0A4nA+z7qhELZpZcdptwycMh7yWDFekqOXRaBRx2p/fxkuCBWVyBaY0tXVi5ZCJe3v6Nl88/afblB6HDJPLIYwpa8yKlDMCJhnahRUg+ouAS4AraWjCzBL9/60tU3nQlrDbXnn3P3zwXTB+ZPxbXXJEFOPjlycnW8sr7o6pr8M+PT3p12JAw2LDpaMBFWd/ANM/F6PvnXA2TxY7hhf0wsF9aynQSmjQF5wxHkyaOzp+C1rxJqXTQqRwI1idCqB8blUAej2CmtQ/egJ/UjEdzWzfuum0UDCY72/kDwE3jBvsFTb245SgMRntAeZoumXh//8v2b3DTuMFex59vNgUNzPINTPM8/pV3v4XV5mTTWPTl3iYiVlsPFpaVeKWUXlhWAqu9R2DJYg8FrfmTUjOAd/efpYpgbkIc7UQtGRlHKoiashI4GZ9gL57gL3d7fPK0dZkxQJfG68Lw7IDdwWe+x7QaLK5F2l43gNFsx6Pzx+J7vZHzmoX9tdyunyQeSWrVSuw5eA5VU4td94wB9hw8hwnDUzSFigeUeM+fpDMAok4GFwahLvZFMxmZb5tOBpBKJJzX52uPT57sjHRAwkDCcz3PjtcdfOYJu8js03kX5Kjx6Pxx2PZJvd81B+Wow09VneAdiS5djrtuHZW8uYciICUS70WZpDMAot4GGgahjnZCSkYW4sKZ0Wy/PLIE8NV3zZgxsRA1ZSWsG2jfV4340dxr8Jd3vmHbe3T+WOg0CsDJL0//bDVOnW/H2aZO3D/napeLpvf3B+aNxj96t5J6rgG4/9jdawAqpQxGS49X593U2o0XthzBo/PH4oUtR0PqFJN6JCniojJJn3gvBiSdAQgEb6peERLyaCdYhxDI3YHLhiE7Mw3tJhu2/7uePW5hWQn2fdWI++ZcgycKr4O+3Yz2Lgv+8/UFNieQ0wm8+cEJr1q7vqmYbT0O1H59ga0BoJBJsWLRBDS2dMFmd6J/dhpW3TXRW34JsHZZKZrbzGhsMeKND06gy2THQ3eMQYZGAWvH5fvS1NqNfhnKkDtFwUeSke5kEeu+fREbPz5SygDQIvBlwhrtBOgQ+Nwdv310Cs63mNjfaspGYNsnp7yO27SnDmvunYQcrRINvccCwIJbSrD+rS+9OlAvF0rveoK+w4Jjpy6xswe3UdlZewbr3/oSVVOLsf3f9Zh8TX9/+RlAIZXgjx6yA8Cf/nkM86YPw9t7vmO/Uylk0KYpQu4Uue7tsuoxcACubRV9DQKTAN+3GHHxkom/Yw/RINMWRx7Eavx4SCkD8K8D52gR2E1fRzs+o0uj2c7p7jCYe3z8/QzncRab61h32cYMjQL9s9VBXSiGbjvqLxg4jUrV1GJs/qgOUin65Kop7K/1cg+F7QZw39tHJqNRb0JjiwlvfHAcXSZ7n3MAhbqwHKpBTqaFaUI4UsoAKOQptas1csId7fR2Qq/vOo6bxg2GVAqMHJKNUUOyoO+0oGLyFcjPVoNhGNh7HFgwcziydWnI1KqgkEmguf0q7PjsNJuZU6WQocfhxNFTegwb3A+PLR4PQIKLrSYvF0puVhpmThqCHgcDg6UHOrUc3TYHBuVpUDWtGHsPNQAAbp5YBEiAoQUZGDUkCxNG5MNg8q885obPVTMoRx25G4AB7D0M/rj5qNf1X9r6NdY+WOq128g3oI1rlB7qwjKfUfM1yHFdmPbVUy2HwRTiTIQCswQlpQwAERmGbjte33XcL0Pnw3eOgcPhRKfRhtffP47Km66ExebA5g9PssfUlJUgTSnDHdOH4Z+fnEKXyY6lVdfgvX31mHR1AZ565SB77OJZI3BPxSj87V/HkaFRoGLyFV5uHt/sm/dUjIK9x4k3P/iOPebB6tFY/9aX7DFcWUd1ajmWVY/2y9J5ecE5MjeAZ04jN+x20wwl54heKZf61S4eVZQZ8sIyn1GzWHuEWZjm2FXlG+jHNxNxOpmk3U6bKqTUkFmdRvYsEjqMNs4qXRv/cQxpSgXe7q3iZTDZ2V087mPe3lOHTpMdnb1F2pcvHIetH5/E6GH5ftd784PvYLb2oGpqMX54+9VegWLuoC3PwC6Dyc52/u7v/uxzjLvymNHSwwZotRps2P2fs6iaWowFM0tcrqMP62AweQT+RBDQ5c5p5Il7uynfiL7+goEzEMndsftey3dh2b3+4BnItXzBOOTyyBLrhWlfPbkC/fiCrZoumTjvkZgDs+JNSvWYbQar0CIkNVkZKkil3IFaZlsPG3DF5+93B1/Zepw439zlKqLCE/hltTux+aM61JSVBA3sksskIQV/qZRSnG7qwp/+ebkegXvR2LNgDDsqjjCgKydDyTnDyNEp0eARicx1jzy/6zDZUJSnCW3Rnm9tBxBki6PfzCVIoJ8nbQZz8m6nTRGSzgBQIFjs0KXLMWpoNqeLIV0pZ0eYfMFd7uCr3AwVFL3Xcf/GFbilUshwxcDMgIFduVlpKOzPXTnMN/irfz8NWzQe8F80dh/nHhVHHNDlBMYWZ2Ptg6V+dQD4XDVcAWpZGiXbsT//2HRcbDUGXpuIJC12lOHTM5Qtstm6dArMEpikcwGt/FMt7lu3l/O/dW8cFlq85EMCGK09uNBhwemLRmRqlXhs0Xg/F4Na5fKz7/uqETqNAotnjfQ6pqasBJkaBYoH6qBTK1CYp8aj88fi3IUOVxCWx7FLq65BepoM86YPg9Fs87vWsurR2HekEQAwc9IQ/PW9b/3y1zx05xj2GHf7Ep7Rp7T3LfcaFSNwQFfI9NYBKBmocy389u7+4XPVFA/U+X3nlgcMMChfi6JcDbsdNix6DUOfz+8Dvnru+6rR73l76ehBQa6G8x5xHUvEBkErgvWF+9fu5o0EnjNlKObedKXgFXjiRcR6SoD6C11o1Bu9FmGXLxiHwnwN2rqsXi4Gl3+9Bw6HA9p0Jax2B0yWHqQpZZDLpdAoZd7Vv2TA0VNt2PxhHburqKSoH/Kz03Cp3QJNmgLdVjuUSjnkUglM5t6KWxoFDCZXumeHk8Ezf/uCTdfszl9TUpSF3EwVLnVakaaSQ5cuh5MBVr24329EyVfNy2Dp4Tw+ajtnuCplIXD1rKR8d3317H1+wWYivFXekqpHCo7QzzRhS0L2BVGXhPQhJ0eLM43tfd5CZzDb8Z//tnjttQei1wl6ln70vPb/feAG3tKPfjLydNLzpg/DDVfle8vYhzrGibYLRSzvLiAeXYXWM2FLQvaFQLmAZFIJ/rJyRpwlEggJcODrJjaiti+dV4fRxrvAGo2FOL5tkhf0JrR2WEKSVZcux0N3jOFc2B05JMtbxj4EvxXma/B/7r8eFlsPcjNUsRmB0l53IkFJOgNAi8AuDN12r3QKXguYagV/hyMF2o12mCx22BxODC/K4lyIS1PJcbrZiHSVnDPIyg+fKl0ZagXy+nEv8qmUMmzYdMTlmjHbA3eKDHDlQB3mTR/m2kHDADtrz6DLZOdeLAw1+I1n9K/rTakQNRJwliEKyOiGRNIZAMoG6oJvAdNosffm5OfocCTA8YYOtLSZWZ//qCFZeGDeaPzPtstbGZdWXYNzTZ3409ZveIOsvODo5NznLF841qsy19Kqa/DOpy6X0+HvWvD2nrqgnaJWJcPQAh3Wv/UlMjQKzJw0BIX9tYBEwq4JhEu8Ujonc+ropIWMbsgk3S6gQIgpGyhf4JBKIecNrmk12HC+2eQVeDV6WD7+8VGdV7DU1o9PQqtWsee7g6z4AnS4Ojn3Of2z1Vhz7yQ2OGz3wbM4eb4TKoUMzt4dM0EDgBigdHQBfvvoFNx16yhs++QUfvv6Yax6cT+ON3T2qRpXVHYAJVA7xGWo8lfopJQBqDvfKbQIcUOXLseKRRP8ttCZLNzJ2zpMrgLpfkFcElc65M0f1WHzh3XY/JErBYO7BKP7fCfD8HZafJ2ck2HQ2mnFH7ccQYfRig2bjrCd//1zrsbeww1+MvIhlUrgdDLsWoD7nL7+YYcaeRsp8WqHuAwZ3dBJOhdQIEoKo+y/TWR6R8UD+nkveBrMPazf3b11Uip1FQPXpCtwtqkrpMAdm93p9VkqkfB2WoGCntRpcnSZ7NhZe4YtFiOVSGC29uBSh8VPxkAunWgWYolXcRAqQhJ/BK/XkETQNtAkhlNPj4yevkndHl88AXanE82t3dh98BxuGjcYmnQZMjPS8Kd/XN5l45uMjXcNoHehzWzrgb7Tihc9KmrVlJUgNzMNQwfq0HDR6NUBLp41Agq5FDv2nfaTkc9Xm5eXgfrz7Xj2tUOuHEC9bp99XzVi1V0T++ZP59qnH4u/hjDaEcu7C8RQ1wRbAxD6maZUHAAFgl2GV08J0NrlvQffnXJ5eGEmMjRKtHaa8Ye3LmdwXFY9BgzD+AViuYOsAFeuJc+0xp6GJk0pg8XmQKZGhTSVDC1tLiOz6q6J0KkVuNDuKu4CBqzr54e3X40Nm474jdS4Fkjz8jKgb+3C0fo2v9w7AfPvJ9luELG8u0CMdY2XcQ8BoZ9pSsUBBOKzr5uoIAwAMIDJo5BLblYaZk++wmukXVNWwpZGbGrtxm/++sXljtfpsZWSZzRVmK/Bhs1HUDW12JVvZ5or06YvbveMNk2O7Z96F14/39wVlkvHYLL7ZZp8aevX/DtqEmwkSMQRqvwVEim1CNxt6RFahITBc/Hx5olFfimZ395T50qt0AvfIhnfjgqjxYEMjcIr+2OgxU6u3DijPBLGcZ3jS7iLewm1GySCtNMEESsEmQG88MILeP/99wEA06ZNw8qVK6Ny3aSrBxDIPRHMdSFxjaC/b+mCSimDWiWHxdYDbboSOrUcYBgsXzgO55uNIaVTdgd/ndEboU1TwGSxQ5OmgM3egzX3TkKbwQqVUoptn5xCe5cVHV0W/GDWKGTrlFi7rBQ2mwNX/3ASWjssSFPJoG83Y2CO5vLUuzdK97ePToHB3AOLtQeZGiUeXzzBr0CKV86cXv3teiMUCllYi3tGs51deAaAvYcacKnD4pUO2u0mcOclkstkoQW+hfksQ5qJuGsCt5qgSVPAauuBVq1MeLcVkbzEvcesra3FZ599hm3btkEikWDp0qXYs2cPysrKQjo/ZSKBgxT3Dthh8AReqZQy7Nh3GvNnlmCLxyLu6nuuDZia2H3+me872WpeC8tK8MW3Tbh50hC8vP0btp1H7hwDg8mG5zd5LuqORJpSiv/d8V8vebjgqlv720en+CWe49Kv9usLqCkr8Utcx+nflQDtRhu2/7veK4XEnoPnXAaD4x66fy+/fkjgwLdwnmWo5R4DyHPXraPIbUXEhLgvAp88eRImkwnjxo0DADz11FMYMmQI7rnnnpDOD7QIXFKYidU/mCj4oksoGMx2rNpYy7kAKpVKUPtNM1s8ZO+hBnSZ7GyHwXfuvOnD4HQy2P7veq8c+K4yfSPw0tbLO31+VHUNTBY7TJYedmG2y2Rnz1MpZFi+cJzfIm1N2QjO5HHzpg/D23u+8/vOM2FbIJ09ffh8x1VNLcbeww2YOWkIhg3ORK6OP3cP3zXW3DsJhblqGLr529j+73ruZHN9eJa6dAUa9Cb8+pWDfuf9eun1KMrVBNV5+7/rUzJyOBn+TqOB0Hom1CLw8OHD2X+fPXsW77//Pt56662oXLvufCfy8jIAgP1/onLxlJ7TLWN1OnGxpZvtZD2Tn3XbHSguyuY918kwl33yHu6dptZuKGQSVE0tRv8cNZpbu9FltuFv/zruL5jk8vUsNodfO8Gqgfl+55Y5kM6exwQ6DhLgUocFb+/5Dr95eLLXOb7wXUMulyIvNyNgG1yyByKYXjaGu4DOgBwt8vK0QXXmukepQqL/nUaLRNVTMKf5yZMn8eCDD2LlypUYOnRoVK6ZrVNBr+8S3OKGglol53bLMBJs/Id3tOumPXWYN30Y1AoZ9Pou/nMlrmhZlcK/WpZUKnHNDKYVs//nuob7PJVChjSlv889WDUw3+/cMgfS2fOYQMd5yuZ7Tqj3V6MMfA/dlcp8ZQ9EML2UUu5yjUopE/TeuOUJVZZkIhn+TqOB0HoGmgEIsgvo8OHDuPfee/Gzn/0M8+bNi9p1bxxdELVrxRq+ilF8qRwK+2td7g4JIJUAD90xxuvcmrIS6DQK7DvSiKVV1/hVy9r26SksnjUC+75qxMLeyl6+VbZqykqw93ADO+t4b189llZd43XM4HwNHpjr/d3iWSPRL0Ppdy13dTBWZ7Uca+6dhJqyEiyYWYKCHDVnBSiue+MpWyhVo/jur/s8rt8XlpVg35FGTtkjacszTfWvl16PZx+e7OfTDyQPVckiYkXc1wCampowb948rF+/HqWlpWGfn1KRwBzBKgZzgCpVagW7UJihUWD25CtQkKtBmlIGhVwKm92JuoYOHD5xERNHDkD/bDW6La5dKRs2He0twO6ESilFfj81urqtGJibgW6LHTqtCjIZ0GWyQ5uuQLelB+o0ORjGCYlEik6jDdp0BV5//7/Qd1rYjJyD8tSw9zDoNFqhVSthMNmQrpIjU63wrg7Gscj56PyxuHpoFncQl8+9SU9XoulSkFq5Idxfrkhmz11AMpkMunS5t+x9fJZhL9pKAJtTgoutRtcuIHsPtGmKlE0bkTR/pxEitJ4JFQm8du1a/POf/0RR0eU96DU1NVi0aFFI56fKIjAvAXaUBFq43PxRHX5+10TkZ6Xj8HctcDovR9zOnnwFCvO1rjz8m4/gUoeFPT83Kw133TrKq+AKb7BUbydntNihUshdZRa1oUXXhroAzEdSP9MwEIuegHh0FVrPhFoEfuKJJ/DEE0/E5NqXOi3BD0p0AlS14guESk+ToaZsBKw2Byx2Bz78ooFNtMYVAfyv2jOsEZg5aQg27fnOa7/867uOc+fXYQCdWtFbb+CLsKJro5nIjSCI6JBkkVMigSeMnSvLYUGOGtp0Jd7e/Y1fJ88XAezesqlSyFA8SAelwjsh28KyEhgtdu50DH0scEIZGgki8SADkGhwlFZ0R6Xq1HI8Mn+sV9bNpXOuwXOvH0aGRoGqicW92wadWDZ3NCw2B2ckbPHgTKy++1rk6FwFdH73hndpyU176rD2Qe71mUsGq1eOoZsnFgESwGjpCeirDpoWOZmStiWTrAQRgJQyAEmXC6i343SnRsjNTIO+3eyVGoFNxTwoA8fPdaLdYL7cqTPA+ZYudkHYcxT/o7nXQCmXckbC1jd2sjOAn9aM53TNmCx25Gj9R+fpvdsVfdvc/mkQV1Cggu3JlLQtmWQliCAkXTrolNkFJAHqL3ShUW/0Sm2weNZI7PisnvXRuyNqJ47IwxN/PuDax++RVXPlXRPR0GwMOTp3xaIJePndr9nr80X28rl0jDYHDp9ogdXuDOu8QPehtcvmWrhmLs9SfK+VKM800sXsYCSKnvFALLoKrWdCLQJHSqCi8ADwv6tvjqM0fcfQbUf9BYNXJ2q1O/DmBye80ji4o1JbDRZY7Q589V0z7p9zNV5591tY7Q7oO7rRP1sdcnRuY0uX1y6gD784h4fuGOO3C4jPnaNVyTA4T4suM3/pyZA7Qp78Nzt7F6kTcYGYFrOJVCLpDEAgkqkkZIfRxptWwS9Lp1KGHF0aCnLUKB09ENs+OYWashL0z1ZDJpcCjGsxuKm12+s8rujcHod3r95lsuPKggxu1wwXDFA8MAOtXbaIF3W5FpQ37alj898k4gIxLWYTqURK1QMYWdRPaBFCJitDxaZV8IQrS2f/7HTkZCqxrHoMNu2pg93hBAPg+U1H8Mxfv8Dv3/wSd8wYjoIcNXve4lkjkKlR+ETs+n+3fME4aNPk0KUrUJSrcY1igzkFGSAnQxk4+jUE+EbT0t7UCYkY/Ro06pcgkoiUmgEkE7p0OYoH6vzSGz+6YCzaOi1YMLMEYIB/1Z5hM4EyvTOGqonFfts7/7L9G6y5dxLau6wYkKNGT48DOrWSHdm7I0s9v4uoVF6gRd0Q4RtNTxyRj5wMZWIuqkZB75hBu5OIMEkpA5BUJSF7XSn9s9MxvKifaxeQTgWj2Y7/98aXfod3mGzI0vZW+fKowuXGanfg2zNt2PxhnVeaYQAevunLbgqdWgFDtx0NLaa+dxYRlt3j2xqasJ2/m0QsN0i7k4g+kHQGIGUKwgAAA2hVcmhVHo+BJ9tmlkbJdpjnW4wBM0cG9UcnSmeRyKPpJKOvAXqEuEk6AxBoF9CcKUOjMwOIdCodQjlHvt+DBUyNKspEYb4GA3LUXjt33Hv8A+3gcWO09OB8ixFV04oBuLZeRr2zCPUeJuJoOgmh3UlEX0g6AxBzIh0dBzs/2O/BRsW9s4YxV/TD849N98ocOWH4xOAjaAlwuqmLs+BM1DqLRJlhiAjanUT0BQoE8yHSQJ9g50czkKgvegYqJxlqCcS+thHJDEPoYJp40Wc9k9Do0jOND6IJBJNJJfjLyhkRXT/SqXSw84WeqvO1zxaciUJnIbSOooTWU4g+kHQGINaLwJFOpYOdL/RUna/9QTnqqHUWQusoWmg9hQiTlAoEiwaRBvr0pRRhPAOJ4tG+0DoSBBEatAbARaTl/cIoRRjJVD0Sf3E02o9nG0L7UeOFWPQExKOr0Hqm1BpAXIh0Kh3sfKGn6vFoX2gdCYIICrmACIIgRAoZAIIgCJFCBoAgCEKkkAEgCIIQKWQACIIgRAoZAIIgCJFCBoAgCEKkkAEgCIIQKWQACIIgRIogBmDHjh2YPXs2ysvL8cYbbwghAkEQhOiJeyqI5uZmrF+/Hlu3boVSqURNTQ2uv/56DBs2LN6iEARBiJq4zwBqa2txww03ICsrC2q1GrNmzcKuXbviLQZBEIToibsBaGlpQV5eHvs5Pz8fzc3N8RaDIAhC9MTdBeR0OiGRSNjPDMN4fQ4GX1pTX/LyMsKWLRkRi56AeHQVi56AeHRNVD3jbgAGDBiAQ4cOsZ/1ej3y8/NDPj8u9QCSBLHoCYhHV7HoCYhHV6H1DFQPIO4uoMmTJ+PAgQNoa2uD2WzG7t27MXXq1HiLQRAEIXriPgPo378/VqxYgbvvvht2ux133nknxowZE28xCIIgRI8gFcEqKytRWVkpRNMEQRBELxQJTBAEIVLIABAEQYgUMgAEQRAihQwAQRCESCEDQBAEIVLIABAEQYgUQbaBJgvv7DuNuTddGbPz171xGKt/MBHr3jiMkUX9vI59Z99pfPZ1E24cXeB3jXVvHAYATBw1AB/85yzaDVZUThnKHu/ms6+b8LuHp+Dxjftx4+gCnGhox8iifgCA3V+cBwAU9dfiUqcFuZlpqDvfiTlThmLuTVfi8Y37kZuZBgAYWdQPJxra2X+7r/vw7z9F+aRC9lrlkwrxrwPnAADFg3TssTeOLsBnXzcBAPvvbksPyicVYsf+s36yu/V16+nW9fDxi6ysDc1GFPXXsjLNvelKr/v4zr7TAMDq7JbZ9xj3eb56rv7BRL9ruHHL5/7ds213O77vgSee8nk+23VvHGb1dL8Xq38w0e9aXOd7fn+iod3vPK5ruGX2lMn3euHgK6/vNbiu6Sl3sGOD6RPOb9H42w52n4OdH0n70YIMQADe3X82oocU7Py6853s/+vOd3od++7+s7zX8DyP63hf2gxW9nvPczw/txmsXu21Gazsd57neP7bYnN4tef5b7dOvt9z/dv3/259+dr1lct97zzvo68svnK5j3Gfx3Vv+K7hls9TXk99ffF9Jp7yeT5b32twXc8ts+/5vt8HwlNvdxue98PzmHDgun++77TvNT3lDnZsMH3C+S0af9uREGn70YJcQARBECKFDABBEIRISToXkFQaWuroUI8LRH6/9IiuE+x89+/5/dIBeMvs/s73e9/fguF5/Wie05frhtM+0Hc9+WTjOiaUY4PJ53tssOcV6jPnen/43plA75IvXHr7tteXd9/3nGCffWUJdmyo7YbyWzT+toG+90eRth8OgdqRMAwTOLcyQRAEkZKQC4ggCEKkkAEgCIIQKWQACIIgRAoZAIIgCJFCBoAgCEKkkAEgCIIQKWQACIIgRAoZAIIgCJFCBoAgCEKkpJQB2LFjB2bPno3y8nK88cYbQosTdYxGI26//XY0NjYCAGpra1FZWYny8nKsX79eYOmixwsvvICKigpUVFTgueeeA5C6uj7//POYPXs2Kioq8OqrrwJIXV0B4Nlnn8Xq1asBpKaeS5YsQUVFBaqqqlBVVYWjR48mtp5MinDx4kVmxowZTHt7O2MymZjKykrm5MmTQosVNY4cOcLcfvvtzNVXX82cP3+eMZvNzLRp05iGhgbGbrcz9913H/PJJ58ILWbE7N+/n1m4cCFjtVoZm83G3H333cyOHTtSUteDBw8yNTU1jN1uZ8xmMzNjxgzm+PHjKakrwzBMbW0tc/311zOrVq1KyffX6XQyN954I2O329nvEl3PlJkB1NbW4oYbbkBWVhbUajVmzZqFXbt2CS1W1Ni8eTOefPJJ5OfnAwCOHTuGIUOGoLCwEHK5HJWVlSmhb15eHlavXg2lUgmFQoHi4mKcPXs2JXW97rrr8Pe//x1yuRytra1wOBwwGAwpqWtHRwfWr1+PZcuWAUjN9/f0aVeRnvvuuw9z5szB66+/nvB6powBaGlpQV5eHvs5Pz8fzc3NAkoUXZ5++mlce+217OdU1Xf48OEYN24cAODs2bN4//33IZFIUlJXAFAoFNiwYQMqKipQWlqass/1V7/6FVasWAGdTgcgNd9fg8GA0tJSvPjii/jrX/+Kt99+GxcuXEhoPVPGADidTkgkl9OeMgzj9TnVSHV9T548ifvuuw8rV65EYWFhSuu6fPlyHDhwAE1NTTh79mzK6bplyxYUFBSgtLSU/S4V39/x48fjueeeQ0ZGBrKzs3HnnXdiw4YNCa1n0tUD4GPAgAE4dOgQ+1mv17PuklRkwIAB0Ov17OdU0vfw4cNYvnw51qxZg4qKCnz++ecpqWt9fT1sNhtGjRqF9PR0lJeXY9euXZDJZOwxqaDrzp07odfrUVVVhc7OTnR3d+P7779POT0PHToEu93OGjqGYTBo0KCEfndTZgYwefJkHDhwAG1tbTCbzdi9ezemTp0qtFgxY+zYsThz5gzOnTsHh8OB9957LyX0bWpqwiOPPILf/e53qKioAJC6ujY2NuKJJ56AzWaDzWbDRx99hJqampTT9dVXX8V7772H7du3Y/ny5bj55pvx8ssvp5yeXV1deO6552C1WmE0GrFt2zY89thjCa1nyswA+vfvjxUrVuDuu++G3W7HnXfeiTFjxggtVsxQqVRYt24dfvzjH8NqtWLatGm49dZbhRYrYl555RVYrVasW7eO/a6mpiYldZ02bRqOHTuGuXPnQiaToby8HBUVFcjOzk45XX1Jxfd3xowZOHr0KObOnQun04nFixdj/PjxCa0nVQQjCIIQKSnjAiIIgiDCgwwAQRCESCEDQBAEIVLIABAEQYgUMgAEQRAihQwAQRCESCEDQIiS++67D21tbREfc/DgQdx+++1B2xsxYgTntT766COsXbsWgCuV8K5du9DY2Ijx48cHvSZBRErKBIIRRDjs378/KsdEyi233IJbbrkl5u0QBBc0AyBExy9+8QsAwD333IPPP/8cS5YsQWVlJebMmYN33nnH75impiZ8/PHHqKmpQXV1NaZPn44//OEPYbf7hz/8AfPmzUNVVRU+/vhjAMDWrVvx4IMPRkUvgggXmgEQouOZZ57B1q1b8be//Q0LFizAypUrUV5ejubmZsyfPx9DhgzxOqZfv35YuXIl1q1bh6FDh6K5uRkzZszA3XffHVa7gwcPxlNPPYW6ujosWbIE77//fow0JIjQIANAiJb6+npYrVaUl5cDcOWTKi8vx759+7x88BKJBC+99BI++eQTvPfee6ivrwfDMDCbzWG1t2jRIgBASUkJiouL8dVXX0VPGYLoA+QCIkSLRCLxy83OMAx6enq8vuvu7sa8efPw7bff4qqrrsLKlSshl8sRbhotqfTyn5vT6YRcTuMvQljIABCiRCaTYdCgQZDL5di9ezcAoLm5GR988AEmT57MHtPT04Nz587BaDTipz/9KW6++WYcPHgQNpsNTqczrDa3bdsGAPj222/R0NCAsWPHRlcpgggTGoIQouTWW2/Fvffei40bN2Lt2rX44x//CIfDgUceeQQ33HADe8ySJUvw/PPPY/r06bjtttugVCpRUlKCYcOG4dy5c1AqlSG3ef78ecydOxcSiQS///3vkZWVFSPtCCI0KB00QRCESKEZAEFEgZdffhk7duzg/O3+++/HnDlz4iwRQQSHZgAEQRAihRaBCYIgRAoZAIIgCJFCBoAgCEKkkAEgCIIQKWQACIIgRMr/B0x5tiafPYALAAAAAElFTkSuQmCC", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Create a visualization with pandas df\n", "sns.scatterplot(data=pandas_tips, x=\"total_bill\", y=\"tip\")\n", "sns.rugplot(data=pandas_tips, x=\"total_bill\", y=\"tip\")" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAEJCAYAAACdePCvAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAABOIElEQVR4nO29eZhcZZ33/T1Lbd3VW7qrk86+kQQSSAIINAJh8Q1LEwIIDvoYx0FH8eIRRX0Q0Uvf10s0cDmDMuroCKLiwgMj+yDCsA0hkQAhCUmArJ210/tS1bWe5f3j1H3qnFPn1Na1ddfv8093V5865757ub/3b705VVVVEARBEDUHX+kBEARBEJWBBIAgCKJGIQEgCIKoUUgACIIgahQSAIIgiBqFBIAgCKJGIQEgCIKoUcRKDyBfhofHoShTv3ShtdWPwcFQpYdRMWj+NP9anX+x587zHFpa6m2/N+kEQFHUmhAAADUzTydo/jT/WqVccycXEEEQRI1CAkAQBFGjkAAQBEHUKCUVgFAohKuvvhrHjh0DAGzevBnr1q3D2rVrcd9995Xy0QRBEEQWSiYAO3bswCc/+Ul0d3cDAKLRKO666y784he/wHPPPYddu3bhtddeK9XjCYIgiCyUTAAeffRRfO9730N7ezsAYOfOnZg3bx7mzJkDURSxbt06PP/886V6PEEQxJShVF37SyYAd999N84++2z9676+PgQCAf3r9vZ29Pb2lurxBEEQU4Luk2P45i+3oGdwvOj3LlsdgKIo4DhO/1pVVdPXudLa6i/msKqaQKCh0kOoKDR/mn+twuYeiUl44IE3oQJYOK8Vfp+rqM8pmwDMmDED/f39+tf9/f26eygfBgdDNVEgEgg0oL8/WOlhVAyaP82/VudvnPtvnnsfPQPj+D+fXI1IKIpIKJr3/Xiec9w4ly0NdOXKlTh06BAOHz4MWZbx7LPP4qKLLirX4wmCICYVW9/vxaadPbiqcx6WzWspyTPKZgF4PB5s3LgRX/7ylxGLxbBmzRpcccUV5Xo8QRDEpGFgNILfPf8hFs5sxPoLFpTsOSUXgJdffln/vLOzE08//XSpH0kQBDFpkWUF//HMHqiqii9csxyiUDpHDVUCEwRBVBGPvrQP+4+NYsPapWhv9pX0WSQABEEQVcK+YyN45IUPcN7y6ehcMaPkzyMBIAiCqALC0QT+4+k9CLTUYcPapWV5JgkAQRBEhVFVFb//24cYDsbwjU+fBZ+nPPk5JAAEQRAVZvOuk9j6fh/WX7gAy+ZNK9tzSQAIgiAqSO9wGH94YS+WzmlG13nzyvpsEgCCIIgKIckKfvXUbogCh39edxp4Pv/2OBOBBIAgCKJCPPH6QXSfDOIfr1iGaY3esj+fBIAgCKIC7OkewvN/P4KLVs7E2cvy74tWDEgACIIgykwwHMcDz+7BjNY6fPKyUyo2DhIAgiCIMqKqKh567gOEIgl8Yd1yeNxCxcZCAkAQBFFGXn33OLbvH8ANaxZh3ozKnnlAAkAQBFEmjveH8MjL+7Fi4TR87CNzKj0cEgCCIIhykJBk/Orp3fC5BXyu6zTwBZyIWGxIAAiCIMrAo68cwLH+cdzcdRqa6t2VHg4AEgCCIIiSs2P/AF565xj+n7Pn4IxFrZUejg4JAEEQRIn5/d8+xOyAHzdcvLDSQzFBAkAQBFFiRkIxrD6lDS6xcimfdpAAEARBlBhVBaog5psGCQBBEEQJUVUVAMBVoQKQABAEQZSQ5PpPFgBBEEStoZAFQBAEUZswC6DMrf5zggSAIAiihFAMgCAIokahGABBEESNoscAUH0KQAJAEARRQigGQBAEUaOooBgAQRBETUIxAIIgiBqF6gAIgiBqFIoBEARB1ChUB0AQBFGjUAyAIAiiRiELgCAIokZJBYErPBAbKiIATz31FLq6utDV1YV77rmnEkMgCIIoD3oQuPoUoOwCEIlEcPfdd+Phhx/GU089hbfffhubN28u9zAIgiDKgpL8WIXrf/kFQJZlKIqCSCQCSZIgSRI8Hk+5h0EQBFEWqjkGIJb7gX6/H1/5yldw5ZVXwufz4SMf+QjOPPPMcg+DIAiiLFRzFlDZBeCDDz7AX/7yF7zyyitoaGjAN77xDTz44IP4/Oc/n9P7W1v9JR5h9RAINFR6CBWF5k/znwpEkz6g5sa6nOdUrrmXXQA2bdqEzs5OtLa2AgCuv/56/OlPf8pZAAYHQ1AUtZRDrAoCgQb09wcrPYyKQfOn+U+V+Q8OjQMAgqFoTnMq9tx5nnPcOJc9BrBs2TJs3rwZ4XAYqqri5Zdfxumnn17uYRAEQZQFPQZQ4XHYUXYL4IILLsCePXtw/fXXw+Vy4fTTT8cXvvCFcg+DIAiiLFAMwMIXvvAFWvQJgqgJqjkLiCqBCYIgSkg1WwAkAARBECWEzgMgCIKoUeg8AIIgCAee2dyN/377aKWHUTIoBkAQBOHAW+/34a0P+io9jJJRzTGAimQBEQRBMOIJGbKiZL9wklLNMQASAIIgKkosIUOJT93qfuYCqkZ3CwkAQRAVJZqQkUgoUFS1KnvmT5SUC6j65laNokQQRI2gqiricRmKqiISkyo9nJKg0olgBEEQ6SQkhR2YhVAkUdGxlIrUgTDVpwAkAARBVIxYQtY/D4WnpgDoMQASAIIgiBQmAZiiFkA1p4GSABAEUTFi8VoQgOpNAyUBIAiiYsQSqfz/4BR1ASlkARAEQaRjdAGNR6emAFAMgCAIwgajAExVC4BiAARBEDbEkwIg8BzGKQZQdkgACIKoGNFkELilwYPglBUA7WMVrv8kAARBVA7mAmpt9E5ZC6Cam8GRABAEUTGYC6i1yTv1LYDKDsMWEgCCICpGLCGD5zg0+z0YjyR0f/lUQgX1AiIIgkgjFlfgcfPw+1yQFRWRmJz9TZOM1JGQ1acAJAAEUcMEw3Fs2XWyYs+PJSS4XQIa6lwAgFAkXrGxlApFIQuAIIgqZNPOHvz62T0Va8MQSyjwuATU+5gATL2W0EwA+Co8FZ4EgCBqmIGxKIBUMLbcxOIyvC4BDb6pawHISR+QwFffclt9IyIIomwMj8UAAAm5MmfyxhIy3G4Bfl0Apl4mELMABLIACKL8qKqKhFS54KKiqpAqtMBmYyhpAUhSZcYXT8jwuAT4WQxgCraDkGVyARFExXh523F88cevYTgYq8jzX9t+Anf+aktFnp2NoWBlLYBoUgB8HhEcB4SmYEM4mSwAgqgc2/b2AwB6Bscr8vzhYBRDYzG9IrRaiCVk3eWSqJAFEIvL8Lh48BwHv881JS0ARSUBIIiK4XEJAMyHj5QTtu4zV0C1YLSIKiUA8YQMj1sEAE0ApmAMQE5aV+QCIogK4HFrAhCtUKYLQ1aqKw7A/P8AKhaj0NJAtWVoygoAuYAIonLoFkDFBaC6LIChscpaAIqq6kFgQBOAqdgPSFFV8BxHzeAIohJ43ZV1ATGqzQU0FExZAJUQgERCgQqYBGBKWgCyWpXuH4AEgKgBKh0DYJAFYIZZZG4mAHVaEHiqNYSTFbUq3T8ACQBRA1Q6BpAKAldZDCAYRUuDB0BlYgBMAJiFxhrCRSss1MVGIQEw8/LLL+P666/HlVdeiR/84AeVGAJRQ7AujGQBmBkei2F6iw9AZS0AowsImHrVwLJKLiCdo0eP4nvf+x5+8Ytf4Omnn8aePXvw2muvlXsYRA3BXAqV3llKVSYAQ8Eo2lvqAFSmECzNBTRVBUCuXgtALPcDX3zxRVx11VWYMWMGAOC+++6Dx+Mp9zCIGoIV4lQqC4gdCFIOF5CiaG0n2KLqRCQmIRKTK2sBxJkFoO1DG3xuAFNPABRFhSBUpwCU3QI4fPgwZFnGLbfcgvXr1+NPf/oTmpqayj0MooZgPvhYvLKthsvhAnrhraP4zgNvZr2O1QBMa/RCFPjKuoDcqSAwMPUEQFbUqjwMBqiABSDLMt5++208/PDDqKurw5e+9CU88cQTuP7663N6f2urv8QjrB4CgYZKD6GiFGv+dXXazlJWK/MzrUvubBsafKbnv39oCPM6GlDnddm+r5CxjkYSGBiNoqm5LqMVcGQwDABYOLcFHhcPl1ss+8/Gc3QUANAxvRGBQAM8dUlPQLJt8lT5+3e5BLhdQl7zKdfcyy4AbW1t6OzsxLRp0wAAH/vYx7Bz586cBWBwMKS3V53KBAIN6O8PVnoYFaOY8w+GtHTHUDhekZ9pOKz1uB8cCqHfry32kqzgW7/YhPUXLMDV589Pe0+h8x8ejQAADh0ZwrRGr+N1h44OAwB4WYHAcxgLRsv+sxkY0nozjQej6Oc0VwnHAT39IQCYMn//4eQZB7nOp9j/+zzPOW6cy+4CuuSSS7Bp0yaMjY1BlmW8/vrrWL58ebmHQdQQuQaBt+3tx+//9mHxn5/8aAwCK4oKWVHROxQu6rPYmbrBLE3VhsZi4DigucENl1ghF1Dc7ALieQ71XhfGyQVUNsouACtXrsTnP/95fOpTn8JVV12FmTNn4uMf/3i5h0HUEGzdzSYAu7uHSno+rrESmMUl+kejDlcXRiSmxTmCWU7WGgpG0ez3QOB5iKJQkSygqCUNFJia7SCquRCs7C4gALjhhhtwww03VOLRRA3CLIBITIKqqo49WRRF602T6ZqJYGwGxzKT+kciRX1GJBnozsUCmJYsAnNVKAgcT8gQeA6ikNqH+n1TzwKgLCCCqCB6Ja6S+WQuWdESNou+GGZoBz0SjBX1eboFkE0AgjG0JGMELpGvTB1AXE4LVPt9rqxjn2zIyhQoBBsdHUUoFCrlWAiiJBh7yzAfuR0suSBeot2wMQ2UjUmFuS3zREnFAJxdQKqqYngsarAAuIocCRlLyHobCIa/zoXxKXYqmKyoECZrDODgwYP4+Mc/js7OTpx77rn49Kc/jRMnTpRjbARRFIy9xSIZagF0AShywZheCGZwARltgf5RezeQoqrYe3Qk5+coqopoDhbAeFRCXFL0LKGKBYETzhbAVGoIN6ldQN/61rdw4403YseOHXj33Xdx+eWX49vf/nY5xkYQRcF4FGOmfkBsh16qimG7IDAA9I/YWwAfHB7Gxj9uw/H+3CzvWFzWhSWTBaAXgTELQBQqlgXEqoAZDT4XJFmpeN+mYjKpXUCRSAQ33XQTXC4X3G43NmzYgIGBgXKMjSCKgnGxjSecF7qUBVDcxdAYg0i9lvp8wCEQHI5qu/lYjuNh/n8gczUtawPNLACxUjEAw2EwjPpkP6Cx8cxZTJMJWVEmrwto4cKF2LZtm/713r17MXv27JIOiiCKiXGxzbS7L7UFYAxAmywAh1RQtitXkZs7JJLcNfMcl9EFxA6CmdZY2Swg7ThIswA0MAHIYMFMNjQXUHXm22RNAz1x4gQ2bNiApUuXQhRF7NmzB4FAAOvWrQMAPPPMMyUfJEFMBCVHAWDXxaUSuYCMFoDhdScLgO3Kc3WHMwugtcmTxQUUg8BzaKzXWlS4RK4iFkA8IesixDBaAE0eX9nHVAqq2QWUVQC+8Y1vlGMcBFEyjAtoLhZAsV1A1vtrY0odFO5UC6DvynMUABYAbm/2YXf3sOZ64NN3nuwgGFad6hKEimUBpVkAdQYX0LSpIQDVfCCMowAcOHAAixYtQn19ve33qX0DMVnI1QWklDwInO4Cam30om8kgkhMgs9j/nfM1wUUTgpAoKUO6B5GKCKhKbnLN2IsAgMqVwcQjacLADsTIDilYgDV2wrCUQDuvfde/OpXv8KNN96Ijo4Ocy51JIItW7aUZYAEMVEUFXC7eMQTmbNL5FKlgWYIAgdafOgbiaB/JIK5080dIPN1AbFWF+3N2s45GI47CEAUi2elWrCLyTTQUlVAOxG3CwJ7XeAw1YLA1ZsG6igA99xzD0ZGRrBo0SI8/PDD+h9HIpHApz/96XKOkSAmhKqq8LpFxBPxnCyAcriAGO3NPuwGMDAaTRcAZgHkqAAsa6g9echLyCYQrKgqhoMx/SxgQLMAVFUbn1imhUpRVcQlBW5LGijPc6jzihljGJONSekC+vrXv4433ngDHMehs7NTf10QBFx++eVlGRxBFANVBUSBgyhwucUAihwETp0IZugGyiyA5G7dLg6Q70Ht0bgEDkBbk5beaddULTgeh6yoplbRrmSGSjwh48W3j+LiVbPS3FHFJq4fCJ/+HL/PNeUsgEkXBH7wwQcBaIVgP/rRj8o2IIIoNqqqggMHj0tAPJ69DsBOJFRVxeP/cxDnr5iBjlb7uFg2JEMlMHPr1/tE+DwCBpLFYGPjcQgChwCMFkBu9w/HJHg9gu72sdtFDwWTNQAWCwAAdh0awmOvHMD7h4fxtU+symNm+cNqG6yFYIDWDmKqxQCq1QLImpxKi3/+DCf/yYjqQFEBjtMOHy80C2g8KuG/thzGtr39BY/D6AJiT+A5DoEmn94O4vZ/24Qv/+R1AMYgcG5EYzK8blFPpbSrBTAeBclgAsDYdXAo5+rjQmHHc9qdWub3Ti0LoJpdQNVZnTDJGR0nAagmVGhZGJ4sAqDXAdhcw4LHuVblWgYAwNINNPksjgPamn0YSBaDsSuMXUJzjQFEYhLqPCJEgUedx96PzqqAWwz598wFZKwk/uVTu4seDDeSsgBsBKDONaUKwWRFmXwuIIKYbKiqivGopKcSpl7XFtpsApCqBNYWp0hMgkvkIQq8/j6nRTEhyZAV1danrd/fJg2U4zi0NXmx6+CgaaE/OTSuxwCsy38sIQNq6iQtRiSuuYAALZ/e1gIIRuESeb3iFkhZACyN9JqPzsfTb3TjkZf34zOXL3Wcz0SwHghvJFMMQFFUHB8Yz6tZnMctYHpLXWEDLQKaC6g699okAMSUYceBQfz7k7vwL7d+1CQCLIPN4xYy7mqVpI+eXXPrff+D0xe24vZPrNQXLCcBefSVAzjSG8S3Pn1W2vfYUmVyATELAFogOC4ppkVPUVTHQrBfP7MH8YSMr/3DKtPrkZiku3/8dS7bfkBDY1oGkDHdU7RYAGcuCSAuKXj+zSM499R2LJ3bYjvniaALgJ0F4HMhnpBtC8VeeOsoHn1lf97P+95nP4J5M8p/yPzQWBSqCrhFEgCCmBDW9EUrIyHNbRIMx00CwA4b97gEhDIclagHgQ1ZQO8dHNRei2e2AIaDMYyGMrstJJs0UI7jEGjW/PHGnkCKoqbqACwK0DsUxsmhsNZN07CDjsRktDVpWUUiz9umnQ4Fo6YAMJCyACJRWf/62gsWYMvuk3hq0yHc8aniC0A87iwADXVaEHs8kkj7fu9wGPVeEf901ak5PScYjuN3z3+Io32higjAE68fhChwOG/59LI/OxdIAIhJw+h4ZgFgC7i1sZmqIhkD4DEwmlsQ2OpiiCYyxwAkWTH1HLK9v8EFpOguIOiLtjEVVFHVVHsGy22DkQRkRcW+4yNYsaBVf91YTczz9v19hsZiOHWeeUFPuYA0i8El8HC7BFx13jz8+b/34YPDw1g2r7giEM3gAqr3poLYxmA1oGVJtTR4ceaSQE7PkWQFD/9tL/qKfPRmLhztC2Hzeydx+Tlz9d9xtVGddglBFEAqj9+88CnMBeTK5gJKBYEly/GN2SyAhJRBAGwqgVNBYE7P2zc2hTNaAOa3qXqB1weHR0yPicQl+JIxAI4DVIsFICsKRkKxtAZsugAkC8nY12tWzkST342nNh2yn9cEyOQCYv2AQjYng42Ox9FU70p73QlR4NHa5Cn62cu58Nir++HziLiqc17Zn50rJADElMHpRC/NAgDcbiFjFo+xHbS1CCtbDECWFVuXi9392ZgALQbgdglo8rstLiCjJWM80lLSheaDI8P665KsIJ5QUhYAx6UJ0mgoDlUFpjWYd9UsC4gFgcWkALhdArrOm4cPj47g/cPDKCYpF1D6EsTiGHaVzKOhOBrrna1AO9qbfegbDhcwysLZ0z2EXQeHcPX589OSEqoJEgBiyuB0pq9qsABySwNV0twnKQvAXkASspq249afr1cCpx8JyWKxgSaf2QJQUwfYG9dxVt3b2uhFd09QD9yyPkA+d8oFZB1O6iAY8wIqWrKAXIbe9WtWpayAYh7TyH4PdnUALEPJGsRWVVWzAPzp/Y0yEWipQ99w+SwARVXx2CsH0NrowWVnzSrbcwuBBICYMrAFPC0GgFQaaEJSdKGwYmwFYW2PnM0C0GIAmcdnfyKYpgBtzV7T0ZDGLCDjbdmu+CPL2k1nBjMhMFoAVkHSD4JxsAAiFgsA0I6L7DpvHvYeHcEHRbQCYgkl2Z7DzgLQ5mAVgEhMgiQrtg3uMtHe7MN4VCrbYfNb9/TicG8Q11+0CC4xXeCqCRIAYsrg1M3TGAMAnBdxYzO4NAuA1QE49AmSZGdhsUsDZes/qw9qa/LpC7R2rWJbCMYsgJWLWyEKnO6aSQlAKgZgdQE5WQB6FlBMgihwaa2L16yaiWa/G89s7radXyHEbFpBMwSeh88j6jEJxmgyTbYxXwFINscrhxWQkBQ8/j8HMbfdj3OrNPPHCAkAMWVwdgFpCy3zNzsFco0CYrQAZCXVRtqpnXRuWUDGE8EMQQAAgWavydVjigEYXmcWwLRGLxbPatLjAEwAvJ5MLqAoPG4hrdFbSgDktLYQ2vcFXHbWbHxwZAS9Q8XxpccSsq37h1Hvc+lZSQxWJ1GIBQDYN9wrNq9sO4aB0ShuvGRx1Z4BYIQEgJgysAU8kRYE1iwAdwYLQFFVqGqyNTJS/nAAON4/nkMaqOpoAejXKPaVwIAWAzCNR1FtK4GZW8Tvc2HZ3BYc7Q0hFEno5wHXGYPAlvEMB7WDYKw9/42Lvp1LBgDOX9EBjgM2vdeTcY65YlfkZcTvczlaAPkKAOu42ltiC2A8msAzm7uxfME0LF8wraTPKhYkAMSUIZMFwHGA180EIH0RZ+/1Ja8Zj6QWn4M9Y7rV4OTqySkNVE53AbGluK3Z7JeXVdW2G2gwEococPC6BSyb1wIVwIdHRlIWQHL8mgWQHgOw5tUDMDUqs7MAAKClwYMVC1qxedfJrEKXC9kEoN5OAJKFdk3+/LKAPO5kllWJBeC5LYcRjkq48eJFJX1OMSEBIKYMTv38VWsMwMaNw/zszIViDBgeOjGmZ9kA9haErChQnDNM9Wv05yFVBwBogVnjQpxIyPrO3xgDCIUT8Ptc4DgOC2c2QhQ4HDwxqp8HnLIAkLZQW4+CZHAcpy/8LgcLAAAuOKMDw8EY9nQPZZ5oDsQTsm0RGMPvc5msMAAYC8chJA+MyZf2Zl9Ji8EGR6N48e1jOG/5jLSDfaoZEgBiyuB0opeiqFodQAYXEBMP5h9nu886j2iyAIB0CwMAEpKadCOl744zBYGZN4bnObQadudRh1hDKJKA36e5QERBC5ZG47LuAvIas4AMY5FkrdeQnQUApBZ+JwsAAFYtbkO9VyyKG8juPGAjThZAY727IN96e0tpawGefP0gAOC6ixaU7BmlgASAmDI4pYEqQNYsoDQXUNICWDKnGSf6x3X/s937VYecfSvmdtDaB+NaZnQDReOS9VIAWhYQq5QFkge6SwoiMQkCz+lNxzhLEHg4GIMK2FoA7D7Gj07XnHfaDGzbOzDhlErNBeT8LL9NEHh0PJ53BhCjvdmHkVC8JC2uj/QGsXnXSXzsrNlV2/LBCRIAYsrgXAmsWQAed+4WAIsBLJnTDBVAz2BYX6yt97fr8mkZgON1HFIKYFw8jBaAnQuI4RJ4JGRNALxuQXcpWYPAdgfBGMnFBQRobiBJVvDmnt6M12XD7kB4I/U+FyIx2TSH0fFY3gFgRnuyHXQpMoH+89UDqPOK6Dq/els+OEECQOSNoqi4/z93Yv/x0UoPBS+8dRQvvXMMgHMvIC0InJsFwPr5jyd3nwtnNurXsApV6/uNbSMyBUhlmyCB0QIIGCwAp3TTUCQBv4MFYEzv5C11APpRkI32FgDL/hGztC2eO92P2QE/Nu2cmBsollDgzhIDALT+RgytD1ChAlCaWoDd3UPYdWgIXZ3z9SZ2kwkSACJvhoMxbN8/gH9/clelh4JHXtqHP764F0CmbqAsCJysA7BZXFMWgLYojSQXzOnT6vTFsSG5+FhjDJLNYe92mLOAzEFgIJWuCJhdQOyeiqJiPJJIO8xFEwDZJAAcz5ncUboF0DAxC4DjOFxwRge6TwZxbALHRsYSMrxZLABAO4oT0H4GwfFEwS4g9rMtZiBYa/mwH62N3qpv+eAECQCRN+x4u2yFT4XQMziOgdHC/kll1stHslYCp84EBrLEAJKL6MmhMESBR2OdC63Jbp2NyT711vcbBccuE8guCKy3gzZcZ3QBmSyA5LWhaAIqUv3yAW3nnpBkGwvA4gIKxlDvFR0zb3KJATDOWz4dAs8VbAWwNhfZXEAAEEkKQCiSgKKqBVsAfp8L9V6xqBbAm3t6caQ3hOsvWlj1LR+cIAEg8oZlKzo1P5sI3/71m7jj37fkdK0prVJVHbOAtBiA1ndG4DnbOgAmHmwRHQnF9aIpFjhlwde0GICpz38mF5AhS8hwJjBjVqAeCzo0l5NRZNgdWRWw32IBSLKqtYI2LO7WbqDDYzG0OOz+gdyygBiNdW6sXNyGv+8+mdY1NRcyNYJjMAEIJ4PNYwXWABgJFDEVNCEpePy1ydPywYmKCsA999yDO++8s5JDIAqA0y2Ayo7D2C44lpAz1AGkFlqv274jqDULCEj5y9nC72gBZIkBGDWBjZG9w+gC8rgE3PHJ1QCg5/VrN9A+6FXAdZYgMIsBGPLjed4SAxiLOvr/gfwsAAC44PQOjIUTeO/AYE7XG8l0HjDDb3EBFVoFbKS9xVe0YrCXtx3D4FgUN146OVo+OFExAdiyZQueeOKJSj2emADsD74YFaHZGBqLYsvukwCA//vyfvzLI+/q3zOmZo6FE1ljAIC267QLsLKF2Wtwo7AdM3O5sMWx0BgAYIgD2KSBAtrCDZizgNg92SHvaTEAORkDcFtdQKn7DgVjjhlAgCEInCUGwDh90TQ01rsLqglIHQbj/CzdAogxAdBiMoXGAABNAAZGowVZLUbGowk8y1o+zJ8cLR+cqIgAjIyM4L777sMtt9xSicdXLaFIoiR5yrk8N1OffCdkRcVIKJb1/cPBGBKSrO9gw9GEY5aLlQee3YNfP7MHQ2NRfHhkBLu7Uy2JjYeoB8fjGQ+EYQut05kA7L1uUdCvtVoAbOG33t+4oHT3BG2KwVJfM7eVapMGCqQsAmMQmMHOM7a6gOIJmxgAnyoEiyW0n71TDQC7j/FjNgSex/nLZ2DngUHT78GKoqgYTgbUGbEM5wEz/LoLSPs5jI1rfzsTsQACzT4oqqoHxAtlMrZ8cKIiAvDd734Xt99+OxobG7NfXEPc9tPX8aM/bKvIc7//27fyfl8sIeNrP3sDt/30dfzw4Xccrxsdj+Ff/u8O3PbT1wEA//snr+P//PvmvJ51YnDc5r5GCyBuCAKbd3iyoupWi5MAMAtAEFLpomzHfMqsJgDA3Bl+AJnTQO//y05s3zfgOA92MDyThHQLQHshZqoD0D4aG8ExXCKPSEyCrKh6BpN231QQeDhLCii7j/FjLnz0jA7IiqpbaHZs29uPb/5ys0kEcnEB+TwiOC51TvHoeAxukdd7HRVCexEygVjLh84Vk6vlgxNlPxT+scceQ0dHBzo7O/H444/n/f7WVn8JRlVcRqMyAoHC/jgO9wb19xZ6j0LoGQzn/DxvOH3Hd7Qv5Pj+0aisH1zCrglFElmfFwg0YFqzDzgyApfHZXodAGSkFh6VFyAK2uIgyYrp3pKiorHBg0CgAf56N1Rwac8eZG2WW+rgTbZXWDC7GYFAAwKBBiyYOw2z2/3444v7ILhE0/tPjJh3lPtPBrH2owv1r73e1K61ubkOrU0+NA5obQlaWurSxsJxZhdQQ3LsMrQmcLNmNqe+5/fqfvJAq1+/V4PfAxVAW5sfJ4a18S2cM83xZ16fdHM1N/py/jsIBBqwZG4z/r6nF//rqtPSuowCgLJ3AJKsYiicwJKFbQCAo0PaAjw9+bN1wu9zQeV4BAINiEkqWhq9aG8vfNPIJ11k4YRa8P/WH/57HzgO+Ny1pyOQLC4rBeX63y+7ADz33HPo7+/H+vXrMTo6inA4jB/+8Ie46667cnr/4GCoLL7niTA8Mo5+b+E7lf5+TQT6+4NFHFVuz80F60lN2d4/PDJue0225/X3B6EmfeZ9hpxz9r6TA6nXTvSOIZrcLcbiCvr6xvQFKRJNALKK/v4gOKgIheNpzx5MWhjBYBRichcuqKp+nZcHBgZCcIs8RkYjpvcPWKyT7R/2mb4fiaQEs68/CCUuYWREE4BRy70AzX8fMxZAjUXR3x9E3+A46r0u0/VSInWdHJf077Fn9vaN4eBRzW3GK7LjzzySzLaJx6S8/u7OPXU6Hv7bh3jrvRN6BpOR0WRK73t7+7Bouj/5M9B+b5HxmOOzAoEGeN0CBkfC6O8PondwHH6fOKH/CVVV4RZ5HDw6jP7+trzff6Q3iFfePorLz50LTnL+WU6UYv/v8zznuHEuuwA89NBD+uePP/44tm7dmvPiT9QezE0QsYkZRGISGuvdiCdkzQXEMmxUFbKiQhSS7pSEDLdbc214XIJ+MpYRtqkQOKMLKN1l4nHxGWMAgGZNjYZitimLbIyZYsUcx9m6qaxVwIDZZWOtA9DmlToKsiVDDID5pPJxAQHAuae245GX9mHTez22AsDme6Q3JdbxRPYYAADUeVMN4cbCcd2FUygcxyHQ4iu4FkBv+dA5+Vo+OEF1AERVw5qbRWPpQdFITMt9b6xzIxhOmDJwWMBWkhVIsqovNh6nLKDke3leOzjG4xL01sqm8djEEOyySj5MurwAczM3lgXE2kHbpRDyvDmzSDVkARkzgABz5a4xBmAs1hsai6GhzpVTsRITzVyp87pw5pIA3tzdi4TNcZnsd3KkL7WjzaUOANA6sTIBGA3FJ1QDwGhv9hXUD2j3ocnd8sGJigrA9ddfj40bN1ZyCEQB2LU8LhVsfRywydyIxmV4PSIa6l0YM2QBAdAXI7bb9BoFIEMWkMBrLSOmNaafnAUwAbCmgaa+ZgfPOB2gnosFYBUFvRAsErexAFKLqKkVBCvWU1XtIJgMRWBG7OacjQtO70A4JuFdm+A3K5IbGovprkMW38gW0K3ziggnD4IPRRITygBiBJICkE8Vu6KqeOzVyd3ywQmyAIi8sab1lRKWy25sO8AWa7MFEDe1WoglM4HYYs0aj2XLAuJ5Dh89vQMfO2u27Xg8LsHGBZR6LgcOS+Y04/0jI7bvT6WBJq+3WW+NB8NoF2sftLMAnF1AXgcX0PBYLGMGkJFCSppOndeCaY0e29YQxt/J4V7NCsjVBVTvFTEeTej1D8UQgOktPsQlRT9dLBf0lg9rJm/LBydIAIi8eW37ibTXMv0zTyRob7dTY5W3rAFaQ53bVAgGpM4FZvn0ugXgFrTjGy1jMloAHz29A5ecaS8Abhef0QWkqiqWzW1B71A4JZTGSmDmArJpBsewvsbOG4jE5HQXkEEA6uwEIE8LoJDfFM9zOH9FB3YfGkrLsTcKwJGkAMQSMkSB191UTtR5XIhEpaIUgTECelfQ3A6HSUiy1vJhuh/nnjZ5Wz44QQJA5I3dIpEpeDiRw0NsBSC5u4/GJXjdIhrrXQiFE5BkNVWtK7GiLe2jMQYAaItQz+A4jvVpwUnFYAFkwuMS0uoMJMPXKoBl85oBAB8eSXcDWeMFdhaAdQwqjG0gzIug0WdvdKmwe4SjCURiclYLgP2c06yPHLng9BlQAWzeZa4JUBTtdzKt0aMHgrMdBsPweUXEJQWDo5oAFMMCyLcW4KV3jmstHy6Z3C0fnCABIIqC0y7/rv/4O/704r6C72vnqmUCEIlJqEtaAIqqIhiO64ugUSSAVDaR3hI6IePbv34T3/3NVm38GXbkRtxiehA5YVnU57Y3wOcR8UFSAFRTJXAqUwmwd7mwNZgFwFU11fco3QIQ9GuNbRzYPQZHkxlA2QQgRwF0or2lDkvmNGPTez2mGJGsqOB5DnPbG0wWQKYiMEa9l3Vm1dJsiyEArU3a2cu5ZAKNRxP4ry3dWDEFWj44QQJA5I/NqhyOSYglZISjCQyMRvRd/8mhMHpz+Gc7cHwUvTZmuZ2wJCQZqqoiEpPh9Qh6k7bxqKQLwGDSFRGzWABOLaHZgpxt/fO4hbRmc8YYAKAtokvnNOMDmziAbJmPneCwRdhoVQXD6W0ggFQWkNeSscQa9g2MZj4HgHHGolYAwNz2wgstLzi9A33DEew7ljooSFZUCByHudP9ODkYRiwuI5blPGAGc2n1DGp/F8VwAQk8j9ZGb06ZQP+VbPlwwxRo+eBE2esAiMmPk5/43j9tw6GeVLrfb+68NOd73u3QSsLJBRSJSVBUFXVeEY2GzBjWEO3Xz+xB5/IZhsZjVheQedf+wtajAOzTMo14ssQAGMvmNmP7/gHNJ24TA0hZHOnPYGNwuwSMR7V5Bm06gQIpkfBZBIDdQxeALBbAectnYOXitrT75MPZywL444t7sem9HiyZ0wxAE3BB4DBvegNUAEf7Q4glMp8FwKjzpgTA5xGzpo3mSnuLL+umJBqX8NI7x3De8qnR8sEJsgBqjFJm8BgX/2Jhd+bA4FhM9yfPDvj1k7oAYFZA28EKlp46ugvI4Vzg7pPa2LO5ed2igHhCgaKqiMYlDI3Zd5dcOrcFAHQ3ECM9C8jGAuAsFoCaigE4BYF9FpeKUQA4AM055NBPZPEHtOM0P7KsHW990Kf/3GVF0VxAyUX0SG8wGQPIRQC0uZ4cGi/K7p8RyKEt9O5DQ0hICi48o6Noz61GSAAmMZKs4MePvIuDJ8Zyfg/LqGCUM6e/EOxCCyOhqJ5SOG96g+4CArQd+jmntqMtGexztgDsu5FmiwF4DDGGZ97oxv/327dsD5iZM92Peq+Y5gbSXUAZ0kCZC0iPASAVA6h3EgCrBZD8zx4cjaDJ7865zfNEueCMDsTiMt7+sA+ANl+R5zCt0YN6r5gSgBxiAMwFFInJRfH/M9qbfQjHJMeWJgCwfd8A6r0iFs9uKtpzqxESgEnMycEw9nQP46Hn3i/4HoUc62gVjXxK4/N9np1ASbKKwyeDaGnwoLHeDb/PpQdTeZ5LHpKiLfDWzpNMAOzOBQay58GzRTmWkBEMaznqdtk+PKfVA3xweNi2EjhjEFiPASQXyaQLqM4jpi3kLAbg6AIai2Y8B6DYnDK7Ce0tPr0mgAWBOU6zAg73hhBPyDm5c+oMB9wUVQCyHBCvKCp2HBjE6YtayyaclWJqz26KIyRTAKUJ5Nkb19dcrQHjZU31btNh5kZsWy7I+Y3VTjAkWcHh3iDmz9DcCjzP6b5xnufgcgl6+mcsLoNDauFmQhB1EoCsMYCkgCRk3fXDgpRWls1twcBoVM/EAQBJsaaB2rmAtI8mC8CmDxAAiLoFYHEBJW8yNBbLeA5AseE4rY7iw6Mj6BuJaDGApDkyb3oDjveHtGB9DgJQXyoB0FNB7X9v+4+PIhRJYNXi/BvGTTZIACYxzM8tT+CEI2OWjayoGBiJ5JW3r8J51/ylf30t7TXrOb5AZqtAUdIPAg9HJZwcDGOeITjH3EACz8Et8ohLMv74wl4c6hmD2y3oCy27zniWgHFMWWMAhiCyNf3TyrJ5WhzgwImx1O8qpzRQcwxASwONp/n/AYMF4LZkARkmUk4LAAA+umIGOABv7OyBLKv63OdO90OSVYyNx3OKAbhEQd+BFzUG0JzZAti+fwACz2HFgtaiPbNaIQGoEgrxxRsbfhX+3NTnsqzijl9uwbd+9ffM7zF8PjYez6t/jGwRHO25zgupqqYfU7jv2ChUAPNmpASgwWgBiDziCQUvbTuGXYeGTIuNzyPA7eIxEkrFQozWQF4WgJRZOGYF6uH3uSDJij6H9CMhbSqBWQwg+SwVmgvImgIKZMoCSn1eTgsA0ATntAXTsHlXDyRZMQhA6vfFurNmg7mBimkBuF0CWho8joHg7fsGsGxus8kFNVUhAagSCirB55gFUJxWC8w9YRccGwnF8MundmluHcvj8imQNI6VCUBCSh//D3+7FcPBGKIJOa1DJTtcxigAbIfILAAjxqpTjuPQ7PeYBMDoqspaB2AoJJNkRXdT2Pm0eU6rBwBSFbtM7JxOBGPvA1IuIJYFZOcC8rgFuEQ+rdUzX0ELANBqAgbHYth7bFTfqMyYVge3K9WWOxdYILjJXzwBADQrwK4auGdwHCeHwlh1SqCoz6tWSACqhQkk41iLi/J6rFEAMgjJX147gK3v92Hr+71p38unRN5kAcipls1WtrzXg217+xFPyLqf23iPJr/blNrIDm7nOS6tYZfHZd7Jtfg9GDGkw0bysADMLiAVswJ+tDV54XFohcHcQAKzAPRuoM6VxyyDx2XJAmrwpS+CHpeA73/uHFxgSVfkDEqWrQq4FJy5pA11HhGRmKTHqniew5xkoVmuAlCvWwDFnUN7s/25ADv2DwIAVi6e+u4fgASg6Dzw7B5s29uf9/sm4saRlfxjAJKs4Lm/Hza5PyTJ+T7MhSEpqqm1AZAKRueC0d0j6RaA+bkXrdQWs3hCq/gV+dSfKSv6mmcpzmGvCzyn7zIZHou7obnBgxFDN0jTAexZLQBzENglcPjM5Uux/sKFttcvm9sMIGUB6GcCZ/h1C5YYQDQuIy4pthYAAExvqUtzkxlF2c51VGpcoqA3TxMMY2FuoFwFwJcUgGLGAAAtE2h0PJ6WqLB9Xz/mtPvR1jSxw2cmCyQARSQSk7Bl18m88vInAltECrEAduwfwH++egB/finVp8eYodKfzOAYHI3i5FBYX1AkWUmzVs5copnLs9rqsz53yLDzZu4gqwXAdtlxSYGimgVmfvLUqfkzzALAisFYDMCINeOkxe/BsEMMIJs1w8QllowBiAKPFQtbsXKR/Y5xZls9GupcaQF79iO0czkxlwlb1EMObSAyYdDMijUxY1aJYBAnJty51AEAKRdQg4P4FQpLBTW2hAiG49h3fLQmsn8YUz/KUUa6TwahQhOCfJlIQVYhMQDmJtnTPaS/ZnQBffvXf8dNl52CP7ywN+1Z1iIqtlAdHzCfjWukPXkU38Y/bjM8L9mx02IB8Jzmx39q0yFwnNZvnrFkTjN2HhjEKbObTe9h2T08z8FtcQFZ/fPNfrfJ6ojGjC4gxykAMFsACVnR3VNOriOO43D6wlacGBjHIBfTxZpZW3bN1ziLBRB0aASXCeOiXykBmD+jAXOn+03Fa4tmNuZcmQwAbU0+BJq9Rc/HZwLQOxzB7KRbaueBQagqsOqU2hEAsgCKyKEebefvlGOeiULWf+aKKcQCSJ2YZXDJGBZFSVZtF3RJVtIW7GyctTSAG9akN9RiP6ewJe2U41KLdmujF5+/+jT9ews6GnHPLZ1YvsDcnVFPA+XSg8DWk6eaLQFTowuIy+IDMsYANBeQ9qxMXTQ/vXYJbv/ESggCp/+uBseicIu87a6ebQbYoqc3gstjF2xc9CvVxZjjOHzjptX47BXL9NdmBfy450udumssG+s+Oh/f3nB20cfGagGMFsD2/QNo9rtNyQVTHRKAIsIEoDALoIAHTiBwzBZxYx2AtUjJbh52AdtsTGuw38Gx+48nz31lu2uO43QXQWO9OdjrFnnbwrOGemMhmDULyGoBmAXAaNHkYwFIsqrPK1P2kNettawWeE631gZGo2ht8tpaDuxvgcUNxpIWQH4uIM7283Lj97nS0inbmnw5pw57XELR/f+A1meo3ivqB8MkJBm7Dg5h1eK2Kdn33wkSgCKSEoACLIACVvOJdPFhx/LZtSlgRKLpAlCItcFx9oeNh5kAJNNOm5OpfhxSC631gBKng2daG704Y1ErFs1qyu4CslgARqHL9s/P8xxEQesImpBSLqBcDlIReE4P2A+MRBwrqJ0sgIa63BfCanABVTvtLXV6KugHR0YQS8g15f4BSACKxkgohqGxGOq9ohYgzHOnXJgLqHDs3DjWPvdhGwtAltW84xUcZ79AWi0AtjM3WgDWxctJAESBx1dvXIkFHY3pQWC3NQhsXkhN58PmsFZ6XFqhmVbgxeljzoYg8LqADoxG0dZkn5/PfjNMAMajEjjOfORjNjjDj4DWf3tYXArQir/cLt4Ub6oFSACKBNv9M9+03eKZiVI25YzGJfzq6d2mVtDW1EvAHA8AiucC4sCZMkGs9x+PJsBznJ7Jw3EpC8Dqvsh09CQjvRDMLAAuUTD1mTFmJuXiLXEnD5bPNQbAYC6gcDSBcExyTDVkAmvMfqr3uvJy5ZgsgAq6gKqZQLMPg8l23tv3D2DFgtYpd+h7NkgAisShniB4jsNpyaPjxjO0mrWnEBMgt/ds3zeAN/f04us/f0P3d8dt2iFbX7MTMa0OID9ysQDqfaKpmIq5bay791z+Qa0iYZdyaKycNR5knstOPiUAxhhA7i6g/hHteY4WgJK6nt013zRIcgFlZ3qLD6oKbNvbj+FgrKbSPxkkAEXiUM8YZgXq9YVl3MZ/nolCinlzfYuxj0o4OS5bC0DK0QLI8OBrL1yQ/iJnXyxmjAHUe11w2QSBvZYmZ64c0gHTK4HTBcAYCDZaRrmslR4Xr4+dxQD4HP6TBIGHJKsYGNXcDjnFAJLjybeYyzgPWv/tYT//F946Cg7AGTVS/WuEBKAIqKqK7p4xLOhoRH3yFKP8LYB0Rsfj2HVwMMODc7uP0QWw84B2P9sYgMUCsAtmZ6s56Fw+I+01DpzeEtju/uPRBOp9ou66MQWBhQJcQFmygACzAIwZOoPmagGw368rDwtA5LU0UN0CaHawAJgLiOf0++YrAKYsIFIAW1gtwMETY1g0u8l0sFCtQAJQBPpGIhiPSlg4sxH1Pm3Hmk9LZcC+FcQ9f9yGf310x4RP7TK+/bXtJwCk6gCM5JLfLytKRt0xLjbnnNoOQNuB2s1BdwFFJM0C0IuqUm4bq+vILpvISloMwMYFZMwEMo4sl6XS4xL0329+QWAOsqxgYDQCn0fUNwtWUmmgqXnkkwEEUAwgF5rq3fpmYXWNZf8wSACKwKFk64f5MxoMFkCetQA2q+rJIS1H2S71MhiO6xWiVvYeHcGfXtyLE8lCLjtxycUCsGPr+30ZewYZFxu26+c42PbONwaB672iLgCKquoxAKsA5LLQWmsO7CwAp/bCudzf4xJ0V5q1UV0m3KKAuKRgYDSKgIP/HzAEgXlOd9/kHQMw/NzIALCH4zi9IKwW/f8AtYIoCgd7xuAWecwK1OuVpPlaAHY7ZJ7joKiqdq6qZQ37yv2bHO/1/JtHsH3/AP77nWO495ZOWwFI2Jxja80CcuLDZDtmO3gOuPdLnRAFHn959QAAzQVkt+CagsBel+5OkSQV3qQlZec6yoZ1EbcTAKuVkA9uF68LQC4xCUadV8TYeBwjIRkzptU5Xme2ADgAav4uIM74OSmAE3PatarfjtbsfaymImQBFIFDPWOYN6MBAs+D5zl43ULeFoCdW4WtfUqeEWLjgj8ajsOuWaidBRCzcQvlC8dzaGvyodnv0RdijtOqP79+0yrTtZGYBFlREIlJqPe59J17QlZSdQBF+Au1cwHlEktwvJ9L0H9f+fSoqfOICEclDI5GHQPAgDUGoL2WvwAYLQASACc2XL4Ed3zqzEoPo2KQAEwQSVZwpDeEBckulYB2OlP+FkD6a/qBL3kKgOnfXbV3AdnGAHJwAWXDtNtMfsoWoHbLoheJSfpO2ugCkmRF37UXoz7CzgKYiAAYK43zEQCfV8TgWBRxSXFMAQVSGWHGLKCJuIAIZ7xusSLtsqsFEoAJcrx/HAlJsQiAkHcaqJ0LiB3qke+Zv8Ydn6oCqo2AxCUlbRHM1QWUCTt3A3vF6nYJx2T951TvSwWBE1JKAAopPLNiLwCFF/wYzxdwibkvtHUeURfztgwWgLEQjLkU/TaHwWSCdv1ELpAATBBWAbxgpkEA3BW2AAz/+ypU+yBwQk7b+RTFAjC2ILB8Yu3JE41JCCUD2fVegwvIIE4TOe2MYT0QBqiMBWBsipYpCMxcfgLPpeoAyAIgSgAJwAQ51DMGv89l+of2ecS86wDs6mvZ/7CUwyJoXOSNu7/BsahtkVlcUkz95X0eIe82z3bYtSG29rdn16kAhoJaTrzVBaQfoTiB844ZdoHkicYAGPnGABiZTpxiv0pB4PW/gXzOAgBya2lBECQAE+RQzxjmdzSYFt1CXEB2UWCWAmkNAtsFhSMxCXuPjuCFrUdMFsB/PL0n7XpZUZCQFJNfWRT4IlkAzi4gUeD1OfmTWT6sH7vVBaQfop6MYJ+/YgbOX5FeZFYo+WTvWDEWmuUjJHXJFOGGOlfGE7HYZkDkOQAchGRiQT6QC4jIBRKACRCNSzg+MI6FBv8/AHiTQeB8zvm123s7xQCsJ3IBQCiSwMY/bsMjL+9PcydZxzEaimsuIENxkSjw6D4ZBJBqy2zkR184L5dpWGIAqSwgBlsw2SlRA6MpC8DoAmK7duYC+vzVp5kOhpko+eTvW5moBZDtvNmUBaBFAPx1rrwXdEr9JHKhIgLws5/9DF1dXejq6sK9995biSEUhcMng1BVmALAgBYDUFXNx50zDnUAQLof3FYADEVhxuZmQLrFMByKaRaAz2gBaG0Kmv1uLJnTnHb/XA9+t1t3jIsXWzB1AUhaAHVeMVUHICv6dRN1Af3ktgtsX3flcZC9FbMA5BEETsYAAg4tIBipNFAeHJe/+wcoTvosMfUp+5/J5s2bsWnTJjzxxBN48sknsXv3brz44ovlHkZRONSj7ZjTBMCjLRChPNxA+QSB7QQgGEnorgnW45xhtQCGx2KISwp8Bp80u2TVKQHbvvO5FmRxdjEAw/fZgulPukP6R6PweUQIPA8xmVGTkBRdcCaaBeTU32UiZ8wag9l5FYLlagEkf988p/08C0lTJAuAyIWyC0AgEMCdd94Jt9sNl8uFRYsW4cSJE+UeRlE41DOG1kZv2pF1bGF1CgSrqoqewXFT6qd1/R8ai+qZRJKs4Hh/SP9ezObM4fFIQq+2tbZx/uDwiOnrXYe0hnBGXzZzxaxa3JbWgRMoLKjI3mIUBSYkbFEbHI3qvfldhkIw3QIoQhaQHXbnE+SKyQLIw5XU5HfD5xGxcGZjxuvYlJkL0F9AkzLKAiJyoeytIE455RT98+7ubvz1r3/Fn//855zf39rqL8WwCuJwXwjLFkxDIGA+RDowTRujy+NK+x4A/O3vh/Gzx7bjK/+wGh87Zy4AIGrY6AYCDVj39af0r194+xje+aAP9311DV5795itsBwfiuhdJq1s2X3S9PX/7OgBAMyari1ES+e2QBA47Dk0hAvPmoO+sVjaPaZPb8Q5p83A1j3me51/RodpjsbP3Z5k0LPBq7/Ogp9tyVYIsqKiqcGDQKAB3npP8p4zcUrSDXXDx5bY/gyzcfqiNrx3YMDxvY3NmohefOZsLF/YipfeOoIPDg/n9KxgPPXLmjG90dSora3Jm/Eef/7BVfrO3omLVs/Cc5u7MaujCS2NXiyY1ZT3z8AonIX8/KqByTruYlCuuVesF9C+ffvwxS9+EXfccQfmz5+f8/sGB0N5t0YoBWPhOPqGwrh45Uz09wdN34tEtAV0YGg87XsDIxH8+qn3AABPvrofZ8xvBsdxGBoa16+xvue9AwMAgINHhvDXLd22zdie39Kdcbz/fPVpWHVKGz44MoxmvwcCz2F2ux8/++qFcIk8EpKCWELB6EgYizs0AWtr8uqWQXA0gss/MgvrPzoPzX4PxqMJeN0ivG7BNF7j54MjWjM7yLL+Olv2BIPN4xF5/fv3f+VC1HlENPk9+M2dl9r+PHLhf1+3AglJzvje+79yIXweAQLP44z5LUhISk7PCodSQjs6HEZ0XPt9//z2iyDwXEHjNXLdBfPxv644FcHRCL75yVVwiULe9zRalxMdTyUIBBom5biLQbHnzvOc48a5IqGid955B5/97Gfx9a9/Hdddd10lhjBhulkBWEe6UrNUR7uFeseBQcTiMj521mwc7g3qcYRMLZ9Z47ZQNIFYXC7ILTKjtQ4+j4iWBg8WdDRi7vQG8ByHOq8LLlFAndelH2Yzf0YjfnPnpaYDskWBA8dx6Gith88joq3JB7+hf48dzFIx7pCZb9/rFnU/tbEtst+X39GHTrhEXk+7dMLvc+kuKe363PZDbpMLyJj+K6YVuxWCwPO6W7HO0CY7HygNlMiFsgtAT08Pbr31Vvz4xz9GV1dXuR9fNA6eGAPHAfNm2AgAy2ax6cLGcu2v6pwHj0vAK+8eA2COAVjFgH01OGrv4jGyaJa9f7mQvHfjwlzIghLSBSB1H7bgCgKnB8vrJ1kvFo8r1ea6kG6lBFEtlP2v98EHH0QsFsPGjRuxfv16rF+/Pq8YQLVwqCeImW31tgFTUbcA0nfqLIOnsd6NzuXTsfX9PoQiCVMWkKoC0xo9ae/NRQCWzmmxfd3lyv9XbZcNlA92AsCygASO04Pl9TnuvKsFtsufSDEZQVQDZf/P+853voPvfOc75X5sUVFVFYd6xhwPkciUwsiasPEch4tXz8Kr20/gjfd6cOq81MLdNxKBz+MCYA7GDjgIgChwkJL58k6LaSGLVa4uESdYrKbB0MiMucd43igAk8sCYBXNE0klJYhqgP6CC2BgNIpQJGFqAGeEuQVsBSAh62mEc6c3YPGsJrz67nGTBXDXf/zdNtDdNxJJew0w94p3ajFQiB95ogLw9ZtW47oLF5jGxBZNgZ+8FgCgWQETqSYmiGqA/oILgHUAtbaAYKQsALsunIop//6SM2ehdziCPd1DpuuGg9G0itHhYMz03saka8XYKtipZ0xBAjBBF9Ccdj/WfXSB6TWjBcDuP9liAIBWQzGRamKCqAZIAArgUM8YREE7AtIOFgOwOwc3lpBN7YTPXtoOv8+FV949brouGpcxq60eM1vNRwcumtmkfz6tURMI4+EiHpeDC6gAAWhvcT62sFCYBcBxhiDwJLQAPC6BXEDEpIf+ggvg0IkxzJvud1wA+AwHucQTclo3yQvP6LD176sq0OQ3B4NPmZ0SgNbkwn/W0oAeaHWyAArJVmFpocVEMHT59E1mC0AkFxAx+Zl8W68KIysKunuDuOiMmY7XcBwHUeBMFoCiqjh8MoihYCzNT79m9Sw8/+aRtHYQdV4RHksswNh3qDVpAYyNxxFo9iEYTmRsMwwATfX5Leq3XrfCNtOpUPQun7I6aYPAgHbIjJ2LjyAmEyQAedIzEEY8oTgGgBmiwEOSVBw4Poqt7/fh7Q/7MBzUsnqWzzenarY3+7BiYSveO6j16PnEJYtx8UfmovvYELZ9OGC6dkayhcKyuc1YubgNL7x1FLMC9ZBkBQdPjCHQ5DVV8FrJd1d/1tJ2/fN8xcOOlAWgItDsQ51ncp7J2tbkK8r5CQRRSUgA8uSgXgGcWQDcIo8X3z6KF98+ClHgsGJBK/w+F472hWwbkV121mxdAOq8IuZMb8DJvjE0GBrNnXNqO9qavfj57RfBJfIQBR73f+VC+H0urFjYikvPmo16rwt3//O5+OKPXyvirDWK4RISDS2fL141C2cvDUzodK5K8U9XLrM7w4cgJhUkAHlyqGcMdR4R7S2ZW/r+4xXLcLg3iECzD6tPaUOd14UnXz+Io30h29jAGYtaMWNaHU4OhU3ZN8y3v3x+C25ZvwIA4POkFky2e+Y5TnelTOTA81Kjt8mQVS0TaBK6f4D0840JYjJCApAnh06MYUFHQ9Z+66uXBLB6ScD02px2rSHTicGw7XtaGjw4ORSGz5AVw/rZl6otcrlZPKsJL71zDDPbip9hRBBEfpAA5EE8IeNY/ziu6pxb0PvnTNf6BrFYgJUzlwTw/uFhzGxNpZeyFMmpEnA897TpWNDRUJIUU4Ig8oMEIA+O9IagqCoWzMjs/3fCmK9vx6VnzsJFKztMLhy978wk9JM7QYs/QVQHJAB5oAeAs2QAOcFzHD575TIEHISA4zjT4t9U70GT342uznm4eNWsgp5JEAThBAlAHnT3jKGlwYNmf+HZMBetdK4fsMKybj6+ZlHBzyMIgnCCBCAPDvaMZU3/JIhq4fs3n1NQG3CidiAByJFQJIG+4QguPKOj0kPJi7nT/ViSPF+XqC1mt1fP+dlEdVITAhBLyNh1cBATyaQ8MaCd2evUAbRa+X//6ZxKD4EgiCqlJgRgy+6T+P3zH074Pm6Rx/wcBKAYLROKQTlbLKxa3IZ6X038ORHElIFTM51GXoUMDoZsD0vJhKqqODkUzvt9Vvw+V1p3zlIRCDSgvz9Y8PsTkgyAm7TpoxOd/2SH5l+78y/23HmeQ2urvTuwJrZsHMeho9W+d/9UpZrbQRAEUR1Mzu0hQRAEMWFIAAiCIGoUEgCCIIgahQSAIAiiRiEBIAiCqFFIAAiCIGqUSZcGyvOZD2KZStTSXO2g+dP8a5Vizj3TvSZdIRhBEARRHMgFRBAEUaOQABAEQdQoJAAEQRA1CgkAQRBEjUICQBAEUaOQABAEQdQoJAAEQRA1CgkAQRBEjUICQBAEUaOQAFQJoVAIV199NY4dOwYA2Lx5M9atW4e1a9fivvvuq/DoSsvPfvYzdHV1oaurC/feey+A2pr/T3/6U1x11VXo6urCQw89BKC25s+45557cOeddwKorflv2LABXV1dWL9+PdavX48dO3aUb/4qUXG2b9+uXn311ery5cvVo0ePqpFIRF2zZo165MgRNZFIqDfffLP66quvVnqYJeGNN95Q/+Ef/kGNxWJqPB5XP/OZz6jPPPNMzcz/zTffVG+66SY1kUiokUhEveSSS9T333+/ZubP2Lx5s3ruueeq3/zmN2vq719RFPWCCy5QE4mE/lo5508WQBXw6KOP4nvf+x7a29sBADt37sS8efMwZ84ciKKIdevW4fnnn6/wKEtDIBDAnXfeCbfbDZfLhUWLFqG7u7tm5n/OOefg97//PURRxODgIGRZxtjYWM3MHwBGRkZw33334ZZbbgFQW3//Bw8eBADcfPPNuOaaa/CHP/yhrPMnAagC7r77bpx99tn61319fQgEAvrX7e3t6O3trcTQSs4pp5yCVatWAQC6u7vx17/+FRzH1cz8AcDlcuH+++9HV1cXOjs7a+r3DwDf/e53cfvtt6OxsRFAbf39j42NobOzEz//+c/x29/+Fo888ghOnDhRtvmTAFQhiqKA41ItXFVVNX09Fdm3bx9uvvlm3HHHHZgzZ07Nzf+2227Dli1b0NPTg+7u7pqZ/2OPPYaOjg50dnbqr9XS3//q1atx7733oqGhAdOmTcMNN9yA+++/v2zzn3TnAdQCM2bMQH9/v/51f3+/7h6airzzzju47bbbcNddd6Grqwtbt26tmfkfOHAA8Xgcp556Knw+H9auXYvnn38egiDo10zl+T/33HPo7+/H+vXrMTo6inA4jOPHj9fM/N9++20kEgldAFVVxaxZs8r2908WQBWycuVKHDp0CIcPH4Ysy3j22Wdx0UUXVXpYJaGnpwe33norfvzjH6OrqwtAbc3/2LFj+M53voN4PI54PI6XXnoJN910U83M/6GHHsKzzz6Lp556CrfddhsuvfRSPPDAAzUz/2AwiHvvvRexWAyhUAhPPPEEvva1r5Vt/mQBVCEejwcbN27El7/8ZcRiMaxZswZXXHFFpYdVEh588EHEYjFs3LhRf+2mm26qmfmvWbMGO3fuxLXXXgtBELB27Vp0dXVh2rRpNTF/O2rp7/+SSy7Bjh07cO2110JRFHzqU5/C6tWryzZ/OhGMIAiiRiEXEEEQRI1CAkAQBFGjkAAQBEHUKCQABEEQNQoJAEEQRI1CAkAQBFGjkAAQNcnNN9+MoaGhCV/z5ptv4uqrr876vKVLl9re66WXXsIPfvADAFpb4Oeffx7Hjh3D6tWrs96TICYKFYIRNckbb7xRlGsmymWXXYbLLrus5M8hCDvIAiBqjm9961sAgH/8x3/E1q1bsWHDBqxbtw7XXHMNnnzyybRrenp68Morr+Cmm27C9ddfj4svvhg/+clP8n7uT37yE1x33XVYv349XnnlFQDA448/ji9+8YtFmRdB5AtZAETN8aMf/QiPP/44fve73+ETn/gE7rjjDqxduxa9vb248cYbMW/ePNM1LS0tuOOOO7Bx40bMnz8fvb29uOSSS/CZz3wmr+fOnj0b3//+97F3715s2LABf/3rX0s0Q4LIDRIAomY5cOAAYrEY1q5dCwCYPn061q5di9dff93kg+c4Dr/85S/x6quv4tlnn8WBAwegqioikUhez/vkJz8JAFiyZAkWLVqEd999t3iTIYgCIBcQUbNwHJfWZ11VVUiSZHotHA7juuuuw+7du3HaaafhjjvugCiKyLeNFs+n/t0URYEo0v6LqCwkAERNIggCZs2aBVEU8cILLwAAent78be//Q3nn3++fo0kSTh8+DBCoRC++tWv4tJLL8Wbb76JeDwORVHyeuYTTzwBANi9ezeOHDmClStXFndSBJEntAUhapIrrrgCn/3sZ/GLX/wCP/jBD/Bv//ZvkGUZt956K8477zz9mg0bNuCnP/0pLr74Ylx55ZVwu91YsmQJFi9ejMOHD8Ptduf8zKNHj+Laa68Fx3H413/9VzQ3N5dodgSRG9QOmiAIokYhC4AgisADDzyAZ555xvZ7n/vc53DNNdeUeUQEkR2yAAiCIGoUCgITBEHUKCQABEEQNQoJAEEQRI1CAkAQBFGjkAAQBEHUKP8/4tJAb6qNkeYAAAAASUVORK5CYII=", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Create a visualization with Modin df\n", "sns.lineplot(data=modin_tips, x=\"total_bill\", y=\"tip\")" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAEJCAYAAACdePCvAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAABOIElEQVR4nO29eZhcZZ33/T1Lbd3VW7qrk86+kQQSSAIINAJh8Q1LEwIIDvoYx0FH8eIRRX0Q0Uvf10s0cDmDMuroCKLiwgMj+yDCsA0hkQAhCUmArJ210/tS1bWe5f3j1H3qnFPn1Na1ddfv8093V5865757ub/3b705VVVVEARBEDUHX+kBEARBEJWBBIAgCKJGIQEgCIKoUUgACIIgahQSAIIgiBqFBIAgCKJGIQEgCIKoUcRKDyBfhofHoShTv3ShtdWPwcFQpYdRMWj+NP9anX+x587zHFpa6m2/N+kEQFHUmhAAADUzTydo/jT/WqVccycXEEEQRI1CAkAQBFGjkAAQBEHUKCUVgFAohKuvvhrHjh0DAGzevBnr1q3D2rVrcd9995Xy0QRBEEQWSiYAO3bswCc/+Ul0d3cDAKLRKO666y784he/wHPPPYddu3bhtddeK9XjCYIgiCyUTAAeffRRfO9730N7ezsAYOfOnZg3bx7mzJkDURSxbt06PP/886V6PEEQxJShVF37SyYAd999N84++2z9676+PgQCAf3r9vZ29Pb2lurxBEEQU4Luk2P45i+3oGdwvOj3LlsdgKIo4DhO/1pVVdPXudLa6i/msKqaQKCh0kOoKDR/mn+twuYeiUl44IE3oQJYOK8Vfp+rqM8pmwDMmDED/f39+tf9/f26eygfBgdDNVEgEgg0oL8/WOlhVAyaP82/VudvnPtvnnsfPQPj+D+fXI1IKIpIKJr3/Xiec9w4ly0NdOXKlTh06BAOHz4MWZbx7LPP4qKLLirX4wmCICYVW9/vxaadPbiqcx6WzWspyTPKZgF4PB5s3LgRX/7ylxGLxbBmzRpcccUV5Xo8QRDEpGFgNILfPf8hFs5sxPoLFpTsOSUXgJdffln/vLOzE08//XSpH0kQBDFpkWUF//HMHqiqii9csxyiUDpHDVUCEwRBVBGPvrQP+4+NYsPapWhv9pX0WSQABEEQVcK+YyN45IUPcN7y6ehcMaPkzyMBIAiCqALC0QT+4+k9CLTUYcPapWV5JgkAQRBEhVFVFb//24cYDsbwjU+fBZ+nPPk5JAAEQRAVZvOuk9j6fh/WX7gAy+ZNK9tzSQAIgiAqSO9wGH94YS+WzmlG13nzyvpsEgCCIIgKIckKfvXUbogCh39edxp4Pv/2OBOBBIAgCKJCPPH6QXSfDOIfr1iGaY3esj+fBIAgCKIC7OkewvN/P4KLVs7E2cvy74tWDEgACIIgykwwHMcDz+7BjNY6fPKyUyo2DhIAgiCIMqKqKh567gOEIgl8Yd1yeNxCxcZCAkAQBFFGXn33OLbvH8ANaxZh3ozKnnlAAkAQBFEmjveH8MjL+7Fi4TR87CNzKj0cEgCCIIhykJBk/Orp3fC5BXyu6zTwBZyIWGxIAAiCIMrAo68cwLH+cdzcdRqa6t2VHg4AEgCCIIiSs2P/AF565xj+n7Pn4IxFrZUejg4JAEEQRIn5/d8+xOyAHzdcvLDSQzFBAkAQBFFiRkIxrD6lDS6xcimfdpAAEARBlBhVBaog5psGCQBBEEQJUVUVAMBVoQKQABAEQZSQ5PpPFgBBEEStoZAFQBAEUZswC6DMrf5zggSAIAiihFAMgCAIokahGABBEESNoscAUH0KQAJAEARRQigGQBAEUaOooBgAQRBETUIxAIIgiBqF6gAIgiBqFIoBEARB1ChUB0AQBFGjUAyAIAiiRiELgCAIokZJBYErPBAbKiIATz31FLq6utDV1YV77rmnEkMgCIIoD3oQuPoUoOwCEIlEcPfdd+Phhx/GU089hbfffhubN28u9zAIgiDKgpL8WIXrf/kFQJZlKIqCSCQCSZIgSRI8Hk+5h0EQBFEWqjkGIJb7gX6/H1/5yldw5ZVXwufz4SMf+QjOPPPMcg+DIAiiLFRzFlDZBeCDDz7AX/7yF7zyyitoaGjAN77xDTz44IP4/Oc/n9P7W1v9JR5h9RAINFR6CBWF5k/znwpEkz6g5sa6nOdUrrmXXQA2bdqEzs5OtLa2AgCuv/56/OlPf8pZAAYHQ1AUtZRDrAoCgQb09wcrPYyKQfOn+U+V+Q8OjQMAgqFoTnMq9tx5nnPcOJc9BrBs2TJs3rwZ4XAYqqri5Zdfxumnn17uYRAEQZQFPQZQ4XHYUXYL4IILLsCePXtw/fXXw+Vy4fTTT8cXvvCFcg+DIAiiLFAMwMIXvvAFWvQJgqgJqjkLiCqBCYIgSkg1WwAkAARBECWEzgMgCIKoUeg8AIIgCAee2dyN/377aKWHUTIoBkAQBOHAW+/34a0P+io9jJJRzTGAimQBEQRBMOIJGbKiZL9wklLNMQASAIIgKkosIUOJT93qfuYCqkZ3CwkAQRAVJZqQkUgoUFS1KnvmT5SUC6j65laNokQQRI2gqiricRmKqiISkyo9nJKg0olgBEEQ6SQkhR2YhVAkUdGxlIrUgTDVpwAkAARBVIxYQtY/D4WnpgDoMQASAIIgiBQmAZiiFkA1p4GSABAEUTFi8VoQgOpNAyUBIAiiYsQSqfz/4BR1ASlkARAEQaRjdAGNR6emAFAMgCAIwgajAExVC4BiAARBEDbEkwIg8BzGKQZQdkgACIKoGNFkELilwYPglBUA7WMVrv8kAARBVA7mAmpt9E5ZC6Cam8GRABAEUTGYC6i1yTv1LYDKDsMWEgCCICpGLCGD5zg0+z0YjyR0f/lUQgX1AiIIgkgjFlfgcfPw+1yQFRWRmJz9TZOM1JGQ1acAJAAEUcMEw3Fs2XWyYs+PJSS4XQIa6lwAgFAkXrGxlApFIQuAIIgqZNPOHvz62T0Va8MQSyjwuATU+5gATL2W0EwA+Co8FZ4EgCBqmIGxKIBUMLbcxOIyvC4BDb6pawHISR+QwFffclt9IyIIomwMj8UAAAm5MmfyxhIy3G4Bfl0Apl4mELMABLIACKL8qKqKhFS54KKiqpAqtMBmYyhpAUhSZcYXT8jwuAT4WQxgCraDkGVyARFExXh523F88cevYTgYq8jzX9t+Anf+aktFnp2NoWBlLYBoUgB8HhEcB4SmYEM4mSwAgqgc2/b2AwB6Bscr8vzhYBRDYzG9IrRaiCVk3eWSqJAFEIvL8Lh48BwHv881JS0ARSUBIIiK4XEJAMyHj5QTtu4zV0C1YLSIKiUA8YQMj1sEAE0ApmAMQE5aV+QCIogK4HFrAhCtUKYLQ1aqKw7A/P8AKhaj0NJAtWVoygoAuYAIonLoFkDFBaC6LIChscpaAIqq6kFgQBOAqdgPSFFV8BxHzeAIohJ43ZV1ATGqzQU0FExZAJUQgERCgQqYBGBKWgCyWpXuH4AEgKgBKh0DYJAFYIZZZG4mAHVaEHiqNYSTFbUq3T8ACQBRA1Q6BpAKAldZDCAYRUuDB0BlYgBMAJiFxhrCRSss1MVGIQEw8/LLL+P666/HlVdeiR/84AeVGAJRQ7AujGQBmBkei2F6iw9AZS0AowsImHrVwLJKLiCdo0eP4nvf+x5+8Ytf4Omnn8aePXvw2muvlXsYRA3BXAqV3llKVSYAQ8Eo2lvqAFSmECzNBTRVBUCuXgtALPcDX3zxRVx11VWYMWMGAOC+++6Dx+Mp9zCIGoIV4lQqC4gdCFIOF5CiaG0n2KLqRCQmIRKTK2sBxJkFoO1DG3xuAFNPABRFhSBUpwCU3QI4fPgwZFnGLbfcgvXr1+NPf/oTmpqayj0MooZgPvhYvLKthsvhAnrhraP4zgNvZr2O1QBMa/RCFPjKuoDcqSAwMPUEQFbUqjwMBqiABSDLMt5++208/PDDqKurw5e+9CU88cQTuP7663N6f2urv8QjrB4CgYZKD6GiFGv+dXXazlJWK/MzrUvubBsafKbnv39oCPM6GlDnddm+r5CxjkYSGBiNoqm5LqMVcGQwDABYOLcFHhcPl1ss+8/Gc3QUANAxvRGBQAM8dUlPQLJt8lT5+3e5BLhdQl7zKdfcyy4AbW1t6OzsxLRp0wAAH/vYx7Bz586cBWBwMKS3V53KBAIN6O8PVnoYFaOY8w+GtHTHUDhekZ9pOKz1uB8cCqHfry32kqzgW7/YhPUXLMDV589Pe0+h8x8ejQAADh0ZwrRGr+N1h44OAwB4WYHAcxgLRsv+sxkY0nozjQej6Oc0VwnHAT39IQCYMn//4eQZB7nOp9j/+zzPOW6cy+4CuuSSS7Bp0yaMjY1BlmW8/vrrWL58ebmHQdQQuQaBt+3tx+//9mHxn5/8aAwCK4oKWVHROxQu6rPYmbrBLE3VhsZi4DigucENl1ghF1Dc7ALieQ71XhfGyQVUNsouACtXrsTnP/95fOpTn8JVV12FmTNn4uMf/3i5h0HUEGzdzSYAu7uHSno+rrESmMUl+kejDlcXRiSmxTmCWU7WGgpG0ez3QOB5iKJQkSygqCUNFJia7SCquRCs7C4gALjhhhtwww03VOLRRA3CLIBITIKqqo49WRRF602T6ZqJYGwGxzKT+kciRX1GJBnozsUCmJYsAnNVKAgcT8gQeA6ikNqH+n1TzwKgLCCCqCB6Ja6S+WQuWdESNou+GGZoBz0SjBX1eboFkE0AgjG0JGMELpGvTB1AXE4LVPt9rqxjn2zIyhQoBBsdHUUoFCrlWAiiJBh7yzAfuR0suSBeot2wMQ2UjUmFuS3zREnFAJxdQKqqYngsarAAuIocCRlLyHobCIa/zoXxKXYqmKyoECZrDODgwYP4+Mc/js7OTpx77rn49Kc/jRMnTpRjbARRFIy9xSIZagF0AShywZheCGZwARltgf5RezeQoqrYe3Qk5+coqopoDhbAeFRCXFL0LKGKBYETzhbAVGoIN6ldQN/61rdw4403YseOHXj33Xdx+eWX49vf/nY5xkYQRcF4FGOmfkBsh16qimG7IDAA9I/YWwAfHB7Gxj9uw/H+3CzvWFzWhSWTBaAXgTELQBQqlgXEqoAZDT4XJFmpeN+mYjKpXUCRSAQ33XQTXC4X3G43NmzYgIGBgXKMjSCKgnGxjSecF7qUBVDcxdAYg0i9lvp8wCEQHI5qu/lYjuNh/n8gczUtawPNLACxUjEAw2EwjPpkP6Cx8cxZTJMJWVEmrwto4cKF2LZtm/713r17MXv27JIOiiCKiXGxzbS7L7UFYAxAmywAh1RQtitXkZs7JJLcNfMcl9EFxA6CmdZY2Swg7ThIswA0MAHIYMFMNjQXUHXm22RNAz1x4gQ2bNiApUuXQhRF7NmzB4FAAOvWrQMAPPPMMyUfJEFMBCVHAWDXxaUSuYCMFoDhdScLgO3Kc3WHMwugtcmTxQUUg8BzaKzXWlS4RK4iFkA8IesixDBaAE0eX9nHVAqq2QWUVQC+8Y1vlGMcBFEyjAtoLhZAsV1A1vtrY0odFO5UC6DvynMUABYAbm/2YXf3sOZ64NN3nuwgGFad6hKEimUBpVkAdQYX0LSpIQDVfCCMowAcOHAAixYtQn19ve33qX0DMVnI1QWklDwInO4Cam30om8kgkhMgs9j/nfM1wUUTgpAoKUO6B5GKCKhKbnLN2IsAgMqVwcQjacLADsTIDilYgDV2wrCUQDuvfde/OpXv8KNN96Ijo4Ocy51JIItW7aUZYAEMVEUFXC7eMQTmbNL5FKlgWYIAgdafOgbiaB/JIK5080dIPN1AbFWF+3N2s45GI47CEAUi2elWrCLyTTQUlVAOxG3CwJ7XeAw1YLA1ZsG6igA99xzD0ZGRrBo0SI8/PDD+h9HIpHApz/96XKOkSAmhKqq8LpFxBPxnCyAcriAGO3NPuwGMDAaTRcAZgHkqAAsa6g9echLyCYQrKgqhoMx/SxgQLMAVFUbn1imhUpRVcQlBW5LGijPc6jzihljGJONSekC+vrXv4433ngDHMehs7NTf10QBFx++eVlGRxBFANVBUSBgyhwucUAihwETp0IZugGyiyA5G7dLg6Q70Ht0bgEDkBbk5beaddULTgeh6yoplbRrmSGSjwh48W3j+LiVbPS3FHFJq4fCJ/+HL/PNeUsgEkXBH7wwQcBaIVgP/rRj8o2IIIoNqqqggMHj0tAPJ69DsBOJFRVxeP/cxDnr5iBjlb7uFg2JEMlMHPr1/tE+DwCBpLFYGPjcQgChwCMFkBu9w/HJHg9gu72sdtFDwWTNQAWCwAAdh0awmOvHMD7h4fxtU+symNm+cNqG6yFYIDWDmKqxQCq1QLImpxKi3/+DCf/yYjqQFEBjtMOHy80C2g8KuG/thzGtr39BY/D6AJiT+A5DoEmn94O4vZ/24Qv/+R1AMYgcG5EYzK8blFPpbSrBTAeBclgAsDYdXAo5+rjQmHHc9qdWub3Ti0LoJpdQNVZnTDJGR0nAagmVGhZGJ4sAqDXAdhcw4LHuVblWgYAwNINNPksjgPamn0YSBaDsSuMXUJzjQFEYhLqPCJEgUedx96PzqqAWwz598wFZKwk/uVTu4seDDeSsgBsBKDONaUKwWRFmXwuIIKYbKiqivGopKcSpl7XFtpsApCqBNYWp0hMgkvkIQq8/j6nRTEhyZAV1danrd/fJg2U4zi0NXmx6+CgaaE/OTSuxwCsy38sIQNq6iQtRiSuuYAALZ/e1gIIRuESeb3iFkhZACyN9JqPzsfTb3TjkZf34zOXL3Wcz0SwHghvJFMMQFFUHB8Yz6tZnMctYHpLXWEDLQKaC6g699okAMSUYceBQfz7k7vwL7d+1CQCLIPN4xYy7mqVpI+eXXPrff+D0xe24vZPrNQXLCcBefSVAzjSG8S3Pn1W2vfYUmVyATELAFogOC4ppkVPUVTHQrBfP7MH8YSMr/3DKtPrkZiku3/8dS7bfkBDY1oGkDHdU7RYAGcuCSAuKXj+zSM499R2LJ3bYjvniaALgJ0F4HMhnpBtC8VeeOsoHn1lf97P+95nP4J5M8p/yPzQWBSqCrhFEgCCmBDW9EUrIyHNbRIMx00CwA4b97gEhDIclagHgQ1ZQO8dHNRei2e2AIaDMYyGMrstJJs0UI7jEGjW/PHGnkCKoqbqACwK0DsUxsmhsNZN07CDjsRktDVpWUUiz9umnQ4Fo6YAMJCyACJRWf/62gsWYMvuk3hq0yHc8aniC0A87iwADXVaEHs8kkj7fu9wGPVeEf901ak5PScYjuN3z3+Io32higjAE68fhChwOG/59LI/OxdIAIhJw+h4ZgFgC7i1sZmqIhkD4DEwmlsQ2OpiiCYyxwAkWTH1HLK9v8EFpOguIOiLtjEVVFHVVHsGy22DkQRkRcW+4yNYsaBVf91YTczz9v19hsZiOHWeeUFPuYA0i8El8HC7BFx13jz8+b/34YPDw1g2r7giEM3gAqr3poLYxmA1oGVJtTR4ceaSQE7PkWQFD/9tL/qKfPRmLhztC2Hzeydx+Tlz9d9xtVGddglBFEAqj9+88CnMBeTK5gJKBYEly/GN2SyAhJRBAGwqgVNBYE7P2zc2hTNaAOa3qXqB1weHR0yPicQl+JIxAI4DVIsFICsKRkKxtAZsugAkC8nY12tWzkST342nNh2yn9cEyOQCYv2AQjYng42Ox9FU70p73QlR4NHa5Cn62cu58Nir++HziLiqc17Zn50rJADElMHpRC/NAgDcbiFjFo+xHbS1CCtbDECWFVuXi9392ZgALQbgdglo8rstLiCjJWM80lLSheaDI8P665KsIJ5QUhYAx6UJ0mgoDlUFpjWYd9UsC4gFgcWkALhdArrOm4cPj47g/cPDKCYpF1D6EsTiGHaVzKOhOBrrna1AO9qbfegbDhcwysLZ0z2EXQeHcPX589OSEqoJEgBiyuB0pq9qsABySwNV0twnKQvAXkASspq249afr1cCpx8JyWKxgSaf2QJQUwfYG9dxVt3b2uhFd09QD9yyPkA+d8oFZB1O6iAY8wIqWrKAXIbe9WtWpayAYh7TyH4PdnUALEPJGsRWVVWzAPzp/Y0yEWipQ99w+SwARVXx2CsH0NrowWVnzSrbcwuBBICYMrAFPC0GgFQaaEJSdKGwYmwFYW2PnM0C0GIAmcdnfyKYpgBtzV7T0ZDGLCDjbdmu+CPL2k1nBjMhMFoAVkHSD4JxsAAiFgsA0I6L7DpvHvYeHcEHRbQCYgkl2Z7DzgLQ5mAVgEhMgiQrtg3uMtHe7MN4VCrbYfNb9/TicG8Q11+0CC4xXeCqCRIAYsrg1M3TGAMAnBdxYzO4NAuA1QE49AmSZGdhsUsDZes/qw9qa/LpC7R2rWJbCMYsgJWLWyEKnO6aSQlAKgZgdQE5WQB6FlBMgihwaa2L16yaiWa/G89s7radXyHEbFpBMwSeh88j6jEJxmgyTbYxXwFINscrhxWQkBQ8/j8HMbfdj3OrNPPHCAkAMWVwdgFpCy3zNzsFco0CYrQAZCXVRtqpnXRuWUDGE8EMQQAAgWavydVjigEYXmcWwLRGLxbPatLjAEwAvJ5MLqAoPG4hrdFbSgDktLYQ2vcFXHbWbHxwZAS9Q8XxpccSsq37h1Hvc+lZSQxWJ1GIBQDYN9wrNq9sO4aB0ShuvGRx1Z4BYIQEgJgysAU8kRYE1iwAdwYLQFFVqGqyNTJS/nAAON4/nkMaqOpoAejXKPaVwIAWAzCNR1FtK4GZW8Tvc2HZ3BYc7Q0hFEno5wHXGYPAlvEMB7WDYKw9/42Lvp1LBgDOX9EBjgM2vdeTcY65YlfkZcTvczlaAPkKAOu42ltiC2A8msAzm7uxfME0LF8wraTPKhYkAMSUIZMFwHGA180EIH0RZ+/1Ja8Zj6QWn4M9Y7rV4OTqySkNVE53AbGluK3Z7JeXVdW2G2gwEococPC6BSyb1wIVwIdHRlIWQHL8mgWQHgOw5tUDMDUqs7MAAKClwYMVC1qxedfJrEKXC9kEoN5OAJKFdk3+/LKAPO5kllWJBeC5LYcRjkq48eJFJX1OMSEBIKYMTv38VWsMwMaNw/zszIViDBgeOjGmZ9kA9haErChQnDNM9Wv05yFVBwBogVnjQpxIyPrO3xgDCIUT8Ptc4DgOC2c2QhQ4HDwxqp8HnLIAkLZQW4+CZHAcpy/8LgcLAAAuOKMDw8EY9nQPZZ5oDsQTsm0RGMPvc5msMAAYC8chJA+MyZf2Zl9Ji8EGR6N48e1jOG/5jLSDfaoZEgBiyuB0opeiqFodQAYXEBMP5h9nu886j2iyAIB0CwMAEpKadCOl744zBYGZN4bnObQadudRh1hDKJKA36e5QERBC5ZG47LuAvIas4AMY5FkrdeQnQUApBZ+JwsAAFYtbkO9VyyKG8juPGAjThZAY727IN96e0tpawGefP0gAOC6ixaU7BmlgASAmDI4pYEqQNYsoDQXUNICWDKnGSf6x3X/s937VYecfSvmdtDaB+NaZnQDReOS9VIAWhYQq5QFkge6SwoiMQkCz+lNxzhLEHg4GIMK2FoA7D7Gj07XnHfaDGzbOzDhlErNBeT8LL9NEHh0PJ53BhCjvdmHkVC8JC2uj/QGsXnXSXzsrNlV2/LBCRIAYsrgXAmsWQAed+4WAIsBLJnTDBVAz2BYX6yt97fr8mkZgON1HFIKYFw8jBaAnQuI4RJ4JGRNALxuQXcpWYPAdgfBGMnFBQRobiBJVvDmnt6M12XD7kB4I/U+FyIx2TSH0fFY3gFgRnuyHXQpMoH+89UDqPOK6Dq/els+OEECQOSNoqi4/z93Yv/x0UoPBS+8dRQvvXMMgHMvIC0InJsFwPr5jyd3nwtnNurXsApV6/uNbSMyBUhlmyCB0QIIGCwAp3TTUCQBv4MFYEzv5C11APpRkI32FgDL/hGztC2eO92P2QE/Nu2cmBsollDgzhIDALT+RgytD1ChAlCaWoDd3UPYdWgIXZ3z9SZ2kwkSACJvhoMxbN8/gH9/clelh4JHXtqHP764F0CmbqAsCJysA7BZXFMWgLYojSQXzOnT6vTFsSG5+FhjDJLNYe92mLOAzEFgIJWuCJhdQOyeiqJiPJJIO8xFEwDZJAAcz5ncUboF0DAxC4DjOFxwRge6TwZxbALHRsYSMrxZLABAO4oT0H4GwfFEwS4g9rMtZiBYa/mwH62N3qpv+eAECQCRN+x4u2yFT4XQMziOgdHC/kll1stHslYCp84EBrLEAJKL6MmhMESBR2OdC63Jbp2NyT711vcbBccuE8guCKy3gzZcZ3QBmSyA5LWhaAIqUv3yAW3nnpBkGwvA4gIKxlDvFR0zb3KJATDOWz4dAs8VbAWwNhfZXEAAEEkKQCiSgKKqBVsAfp8L9V6xqBbAm3t6caQ3hOsvWlj1LR+cIAEg8oZlKzo1P5sI3/71m7jj37fkdK0prVJVHbOAtBiA1ndG4DnbOgAmHmwRHQnF9aIpFjhlwde0GICpz38mF5AhS8hwJjBjVqAeCzo0l5NRZNgdWRWw32IBSLKqtYI2LO7WbqDDYzG0OOz+gdyygBiNdW6sXNyGv+8+mdY1NRcyNYJjMAEIJ4PNYwXWABgJFDEVNCEpePy1ydPywYmKCsA999yDO++8s5JDIAqA0y2Ayo7D2C44lpAz1AGkFlqv274jqDULCEj5y9nC72gBZIkBGDWBjZG9w+gC8rgE3PHJ1QCg5/VrN9A+6FXAdZYgMIsBGPLjed4SAxiLOvr/gfwsAAC44PQOjIUTeO/AYE7XG8l0HjDDb3EBFVoFbKS9xVe0YrCXtx3D4FgUN146OVo+OFExAdiyZQueeOKJSj2emADsD74YFaHZGBqLYsvukwCA//vyfvzLI+/q3zOmZo6FE1ljAIC267QLsLKF2Wtwo7AdM3O5sMWx0BgAYIgD2KSBAtrCDZizgNg92SHvaTEAORkDcFtdQKn7DgVjjhlAgCEInCUGwDh90TQ01rsLqglIHQbj/CzdAogxAdBiMoXGAABNAAZGowVZLUbGowk8y1o+zJ8cLR+cqIgAjIyM4L777sMtt9xSicdXLaFIoiR5yrk8N1OffCdkRcVIKJb1/cPBGBKSrO9gw9GEY5aLlQee3YNfP7MHQ2NRfHhkBLu7Uy2JjYeoB8fjGQ+EYQut05kA7L1uUdCvtVoAbOG33t+4oHT3BG2KwVJfM7eVapMGCqQsAmMQmMHOM7a6gOIJmxgAnyoEiyW0n71TDQC7j/FjNgSex/nLZ2DngUHT78GKoqgYTgbUGbEM5wEz/LoLSPs5jI1rfzsTsQACzT4oqqoHxAtlMrZ8cKIiAvDd734Xt99+OxobG7NfXEPc9tPX8aM/bKvIc7//27fyfl8sIeNrP3sDt/30dfzw4Xccrxsdj+Ff/u8O3PbT1wEA//snr+P//PvmvJ51YnDc5r5GCyBuCAKbd3iyoupWi5MAMAtAEFLpomzHfMqsJgDA3Bl+AJnTQO//y05s3zfgOA92MDyThHQLQHshZqoD0D4aG8ExXCKPSEyCrKh6BpN231QQeDhLCii7j/FjLnz0jA7IiqpbaHZs29uPb/5ys0kEcnEB+TwiOC51TvHoeAxukdd7HRVCexEygVjLh84Vk6vlgxNlPxT+scceQ0dHBzo7O/H444/n/f7WVn8JRlVcRqMyAoHC/jgO9wb19xZ6j0LoGQzn/DxvOH3Hd7Qv5Pj+0aisH1zCrglFElmfFwg0YFqzDzgyApfHZXodAGSkFh6VFyAK2uIgyYrp3pKiorHBg0CgAf56N1Rwac8eZG2WW+rgTbZXWDC7GYFAAwKBBiyYOw2z2/3444v7ILhE0/tPjJh3lPtPBrH2owv1r73e1K61ubkOrU0+NA5obQlaWurSxsJxZhdQQ3LsMrQmcLNmNqe+5/fqfvJAq1+/V4PfAxVAW5sfJ4a18S2cM83xZ16fdHM1N/py/jsIBBqwZG4z/r6nF//rqtPSuowCgLJ3AJKsYiicwJKFbQCAo0PaAjw9+bN1wu9zQeV4BAINiEkqWhq9aG8vfNPIJ11k4YRa8P/WH/57HzgO+Ny1pyOQLC4rBeX63y+7ADz33HPo7+/H+vXrMTo6inA4jB/+8Ie46667cnr/4GCoLL7niTA8Mo5+b+E7lf5+TQT6+4NFHFVuz80F60lN2d4/PDJue0225/X3B6EmfeZ9hpxz9r6TA6nXTvSOIZrcLcbiCvr6xvQFKRJNALKK/v4gOKgIheNpzx5MWhjBYBRichcuqKp+nZcHBgZCcIs8RkYjpvcPWKyT7R/2mb4fiaQEs68/CCUuYWREE4BRy70AzX8fMxZAjUXR3x9E3+A46r0u0/VSInWdHJf077Fn9vaN4eBRzW3GK7LjzzySzLaJx6S8/u7OPXU6Hv7bh3jrvRN6BpOR0WRK73t7+7Bouj/5M9B+b5HxmOOzAoEGeN0CBkfC6O8PondwHH6fOKH/CVVV4RZ5HDw6jP7+trzff6Q3iFfePorLz50LTnL+WU6UYv/v8zznuHEuuwA89NBD+uePP/44tm7dmvPiT9QezE0QsYkZRGISGuvdiCdkzQXEMmxUFbKiQhSS7pSEDLdbc214XIJ+MpYRtqkQOKMLKN1l4nHxGWMAgGZNjYZitimLbIyZYsUcx9m6qaxVwIDZZWOtA9DmlToKsiVDDID5pPJxAQHAuae245GX9mHTez22AsDme6Q3JdbxRPYYAADUeVMN4cbCcd2FUygcxyHQ4iu4FkBv+dA5+Vo+OEF1AERVw5qbRWPpQdFITMt9b6xzIxhOmDJwWMBWkhVIsqovNh6nLKDke3leOzjG4xL01sqm8djEEOyySj5MurwAczM3lgXE2kHbpRDyvDmzSDVkARkzgABz5a4xBmAs1hsai6GhzpVTsRITzVyp87pw5pIA3tzdi4TNcZnsd3KkL7WjzaUOANA6sTIBGA3FJ1QDwGhv9hXUD2j3ocnd8sGJigrA9ddfj40bN1ZyCEQB2LU8LhVsfRywydyIxmV4PSIa6l0YM2QBAdAXI7bb9BoFIEMWkMBrLSOmNaafnAUwAbCmgaa+ZgfPOB2gnosFYBUFvRAsErexAFKLqKkVBCvWU1XtIJgMRWBG7OacjQtO70A4JuFdm+A3K5IbGovprkMW38gW0K3ziggnD4IPRRITygBiBJICkE8Vu6KqeOzVyd3ywQmyAIi8sab1lRKWy25sO8AWa7MFEDe1WoglM4HYYs0aj2XLAuJ5Dh89vQMfO2u27Xg8LsHGBZR6LgcOS+Y04/0jI7bvT6WBJq+3WW+NB8NoF2sftLMAnF1AXgcX0PBYLGMGkJFCSppOndeCaY0e29YQxt/J4V7NCsjVBVTvFTEeTej1D8UQgOktPsQlRT9dLBf0lg9rJm/LBydIAIi8eW37ibTXMv0zTyRob7dTY5W3rAFaQ53bVAgGpM4FZvn0ugXgFrTjGy1jMloAHz29A5ecaS8Abhef0QWkqiqWzW1B71A4JZTGSmDmArJpBsewvsbOG4jE5HQXkEEA6uwEIE8LoJDfFM9zOH9FB3YfGkrLsTcKwJGkAMQSMkSB191UTtR5XIhEpaIUgTECelfQ3A6HSUiy1vJhuh/nnjZ5Wz44QQJA5I3dIpEpeDiRw0NsBSC5u4/GJXjdIhrrXQiFE5BkNVWtK7GiLe2jMQYAaItQz+A4jvVpwUnFYAFkwuMS0uoMJMPXKoBl85oBAB8eSXcDWeMFdhaAdQwqjG0gzIug0WdvdKmwe4SjCURiclYLgP2c06yPHLng9BlQAWzeZa4JUBTtdzKt0aMHgrMdBsPweUXEJQWDo5oAFMMCyLcW4KV3jmstHy6Z3C0fnCABIIqC0y7/rv/4O/704r6C72vnqmUCEIlJqEtaAIqqIhiO64ugUSSAVDaR3hI6IePbv34T3/3NVm38GXbkRtxiehA5YVnU57Y3wOcR8UFSAFRTJXAqUwmwd7mwNZgFwFU11fco3QIQ9GuNbRzYPQZHkxlA2QQgRwF0or2lDkvmNGPTez2mGJGsqOB5DnPbG0wWQKYiMEa9l3Vm1dJsiyEArU3a2cu5ZAKNRxP4ry3dWDEFWj44QQJA5I/NqhyOSYglZISjCQyMRvRd/8mhMHpz+Gc7cHwUvTZmuZ2wJCQZqqoiEpPh9Qh6k7bxqKQLwGDSFRGzWABOLaHZgpxt/fO4hbRmc8YYAKAtokvnNOMDmziAbJmPneCwRdhoVQXD6W0ggFQWkNeSscQa9g2MZj4HgHHGolYAwNz2wgstLzi9A33DEew7ljooSFZUCByHudP9ODkYRiwuI5blPGAGc2n1DGp/F8VwAQk8j9ZGb06ZQP+VbPlwwxRo+eBE2esAiMmPk5/43j9tw6GeVLrfb+68NOd73u3QSsLJBRSJSVBUFXVeEY2GzBjWEO3Xz+xB5/IZhsZjVheQedf+wtajAOzTMo14ssQAGMvmNmP7/gHNJ24TA0hZHOnPYGNwuwSMR7V5Bm06gQIpkfBZBIDdQxeALBbAectnYOXitrT75MPZywL444t7sem9HiyZ0wxAE3BB4DBvegNUAEf7Q4glMp8FwKjzpgTA5xGzpo3mSnuLL+umJBqX8NI7x3De8qnR8sEJsgBqjFJm8BgX/2Jhd+bA4FhM9yfPDvj1k7oAYFZA28EKlp46ugvI4Vzg7pPa2LO5ed2igHhCgaKqiMYlDI3Zd5dcOrcFAHQ3ECM9C8jGAuAsFoCaigE4BYF9FpeKUQA4AM055NBPZPEHtOM0P7KsHW990Kf/3GVF0VxAyUX0SG8wGQPIRQC0uZ4cGi/K7p8RyKEt9O5DQ0hICi48o6Noz61GSAAmMZKs4MePvIuDJ8Zyfg/LqGCUM6e/EOxCCyOhqJ5SOG96g+4CArQd+jmntqMtGexztgDsu5FmiwF4DDGGZ97oxv/327dsD5iZM92Peq+Y5gbSXUAZ0kCZC0iPASAVA6h3EgCrBZD8zx4cjaDJ7865zfNEueCMDsTiMt7+sA+ANl+R5zCt0YN6r5gSgBxiAMwFFInJRfH/M9qbfQjHJMeWJgCwfd8A6r0iFs9uKtpzqxESgEnMycEw9nQP46Hn3i/4HoUc62gVjXxK4/N9np1ASbKKwyeDaGnwoLHeDb/PpQdTeZ5LHpKiLfDWzpNMAOzOBQay58GzRTmWkBEMaznqdtk+PKfVA3xweNi2EjhjEFiPASQXyaQLqM4jpi3kLAbg6AIai2Y8B6DYnDK7Ce0tPr0mgAWBOU6zAg73hhBPyDm5c+oMB9wUVQCyHBCvKCp2HBjE6YtayyaclWJqz26KIyRTAKUJ5Nkb19dcrQHjZU31btNh5kZsWy7I+Y3VTjAkWcHh3iDmz9DcCjzP6b5xnufgcgl6+mcsLoNDauFmQhB1EoCsMYCkgCRk3fXDgpRWls1twcBoVM/EAQBJsaaB2rmAtI8mC8CmDxAAiLoFYHEBJW8yNBbLeA5AseE4rY7iw6Mj6BuJaDGApDkyb3oDjveHtGB9DgJQXyoB0FNB7X9v+4+PIhRJYNXi/BvGTTZIACYxzM8tT+CEI2OWjayoGBiJ5JW3r8J51/ylf30t7TXrOb5AZqtAUdIPAg9HJZwcDGOeITjH3EACz8Et8ohLMv74wl4c6hmD2y3oCy27zniWgHFMWWMAhiCyNf3TyrJ5WhzgwImx1O8qpzRQcwxASwONp/n/AYMF4LZkARkmUk4LAAA+umIGOABv7OyBLKv63OdO90OSVYyNx3OKAbhEQd+BFzUG0JzZAti+fwACz2HFgtaiPbNaIQGoEgrxxRsbfhX+3NTnsqzijl9uwbd+9ffM7zF8PjYez6t/jGwRHO25zgupqqYfU7jv2ChUAPNmpASgwWgBiDziCQUvbTuGXYeGTIuNzyPA7eIxEkrFQozWQF4WgJRZOGYF6uH3uSDJij6H9CMhbSqBWQwg+SwVmgvImgIKZMoCSn1eTgsA0ATntAXTsHlXDyRZMQhA6vfFurNmg7mBimkBuF0CWho8joHg7fsGsGxus8kFNVUhAagSCirB55gFUJxWC8w9YRccGwnF8MundmluHcvj8imQNI6VCUBCSh//D3+7FcPBGKIJOa1DJTtcxigAbIfILAAjxqpTjuPQ7PeYBMDoqspaB2AoJJNkRXdT2Pm0eU6rBwBSFbtM7JxOBGPvA1IuIJYFZOcC8rgFuEQ+rdUzX0ELANBqAgbHYth7bFTfqMyYVge3K9WWOxdYILjJXzwBADQrwK4auGdwHCeHwlh1SqCoz6tWSACqhQkk41iLi/J6rFEAMgjJX147gK3v92Hr+71p38unRN5kAcipls1WtrzXg217+xFPyLqf23iPJr/blNrIDm7nOS6tYZfHZd7Jtfg9GDGkw0bysADMLiAVswJ+tDV54XFohcHcQAKzAPRuoM6VxyyDx2XJAmrwpS+CHpeA73/uHFxgSVfkDEqWrQq4FJy5pA11HhGRmKTHqniew5xkoVmuAlCvWwDFnUN7s/25ADv2DwIAVi6e+u4fgASg6Dzw7B5s29uf9/sm4saRlfxjAJKs4Lm/Hza5PyTJ+T7MhSEpqqm1AZAKRueC0d0j6RaA+bkXrdQWs3hCq/gV+dSfKSv6mmcpzmGvCzyn7zIZHou7obnBgxFDN0jTAexZLQBzENglcPjM5Uux/sKFttcvm9sMIGUB6GcCZ/h1C5YYQDQuIy4pthYAAExvqUtzkxlF2c51VGpcoqA3TxMMY2FuoFwFwJcUgGLGAAAtE2h0PJ6WqLB9Xz/mtPvR1jSxw2cmCyQARSQSk7Bl18m88vInAltECrEAduwfwH++egB/finVp8eYodKfzOAYHI3i5FBYX1AkWUmzVs5copnLs9rqsz53yLDzZu4gqwXAdtlxSYGimgVmfvLUqfkzzALAisFYDMCINeOkxe/BsEMMIJs1w8QllowBiAKPFQtbsXKR/Y5xZls9GupcaQF79iO0czkxlwlb1EMObSAyYdDMijUxY1aJYBAnJty51AEAKRdQg4P4FQpLBTW2hAiG49h3fLQmsn8YUz/KUUa6TwahQhOCfJlIQVYhMQDmJtnTPaS/ZnQBffvXf8dNl52CP7ywN+1Z1iIqtlAdHzCfjWukPXkU38Y/bjM8L9mx02IB8Jzmx39q0yFwnNZvnrFkTjN2HhjEKbObTe9h2T08z8FtcQFZ/fPNfrfJ6ojGjC4gxykAMFsACVnR3VNOriOO43D6wlacGBjHIBfTxZpZW3bN1ziLBRB0aASXCeOiXykBmD+jAXOn+03Fa4tmNuZcmQwAbU0+BJq9Rc/HZwLQOxzB7KRbaueBQagqsOqU2hEAsgCKyKEebefvlGOeiULWf+aKKcQCSJ2YZXDJGBZFSVZtF3RJVtIW7GyctTSAG9akN9RiP6ewJe2U41KLdmujF5+/+jT9ews6GnHPLZ1YvsDcnVFPA+XSg8DWk6eaLQFTowuIy+IDMsYANBeQ9qxMXTQ/vXYJbv/ESggCp/+uBseicIu87a6ebQbYoqc3gstjF2xc9CvVxZjjOHzjptX47BXL9NdmBfy450udumssG+s+Oh/f3nB20cfGagGMFsD2/QNo9rtNyQVTHRKAIsIEoDALoIAHTiBwzBZxYx2AtUjJbh52AdtsTGuw38Gx+48nz31lu2uO43QXQWO9OdjrFnnbwrOGemMhmDULyGoBmAXAaNHkYwFIsqrPK1P2kNettawWeE631gZGo2ht8tpaDuxvgcUNxpIWQH4uIM7283Lj97nS0inbmnw5pw57XELR/f+A1meo3ivqB8MkJBm7Dg5h1eK2Kdn33wkSgCKSEoACLIACVvOJdPFhx/LZtSlgRKLpAlCItcFx9oeNh5kAJNNOm5OpfhxSC631gBKng2daG704Y1ErFs1qyu4CslgARqHL9s/P8xxEQesImpBSLqBcDlIReE4P2A+MRBwrqJ0sgIa63BfCanABVTvtLXV6KugHR0YQS8g15f4BSACKxkgohqGxGOq9ohYgzHOnXJgLqHDs3DjWPvdhGwtAltW84xUcZ79AWi0AtjM3WgDWxctJAESBx1dvXIkFHY3pQWC3NQhsXkhN58PmsFZ6XFqhmVbgxeljzoYg8LqADoxG0dZkn5/PfjNMAMajEjjOfORjNjjDj4DWf3tYXArQir/cLt4Ub6oFSACKBNv9M9+03eKZiVI25YzGJfzq6d2mVtDW1EvAHA8AiucC4sCZMkGs9x+PJsBznJ7Jw3EpC8Dqvsh09CQjvRDMLAAuUTD1mTFmJuXiLXEnD5bPNQbAYC6gcDSBcExyTDVkAmvMfqr3uvJy5ZgsgAq6gKqZQLMPg8l23tv3D2DFgtYpd+h7NkgAisShniB4jsNpyaPjxjO0mrWnEBMgt/ds3zeAN/f04us/f0P3d8dt2iFbX7MTMa0OID9ysQDqfaKpmIq5bay791z+Qa0iYZdyaKycNR5knstOPiUAxhhA7i6g/hHteY4WgJK6nt013zRIcgFlZ3qLD6oKbNvbj+FgrKbSPxkkAEXiUM8YZgXq9YVl3MZ/nolCinlzfYuxj0o4OS5bC0DK0QLI8OBrL1yQ/iJnXyxmjAHUe11w2QSBvZYmZ64c0gHTK4HTBcAYCDZaRrmslR4Xr4+dxQD4HP6TBIGHJKsYGNXcDjnFAJLjybeYyzgPWv/tYT//F946Cg7AGTVS/WuEBKAIqKqK7p4xLOhoRH3yFKP8LYB0Rsfj2HVwMMODc7uP0QWw84B2P9sYgMUCsAtmZ6s56Fw+I+01DpzeEtju/uPRBOp9ou66MQWBhQJcQFmygACzAIwZOoPmagGw368rDwtA5LU0UN0CaHawAJgLiOf0++YrAKYsIFIAW1gtwMETY1g0u8l0sFCtQAJQBPpGIhiPSlg4sxH1Pm3Hmk9LZcC+FcQ9f9yGf310x4RP7TK+/bXtJwCk6gCM5JLfLytKRt0xLjbnnNoOQNuB2s1BdwFFJM0C0IuqUm4bq+vILpvISloMwMYFZMwEMo4sl6XS4xL0329+QWAOsqxgYDQCn0fUNwtWUmmgqXnkkwEEUAwgF5rq3fpmYXWNZf8wSACKwKFk64f5MxoMFkCetQA2q+rJIS1H2S71MhiO6xWiVvYeHcGfXtyLE8lCLjtxycUCsGPr+30ZewYZFxu26+c42PbONwaB672iLgCKquoxAKsA5LLQWmsO7CwAp/bCudzf4xJ0V5q1UV0m3KKAuKRgYDSKgIP/HzAEgXlOd9/kHQMw/NzIALCH4zi9IKwW/f8AtYIoCgd7xuAWecwK1OuVpPlaAHY7ZJ7joKiqdq6qZQ37yv2bHO/1/JtHsH3/AP77nWO495ZOWwFI2Jxja80CcuLDZDtmO3gOuPdLnRAFHn959QAAzQVkt+CagsBel+5OkSQV3qQlZec6yoZ1EbcTAKuVkA9uF68LQC4xCUadV8TYeBwjIRkzptU5Xme2ADgAav4uIM74OSmAE3PatarfjtbsfaymImQBFIFDPWOYN6MBAs+D5zl43ULeFoCdW4WtfUqeEWLjgj8ajsOuWaidBRCzcQvlC8dzaGvyodnv0RdijtOqP79+0yrTtZGYBFlREIlJqPe59J17QlZSdQBF+Au1cwHlEktwvJ9L0H9f+fSoqfOICEclDI5GHQPAgDUGoL2WvwAYLQASACc2XL4Ed3zqzEoPo2KQAEwQSVZwpDeEBckulYB2OlP+FkD6a/qBL3kKgOnfXbV3AdnGAHJwAWXDtNtMfsoWoHbLoheJSfpO2ugCkmRF37UXoz7CzgKYiAAYK43zEQCfV8TgWBRxSXFMAQVSGWHGLKCJuIAIZ7xusSLtsqsFEoAJcrx/HAlJsQiAkHcaqJ0LiB3qke+Zv8Ydn6oCqo2AxCUlbRHM1QWUCTt3A3vF6nYJx2T951TvSwWBE1JKAAopPLNiLwCFF/wYzxdwibkvtHUeURfztgwWgLEQjLkU/TaHwWSCdv1ELpAATBBWAbxgpkEA3BW2AAz/+ypU+yBwQk7b+RTFAjC2ILB8Yu3JE41JCCUD2fVegwvIIE4TOe2MYT0QBqiMBWBsipYpCMxcfgLPpeoAyAIgSgAJwAQ51DMGv89l+of2ecS86wDs6mvZ/7CUwyJoXOSNu7/BsahtkVlcUkz95X0eIe82z3bYtSG29rdn16kAhoJaTrzVBaQfoTiB844ZdoHkicYAGPnGABiZTpxiv0pB4PW/gXzOAgBya2lBECQAE+RQzxjmdzSYFt1CXEB2UWCWAmkNAtsFhSMxCXuPjuCFrUdMFsB/PL0n7XpZUZCQFJNfWRT4IlkAzi4gUeD1OfmTWT6sH7vVBaQfop6MYJ+/YgbOX5FeZFYo+WTvWDEWmuUjJHXJFOGGOlfGE7HYZkDkOQAchGRiQT6QC4jIBRKACRCNSzg+MI6FBv8/AHiTQeB8zvm123s7xQCsJ3IBQCiSwMY/bsMjL+9PcydZxzEaimsuIENxkSjw6D4ZBJBqy2zkR184L5dpWGIAqSwgBlsw2SlRA6MpC8DoAmK7duYC+vzVp5kOhpko+eTvW5moBZDtvNmUBaBFAPx1rrwXdEr9JHKhIgLws5/9DF1dXejq6sK9995biSEUhcMng1BVmALAgBYDUFXNx50zDnUAQLof3FYADEVhxuZmQLrFMByKaRaAz2gBaG0Kmv1uLJnTnHb/XA9+t1t3jIsXWzB1AUhaAHVeMVUHICv6dRN1Af3ktgtsX3flcZC9FbMA5BEETsYAAg4tIBipNFAeHJe/+wcoTvosMfUp+5/J5s2bsWnTJjzxxBN48sknsXv3brz44ovlHkZRONSj7ZjTBMCjLRChPNxA+QSB7QQgGEnorgnW45xhtQCGx2KISwp8Bp80u2TVKQHbvvO5FmRxdjEAw/fZgulPukP6R6PweUQIPA8xmVGTkBRdcCaaBeTU32UiZ8wag9l5FYLlagEkf988p/08C0lTJAuAyIWyC0AgEMCdd94Jt9sNl8uFRYsW4cSJE+UeRlE41DOG1kZv2pF1bGF1CgSrqoqewXFT6qd1/R8ai+qZRJKs4Hh/SP9ezObM4fFIQq+2tbZx/uDwiOnrXYe0hnBGXzZzxaxa3JbWgRMoLKjI3mIUBSYkbFEbHI3qvfldhkIw3QIoQhaQHXbnE+SKyQLIw5XU5HfD5xGxcGZjxuvYlJkL0F9AkzLKAiJyoeytIE455RT98+7ubvz1r3/Fn//855zf39rqL8WwCuJwXwjLFkxDIGA+RDowTRujy+NK+x4A/O3vh/Gzx7bjK/+wGh87Zy4AIGrY6AYCDVj39af0r194+xje+aAP9311DV5795itsBwfiuhdJq1s2X3S9PX/7OgBAMyari1ES+e2QBA47Dk0hAvPmoO+sVjaPaZPb8Q5p83A1j3me51/RodpjsbP3Z5k0LPBq7/Ogp9tyVYIsqKiqcGDQKAB3npP8p4zcUrSDXXDx5bY/gyzcfqiNrx3YMDxvY3NmohefOZsLF/YipfeOoIPDg/n9KxgPPXLmjG90dSora3Jm/Eef/7BVfrO3omLVs/Cc5u7MaujCS2NXiyY1ZT3z8AonIX8/KqByTruYlCuuVesF9C+ffvwxS9+EXfccQfmz5+f8/sGB0N5t0YoBWPhOPqGwrh45Uz09wdN34tEtAV0YGg87XsDIxH8+qn3AABPvrofZ8xvBsdxGBoa16+xvue9AwMAgINHhvDXLd22zdie39Kdcbz/fPVpWHVKGz44MoxmvwcCz2F2ux8/++qFcIk8EpKCWELB6EgYizs0AWtr8uqWQXA0gss/MgvrPzoPzX4PxqMJeN0ivG7BNF7j54MjWjM7yLL+Olv2BIPN4xF5/fv3f+VC1HlENPk9+M2dl9r+PHLhf1+3AglJzvje+79yIXweAQLP44z5LUhISk7PCodSQjs6HEZ0XPt9//z2iyDwXEHjNXLdBfPxv644FcHRCL75yVVwiULe9zRalxMdTyUIBBom5biLQbHnzvOc48a5IqGid955B5/97Gfx9a9/Hdddd10lhjBhulkBWEe6UrNUR7uFeseBQcTiMj521mwc7g3qcYRMLZ9Z47ZQNIFYXC7ILTKjtQ4+j4iWBg8WdDRi7vQG8ByHOq8LLlFAndelH2Yzf0YjfnPnpaYDskWBA8dx6Gith88joq3JB7+hf48dzFIx7pCZb9/rFnU/tbEtst+X39GHTrhEXk+7dMLvc+kuKe363PZDbpMLyJj+K6YVuxWCwPO6W7HO0CY7HygNlMiFsgtAT08Pbr31Vvz4xz9GV1dXuR9fNA6eGAPHAfNm2AgAy2ax6cLGcu2v6pwHj0vAK+8eA2COAVjFgH01OGrv4jGyaJa9f7mQvHfjwlzIghLSBSB1H7bgCgKnB8vrJ1kvFo8r1ea6kG6lBFEtlP2v98EHH0QsFsPGjRuxfv16rF+/Pq8YQLVwqCeImW31tgFTUbcA0nfqLIOnsd6NzuXTsfX9PoQiCVMWkKoC0xo9ae/NRQCWzmmxfd3lyv9XbZcNlA92AsCygASO04Pl9TnuvKsFtsufSDEZQVQDZf/P+853voPvfOc75X5sUVFVFYd6xhwPkciUwsiasPEch4tXz8Kr20/gjfd6cOq81MLdNxKBz+MCYA7GDjgIgChwkJL58k6LaSGLVa4uESdYrKbB0MiMucd43igAk8sCYBXNE0klJYhqgP6CC2BgNIpQJGFqAGeEuQVsBSAh62mEc6c3YPGsJrz67nGTBXDXf/zdNtDdNxJJew0w94p3ajFQiB95ogLw9ZtW47oLF5jGxBZNgZ+8FgCgWQETqSYmiGqA/oILgHUAtbaAYKQsALsunIop//6SM2ehdziCPd1DpuuGg9G0itHhYMz03saka8XYKtipZ0xBAjBBF9Ccdj/WfXSB6TWjBcDuP9liAIBWQzGRamKCqAZIAArgUM8YREE7AtIOFgOwOwc3lpBN7YTPXtoOv8+FV949brouGpcxq60eM1vNRwcumtmkfz6tURMI4+EiHpeDC6gAAWhvcT62sFCYBcBxhiDwJLQAPC6BXEDEpIf+ggvg0IkxzJvud1wA+AwHucQTclo3yQvP6LD176sq0OQ3B4NPmZ0SgNbkwn/W0oAeaHWyAArJVmFpocVEMHT59E1mC0AkFxAx+Zl8W68KIysKunuDuOiMmY7XcBwHUeBMFoCiqjh8MoihYCzNT79m9Sw8/+aRtHYQdV4RHksswNh3qDVpAYyNxxFo9iEYTmRsMwwATfX5Leq3XrfCNtOpUPQun7I6aYPAgHbIjJ2LjyAmEyQAedIzEEY8oTgGgBmiwEOSVBw4Poqt7/fh7Q/7MBzUsnqWzzenarY3+7BiYSveO6j16PnEJYtx8UfmovvYELZ9OGC6dkayhcKyuc1YubgNL7x1FLMC9ZBkBQdPjCHQ5DVV8FrJd1d/1tJ2/fN8xcOOlAWgItDsQ51ncp7J2tbkK8r5CQRRSUgA8uSgXgGcWQDcIo8X3z6KF98+ClHgsGJBK/w+F472hWwbkV121mxdAOq8IuZMb8DJvjE0GBrNnXNqO9qavfj57RfBJfIQBR73f+VC+H0urFjYikvPmo16rwt3//O5+OKPXyvirDWK4RISDS2fL141C2cvDUzodK5K8U9XLrM7w4cgJhUkAHlyqGcMdR4R7S2ZW/r+4xXLcLg3iECzD6tPaUOd14UnXz+Io30h29jAGYtaMWNaHU4OhU3ZN8y3v3x+C25ZvwIA4POkFky2e+Y5TnelTOTA81Kjt8mQVS0TaBK6f4D0840JYjJCApAnh06MYUFHQ9Z+66uXBLB6ScD02px2rSHTicGw7XtaGjw4ORSGz5AVw/rZl6otcrlZPKsJL71zDDPbip9hRBBEfpAA5EE8IeNY/ziu6pxb0PvnTNf6BrFYgJUzlwTw/uFhzGxNpZeyFMmpEnA897TpWNDRUJIUU4Ig8oMEIA+O9IagqCoWzMjs/3fCmK9vx6VnzsJFKztMLhy978wk9JM7QYs/QVQHJAB5oAeAs2QAOcFzHD575TIEHISA4zjT4t9U70GT342uznm4eNWsgp5JEAThBAlAHnT3jKGlwYNmf+HZMBetdK4fsMKybj6+ZlHBzyMIgnCCBCAPDvaMZU3/JIhq4fs3n1NQG3CidiAByJFQJIG+4QguPKOj0kPJi7nT/ViSPF+XqC1mt1fP+dlEdVITAhBLyNh1cBATyaQ8MaCd2evUAbRa+X//6ZxKD4EgiCqlJgRgy+6T+P3zH074Pm6Rx/wcBKAYLROKQTlbLKxa3IZ6X038ORHElIFTM51GXoUMDoZsD0vJhKqqODkUzvt9Vvw+V1p3zlIRCDSgvz9Y8PsTkgyAm7TpoxOd/2SH5l+78y/23HmeQ2urvTuwJrZsHMeho9W+d/9UpZrbQRAEUR1Mzu0hQRAEMWFIAAiCIGoUEgCCIIgahQSAIAiiRiEBIAiCqFFIAAiCIGqUSZcGyvOZD2KZStTSXO2g+dP8a5Vizj3TvSZdIRhBEARRHMgFRBAEUaOQABAEQdQoJAAEQRA1CgkAQRBEjUICQBAEUaOQABAEQdQoJAAEQRA1CgkAQRBEjUICQBAEUaOQAFQJoVAIV199NY4dOwYA2Lx5M9atW4e1a9fivvvuq/DoSsvPfvYzdHV1oaurC/feey+A2pr/T3/6U1x11VXo6urCQw89BKC25s+45557cOeddwKorflv2LABXV1dWL9+PdavX48dO3aUb/4qUXG2b9+uXn311ery5cvVo0ePqpFIRF2zZo165MgRNZFIqDfffLP66quvVnqYJeGNN95Q/+Ef/kGNxWJqPB5XP/OZz6jPPPNMzcz/zTffVG+66SY1kUiokUhEveSSS9T333+/ZubP2Lx5s3ruueeq3/zmN2vq719RFPWCCy5QE4mE/lo5508WQBXw6KOP4nvf+x7a29sBADt37sS8efMwZ84ciKKIdevW4fnnn6/wKEtDIBDAnXfeCbfbDZfLhUWLFqG7u7tm5n/OOefg97//PURRxODgIGRZxtjYWM3MHwBGRkZw33334ZZbbgFQW3//Bw8eBADcfPPNuOaaa/CHP/yhrPMnAagC7r77bpx99tn61319fQgEAvrX7e3t6O3trcTQSs4pp5yCVatWAQC6u7vx17/+FRzH1cz8AcDlcuH+++9HV1cXOjs7a+r3DwDf/e53cfvtt6OxsRFAbf39j42NobOzEz//+c/x29/+Fo888ghOnDhRtvmTAFQhiqKA41ItXFVVNX09Fdm3bx9uvvlm3HHHHZgzZ07Nzf+2227Dli1b0NPTg+7u7pqZ/2OPPYaOjg50dnbqr9XS3//q1atx7733oqGhAdOmTcMNN9yA+++/v2zzn3TnAdQCM2bMQH9/v/51f3+/7h6airzzzju47bbbcNddd6Grqwtbt26tmfkfOHAA8Xgcp556Knw+H9auXYvnn38egiDo10zl+T/33HPo7+/H+vXrMTo6inA4jOPHj9fM/N9++20kEgldAFVVxaxZs8r2908WQBWycuVKHDp0CIcPH4Ysy3j22Wdx0UUXVXpYJaGnpwe33norfvzjH6OrqwtAbc3/2LFj+M53voN4PI54PI6XXnoJN910U83M/6GHHsKzzz6Lp556CrfddhsuvfRSPPDAAzUz/2AwiHvvvRexWAyhUAhPPPEEvva1r5Vt/mQBVCEejwcbN27El7/8ZcRiMaxZswZXXHFFpYdVEh588EHEYjFs3LhRf+2mm26qmfmvWbMGO3fuxLXXXgtBELB27Vp0dXVh2rRpNTF/O2rp7/+SSy7Bjh07cO2110JRFHzqU5/C6tWryzZ/OhGMIAiiRiEXEEEQRI1CAkAQBFGjkAAQBEHUKCQABEEQNQoJAEEQRI1CAkAQBFGjkAAQNcnNN9+MoaGhCV/z5ptv4uqrr876vKVLl9re66WXXsIPfvADAFpb4Oeffx7Hjh3D6tWrs96TICYKFYIRNckbb7xRlGsmymWXXYbLLrus5M8hCDvIAiBqjm9961sAgH/8x3/E1q1bsWHDBqxbtw7XXHMNnnzyybRrenp68Morr+Cmm27C9ddfj4svvhg/+clP8n7uT37yE1x33XVYv349XnnlFQDA448/ji9+8YtFmRdB5AtZAETN8aMf/QiPP/44fve73+ETn/gE7rjjDqxduxa9vb248cYbMW/ePNM1LS0tuOOOO7Bx40bMnz8fvb29uOSSS/CZz3wmr+fOnj0b3//+97F3715s2LABf/3rX0s0Q4LIDRIAomY5cOAAYrEY1q5dCwCYPn061q5di9dff93kg+c4Dr/85S/x6quv4tlnn8WBAwegqioikUhez/vkJz8JAFiyZAkWLVqEd999t3iTIYgCIBcQUbNwHJfWZ11VVUiSZHotHA7juuuuw+7du3HaaafhjjvugCiKyLeNFs+n/t0URYEo0v6LqCwkAERNIggCZs2aBVEU8cILLwAAent78be//Q3nn3++fo0kSTh8+DBCoRC++tWv4tJLL8Wbb76JeDwORVHyeuYTTzwBANi9ezeOHDmClStXFndSBJEntAUhapIrrrgCn/3sZ/GLX/wCP/jBD/Bv//ZvkGUZt956K8477zz9mg0bNuCnP/0pLr74Ylx55ZVwu91YsmQJFi9ejMOHD8Ptduf8zKNHj+Laa68Fx3H413/9VzQ3N5dodgSRG9QOmiAIokYhC4AgisADDzyAZ555xvZ7n/vc53DNNdeUeUQEkR2yAAiCIGoUCgITBEHUKCQABEEQNQoJAEEQRI1CAkAQBFGjkAAQBEHUKP8/4tJAb6qNkeYAAAAASUVORK5CYII=", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Create a visualization with pandas df\n", "sns.lineplot(data=pandas_tips, x=\"total_bill\", y=\"tip\")" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAVwAAAFcCAYAAACEFgYsAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAA09klEQVR4nO3deXyU9b0v8M/s+2SZzGQjCVvYIYlVBLVE0RPRGEWKFnsvevRcX3qvhVt6XrUeDkfvrV0or/ZQl3Laq61tbe/tS1tQjEg9RcUlLIpCIGwBspNMJsskM5lk1uf+ERIBWTLJPPPMPPN5v159lSST3/P9meSTJ7/ntygEQRBARESiU0pdABFRqmDgEhHFCQOXiChOGLhERHHCwCUiihMGLhFRnKjFbHz16tXo6emBWj18mR/84AcYGBjAT37yE/j9ftxxxx1Yt26dmCUQESUM0QJXEAQ0Njbi/fffHw3coaEhLFu2DK+++ipyc3Px2GOPYffu3SgvLx9zu93dXkQi8pk6nJFhRG+vT+oyRCX3Psq9fwD7GA273XLZj4kWuGfOnAEAPPLII3C73bj//vsxY8YMFBUVoaCgAABQVVWFnTt3RhW4cqNWq6QuQXRy76Pc+wewj7Ei2hhuf38/Fi9ejF/+8pf43e9+hz//+c84e/Ys7Hb76GscDgecTqdYJRARJRTR7nDLyspQVlY2+vbKlSvx/PPP42tf+9ro+wRBgEKhiKpdm80csxoTxZX+BJELufdR7v0D2MdYEC1wP/vsMwSDQSxevBjAcLjm5+fD5XKNvsblcsHhcETVrtzGcO12C1wuj9RliErufZR7/wD2Mdp2Lke0IQWPx4NNmzbB7/fD6/Vi27Zt+O53v4uGhgY0NTUhHA6juroaS5YsEasEIqKEItod7i233IJDhw5h+fLliEQi+Na3voWysjJs3LgRa9asgd/vR3l5OZYtWyZWCURECUWRbNszckgh+ci9j3LvH8A+RtvO5XClGRFRnDBwiYjihIFLRBQnDFwiojhh4BIRxQkDl4goTkTdnpHiKy3dCK0mNhtwBIJh9LnlvTsUUbwxcGVEq1Hhpa2HYtLWoytKYtIOEX2JQwpERHHCwCUiihMGLhFRnDBwiYjihIFLRBQnDFwiojhh4BIRxQkDl4goThi4RERxwsAlIooTBi4RUZwwcImI4oSBS0QUJwxcIqI4YeASEcUJA5eIKE64AbmE0tKNAAC73SJxJUQUDwxcCWk1Kvzfvx3HwIA/Ju3xlAaixMYhBSKiOOEdboqKRAT4/CGolArotSooFAqpSyKSPQZuinF7/DjZ6kZn7yAiwvD79FoVChxmTM9Pg0bNP3qIxMLATRGCIOB4sxv1rX3QqJWYnGOF1aRBOCKgs3cQ9a19aHZ6cO1MB2xpeqnLJZIlBm4KiAgCPj/pwtkuHwodZsybkgn1eXeyU3Kt6PX48UW9CzV1Hbim2C5htUTyxb8fZU4QBBw61Y2zXT7MKcpAaXHWBWE7IsOiw00LcpFh0eHzehc+P94pQbVE8sbAlbnGDg9aOr2YMSkN0yelXfG1WrUK18/OhsWowcY/7IezxxenKolSAwNXxvoGAjjS0ANHhgEzC9PH9DkatRLXz86GSqnEr96sQzAUEbdIohTCwJWp4aGELmjUSlxTnBXVtC+DTo3vrCpDk9ODt/c0ilckUYph4MpUQ7sHbm8A86fYoNWoov786+flYtHcbLy9pwlnuwZEqJAo9TBwZSgQCuNEixv2dD3ysozjbmfV0mLotSr86T9PQhCEGFZIlJoYuDJ0qrUPwVAEc4oyJrSCzGrS4u6bpuBYUy+ONPTEsEKi1MTAlZmhQBhn2j2YZDchzaybcHu3lOXDnq7H6++fQoR3uUQTwsCVmTNn+xGJCJhRkB6T9tQqJe5dMhWtrgF8fsIVkzaJUhUDV0a8g0E0dvQjz2aE2aCJWbsLZ2UjJ9OIt2oaOZZLNAEMXBl5d28jQmEBxVdZ4BAtpVKBysVFaOn04tCp7pi2TZRKGLgyIQgC3q5phM2qi8nY7cUWzc1GhkWHvx9oiXnbRKmCgSsTzp5BdPb4MCXXKkr7KqUSt5Tl42hjL+flEo0TA1cmGjs8sKXpkWMb/7zbq1lSmge1Soldn7eKdg0iOWPgysCgP4RO9yBuW1gIpYgnN1iNWlw/24Gawx3wDYVEuw6RXDFwZaCl0wsAuO26QtGvtfRrk+APhvHJkXbRr0UkNwzcJCcIApo7vchK0yPHZhL9elNyrZiWZ8V7B1q5EIIoSgzcJNfr8cM3FEKBwxy3ay792iQ4ewdxotkdt2sSyQEDN8m1dQ1AqVQgJ1O8h2UXu2aGHXqtCjUcViCKCgM3iUUEAW1dA8jOMMT1tF2dRoXrZjnw2QkX/IFw3K5LlOxE/yn96U9/iqeeegoAUFNTg6qqKlRUVGDz5s1iX1r2uvqGEAhGkG8Xf+z2YjfOz4U/EMaBkzz7jGisRA3cPXv2YNu2bQCAoaEhrF+/Hlu2bMGOHTtw5MgR7N69W8zLy15Htw8qpQLZ6Ya4X7t4Uhqy0vSoOdIR92sTJSvRAtftdmPz5s14/PHHAQC1tbUoKipCQUEB1Go1qqqqsHPnTrEuL3uCIKCjxwdHhgEqVfxHhhQKBW6Yl4Njjb3o6R+K+/WJkpFarIaffvpprFu3Du3tww9WOjs7YbfbRz/ucDjgdDqjbtdmi9/T+HgxmaLf+6C7bxBDgTCKcq0XfP542rocu91yxY9XlU/H9k8aUdvYi/tunTGhtpKd3PsHsI+xIErgvv7668jNzcXixYuxdetWAEAkErng9AFBEMZ1GkF3txeRiDzmf458cQcG/FF/bkNbHxQA0o2aCz5/PG1djsvlueLHVQBmTErDf+5rQvn8nMt+Pe12y1XbSmZy7x/APkbbzuWIErg7duyAy+XCPffcg76+Pvh8PrS1tUGl+vIwQ5fLBYfDIcblU0JHjw+ZVt24DoiMpevnZOPVd0+irWsAk+zy++uDKJZEGfx75ZVXUF1djTfffBNr167F0qVL8fLLL6OhoQFNTU0Ih8Oorq7GkiVLxLi87A0MBeHxBeM69/ZyrpnpgEIBfHqMsxWIriZuT1t0Oh02btyINWvW4M4778TUqVOxbNmyeF1eVjq6fQAg6s5gY5Vm0mJmQTo+O9HJ0yCIrkK0h2YjVqxYgRUrVgAAFi9ejO3bt4t9Sdnr6PHBYtTApI/dMToTcd3sbLz6txNocw1gUhyXGBMlG9EDl2IrEAyju9+PGTE+Rudi4XBkzE9sKxZPwZ/ePYG6FjfK5uZesi0iYuAmna6+4TmvjgxxFzuoVEq8tPXQmF+fadXj7Y/PwDfg/8pshUdXlMS6PKKkxL0UkozLPQi1SoF0S+zPLZuIvCwTvIMh9PuCUpdClLAYuEnG5R5CVppe1JMdxiP33IyJdp53RnRZDNwkMjAYhM8fgl2CvROuRqdVwWbVo73HJ3UpRAmLgZtEXO5BAEjIwAWAHJsBHl8QA4McViC6FAZuEnH1DcGgU8GkT8xnnSMLMTp4l0t0SQzcJCEIArrcQ7CnGca1B0U8mPQaWIwaBi7RZTBwk4TbG0AwHIE9XS91KVeUm2lEd78f/iBPgiC6GAM3SYyM32Yl6PjtiJHlxs7eQYkrIUo8DNwk4XIPIs2khU7i3cGuJs2khV6rGt3vgYi+xMBNAuFIBL0eP2xpiT2cAAyfBJGTaYTLPcglvUQXYeAmAbcngIgA2KyJH7jA8GyFcESAq49H7xCdj4GbBLrPnRlmsybWct7LyUrTQ61ScFiB6CIM3CTQ3T8Ei1Ej+ekOY6VUKpCdYURHr4975BKdh4Gb4CKCMDx+myTDCSNyMg0IBCPo8cTujDWiZMfATXD9AwGEwkLSDCeMcGQYoVBw1RnR+Ri4Ca773IOnzCS7w9WolbBZ9XAycIlGMXATXHe/H0a9GgZdYu6fcCU5mUZ4B0M46/JKXQpRQmDgJjBBENDTP5R047cjsjOHV8XtP+qUuBKixMDATWDewSACoUjSjd+OGNnM5tOjHVKXQpQQGLgJrLt/+Al/so3fni87w4i6M93wDXGPXCIGbgLr6R+CTqNM2P1vxyIn04BwRMDhMz1Sl0IkOQZuAuv1+JFh0Sfs/rdjkWHRwWrS4tCpLqlLIZIcAzdBBYJhDAyFkJFgp/NGS6FQ4NrZ2Th8phvhCDezodTGwE1QvedWaCV74ALAwjk5GBgK4VRrn9SlEEmKgZugRgI33ayVuJKJK5tph0qpwKFT3VKXQiQpBm6C6vH4YTVpoVYl/5fIqNdgVmE6DnIcl1Jc8v80y5AgCHB7/ciUwXDCiJLpWejo8XGpL6U0Bm4C8gwGEQoLshi/HVEyPQsAeJdLKY2Bm4Dk9MBshD3dgPwsE6eHUUpj4Cag3n4/NOrkXvBwKSXTs3CypY+rzihlMXATUK/XjwyLLqkXPFxK6fQsRASuOqPUxcBNMMFQBB5fEBlm+QwnjJiaZ4XZoOGwAqUsBm6CcXvlN347QqlUoGSajavOKGUxcBPMSOCmW5J/wcOllEzP4qozSlkM3ATj9gZg1KuhVSfHCb3Rmjslk6vOKGUxcBOM2+uXxXLeyzHo1Fx1RimLgZtA/IEwBv1hpMvwgdn5uOqMUhUDN4G4B0Y2rJF34JZy1RmlKAZuAnF7AwCANJN8hxQAICvdgHw7V51R6mHgJhC3xw+zQQONWv5fltJzq84GuOqMUoj8f7KTiNsbkPUDs/OVnFt1doSrziiFMHATxJA/BH9Q/g/MRkzNtcJi5KozSi0M3AQxMn6bKne4SqUCC6Zy1RmlFgZughhZYWaV+QOz83HVGaUaBm6CcHsDsBg1sjhSZ6zmTsmEWqXg9DBKGanz053ARo7UkeMOYVdi0KkxszADB7nMl1IEAzcBDAXCCIQiSEuR8dvzlU7PgrPHhw6uOqMUwMBNAH0Dww/MUmn8dkTJNBsAcLYCpQRRA/e5557DnXfeicrKSrzyyisAgJqaGlRVVaGiogKbN28W8/JJo/9c4KYZUy9wueqMUologbt//37s3bsX27dvx1//+le8+uqrOH78ONavX48tW7Zgx44dOHLkCHbv3i1WCUmjbyAAk14NdQqsMLsUrjqjVCHaT/jChQvxhz/8AWq1Gt3d3QiHw+jv70dRUREKCgqgVqtRVVWFnTt3ilVC0ugfCKTkcMKIktGzzvjwjORN1GNhNRoNnn/+efz2t7/FsmXL0NnZCbvdPvpxh8MBp9MZVZs2mznWZUoqGIpgYCiEqZPSYTJNfJZCLNqIdVvhcAR2u+WyH8+0mZFmPozjLX2oKi8eU3uqBJs+d6X+yQX7OHGin8O9du1aPProo3j88cfR2Nh4wUm0giBEfTJtd7cXkYgQ6zIlYbdb4PYMAQAMGiUGzm3POBGxaCPWbalUSry09dAVX2M1arGnth2//utBKK/yPfHoihK4XJ6Y1BYLdrsloeoRA/sYXTuXI9ptwunTp3Hs2DEAgMFgQEVFBfbt2weXyzX6GpfLBYfDIVYJSaHXMxxqct+S8WqyMw0IhiPo6Y/dLwyiRCNa4La2tmLDhg0IBAIIBALYtWsXVq1ahYaGBjQ1NSEcDqO6uhpLliwRq4Sk4Pb4oVUrodfK8wyzsXKkGaBUgKdAkKyJNqRQXl6O2tpaLF++HCqVChUVFaisrERmZibWrFkDv9+P8vJyLFu2TKwSkkKvxw+rSRv10IrcqNVK2NL06OjxYc7kjJT/70HyJOoY7po1a7BmzZoL3rd48WJs375dzMsmjXA4ArfXjyk58n8YMRY5mUYcPtMDz2AQ1hSck0zyl1iPelNMm2v4AWAqTwk7X67NCABo7+KwAskTA1dCZ872A+ADsxF6rRo2qw5nuwekLoVIFAxcCTW09UGpUMBs0EhdSsLItZng8QXh9XHVGckPA1dCZ872Ic2shVLJB0QjRoYVeJdLcsTAlYggCGg424cMq17qUhKKQadGhkWH9m6O45L8MHAl4vYG0OcNIMOSWpuOj0WuzYi+gQA3syHZYeBKpKVzeAlhOgP3K/I4W4FkioErkZZOLwCkzLHo0TDqNUgzaTmOS7LDwJVIs9OLHJsRWk1qL+m9nLwsI9zeAHz+kNSlEMXMmAJ3/fr1X3nf2rVrY15MKmnu9GJKXprUZSSsXJsJANDOu1ySkSsu7X3mmWfgdDpx4MAB9PT0jL4/FAqhpaVF9OLkaigQQmePD7deVwhBkMdWk7FmNmhgNWrQ3uXDNP5iIpm4YuCuXLkS9fX1OHHiBG6//fbR96tUKpSWlopdm2y1ugYgAJiaZ8Xptj6py0lYuVkmnGh2Y9AfgkEn+tbNRKK74nfx/PnzMX/+fNxwww3IycmJV02y1+IcnqEwJT+NgXsF+ecCt61rANPzeZdLyW9Mtw3t7e343ve+h76+vgv+BH7rrbdEK0zOmju9MOnVsKcbpC4loZkNGqSbtWhzMXBJHsYUuE8//TRWrFiBOXPmcJ/SGGh2elHgMPO/5RhMsptxpKEHHl8AFm7ZSEluTIGrVqvx8MMPi11LSohEBLS5vLi5LF/qUpJCXpYRRxp60OYawKwiBi4ltzFNCysuLsaJEyfEriUlOHt9CIQiKHDI6/Rhsei1amSl6dHaNcAZHZT0xnSH29LSgm984xvIy8uDTvflyiiO4Uav2Tm8woyBO3aT7CYcPNUNN/eeoCQ3psBdt26d2HWkjOZOD1RKBfKyTFKXkjRybSbUnu5Gq8vLwKWkNqbAnTFjhth1pIwWpxf5WSaoVVxVPVYatRLZmUa0dQ1g7pRMqcshGrcxBe6iRYugUCggCMLok3W73Y4PP/xQ1OLkqLnTi/lTGRrRyreb0N7tQ5d7SOpSiMZtTIF7/Pjx0X8HAgFUV1ejoaFBtKLkqs/rR/9AAIUOntIbrewMA9QqBdq6vFKXQjRuUf9dq9VqsWLFCnzyySdi1CNrI1syFmbzgVm0VEolcm3Dd7n+YFjqcojGZUx3uG63e/TfgiDgyJEj6O/vF6sm2Wru5AyFiSiwm9DS6cWew+2YW8CVZ5R8oh7DBQCbzYZ//dd/FbUwOWp2emCz6mHU85Te8bCl6WHUqfH3/U2YW7BA6nKIohb1GC6NX0unl8MJE6BQKFCQbcah+i50uQeRxb0oKMmMaQw3EongpZdewurVq/HAAw/gxRdfRCjEnfij4Q+E0dHt43DCBA3vQQF8fLhd6lKIojamwP35z3+OvXv34qGHHsLDDz+ML774Aps2bRK7Nllp7fJCAFCYzRkKE2HUqVFabMcnh9sR4VJfSjJjCtyPPvoIv/rVr3DbbbehoqIC//Ef/8E5uFFqObekt5B3uBP2DwuL0N3vx7GmXqlLIYrKmAJXEARoNF8+6NFqtRe8TVfX0umFQaeGLU0vdSlJ7/p5OTDp1fi4lsMKlFzGFLizZs3Cj3/8YzQ3N6OlpQU//vGPudw3Ss2dHhRyD9yY0GpUWDQnBwdOuDAwFJS6HKIxG1PgPvPMM+jv78eqVatw3333obe3F//2b/8mdm2yEYkIaO0cQAFnKMTMTQtyEQpHsO+oU+pSiMbsioEbCATw/e9/H3v27MHGjRtRU1ODBQsWQKVSwWxmeIxVp3sQ/mCYMxRiqCjHgkKHGR9xWIGSyBUD9/nnn4fX68U111wz+r5nn30W/f39eOGFF0QvTi6azx0ayT0UYuvrJXlo6vCM/vclSnRXDNwPPvgAP//5z2Gz2Ubfl52djU2bNuHvf/+76MXJRUunl3vgimDR3Gxo1Uq893mb1KUQjckVA1ej0UCv/+pTdbPZDK2W50uNVbPTi1ybCRo198CNJZNeg0Vzs7H3aAd8fHhGSeCKCaBUKuH1fnU7PK/Xy5VmUWjp9HBJr0iWXjMJgWAEHx/ukLoUoqu6YuDedddd2LBhA3w+3+j7fD4fNmzYgIqKCtGLk4P+gQDc3gAXPIikMNuCaflWvP95K1eeUcK7YuA+9NBDsFgsuPHGG3H//fdj5cqVuPHGG2G1WvHEE0/Eq8akNrIHbgGX9Ipm6TWT4OwdxLFGrjyjxHbF3cKUSiWeffZZPP7446irq4NSqcSCBQvgcDjiVV/Sa+4cfoLOKWHiuXamA3/eVY/3Pm/lmWeU0Ma0PWN+fj7y8/PFrkWWWpxeZFp1MBu4FFosGrUSS0rysGNvE7r7hrh8mhIWH5uLrLnTy/m3cXBz6fANwQcHOUWMEhcDV0SBYBjt3QMcTogDW5oepdOz8OGhswiGIlKXQ3RJDFwRtXUNQBB4aGS8LL1mEjy+IPYf4/4KlJgYuCLiDIX4mjM5A/l2E/62v2X0/D2iRMLAFVGz0wODToUsPsSJC4VCgduvK0Sry4ujnCJGCYiBK6LmTi8K7GYouQdu3Fw/JxtpZi127m+WuhSir2DgiiQiCGjp9KKAMxTiSqNW4ravTUJdQ8/okA5RohjTPFyKnss9CH8gzE3HRRAOR2C3X/4X2crbZuLtPU3YXduOdQ9cc9nXAcMzSfrcviu+hihWGLgiGT00koEbcyqVEi9tPXTF1+TZjHj/QAtUEGDQXf7b/NEVJbEuj+iyRB1SePHFF1FZWYnKysrRY9VrampQVVWFiooKbN68WczLS6q50wulQoF87oErial5VggC0NDeL3UpRKNEC9yamhp8/PHH2LZtG9544w3U1dWhuroa69evx5YtW7Bjxw4cOXIEu3fvFqsESbU4PcjNMkKjVkldSkoy6jXIyzKiscPDhRCUMEQLXLvdjqeeemr0SPVp06ahsbERRUVFKCgogFqtRlVVFXbu3ClWCZIaXtLL4QQpTc9LQygsoKmDR/BQYhAtcIuLi1FaWgoAaGxsxDvvvAOFQgG73T76GofDAadTfquCPL4Aej1+zlCQWLpFB3u6HqfO9iEU5l0uSU/0h2b19fV47LHH8OSTT0KlUqGxsXH0Y4IgQBHlHFWbLfHvGs+edAEA5s+wX/Fp+giTSReza6dCW9G0V1Jsx98/bUFH7xBmFmVc8jVj+RpdTSzaSHTs48SJGrgHDhzA2rVrsX79elRWVmL//v1wuVyjH3e5XFHvrdvd7UUkktjLNg+dHL5rT9Or4XJd/s/ZkS/uwIA/ZtdOhbaiac+oVcFm1aPuTDdyM/VQKb/6R92VvkZjYbdbJtxGomMfo2vnckQbUmhvb8cTTzyBn/3sZ6isrAQAlJSUoKGhAU1NTQiHw6iursaSJUvEKkEyzU4vbNwDN2HMLEiDPxhGs5MLIUhaot3h/uY3v4Hf78fGjRtH37dq1Sps3LgRa9asgd/vR3l5OZYtWyZWCZJp6vCgkBvWJAxbmh6ZFh3qW/tQmG2BSsml1iQN0QJ3w4YN2LBhwyU/tn37drEuK7mhQAjOHh8WzcmWuhQ6R6FQYEZBOvYedaKl04vJOfxlSNLgXgox1tLphQDwDjfB2NP1SDdrcaq1L+GfAZB8MXBjbGScsIh3UQlFoVBgZkE6fP4QWlwcyyVpMHBjrKnDA4tRg3SzVupS6CKODAPSzVqcbHYjHOG8XIo/Bm6MNTs9KMq2RD2/mMSnUCgwuygDg4EwGtvlPcWJEhMDN4aCoQjaugY4fpvA7OkG2NP1qG/t4x4LFHcM3Bg62zWAcETg+G2Cm12YgUAogtNn+6QuhVIMAzeGmpzDf6ZyD9zElm7RIddmxOm2frg9sV0BR3QlDNwYajp3aKQ93SB1KXQVswszEIkIeH3XSalLoRTCwI2h5g4PChwWHhqZBMxGDQqyzdhR04gu96DU5VCKYODGSCQyfGhkER+YJY2ZBelQKICtH52RuhRKEQzcGGnv8SEQinD8NokYdGosL5+GvXVOnG7jAzQSHwM3RprPPTDjHW5yue/WGUgzafH/dtUjInDJL4mLgRsjTR0eaNRK5GYZpS6FomDQqfGN8mk4c7Yf+47K7/QRSiwM3BhpdnowyW665AbXlNhumJ+DyTkW/OWD0/AHwlKXQzLGdIgBQRDQ5OQDs2SlVCjwwG3F6PX48c6+JqnLIRlj4MaAq28Ig/4Ql/QmseJJ6Vg424F39jWju29I6nJIphi4MdDcMbLCjIGbzO67eToA4LX3T0lcCckVAzcGGjr6oVIqUOAwSV0KTYAtTY/KxUX49HgnjpzplrockiEGbgw0tnswyW6GRq2SuhSaoDuuL0J2phF/fPckAkE+QKPYYuBOUEQQ0NjRjym5HE6QA41aiQcrZqDTPYi39/ABGsUWA3eCnD0+DPrDmJxrlboUipHZkzOxeG42duxtQnv3gNTlkIwwcCdo5OSAKQxcWbl/aTF0GhVe/dsJCFyBRjHCwJ2gho5+aNVK5HGFmaykmbRYefM0HG92Y09dh9TlkEwwcCeosd2DwhwLV5jJ0JLSPEzLs+LPu06hfyAgdTkkA2qpC0hm4UgEzU4PykvzpS6FxikcjsBuv/wDz3/+r9fif/77B3h99xk89dB1l32d3W5BIBhGn9snRpkkEwzcCWhzDSAQinCGQhJTqZR4aeuhK75men4aPqk9i//9f2qQl/XVudYmkw4DA348uqJErDJJJvh38AQ0nlthxhkK8jYt34p0sxa1p7vh59xcmgAG7gQ0tPfDoFPDkcEzzORMqVCgtDgLoXAEh09zBRqNHwN3Ahra+zE5h2eYpQKrUYuZhek42+3D2S7OzaXxYeCOUzAURptrgPNvU8i0/DSkmYaHFoYCIanLoSTEh2ZRSks3QqtR4WhDN8IRAaWzsq/4lJvkQ6lQ4JoZWfjwUDu+qO/CojnZUPCvG4oCAzdKWo0KL209hFPnDh38rK4dR+o7x9UWn2onH4tRi7mTM1B7pgcN7R5MzeNfODR2HFIYp16PH0adGnotdwhLNUU5FmRnGHC0sYcLIigqDNxxEAQBPf1+ZFh0UpdCElAoFCidngWNWokDJ10IhyNSl0RJgoE7DoP+MPzBMDKtDNxUpdOqUFacBY8viIP1LqnLoSTBMdxx6PUMn3nFO9zU5sgwYkquBSeb3UgzaqUuh5IAA3ccejx+qJQKWPlDlvLmTM5E30AQX9S70NE9AI7o05VwSGEcej1+pJu1UCo5JSjVqZQK3LggFwDw01c/QzDE8Vy6PAZulPzBMPoGAsiw6KUuhRKE2ahFWXEWTrW48dp7PPGXLo+BG6VTLW4IApDJ8Vs6T67NhOXl07Dr81bsP+aUuhxKUAzcKJ1o6gHAB2b0VQ9VzsG0PCteeec42rjfAl0CAzdKx5t6YdSroeOCB7qIWqXEf18+Dzq1Ei/+tRa+oaDUJVGCYeBGQRAEHG3o5nACXVamVY//ce98dPUN4dfbjyIS4QGU9CUGbhTau33o8wZgS+MDM7q8GQXp+NZtxTh8phvbPjojdTmUQDgPNwonWtwAAJuVgUtXdnNZPpqcHry9pwmF2RZcN8shdUmUAHiHG4WTLW5kWnUw6fl7iq5MoVDgv/zDTEzLt+I3bx9FS6dX6pIoATBwx0gQBJxo7sW8qVncA5XGRKNW4ol758OgU+OFv9bCO8iHaKmOgTtGLvcg3N4A5k6zSV0KJZF0sw7fvnc+3F4/frn1MELcWSylMXDH6ESzGwAwbyoDl6IzLT8ND985Gyda3PjdO8chCJy5kKo4GDlGJ1rcMBs0KMjmcToUvcVzc9DZO4g3P25AdoYBVTdOkbokkgADd4xOtrgxsyCd47c0bnffOBmdvT5s+6gB9gwDFs3JkbokijNRhxS8Xi/uuusutLa2AgBqampQVVWFiooKbN68WcxLx1R33xC6+oYwozBd6lIoiSkUCvzjHbMxoyAdv337OOpb3VKXRHEmWuAeOnQIDzzwABobGwEAQ0NDWL9+PbZs2YIdO3bgyJEj2L17t1iXj6kTLb0AgJkF6dIWQklPo1bi2yvmw2bV4YW/HkZnr0/qkiiORAvc1157Dc888wwcjuEJ37W1tSgqKkJBQQHUajWqqqqwc+dOsS4fU8eb3DDp1ZhkN0tdCsmA2aDBd+4rgSAI2Px6Lfp9PIgyVYg2hvujH/3ogrc7Oztht9tH33Y4HHA6o9/GzmaLb+gJgoBjzb0onelAdvbwkdgmU2z3Uohle6nQVqzbi0VbI23Y7WN7qGq3W/D0f1uEf/tVDba8cQQ/fPxGGHSJ/UhlrH1LZmL3MW5f4UgkcsEDJ0EQxvUAqrvbG9cNQdq6BtDdN4TpuRa4XB7Y7RYMDPhjeo1YtpcKbcW6vYm2ZTLpRttwuTxj/jy7WYvH7pmLF7cexg9e2oO1KxdArUrMmZp2uyWqviWjWPXxSqEdt69uTk4OXK4vTzd1uVyjww2JrK5heP/buZMzJa6E5Kis2I6Hls3CkYYe/HbHMUQ4R1fW4ha4JSUlaGhoQFNTE8LhMKqrq7FkyZJ4XX7c6hp6kJ1pRFa6QepSSKaWlOThG+VTsbfOidfeO8WFETIWtyEFnU6HjRs3Ys2aNfD7/SgvL8eyZcvidflxCYYiONHci68vyJO6FJK5OxcVoc8bwLuftiDNrMUd1xdJXRKJQPTAfe+990b/vXjxYmzfvl3sS8bMqbY+BEIRzJ3C4QQSl0KhwKrbiuEZDOL190/DpNdgSQl/0ctNYj8WlVhdQw9USgVmcsEDjUE4HJnwU+7vP7QQP3plH36/8zisVj1K+cteVhi4V1DX0INp+WkJP12HEoNKpcRLWw9NuJ2cdD1sVj1efO0gHr9nHq7l5uWykZhzUBJAvy+AJqeHwwkUdyqVEgtnOzCzKBO/3l6Hg/VdUpdEMcLAvYzDp7sBAPMYuCQBtUqJ//XoIhRmm7HljcOj0xMpuTFwL+NgfRfSzVoU5ch/dQ0lJqNeg3X3lyLXZsILf63FieZeqUuiCWLgXkIgGMbhhm6UFduh5HaMJCGzQYN/XlWKrHQDNr9+CMcaeaebzBi4l3C0qReBYARlxVlSl0IEq1GL7z1QBke6Ab/4Sy0On+mWuiQaJwbuJRysd8GgU2FWUYbUpRABANJMWjz5rWuQZzPh+b/U4ouTrqt/EiUcBu5FIhEBB+u7MH+qLWE3EqHUZDZo8L0HSlGUY8GWN45g/7Hod9sjaTFRLnLmbD/6fUGUFduv/mKiODPqNfjnb5ZiWp4Vv36zDrsOtEpdEkWBgXuRz+tdUCkVmM/TeSlBGXRqrPtmKUqmZ+FP/3kSf/ngNDe8SRIM3PMIgoAv6rswqygDRj1Xl1Hi0mlUeGLFPNxcmocde5vwm7ePIRSOSF0WXQVT5TzNTi+cPT7cfl2B1KUQjWlvhu/+12sxKeck/rjzOHyBML6/+lqYjdqvvC4QDKPPzfPTpMbAPc+eug6olAquXaeEEM3eDKXTbThU78J/+9F/YuFsBywXhe6jK0rEKJGixCGFcyIRAfuOOrFgmg1mg0bqcoiiUphtwQ3zchAKR/DhoXa0d/NuNhExcM851tSLvoEAFs/NkboUonGxWfVYUpIHi1GDT4934nhzLx+mJRgG7jl76jpg0KlRMp2zEyh5GXRq3Dg/B5PsJpxs6UPNkQ4M+kNSl0XnMHAB+ANhHDjpwnWz7NCoVVKXQzQhKqUSZcVZKJ1ug9sbwAcHz+KT2rNSl0XgQzMAwBenXPAHwhxOINlQKBQozLYg06rH5ydd2Pj7T/EPCwvxT3fPg2mczyjOnzHBWQ/jw8AFUHO4A5lWHYoL0qUuhSimzAYNbpqfC4NRi9d31eOjg22YNzUTuZlGKKLYCc9k0mFgwD/6Nmc9jE/KDyk4e3w40tCDJQvyuBUjyZJSqcCDd87B1xfkQqtR4rPjLnx6vJNjuxJI+cB97/M2qJQKlJfyhFSStwyLDktK8jBncgZc7iG893kbTra4EeYKtbhJ6SGFoUAIHx9ux7WzHEgz66Quh0h0SoUC0/PTkGszoq6hB8eb3Wjq8GBWUQYm2U1RDTNQ9FL6DndvnROD/hBuvWaS1KUQxZVJr8HC2dm4YV4OtBoVvqjvwoeH2tHZO8i5uyJK2cAVBAG7Pm9FYbYZ0/KtUpdDJImsND2WlOSirDgLgVAYe4868fHhDnT2+hi8IkjZIYUTzW60uQbw8B2z+GcUpTSFQoEChxn5WSY0d3pR3+rG3qOdyDBrMbMwHfZ0g9QlykbKBu47+5phNmhw/ZxsqUshSghKpQKTcywodJjR0unFyXPBm27WYs4UGzItWs7kmaCUDNymDg8On+nGiiVTodVwZRnR+ZRKBYpyLCg4F7yn2vpQc7gdBp0KU3OtKMy+8paRdHkpGbjVNY0w6NRYyodlRJc1EryF2Wa4fSEcPdOFusZenGhxQ2/Q4sY5DmRa9VKXmVRSLnAb2vtx4KQLd984mac6EI2BQqHAJIcZGSYNej1+nD7bhzc/PI3tH57G12bacUtZPmYUpPNZyBikXOJs3X0aZoMGty8slLoUoqSTYdHh2pkO3H1zMV579zg+rm3H/mOdyMsy4ebSPNwwL1fSG5m0dOOEhgkvPmEj1ntGpFTgHmnoRl1jL765dDoMupTqOlFMZWcaserWYty7ZCr2H3Pigy/a8H//Xo+/7D6N62dn48b5uSielBb3u16tRjXmUzIudvF+EUDs94xImdQJhSP4f3+vhyPdwLFbohjRaVT4+oI8fH1BHho7+vH+523Yd8yJj2rbkZWmx+K5ObhhXg6yM41Sl5oQUiZw3zvQivZuH9Z8Yz406pRd70Ekmsk5Vjx8pxUP3FaMz0+6sOdIB6prGvFWTSMm2U0oK7bjmhl2FGabU3a8NyUCt8s9iG0fNWDBNBtKp2dJXQ6RrOm1atwwLxc3zMtFr8ePT4858UV9F6r3DIdvplWH2UUZmFWYgdlFGSk100H2gSsIAv7wtxMAgNUVM1P2NyuRFDIsOlQsLETFwkJ4fAEcOtWNQ6e6cLC+C58c7gAA2NP1mJJrRVG2BYXZw9PQLj51WC5kH7jvfd6GIw09+C//MAO2tNT5TUqUaCxGLW5akIubFuQiIgho7fTieLMbJ1vcON3Wj/3HOkdfazZo4MgwDP8v3YDsDCPs6QbY0vRIMyfvijdZB67HF8Br75/C/Kk2LL0mX+pyiOgc5bkjgAqzLai4rgAA4B0MotnpQbPTC2evD529g6hv6cO+OifO30ZHpVTAZtXDlqYf/f+sc/+OqFQQBCFh/5KVdeBq1EosW1iIW782KWG/AEQ0zGzQYM7kTMyZnHnB+4OhCLr6BuFyD6K7bwhd/UPo7hv+3+GGbvR5Axe8XqVUwGLUwGrUwmLSIMOsQ7pZB6VS+gyQdeDqtWrcu2Sq1GUQ0QRo1Erk2kzItZku+fFgKIyefj+6+ofgDwvY8fEZ9PuC6Oj1oblz+DQLpVKBDLMWNqse9nQDMqw6SYYlZB24RCR/GrUK2ZlGZGcaYbdb0Hy2b/RjQ4EQevr96On3o7t/CCdb+3CytQ8atRKODANyMoxwZBjiNlU0JQJ3osv9iOhC4XDkK8tgxysUjkCtEifw9Fo18rLUyMsavjsOhiJwuQfR0TM8RtzmGoBSqUBOhgHTCtKRZtCIOvSQEoE7keV+F+Px0ESASqWM6c9UvH4+NWol8rJMyMsyQRAE9Hj8ONs1gLauAZw9eHb040UOM9LMsZ+alhKBS0R0MYXi3GwHqx5zJ2fCMxTCqVY3Wju9aOrwwGrU4M6vT4Muhje8DFwiSnlKpQJ5djPSjBoEQxG0ubzo6B3EwGAQOqMmdteJWUtERDKgUSsxOdeKRXOyMbMo8+qfEAUGLhFRnDBwiYjihIFLRBQnDFwiojiRJHDfeust3HnnnaioqMCf/vQnKUogIoq7uE8Lczqd2Lx5M7Zu3QqtVotVq1bh+uuvx/Tp0+NdChFRXMU9cGtqarBo0SKkp6cDAG6//Xbs3LkT3/72t8f0+eNddmeO4Vy6WLZl0mugECIxay9R+xnLtmLd3kTbMp73NZRrP42X+D5NlNpi1dal+giMP3MuRSEIgnD1l8XOr3/9a/h8Pqxbtw4A8Prrr6O2thbPPvtsPMsgIoq7uI/hRiKRC/amTeTNgomIYinugZuTkwOXyzX6tsvlgsPhiHcZRERxF/fAveGGG7Bnzx709PRgcHAQ7777LpYsWRLvMoiI4i7uD82ys7Oxbt06PPjggwgGg1i5ciUWLFgQ7zKIiOIu7g/NiIhSFVeaERHFCQOXiChOGLhERHHCwCUiihMGbpx5vV7cddddaG1tBTC81LmqqgoVFRXYvHmzxNVN3IsvvojKykpUVlZi06ZNAOTXx+eeew533nknKisr8corrwCQXx8B4Kc//SmeeuopAPLr3+rVq1FZWYl77rkH99xzDw4dOhSfPgoUNwcPHhTuuusuYe7cuUJLS4swODgolJeXC83NzUIwGBQeeeQR4YMPPpC6zHH75JNPhG9+85uC3+8XAoGA8OCDDwpvvfWWrPq4b98+YdWqVUIwGBQGBweFW265RTh27Jis+igIglBTUyNcf/31wve//33ZfZ9GIhHhpptuEoLB4Oj74tVH3uHG0WuvvYZnnnlmdGVdbW0tioqKUFBQALVajaqqKuzcuVPiKsfPbrfjqaeeglarhUajwbRp09DY2CirPi5cuBB/+MMfoFar0d3djXA4jP7+fln10e12Y/PmzXj88ccByO/79MyZMwCARx55BHfffTf++Mc/xq2PDNw4+tGPfoRrr7129O3Ozk7Y7fbRtx0OB5xOpxSlxURxcTFKS0sBAI2NjXjnnXegUChk1UcA0Gg0eP7551FZWYnFixfL7uv49NNPY926dbBarQDk933a39+PxYsX45e//CV+97vf4c9//jPOnj0blz4ycCUk14186uvr8cgjj+DJJ59EQUGBLPu4du1a7NmzB+3t7WhsbJRNH19//XXk5uZi8eLFo++T2/dpWVkZNm3aBIvFgszMTKxcuRLPP/98XPoY96W99CU5buRz4MABrF27FuvXr0dlZSX2798vqz6ePn0agUAAs2fPhsFgQEVFBXbu3AmVSjX6mmTu444dO+ByuXDPPfegr68PPp8PbW1tsukfAHz22WcIBoOjv1QEQUB+fn5cvk95hyuhkpISNDQ0oKmpCeFwGNXV1Um9kU97ezueeOIJ/OxnP0NlZSUA+fWxtbUVGzZsQCAQQCAQwK5du7Bq1SrZ9PGVV15BdXU13nzzTaxduxZLly7Fyy+/LJv+AYDH48GmTZvg9/vh9Xqxbds2fPe7341LH3mHKyGdToeNGzdizZo18Pv9KC8vx7Jly6Qua9x+85vfwO/3Y+PGjaPvW7Vqlaz6WF5ejtraWixfvhwqlQoVFRWorKxEZmambPp4Mbl9n95yyy04dOgQli9fjkgkgm9961soKyuLSx+5eQ0RUZxwSIGIKE4YuEREccLAJSKKEwYuEVGcMHCJiOKEgUtEFCcMXEoajzzyCHp6eib8mn379uGuu+666vVmzpx5ybZ27dqFH/7whwCGt/nbuXMnWltbUVZWdtU2KbVx4QMljU8++SQmr5moW2+9Fbfeeqvo1yH54R0uJYV/+Zd/AQA89NBD2L9/P1avXo2qqircfffdeOONN77ymvb2drz//vtYtWoVVqxYgZtvvhm/+MUvor7uL37xC9x7772455578P777wMAtm7disceeywm/aLUwjtcSgo/+clPsHXrVvz+97/H/fffjyeffBIVFRVwOp247777UFRUdMFrMjIy8OSTT2Ljxo2YPHkynE4nbrnlFjz44INRXXfSpEn4wQ9+gJMnT2L16tV45513ROohpQIGLiWV06dPw+/3o6KiAgCQnZ2NiooKfPTRRxeMoSoUCvzqV7/CBx98gOrqapw+fRqCIGBwcDCq6z3wwAMAgBkzZmDatGn44osvYtcZSjkcUqCkolAovrJPqSAICIVCF7zP5/Ph3nvvRV1dHebMmYMnn3wSarUa0W4dolR++SMSiUSgVvMehcaPgUtJQ6VSIT8/H2q1Gu+++y4AwOl04m9/+xtuuOGG0deEQiE0NTXB6/XiO9/5DpYuXYp9+/YhEAggEolEdc1t27YBAOrq6tDc3IySkpLYdopSCn9dU9JYtmwZ/vEf/xFbtmzBD3/4Q7zwwgsIh8N44oknsGjRotHXrF69Gs899xxuvvlm3HHHHdBqtZgxYwamT5+OpqYmaLXaMV+zpaUFy5cvh0KhwL//+78jPT1dpN5RKuD2jEREccI7XEpZL7/8Mt56661Lfuyf/umfcPfdd8e5IpI73uESEcUJH5oREcUJA5eIKE4YuEREccLAJSKKEwYuEVGc/H/38wAJadXY2gAAAABJRU5ErkJggg==", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Create a visualization with Modin df\n", "sns.displot(data=modin_tips, x=\"total_bill\", kde=True)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAVwAAAFcCAYAAACEFgYsAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAA09klEQVR4nO3deXyU9b0v8M/s+2SZzGQjCVvYIYlVBLVE0RPRGEWKFnsvevRcX3qvhVt6XrUeDkfvrV0or/ZQl3Laq61tbe/tS1tQjEg9RcUlLIpCIGwBspNMJsskM5lk1uf+ERIBWTLJPPPMPPN5v159lSST3/P9meSTJ7/ntygEQRBARESiU0pdABFRqmDgEhHFCQOXiChOGLhERHHCwCUiihMGLhFRnKjFbHz16tXo6emBWj18mR/84AcYGBjAT37yE/j9ftxxxx1Yt26dmCUQESUM0QJXEAQ0Njbi/fffHw3coaEhLFu2DK+++ipyc3Px2GOPYffu3SgvLx9zu93dXkQi8pk6nJFhRG+vT+oyRCX3Psq9fwD7GA273XLZj4kWuGfOnAEAPPLII3C73bj//vsxY8YMFBUVoaCgAABQVVWFnTt3RhW4cqNWq6QuQXRy76Pc+wewj7Ei2hhuf38/Fi9ejF/+8pf43e9+hz//+c84e/Ys7Hb76GscDgecTqdYJRARJRTR7nDLyspQVlY2+vbKlSvx/PPP42tf+9ro+wRBgEKhiKpdm80csxoTxZX+BJELufdR7v0D2MdYEC1wP/vsMwSDQSxevBjAcLjm5+fD5XKNvsblcsHhcETVrtzGcO12C1wuj9RliErufZR7/wD2Mdp2Lke0IQWPx4NNmzbB7/fD6/Vi27Zt+O53v4uGhgY0NTUhHA6juroaS5YsEasEIqKEItod7i233IJDhw5h+fLliEQi+Na3voWysjJs3LgRa9asgd/vR3l5OZYtWyZWCURECUWRbNszckgh+ci9j3LvH8A+RtvO5XClGRFRnDBwiYjihIFLRBQnDFwiojhh4BIRxQkDl4goTkTdnpHiKy3dCK0mNhtwBIJh9LnlvTsUUbwxcGVEq1Hhpa2HYtLWoytKYtIOEX2JQwpERHHCwCUiihMGLhFRnDBwiYjihIFLRBQnDFwiojhh4BIRxQkDl4goThi4RERxwsAlIooTBi4RUZwwcImI4oSBS0QUJwxcIqI4YeASEcUJA5eIKE64AbmE0tKNAAC73SJxJUQUDwxcCWk1Kvzfvx3HwIA/Ju3xlAaixMYhBSKiOOEdboqKRAT4/CGolArotSooFAqpSyKSPQZuinF7/DjZ6kZn7yAiwvD79FoVChxmTM9Pg0bNP3qIxMLATRGCIOB4sxv1rX3QqJWYnGOF1aRBOCKgs3cQ9a19aHZ6cO1MB2xpeqnLJZIlBm4KiAgCPj/pwtkuHwodZsybkgn1eXeyU3Kt6PX48UW9CzV1Hbim2C5htUTyxb8fZU4QBBw61Y2zXT7MKcpAaXHWBWE7IsOiw00LcpFh0eHzehc+P94pQbVE8sbAlbnGDg9aOr2YMSkN0yelXfG1WrUK18/OhsWowcY/7IezxxenKolSAwNXxvoGAjjS0ANHhgEzC9PH9DkatRLXz86GSqnEr96sQzAUEbdIohTCwJWp4aGELmjUSlxTnBXVtC+DTo3vrCpDk9ODt/c0ilckUYph4MpUQ7sHbm8A86fYoNWoov786+flYtHcbLy9pwlnuwZEqJAo9TBwZSgQCuNEixv2dD3ysozjbmfV0mLotSr86T9PQhCEGFZIlJoYuDJ0qrUPwVAEc4oyJrSCzGrS4u6bpuBYUy+ONPTEsEKi1MTAlZmhQBhn2j2YZDchzaybcHu3lOXDnq7H6++fQoR3uUQTwsCVmTNn+xGJCJhRkB6T9tQqJe5dMhWtrgF8fsIVkzaJUhUDV0a8g0E0dvQjz2aE2aCJWbsLZ2UjJ9OIt2oaOZZLNAEMXBl5d28jQmEBxVdZ4BAtpVKBysVFaOn04tCp7pi2TZRKGLgyIQgC3q5phM2qi8nY7cUWzc1GhkWHvx9oiXnbRKmCgSsTzp5BdPb4MCXXKkr7KqUSt5Tl42hjL+flEo0TA1cmGjs8sKXpkWMb/7zbq1lSmge1Soldn7eKdg0iOWPgysCgP4RO9yBuW1gIpYgnN1iNWlw/24Gawx3wDYVEuw6RXDFwZaCl0wsAuO26QtGvtfRrk+APhvHJkXbRr0UkNwzcJCcIApo7vchK0yPHZhL9elNyrZiWZ8V7B1q5EIIoSgzcJNfr8cM3FEKBwxy3ay792iQ4ewdxotkdt2sSyQEDN8m1dQ1AqVQgJ1O8h2UXu2aGHXqtCjUcViCKCgM3iUUEAW1dA8jOMMT1tF2dRoXrZjnw2QkX/IFw3K5LlOxE/yn96U9/iqeeegoAUFNTg6qqKlRUVGDz5s1iX1r2uvqGEAhGkG8Xf+z2YjfOz4U/EMaBkzz7jGisRA3cPXv2YNu2bQCAoaEhrF+/Hlu2bMGOHTtw5MgR7N69W8zLy15Htw8qpQLZ6Ya4X7t4Uhqy0vSoOdIR92sTJSvRAtftdmPz5s14/PHHAQC1tbUoKipCQUEB1Go1qqqqsHPnTrEuL3uCIKCjxwdHhgEqVfxHhhQKBW6Yl4Njjb3o6R+K+/WJkpFarIaffvpprFu3Du3tww9WOjs7YbfbRz/ucDjgdDqjbtdmi9/T+HgxmaLf+6C7bxBDgTCKcq0XfP542rocu91yxY9XlU/H9k8aUdvYi/tunTGhtpKd3PsHsI+xIErgvv7668jNzcXixYuxdetWAEAkErng9AFBEMZ1GkF3txeRiDzmf458cQcG/FF/bkNbHxQA0o2aCz5/PG1djsvlueLHVQBmTErDf+5rQvn8nMt+Pe12y1XbSmZy7x/APkbbzuWIErg7duyAy+XCPffcg76+Pvh8PrS1tUGl+vIwQ5fLBYfDIcblU0JHjw+ZVt24DoiMpevnZOPVd0+irWsAk+zy++uDKJZEGfx75ZVXUF1djTfffBNr167F0qVL8fLLL6OhoQFNTU0Ih8Oorq7GkiVLxLi87A0MBeHxBeM69/ZyrpnpgEIBfHqMsxWIriZuT1t0Oh02btyINWvW4M4778TUqVOxbNmyeF1eVjq6fQAg6s5gY5Vm0mJmQTo+O9HJ0yCIrkK0h2YjVqxYgRUrVgAAFi9ejO3bt4t9Sdnr6PHBYtTApI/dMToTcd3sbLz6txNocw1gUhyXGBMlG9EDl2IrEAyju9+PGTE+Rudi4XBkzE9sKxZPwZ/ePYG6FjfK5uZesi0iYuAmna6+4TmvjgxxFzuoVEq8tPXQmF+fadXj7Y/PwDfg/8pshUdXlMS6PKKkxL0UkozLPQi1SoF0S+zPLZuIvCwTvIMh9PuCUpdClLAYuEnG5R5CVppe1JMdxiP33IyJdp53RnRZDNwkMjAYhM8fgl2CvROuRqdVwWbVo73HJ3UpRAmLgZtEXO5BAEjIwAWAHJsBHl8QA4McViC6FAZuEnH1DcGgU8GkT8xnnSMLMTp4l0t0SQzcJCEIArrcQ7CnGca1B0U8mPQaWIwaBi7RZTBwk4TbG0AwHIE9XS91KVeUm2lEd78f/iBPgiC6GAM3SYyM32Yl6PjtiJHlxs7eQYkrIUo8DNwk4XIPIs2khU7i3cGuJs2khV6rGt3vgYi+xMBNAuFIBL0eP2xpiT2cAAyfBJGTaYTLPcglvUQXYeAmAbcngIgA2KyJH7jA8GyFcESAq49H7xCdj4GbBLrPnRlmsybWct7LyUrTQ61ScFiB6CIM3CTQ3T8Ei1Ej+ekOY6VUKpCdYURHr4975BKdh4Gb4CKCMDx+myTDCSNyMg0IBCPo8cTujDWiZMfATXD9AwGEwkLSDCeMcGQYoVBw1RnR+Ri4Ca773IOnzCS7w9WolbBZ9XAycIlGMXATXHe/H0a9GgZdYu6fcCU5mUZ4B0M46/JKXQpRQmDgJjBBENDTP5R047cjsjOHV8XtP+qUuBKixMDATWDewSACoUjSjd+OGNnM5tOjHVKXQpQQGLgJrLt/+Al/so3fni87w4i6M93wDXGPXCIGbgLr6R+CTqNM2P1vxyIn04BwRMDhMz1Sl0IkOQZuAuv1+JFh0Sfs/rdjkWHRwWrS4tCpLqlLIZIcAzdBBYJhDAyFkJFgp/NGS6FQ4NrZ2Th8phvhCDezodTGwE1QvedWaCV74ALAwjk5GBgK4VRrn9SlEEmKgZugRgI33ayVuJKJK5tph0qpwKFT3VKXQiQpBm6C6vH4YTVpoVYl/5fIqNdgVmE6DnIcl1Jc8v80y5AgCHB7/ciUwXDCiJLpWejo8XGpL6U0Bm4C8gwGEQoLshi/HVEyPQsAeJdLKY2Bm4Dk9MBshD3dgPwsE6eHUUpj4Cag3n4/NOrkXvBwKSXTs3CypY+rzihlMXATUK/XjwyLLqkXPFxK6fQsRASuOqPUxcBNMMFQBB5fEBlm+QwnjJiaZ4XZoOGwAqUsBm6CcXvlN347QqlUoGSajavOKGUxcBPMSOCmW5J/wcOllEzP4qozSlkM3ATj9gZg1KuhVSfHCb3Rmjslk6vOKGUxcBOM2+uXxXLeyzHo1Fx1RimLgZtA/IEwBv1hpMvwgdn5uOqMUhUDN4G4B0Y2rJF34JZy1RmlKAZuAnF7AwCANJN8hxQAICvdgHw7V51R6mHgJhC3xw+zQQONWv5fltJzq84GuOqMUoj8f7KTiNsbkPUDs/OVnFt1doSrziiFMHATxJA/BH9Q/g/MRkzNtcJi5KozSi0M3AQxMn6bKne4SqUCC6Zy1RmlFgZughhZYWaV+QOz83HVGaUaBm6CcHsDsBg1sjhSZ6zmTsmEWqXg9DBKGanz053ARo7UkeMOYVdi0KkxszADB7nMl1IEAzcBDAXCCIQiSEuR8dvzlU7PgrPHhw6uOqMUwMBNAH0Dww/MUmn8dkTJNBsAcLYCpQRRA/e5557DnXfeicrKSrzyyisAgJqaGlRVVaGiogKbN28W8/JJo/9c4KYZUy9wueqMUologbt//37s3bsX27dvx1//+le8+uqrOH78ONavX48tW7Zgx44dOHLkCHbv3i1WCUmjbyAAk14NdQqsMLsUrjqjVCHaT/jChQvxhz/8AWq1Gt3d3QiHw+jv70dRUREKCgqgVqtRVVWFnTt3ilVC0ugfCKTkcMKIktGzzvjwjORN1GNhNRoNnn/+efz2t7/FsmXL0NnZCbvdPvpxh8MBp9MZVZs2mznWZUoqGIpgYCiEqZPSYTJNfJZCLNqIdVvhcAR2u+WyH8+0mZFmPozjLX2oKi8eU3uqBJs+d6X+yQX7OHGin8O9du1aPProo3j88cfR2Nh4wUm0giBEfTJtd7cXkYgQ6zIlYbdb4PYMAQAMGiUGzm3POBGxaCPWbalUSry09dAVX2M1arGnth2//utBKK/yPfHoihK4XJ6Y1BYLdrsloeoRA/sYXTuXI9ptwunTp3Hs2DEAgMFgQEVFBfbt2weXyzX6GpfLBYfDIVYJSaHXMxxqct+S8WqyMw0IhiPo6Y/dLwyiRCNa4La2tmLDhg0IBAIIBALYtWsXVq1ahYaGBjQ1NSEcDqO6uhpLliwRq4Sk4Pb4oVUrodfK8wyzsXKkGaBUgKdAkKyJNqRQXl6O2tpaLF++HCqVChUVFaisrERmZibWrFkDv9+P8vJyLFu2TKwSkkKvxw+rSRv10IrcqNVK2NL06OjxYc7kjJT/70HyJOoY7po1a7BmzZoL3rd48WJs375dzMsmjXA4ArfXjyk58n8YMRY5mUYcPtMDz2AQ1hSck0zyl1iPelNMm2v4AWAqTwk7X67NCABo7+KwAskTA1dCZ872A+ADsxF6rRo2qw5nuwekLoVIFAxcCTW09UGpUMBs0EhdSsLItZng8QXh9XHVGckPA1dCZ872Ic2shVLJB0QjRoYVeJdLcsTAlYggCGg424cMq17qUhKKQadGhkWH9m6O45L8MHAl4vYG0OcNIMOSWpuOj0WuzYi+gQA3syHZYeBKpKVzeAlhOgP3K/I4W4FkioErkZZOLwCkzLHo0TDqNUgzaTmOS7LDwJVIs9OLHJsRWk1qL+m9nLwsI9zeAHz+kNSlEMXMmAJ3/fr1X3nf2rVrY15MKmnu9GJKXprUZSSsXJsJANDOu1ySkSsu7X3mmWfgdDpx4MAB9PT0jL4/FAqhpaVF9OLkaigQQmePD7deVwhBkMdWk7FmNmhgNWrQ3uXDNP5iIpm4YuCuXLkS9fX1OHHiBG6//fbR96tUKpSWlopdm2y1ugYgAJiaZ8Xptj6py0lYuVkmnGh2Y9AfgkEn+tbNRKK74nfx/PnzMX/+fNxwww3IycmJV02y1+IcnqEwJT+NgXsF+ecCt61rANPzeZdLyW9Mtw3t7e343ve+h76+vgv+BH7rrbdEK0zOmju9MOnVsKcbpC4loZkNGqSbtWhzMXBJHsYUuE8//TRWrFiBOXPmcJ/SGGh2elHgMPO/5RhMsptxpKEHHl8AFm7ZSEluTIGrVqvx8MMPi11LSohEBLS5vLi5LF/qUpJCXpYRRxp60OYawKwiBi4ltzFNCysuLsaJEyfEriUlOHt9CIQiKHDI6/Rhsei1amSl6dHaNcAZHZT0xnSH29LSgm984xvIy8uDTvflyiiO4Uav2Tm8woyBO3aT7CYcPNUNN/eeoCQ3psBdt26d2HWkjOZOD1RKBfKyTFKXkjRybSbUnu5Gq8vLwKWkNqbAnTFjhth1pIwWpxf5WSaoVVxVPVYatRLZmUa0dQ1g7pRMqcshGrcxBe6iRYugUCggCMLok3W73Y4PP/xQ1OLkqLnTi/lTGRrRyreb0N7tQ5d7SOpSiMZtTIF7/Pjx0X8HAgFUV1ejoaFBtKLkqs/rR/9AAIUOntIbrewMA9QqBdq6vFKXQjRuUf9dq9VqsWLFCnzyySdi1CNrI1syFmbzgVm0VEolcm3Dd7n+YFjqcojGZUx3uG63e/TfgiDgyJEj6O/vF6sm2Wru5AyFiSiwm9DS6cWew+2YW8CVZ5R8oh7DBQCbzYZ//dd/FbUwOWp2emCz6mHU85Te8bCl6WHUqfH3/U2YW7BA6nKIohb1GC6NX0unl8MJE6BQKFCQbcah+i50uQeRxb0oKMmMaQw3EongpZdewurVq/HAAw/gxRdfRCjEnfij4Q+E0dHt43DCBA3vQQF8fLhd6lKIojamwP35z3+OvXv34qGHHsLDDz+ML774Aps2bRK7Nllp7fJCAFCYzRkKE2HUqVFabMcnh9sR4VJfSjJjCtyPPvoIv/rVr3DbbbehoqIC//Ef/8E5uFFqObekt5B3uBP2DwuL0N3vx7GmXqlLIYrKmAJXEARoNF8+6NFqtRe8TVfX0umFQaeGLU0vdSlJ7/p5OTDp1fi4lsMKlFzGFLizZs3Cj3/8YzQ3N6OlpQU//vGPudw3Ss2dHhRyD9yY0GpUWDQnBwdOuDAwFJS6HKIxG1PgPvPMM+jv78eqVatw3333obe3F//2b/8mdm2yEYkIaO0cQAFnKMTMTQtyEQpHsO+oU+pSiMbsioEbCATw/e9/H3v27MHGjRtRU1ODBQsWQKVSwWxmeIxVp3sQ/mCYMxRiqCjHgkKHGR9xWIGSyBUD9/nnn4fX68U111wz+r5nn30W/f39eOGFF0QvTi6azx0ayT0UYuvrJXlo6vCM/vclSnRXDNwPPvgAP//5z2Gz2Ubfl52djU2bNuHvf/+76MXJRUunl3vgimDR3Gxo1Uq893mb1KUQjckVA1ej0UCv/+pTdbPZDK2W50uNVbPTi1ybCRo198CNJZNeg0Vzs7H3aAd8fHhGSeCKCaBUKuH1fnU7PK/Xy5VmUWjp9HBJr0iWXjMJgWAEHx/ukLoUoqu6YuDedddd2LBhA3w+3+j7fD4fNmzYgIqKCtGLk4P+gQDc3gAXPIikMNuCaflWvP95K1eeUcK7YuA+9NBDsFgsuPHGG3H//fdj5cqVuPHGG2G1WvHEE0/Eq8akNrIHbgGX9Ipm6TWT4OwdxLFGrjyjxHbF3cKUSiWeffZZPP7446irq4NSqcSCBQvgcDjiVV/Sa+4cfoLOKWHiuXamA3/eVY/3Pm/lmWeU0Ma0PWN+fj7y8/PFrkWWWpxeZFp1MBu4FFosGrUSS0rysGNvE7r7hrh8mhIWH5uLrLnTy/m3cXBz6fANwQcHOUWMEhcDV0SBYBjt3QMcTogDW5oepdOz8OGhswiGIlKXQ3RJDFwRtXUNQBB4aGS8LL1mEjy+IPYf4/4KlJgYuCLiDIX4mjM5A/l2E/62v2X0/D2iRMLAFVGz0wODToUsPsSJC4VCgduvK0Sry4ujnCJGCYiBK6LmTi8K7GYouQdu3Fw/JxtpZi127m+WuhSir2DgiiQiCGjp9KKAMxTiSqNW4ravTUJdQ8/okA5RohjTPFyKnss9CH8gzE3HRRAOR2C3X/4X2crbZuLtPU3YXduOdQ9cc9nXAcMzSfrcviu+hihWGLgiGT00koEbcyqVEi9tPXTF1+TZjHj/QAtUEGDQXf7b/NEVJbEuj+iyRB1SePHFF1FZWYnKysrRY9VrampQVVWFiooKbN68WczLS6q50wulQoF87oErial5VggC0NDeL3UpRKNEC9yamhp8/PHH2LZtG9544w3U1dWhuroa69evx5YtW7Bjxw4cOXIEu3fvFqsESbU4PcjNMkKjVkldSkoy6jXIyzKiscPDhRCUMEQLXLvdjqeeemr0SPVp06ahsbERRUVFKCgogFqtRlVVFXbu3ClWCZIaXtLL4QQpTc9LQygsoKmDR/BQYhAtcIuLi1FaWgoAaGxsxDvvvAOFQgG73T76GofDAadTfquCPL4Aej1+zlCQWLpFB3u6HqfO9iEU5l0uSU/0h2b19fV47LHH8OSTT0KlUqGxsXH0Y4IgQBHlHFWbLfHvGs+edAEA5s+wX/Fp+giTSReza6dCW9G0V1Jsx98/bUFH7xBmFmVc8jVj+RpdTSzaSHTs48SJGrgHDhzA2rVrsX79elRWVmL//v1wuVyjH3e5XFHvrdvd7UUkktjLNg+dHL5rT9Or4XJd/s/ZkS/uwIA/ZtdOhbaiac+oVcFm1aPuTDdyM/VQKb/6R92VvkZjYbdbJtxGomMfo2vnckQbUmhvb8cTTzyBn/3sZ6isrAQAlJSUoKGhAU1NTQiHw6iursaSJUvEKkEyzU4vbNwDN2HMLEiDPxhGs5MLIUhaot3h/uY3v4Hf78fGjRtH37dq1Sps3LgRa9asgd/vR3l5OZYtWyZWCZJp6vCgkBvWJAxbmh6ZFh3qW/tQmG2BSsml1iQN0QJ3w4YN2LBhwyU/tn37drEuK7mhQAjOHh8WzcmWuhQ6R6FQYEZBOvYedaKl04vJOfxlSNLgXgox1tLphQDwDjfB2NP1SDdrcaq1L+GfAZB8MXBjbGScsIh3UQlFoVBgZkE6fP4QWlwcyyVpMHBjrKnDA4tRg3SzVupS6CKODAPSzVqcbHYjHOG8XIo/Bm6MNTs9KMq2RD2/mMSnUCgwuygDg4EwGtvlPcWJEhMDN4aCoQjaugY4fpvA7OkG2NP1qG/t4x4LFHcM3Bg62zWAcETg+G2Cm12YgUAogtNn+6QuhVIMAzeGmpzDf6ZyD9zElm7RIddmxOm2frg9sV0BR3QlDNwYajp3aKQ93SB1KXQVswszEIkIeH3XSalLoRTCwI2h5g4PChwWHhqZBMxGDQqyzdhR04gu96DU5VCKYODGSCQyfGhkER+YJY2ZBelQKICtH52RuhRKEQzcGGnv8SEQinD8NokYdGosL5+GvXVOnG7jAzQSHwM3RprPPTDjHW5yue/WGUgzafH/dtUjInDJL4mLgRsjTR0eaNRK5GYZpS6FomDQqfGN8mk4c7Yf+47K7/QRSiwM3BhpdnowyW665AbXlNhumJ+DyTkW/OWD0/AHwlKXQzLGdIgBQRDQ5OQDs2SlVCjwwG3F6PX48c6+JqnLIRlj4MaAq28Ig/4Ql/QmseJJ6Vg424F39jWju29I6nJIphi4MdDcMbLCjIGbzO67eToA4LX3T0lcCckVAzcGGjr6oVIqUOAwSV0KTYAtTY/KxUX49HgnjpzplrockiEGbgw0tnswyW6GRq2SuhSaoDuuL0J2phF/fPckAkE+QKPYYuBOUEQQ0NjRjym5HE6QA41aiQcrZqDTPYi39/ABGsUWA3eCnD0+DPrDmJxrlboUipHZkzOxeG42duxtQnv3gNTlkIwwcCdo5OSAKQxcWbl/aTF0GhVe/dsJCFyBRjHCwJ2gho5+aNVK5HGFmaykmbRYefM0HG92Y09dh9TlkEwwcCeosd2DwhwLV5jJ0JLSPEzLs+LPu06hfyAgdTkkA2qpC0hm4UgEzU4PykvzpS6FxikcjsBuv/wDz3/+r9fif/77B3h99xk89dB1l32d3W5BIBhGn9snRpkkEwzcCWhzDSAQinCGQhJTqZR4aeuhK75men4aPqk9i//9f2qQl/XVudYmkw4DA348uqJErDJJJvh38AQ0nlthxhkK8jYt34p0sxa1p7vh59xcmgAG7gQ0tPfDoFPDkcEzzORMqVCgtDgLoXAEh09zBRqNHwN3Ahra+zE5h2eYpQKrUYuZhek42+3D2S7OzaXxYeCOUzAURptrgPNvU8i0/DSkmYaHFoYCIanLoSTEh2ZRSks3QqtR4WhDN8IRAaWzsq/4lJvkQ6lQ4JoZWfjwUDu+qO/CojnZUPCvG4oCAzdKWo0KL209hFPnDh38rK4dR+o7x9UWn2onH4tRi7mTM1B7pgcN7R5MzeNfODR2HFIYp16PH0adGnotdwhLNUU5FmRnGHC0sYcLIigqDNxxEAQBPf1+ZFh0UpdCElAoFCidngWNWokDJ10IhyNSl0RJgoE7DoP+MPzBMDKtDNxUpdOqUFacBY8viIP1LqnLoSTBMdxx6PUMn3nFO9zU5sgwYkquBSeb3UgzaqUuh5IAA3ccejx+qJQKWPlDlvLmTM5E30AQX9S70NE9AI7o05VwSGEcej1+pJu1UCo5JSjVqZQK3LggFwDw01c/QzDE8Vy6PAZulPzBMPoGAsiw6KUuhRKE2ahFWXEWTrW48dp7PPGXLo+BG6VTLW4IApDJ8Vs6T67NhOXl07Dr81bsP+aUuhxKUAzcKJ1o6gHAB2b0VQ9VzsG0PCteeec42rjfAl0CAzdKx5t6YdSroeOCB7qIWqXEf18+Dzq1Ei/+tRa+oaDUJVGCYeBGQRAEHG3o5nACXVamVY//ce98dPUN4dfbjyIS4QGU9CUGbhTau33o8wZgS+MDM7q8GQXp+NZtxTh8phvbPjojdTmUQDgPNwonWtwAAJuVgUtXdnNZPpqcHry9pwmF2RZcN8shdUmUAHiHG4WTLW5kWnUw6fl7iq5MoVDgv/zDTEzLt+I3bx9FS6dX6pIoATBwx0gQBJxo7sW8qVncA5XGRKNW4ol758OgU+OFv9bCO8iHaKmOgTtGLvcg3N4A5k6zSV0KJZF0sw7fvnc+3F4/frn1MELcWSylMXDH6ESzGwAwbyoDl6IzLT8ND985Gyda3PjdO8chCJy5kKo4GDlGJ1rcMBs0KMjmcToUvcVzc9DZO4g3P25AdoYBVTdOkbokkgADd4xOtrgxsyCd47c0bnffOBmdvT5s+6gB9gwDFs3JkbokijNRhxS8Xi/uuusutLa2AgBqampQVVWFiooKbN68WcxLx1R33xC6+oYwozBd6lIoiSkUCvzjHbMxoyAdv337OOpb3VKXRHEmWuAeOnQIDzzwABobGwEAQ0NDWL9+PbZs2YIdO3bgyJEj2L17t1iXj6kTLb0AgJkF6dIWQklPo1bi2yvmw2bV4YW/HkZnr0/qkiiORAvc1157Dc888wwcjuEJ37W1tSgqKkJBQQHUajWqqqqwc+dOsS4fU8eb3DDp1ZhkN0tdCsmA2aDBd+4rgSAI2Px6Lfp9PIgyVYg2hvujH/3ogrc7Oztht9tH33Y4HHA6o9/GzmaLb+gJgoBjzb0onelAdvbwkdgmU2z3Uohle6nQVqzbi0VbI23Y7WN7qGq3W/D0f1uEf/tVDba8cQQ/fPxGGHSJ/UhlrH1LZmL3MW5f4UgkcsEDJ0EQxvUAqrvbG9cNQdq6BtDdN4TpuRa4XB7Y7RYMDPhjeo1YtpcKbcW6vYm2ZTLpRttwuTxj/jy7WYvH7pmLF7cexg9e2oO1KxdArUrMmZp2uyWqviWjWPXxSqEdt69uTk4OXK4vTzd1uVyjww2JrK5heP/buZMzJa6E5Kis2I6Hls3CkYYe/HbHMUQ4R1fW4ha4JSUlaGhoQFNTE8LhMKqrq7FkyZJ4XX7c6hp6kJ1pRFa6QepSSKaWlOThG+VTsbfOidfeO8WFETIWtyEFnU6HjRs3Ys2aNfD7/SgvL8eyZcvidflxCYYiONHci68vyJO6FJK5OxcVoc8bwLuftiDNrMUd1xdJXRKJQPTAfe+990b/vXjxYmzfvl3sS8bMqbY+BEIRzJ3C4QQSl0KhwKrbiuEZDOL190/DpNdgSQl/0ctNYj8WlVhdQw9USgVmcsEDjUE4HJnwU+7vP7QQP3plH36/8zisVj1K+cteVhi4V1DX0INp+WkJP12HEoNKpcRLWw9NuJ2cdD1sVj1efO0gHr9nHq7l5uWykZhzUBJAvy+AJqeHwwkUdyqVEgtnOzCzKBO/3l6Hg/VdUpdEMcLAvYzDp7sBAPMYuCQBtUqJ//XoIhRmm7HljcOj0xMpuTFwL+NgfRfSzVoU5ch/dQ0lJqNeg3X3lyLXZsILf63FieZeqUuiCWLgXkIgGMbhhm6UFduh5HaMJCGzQYN/XlWKrHQDNr9+CMcaeaebzBi4l3C0qReBYARlxVlSl0IEq1GL7z1QBke6Ab/4Sy0On+mWuiQaJwbuJRysd8GgU2FWUYbUpRABANJMWjz5rWuQZzPh+b/U4ouTrqt/EiUcBu5FIhEBB+u7MH+qLWE3EqHUZDZo8L0HSlGUY8GWN45g/7Hod9sjaTFRLnLmbD/6fUGUFduv/mKiODPqNfjnb5ZiWp4Vv36zDrsOtEpdEkWBgXuRz+tdUCkVmM/TeSlBGXRqrPtmKUqmZ+FP/3kSf/ngNDe8SRIM3PMIgoAv6rswqygDRj1Xl1Hi0mlUeGLFPNxcmocde5vwm7ePIRSOSF0WXQVT5TzNTi+cPT7cfl2B1KUQjWlvhu/+12sxKeck/rjzOHyBML6/+lqYjdqvvC4QDKPPzfPTpMbAPc+eug6olAquXaeEEM3eDKXTbThU78J/+9F/YuFsBywXhe6jK0rEKJGixCGFcyIRAfuOOrFgmg1mg0bqcoiiUphtwQ3zchAKR/DhoXa0d/NuNhExcM851tSLvoEAFs/NkboUonGxWfVYUpIHi1GDT4934nhzLx+mJRgG7jl76jpg0KlRMp2zEyh5GXRq3Dg/B5PsJpxs6UPNkQ4M+kNSl0XnMHAB+ANhHDjpwnWz7NCoVVKXQzQhKqUSZcVZKJ1ug9sbwAcHz+KT2rNSl0XgQzMAwBenXPAHwhxOINlQKBQozLYg06rH5ydd2Pj7T/EPCwvxT3fPg2mczyjOnzHBWQ/jw8AFUHO4A5lWHYoL0qUuhSimzAYNbpqfC4NRi9d31eOjg22YNzUTuZlGKKLYCc9k0mFgwD/6Nmc9jE/KDyk4e3w40tCDJQvyuBUjyZJSqcCDd87B1xfkQqtR4rPjLnx6vJNjuxJI+cB97/M2qJQKlJfyhFSStwyLDktK8jBncgZc7iG893kbTra4EeYKtbhJ6SGFoUAIHx9ux7WzHEgz66Quh0h0SoUC0/PTkGszoq6hB8eb3Wjq8GBWUQYm2U1RDTNQ9FL6DndvnROD/hBuvWaS1KUQxZVJr8HC2dm4YV4OtBoVvqjvwoeH2tHZO8i5uyJK2cAVBAG7Pm9FYbYZ0/KtUpdDJImsND2WlOSirDgLgVAYe4868fHhDnT2+hi8IkjZIYUTzW60uQbw8B2z+GcUpTSFQoEChxn5WSY0d3pR3+rG3qOdyDBrMbMwHfZ0g9QlykbKBu47+5phNmhw/ZxsqUshSghKpQKTcywodJjR0unFyXPBm27WYs4UGzItWs7kmaCUDNymDg8On+nGiiVTodVwZRnR+ZRKBYpyLCg4F7yn2vpQc7gdBp0KU3OtKMy+8paRdHkpGbjVNY0w6NRYyodlRJc1EryF2Wa4fSEcPdOFusZenGhxQ2/Q4sY5DmRa9VKXmVRSLnAb2vtx4KQLd984mac6EI2BQqHAJIcZGSYNej1+nD7bhzc/PI3tH57G12bacUtZPmYUpPNZyBikXOJs3X0aZoMGty8slLoUoqSTYdHh2pkO3H1zMV579zg+rm3H/mOdyMsy4ebSPNwwL1fSG5m0dOOEhgkvPmEj1ntGpFTgHmnoRl1jL765dDoMupTqOlFMZWcaserWYty7ZCr2H3Pigy/a8H//Xo+/7D6N62dn48b5uSielBb3u16tRjXmUzIudvF+EUDs94xImdQJhSP4f3+vhyPdwLFbohjRaVT4+oI8fH1BHho7+vH+523Yd8yJj2rbkZWmx+K5ObhhXg6yM41Sl5oQUiZw3zvQivZuH9Z8Yz406pRd70Ekmsk5Vjx8pxUP3FaMz0+6sOdIB6prGvFWTSMm2U0oK7bjmhl2FGabU3a8NyUCt8s9iG0fNWDBNBtKp2dJXQ6RrOm1atwwLxc3zMtFr8ePT4858UV9F6r3DIdvplWH2UUZmFWYgdlFGSk100H2gSsIAv7wtxMAgNUVM1P2NyuRFDIsOlQsLETFwkJ4fAEcOtWNQ6e6cLC+C58c7gAA2NP1mJJrRVG2BYXZw9PQLj51WC5kH7jvfd6GIw09+C//MAO2tNT5TUqUaCxGLW5akIubFuQiIgho7fTieLMbJ1vcON3Wj/3HOkdfazZo4MgwDP8v3YDsDCPs6QbY0vRIMyfvijdZB67HF8Br75/C/Kk2LL0mX+pyiOgc5bkjgAqzLai4rgAA4B0MotnpQbPTC2evD529g6hv6cO+OifO30ZHpVTAZtXDlqYf/f+sc/+OqFQQBCFh/5KVdeBq1EosW1iIW782KWG/AEQ0zGzQYM7kTMyZnHnB+4OhCLr6BuFyD6K7bwhd/UPo7hv+3+GGbvR5Axe8XqVUwGLUwGrUwmLSIMOsQ7pZB6VS+gyQdeDqtWrcu2Sq1GUQ0QRo1Erk2kzItZku+fFgKIyefj+6+ofgDwvY8fEZ9PuC6Oj1oblz+DQLpVKBDLMWNqse9nQDMqw6SYYlZB24RCR/GrUK2ZlGZGcaYbdb0Hy2b/RjQ4EQevr96On3o7t/CCdb+3CytQ8atRKODANyMoxwZBjiNlU0JQJ3osv9iOhC4XDkK8tgxysUjkCtEifw9Fo18rLUyMsavjsOhiJwuQfR0TM8RtzmGoBSqUBOhgHTCtKRZtCIOvSQEoE7keV+F+Px0ESASqWM6c9UvH4+NWol8rJMyMsyQRAE9Hj8ONs1gLauAZw9eHb040UOM9LMsZ+alhKBS0R0MYXi3GwHqx5zJ2fCMxTCqVY3Wju9aOrwwGrU4M6vT4Muhje8DFwiSnlKpQJ5djPSjBoEQxG0ubzo6B3EwGAQOqMmdteJWUtERDKgUSsxOdeKRXOyMbMo8+qfEAUGLhFRnDBwiYjihIFLRBQnDFwiojiRJHDfeust3HnnnaioqMCf/vQnKUogIoq7uE8Lczqd2Lx5M7Zu3QqtVotVq1bh+uuvx/Tp0+NdChFRXMU9cGtqarBo0SKkp6cDAG6//Xbs3LkT3/72t8f0+eNddmeO4Vy6WLZl0mugECIxay9R+xnLtmLd3kTbMp73NZRrP42X+D5NlNpi1dal+giMP3MuRSEIgnD1l8XOr3/9a/h8Pqxbtw4A8Prrr6O2thbPPvtsPMsgIoq7uI/hRiKRC/amTeTNgomIYinugZuTkwOXyzX6tsvlgsPhiHcZRERxF/fAveGGG7Bnzx709PRgcHAQ7777LpYsWRLvMoiI4i7uD82ys7Oxbt06PPjggwgGg1i5ciUWLFgQ7zKIiOIu7g/NiIhSFVeaERHFCQOXiChOGLhERHHCwCUiihMGbpx5vV7cddddaG1tBTC81LmqqgoVFRXYvHmzxNVN3IsvvojKykpUVlZi06ZNAOTXx+eeew533nknKisr8corrwCQXx8B4Kc//SmeeuopAPLr3+rVq1FZWYl77rkH99xzDw4dOhSfPgoUNwcPHhTuuusuYe7cuUJLS4swODgolJeXC83NzUIwGBQeeeQR4YMPPpC6zHH75JNPhG9+85uC3+8XAoGA8OCDDwpvvfWWrPq4b98+YdWqVUIwGBQGBweFW265RTh27Jis+igIglBTUyNcf/31wve//33ZfZ9GIhHhpptuEoLB4Oj74tVH3uHG0WuvvYZnnnlmdGVdbW0tioqKUFBQALVajaqqKuzcuVPiKsfPbrfjqaeeglarhUajwbRp09DY2CirPi5cuBB/+MMfoFar0d3djXA4jP7+fln10e12Y/PmzXj88ccByO/79MyZMwCARx55BHfffTf++Mc/xq2PDNw4+tGPfoRrr7129O3Ozk7Y7fbRtx0OB5xOpxSlxURxcTFKS0sBAI2NjXjnnXegUChk1UcA0Gg0eP7551FZWYnFixfL7uv49NNPY926dbBarQDk933a39+PxYsX45e//CV+97vf4c9//jPOnj0blz4ycCUk14186uvr8cgjj+DJJ59EQUGBLPu4du1a7NmzB+3t7WhsbJRNH19//XXk5uZi8eLFo++T2/dpWVkZNm3aBIvFgszMTKxcuRLPP/98XPoY96W99CU5buRz4MABrF27FuvXr0dlZSX2798vqz6ePn0agUAAs2fPhsFgQEVFBXbu3AmVSjX6mmTu444dO+ByuXDPPfegr68PPp8PbW1tsukfAHz22WcIBoOjv1QEQUB+fn5cvk95hyuhkpISNDQ0oKmpCeFwGNXV1Um9kU97ezueeOIJ/OxnP0NlZSUA+fWxtbUVGzZsQCAQQCAQwK5du7Bq1SrZ9PGVV15BdXU13nzzTaxduxZLly7Fyy+/LJv+AYDH48GmTZvg9/vh9Xqxbds2fPe7341LH3mHKyGdToeNGzdizZo18Pv9KC8vx7Jly6Qua9x+85vfwO/3Y+PGjaPvW7Vqlaz6WF5ejtraWixfvhwqlQoVFRWorKxEZmambPp4Mbl9n95yyy04dOgQli9fjkgkgm9961soKyuLSx+5eQ0RUZxwSIGIKE4YuEREccLAJSKKEwYuEVGcMHCJiOKEgUtEFCcMXEoajzzyCHp6eib8mn379uGuu+666vVmzpx5ybZ27dqFH/7whwCGt/nbuXMnWltbUVZWdtU2KbVx4QMljU8++SQmr5moW2+9Fbfeeqvo1yH54R0uJYV/+Zd/AQA89NBD2L9/P1avXo2qqircfffdeOONN77ymvb2drz//vtYtWoVVqxYgZtvvhm/+MUvor7uL37xC9x7772455578P777wMAtm7disceeywm/aLUwjtcSgo/+clPsHXrVvz+97/H/fffjyeffBIVFRVwOp247777UFRUdMFrMjIy8OSTT2Ljxo2YPHkynE4nbrnlFjz44INRXXfSpEn4wQ9+gJMnT2L16tV45513ROohpQIGLiWV06dPw+/3o6KiAgCQnZ2NiooKfPTRRxeMoSoUCvzqV7/CBx98gOrqapw+fRqCIGBwcDCq6z3wwAMAgBkzZmDatGn44osvYtcZSjkcUqCkolAovrJPqSAICIVCF7zP5/Ph3nvvRV1dHebMmYMnn3wSarUa0W4dolR++SMSiUSgVvMehcaPgUtJQ6VSIT8/H2q1Gu+++y4AwOl04m9/+xtuuOGG0deEQiE0NTXB6/XiO9/5DpYuXYp9+/YhEAggEolEdc1t27YBAOrq6tDc3IySkpLYdopSCn9dU9JYtmwZ/vEf/xFbtmzBD3/4Q7zwwgsIh8N44oknsGjRotHXrF69Gs899xxuvvlm3HHHHdBqtZgxYwamT5+OpqYmaLXaMV+zpaUFy5cvh0KhwL//+78jPT1dpN5RKuD2jEREccI7XEpZL7/8Mt56661Lfuyf/umfcPfdd8e5IpI73uESEcUJH5oREcUJA5eIKE4YuEREccLAJSKKEwYuEVGc/H/38wAJadXY2gAAAABJRU5ErkJggg==", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Create a visualization with pandas df\n", "sns.displot(data=pandas_tips, x=\"total_bill\", kde=True)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAEJCAYAAACdePCvAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAAZRUlEQVR4nO3de3BU9fnH8c8uu8lgE0TSBSla7aBo6SXSiyGWEsAakCQEA8WEmUQKjFKtXDoaICKMchVsIxaZ1sogCiihgkAkqZbbKFCoaEEdFEoJEMnEGMolJiS72fP7wx+pWIFNOOck6/f9+iu7yT7f5yFhP3vO7jnHY1mWJQCAcbyt3QAAoHUQAABgKAIAAAxFAACAoQgAADAUAQAAhiIAAMBQvtZuoLn+85/PFA5//Q9dSEiIU3V1TWu30WqYn/lNnd/u2b1ej6666htf+b2oC4Bw2DIiACQZM+eFMD/zm8qt2dkFBACGIgAAwFAEAAAYigAAAEMRAABgKEcDoKamRunp6SovLz/v/uXLlys3N9fJpQEAl+BYAOzdu1c5OTkqKys77/5//etfevbZZ51aFgAQIceOAygqKtKMGTOUn5/fdF9DQ4OmT5+u8ePHa926dU4tjQvocGV7xcbY/yuvbwjp9Kk62+sCcJZjATB79uz/ue93v/udhg0bpmuuuabFdRMS4i6nragSCMTbXrNg8Xbba865/2eO9OpEzWjC/ObO79bsrh0JvH37dlVUVGjq1KnatWtXi+tUV9cYcYRgIBCvqqozttcMBkO21jzHiV7trhlNmN/c+e2e3ev1XPCFs2sBUFxcrIMHDyozM1O1tbX69NNPNXHiRD311FNutQAA+ALXAmDu3LlNX+/atUuLFi3iyR8AWhHHAQCAoRzfAti8efP/3JeUlKSkpCSnlwYAXARbAABgKAIAAAxFAACAoQgAADAUAQAAhiIAAMBQUXdReFMEQ2Gjz4UCwHkEQBvl93n18MJtttZcMCHF1noAohu7gADAUAQAABiKAAAAQxEAAGAoAgAADEUAAIChCAAAMBQBAACGIgAAwFAEAAAYigAAAEMRAABgKEcDoKamRunp6SovL5ckrVq1Sunp6crIyNDUqVPV0NDg5PIAgItwLAD27t2rnJwclZWVSZIOHz6sJUuW6OWXX9b69esVDoe1cuVKp5YHAFyCYwFQVFSkGTNmqHPnzpKkmJgYzZgxQ3FxcfJ4POrRo4eOHz/u1PIAgEtw7HoAs2fPPu92t27d1K1bN0nSiRMntGLFCs2dO9ep5QEAl+D6BWEqKys1duxYDRs2TElJSc1+fEJCnANdtU1+v/2/HidqSnLk6mWmXxGN+c2d363ZXQ2AQ4cOaezYscrNzdXo0aNbVKO6ukbhsGVzZ21PIBCvYDBke10nakpSVdUZW+sFAvG214wmzG/u/HbP7vV6LvjC2bUAqKmp0ZgxYzRx4kQNHTrUrWUBABfg2nEAf/nLX/Tpp59q6dKlyszMVGZmphYuXOjW8gCAL3F8C2Dz5s2SpFGjRmnUqFFOLwcAiBBHAgOAoQgAADAUAQAAhnL9OAB8/QRDYUc+t9zhyvY6farO9roAPkcA4LL5fV49vHCbvTX9Ps25/2e21gRwPnYBAYChCAAAMBQBAACGIgAAwFAEAAAYigAAAEMRAABgKAIAAAxFAACAoQgAADAUAQAAhiIAAMBQBAAAGIoAAABDEQAAYCgCAAAM5WgA1NTUKD09XeXl5ZKkHTt2KCMjQ6mpqSosLHRyaQDAJTgWAHv37lVOTo7KysokSWfPnlVBQYEWL16sjRs36v3339e2bfZeRQoAEDnHAqCoqEgzZsxQ586dJUn79u3Tddddp2uvvVY+n08ZGRkqLS11ankAwCU4dk3g2bNnn3f7k08+USAQaLrduXNnVVZWNrtuQkLcZfcWLfx++389TtR0sq4TF5uPFibPLpk9v1uzu3ZR+HA4LI/H03TbsqzzbkequrpG4bBlZ2ttUiAQr2AwZHtdJ2o6UfdcoFRVnbG1brQIBOKNnV0ye367Z/d6PRd84ezap4CuvvpqVVVVNd2uqqpq2j0EAHCfawGQmJiow4cP68iRI2psbFRxcbH69u3r1vIAgC9xbRdQbGys5s2bpwcffFD19fVKSUnRoEGD3FoeAPAljgfA5s2bm75OTk7W+vXrnV4SABABjgQGAEMRAABgKAIAAAxFAACAoQgAADAUAQAAhoooAF588UXV1NQ43QsAwEURBcBHH32kgQMH6pFHHtF7773ndE8AABdEdCDYrFmzVFNTow0bNuixxx6TZVnKyclRRkaGYmNjne4RAOCAiN8DiIuL06BBg5Senq6TJ09q5cqVGjRo0HlH+gIAokdEWwA7d+7UqlWrtHPnTg0cOFDPPPOMbr75Zh09elQjR47UgAEDnO4TAGCziALgscce08iRIzVz5kzFx//3QgXf/va3NWLECMeaAwA4J6IAWL9+vUpLSxUfH6+qqiq99tprysvLk9fr1fjx453uEYYKhsKOXBmpviGk06fqbK8LRJuIAmDmzJn67LPPNGTIEHm9Xu3Zs0fl5eWaNm2a0/3BYH6fVw8v3GZ73QUTUmyvCUSjiALg3XffVXFxsSQpISFBCxcuVGZmpqONAQCcFdGngILBoBoaGppuh0LOXFcWAOCeiLYA+vXrpzFjxigzM1Mej0fFxcVKSWEzGgCiWUQBkJ+frxUrVmjTpk3y+Xy64447lJ2d7XRvAAAHRRQA7dq1U15envLy8pzuBwDgkogC4G9/+5vmzJmjU6dOybKspvvfeecdxxoDADgrogBYsGCBpkyZop49e8rj8Vz2ouvWrdOzzz4rSerbt68mT5582TUBAM0TUQB06NBBqamptixYV1en2bNnq7S0VB06dFBOTo527Nih2267zZb6AIDIRPQx0MTERG3bZs8BOY2NjQqHw6qrq1MoFFIoFOKMogDQCiLaAti2bZuWL18uv98vv98vy7Lk8Xha9B5AXFycJkyYoDvvvFPt27fXT3/6U/3oRz9qdh0AwOWJKACef/552xb88MMP9corr2jLli2Kj4/XQw89pCVLlmjs2LERPT4hIc62Xto6vz+iX0+r14zGuk6cY8hu0dCjk0ye363ZI/rf1a1bN5WWlmr//v0aN26cNm3apPT09BYt+NZbbyk5OVkJCQmSpKysLK1cuTLiAKiurlE4bF36B6NcIBCvYND+I66dqOlE3XNP/E71W1V1xpG6dgkE4tt8j04yeX67Z/d6PRd84RzRewDPPvusXnrpJZWWlurs2bNatGiRnnnmmRY1c/PNN2vHjh2qra2VZVnavHmzfvCDH7SoFgCg5SIKgNdee01//vOf1b59e1111VUqKipqOjlcc/Xp00dpaWnKysrSkCFDFAqFdO+997aoFgCg5SLaBeTz+RQTE9N0u0OHDvL5Wr5v9t577+VJHwBaWUTP4l27dtXWrVvl8XjU0NCgJUuWqFu3bk73BgBwUEQB8Oijjyo/P18fffSRbrnlFiUmJurJJ590ujcAgIMiCoAuXbpo2bJlqqurU2Njo+LizPkoJgB8XUUUAEuXLv3K+3/1q1/Z2gwAwD0RBcCBAweavm5oaNA//vEPJScnO9YUAMB5EQXA3Llzz7tdWVmpRx55xJGGAADuiOg4gC/r0qWLPv74Y7t7AQC4qNnvAViWpffff7/pVA4AgOjU7PcApM+PC8jPz3ekIQCAO1r0HgAAIPpFFAC5ubkXvRTkCy+8YFtDAAB3RBQA3//+93Xo0CGNGDFCfr9f69atUygUUlpamtP9AQAcElEAvPPOO1q5cqXatWsnSfr5z3+uESNGaODAgY42BwBwTkQfAz1x4oTq6+ubbn/22Wc6e/asY00BAJwX0RZAenq67r77bt1xxx2yLEslJSXKy8tzujcAgIMiCoAJEyaoZ8+e+vvf/67Y2Fg9/vjjuvXWW53uDQDgoIiPBO7SpYtuvPFGTZw4UX6/38meAAAuiCgAXnnlFU2dOlXPPfeczpw5o/vvv19FRUVO9wYAcFBEAbB8+XKtWrVKcXFxSkhI0Jo1a7Rs2TKnewMAOCiiAPB6veddBKZr165NHwkFAESniAKgY8eO2r9/f9PRwOvXr9eVV17Z4kU3b96srKws3XnnnZo1a1aL6wAAWi6iTwEVFBRowoQJOnr0qPr06aPY2FgtXry4RQseO3ZMM2bM0OrVq5WQkKB77rlH27ZtU0pKSovqAQBaJqIAOHv2rNatW6eysjI1NjbqO9/5Tos/CfTGG29o8ODBuvrqqyVJhYWFio2NbVEtoCWCobACgXhba9Y3hHT6VJ2tNQGnRRQADz30kEpKStS9e/fLXvDIkSPy+/0aN26cKioq1K9fP02cOPGy6wKR8vu8enjhNltrLpjAFiyiT0QBcNNNN2nDhg368Y9/rCuuuKLp/o4dOzZ7wcbGRr399tt68cUXdcUVV+jXv/611q5dq6ysrIgen5AQd+kf+prw+yP69bR6Tep+zu6tCrvrRRuT53dr9oj+F2zatEmlpaXn3efxeLR///5mL/jNb35TycnJ6tSpkyTpF7/4hfbt2xdxAFRX1ygctpq9brQJBOIVDIZsr+tETSfqnnuCjpZ+Jamq6oxttQKBeFvrRRuT57d7dq/Xc8EXzhEFwHvvvWdbM/3799fkyZN1+vRpfeMb39Cbb76p22+/3bb6AIDIXPRjoI8++mjT1ydOnLBlwcTERI0dO1YjR47U4MGD9a1vfUvDhg2zpTYAIHIX3QJ4//33m74eM2aM1q5da8uiw4cP1/Dhw22pBQBomYtuAViW9ZVfAwCiX8RnA73YNYEBANHnoruAwuGwTp06Jcuy1NjY2PT1OS35GCgAoG24aAAcOHBAvXv3bnrST0pKavpeSz8GCgBoGy4aAB9++KFbfQAAXBbxewAAgK8XAgAADEUAAIChCAAAMBQBAACGIgAAwFAEAAAYigAAAEMRAABgKAIAAAxFAACAoQgAADAUAQAAhiIAAMBQFz0dNIDIBENhBQLxttYMBOJV3xDS6VN1ttYFziEAABv4fV49vHCbffX8PgWDIS2YkGJbTeDLWnUX0BNPPKEpU6a0ZgsAYKxWC4CdO3dq7dq1rbU8ABivVXYBnTx5UoWFhRo3blzUX3ayw5XtFRvDnjQA0adVnrmmT5+uSZMmqaKiotmPTUiIc6Cjy1OweLvtNefc/zP5/fb/epyoSV1nap6rZ/eby9HC1Lkl92Z3PQBWr16trl27Kjk5WWvWrGn246uraxQOWw501jKBQLyCwZAjtZ2oGy29nnvyi5Z+7a557k1gSaqqOmNb3WgRCMQbObdk/+xer+eCL5xdD4CNGzeqqqpKmZmZOnXqlGprazVnzhwVFBS43QoAGM31AFi6dGnT12vWrNHu3bt58geAVsCRwABgqFb9+EpWVpaysrJaswUAMBZbAABgKAIAAAxFAACAoQgAADAUAQAAhiIAAMBQBAAAGIoAAABDEQAAYCgCAAAMRQAAgKEIAAAwFAEAAIYiAADAUFzNHGjDgqGwI9eHrW8I6fSpOtvrIroQAEAb5vd59fDCbbbXXTAhxfaaiD7sAgIAQxEAAGAoAgAADEUAAIChWuVN4EWLFqmkpESSlJKSovz8/NZoAwCM5voWwI4dO/TWW29p7dq1evXVV/XBBx/ojTfecLsNADCe61sAgUBAU6ZMUUxMjCSpe/fuOn78uNttAIDxXA+AG2+8senrsrIylZSU6KWXXnK7DcBoHGAGqRUPBDt48KDuu+8+5efn6/rrr4/4cQkJcc411UJ+vzP/jE7UjaZeo62u3TXP1XOkV59XBYu32153zv0/sy1YnAioaOHW7K0SAHv27NH48eNVUFCgtLS0Zj22urpG4bDlUGfNFwjEKxgMOVLbibrR0uu5J71o6dfumn6/r6leNP0bSFJV1ZnLrhEIxNtSJxrZPbvX67ngC2fXA6CiokIPPPCACgsLlZyc7PbyAID/53oALFmyRPX19Zo3b17TfdnZ2crJyXG7FQA2s/O9hXN1eF/BOa4HwLRp0zRt2jS3lwXgArtOXvfFXWCcuM45HAkMAIYiAADAUAQAABiKAAAAQxEAAGAoAgAADEUAAIChuCg8ANiow5XtFRtzeU+tX3UwnRMHxBEAAGCj2BjfZR0M98WD4L7IiQPi2AUEAIYiAADAUAQAABiKAAAAQxEAAGAoAgAADEUAAIChjDkOwI6DMwC4z86rjJ1ft1F+Xzvb60YTY54RL/fgjAvhakWAs+y6ytiXLZiQYvxzAruAAMBQBAAAGIoAAABDtUoAbNiwQYMHD1ZqaqpWrFjRGi0AgPFcfxO4srJShYWFWrNmjWJiYpSdna2kpCTdcMMNbrcCAEZzPQB27Nih3r17q2PHjpKkgQMHqrS0VL/5zW8ierzX62nx2lfFx7b4sV+XutHSq8/vc6TuOW3939bn9ykUbGd73S9qy3W/OL9dNb9KW6z75dm/qCXPfxd7jMeyLKvZFS/Dn/70J9XW1mrSpEmSpNWrV2vfvn2aOXOmm20AgPFcfw8gHA7L4/lvIlmWdd5tAIA7XA+Aq6++WlVVVU23q6qq1LlzZ7fbAADjuR4At912m3bu3KkTJ06orq5Or7/+uvr27et2GwBgPNffBO7SpYsmTZqkvLw8BYNBDR8+XD/84Q/dbgMAjOf6m8AAgLaBI4EBwFAEAAAYigAAAEMRAABgKAKgjaipqVF6errKy8slfX7KjIyMDKWmpqqwsLCVu3PWokWLlJaWprS0NM2fP1+SWfMvXLhQgwcPVlpampYuXSrJrPnPeeKJJzRlyhRJZs2fm5urtLQ0ZWZmKjMzU3v37nVvfgut7p///KeVnp5ufe9737OOHTtm1dXVWSkpKdbRo0etYDBojR492tq6dWtrt+mI7du3W3fffbdVX19vNTQ0WHl5edaGDRuMmX/Xrl1Wdna2FQwGrbq6Oqt///7W/v37jZn/nB07dlhJSUnW5MmTjfr7D4fDVp8+faxgMNh0n5vzswXQBhQVFWnGjBlNR0Tv27dP1113na699lr5fD5lZGSotLS0lbt0RiAQ0JQpUxQTEyO/36/u3burrKzMmPlvvfVWvfDCC/L5fKqurlZjY6NOnz5tzPySdPLkSRUWFmrcuHGSzPr7//e//y1JGj16tIYMGaLly5e7Oj8B0AbMnj1bP/nJT5puf/LJJwoEAk23O3furMrKytZozXE33nijbrnlFklSWVmZSkpK5PF4jJlfkvx+v55++mmlpaUpOTnZqN+/JE2fPl2TJk1Shw4dJJn193/69GklJyfrmWee0fPPP6+XX35Zx48fd21+AqANMvGEeQcPHtTo0aOVn5+va6+91rj5x48fr507d6qiokJlZWXGzL969Wp17dpVycnJTfeZ9Pffq1cvzZ8/X/Hx8erUqZOGDx+up59+2rX5XT8VBC7NtBPm7dmzR+PHj1dBQYHS0tK0e/duY+Y/dOiQGhoa9N3vflft27dXamqqSktL1a7df88H/3Wef+PGjaqqqlJmZqZOnTql2tpaffzxx8bM//bbbysYDDYFoGVZ6tatm2t//2wBtEGJiYk6fPiwjhw5osbGRhUXF39tT5hXUVGhBx54QE8++aTS0tIkmTV/eXm5pk2bpoaGBjU0NGjTpk3Kzs42Zv6lS5equLhY69at0/jx4zVgwAA999xzxsx/5swZzZ8/X/X19aqpqdHatWv129/+1rX52QJog2JjYzVv3jw9+OCDqq+vV0pKigYNGtTabTliyZIlqq+v17x585ruy87ONmb+lJQU7du3T0OHDlW7du2UmpqqtLQ0derUyYj5v4pJf//9+/fX3r17NXToUIXDYY0cOVK9evVybX5OBgcAhmIXEAAYigAAAEMRAABgKAIAAAxFAACAoQgAADAUAQAjjR49WidOnLjsn9m1a5fS09Mvud5NN930lbU2bdqkWbNmSfr8tMClpaUqLy9Xr169LlkTuFwcCAYjbd++3ZafuVy33367br/9dsfXAb4KWwAwztSpUyVJ99xzj3bv3q3c3FxlZGRoyJAhevXVV//nZyoqKrRlyxZlZ2crKytL/fr101NPPdXsdZ966indddddyszM1JYtWyRJa9as0X333WfLXEBzsQUA48ydO1dr1qzRsmXLNGLECOXn5ys1NVWVlZX65S9/qeuuu+68n7nqqquUn5+vefPm6frrr1dlZaX69++vvLy8Zq17zTXX6PHHH9eBAweUm5urkpIShyYEIkMAwFiHDh1SfX29UlNTJUldunRRamqq3nzzzfP2wXs8Hv3xj3/U1q1bVVxcrEOHDsmyLNXV1TVrvZycHElSjx491L17d7377rv2DQO0ALuAYCyPx/M/51m3LEuhUOi8+2pra3XXXXfpgw8+UM+ePZWfny+fz6fmnkbL6/3vf7dwOCyfj9dfaF0EAIzUrl07devWTT6fT6+//rokqbKyUn/961912223Nf1MKBTSkSNHVFNTo4kTJ2rAgAHatWuXGhoaFA6Hm7Xm2rVrJUkffPCBjh49qsTERHuHApqJlyAw0qBBgzRq1CgtXrxYs2bN0h/+8Ac1NjbqgQceUO/evZt+Jjc3VwsXLlS/fv105513KiYmRj169NANN9ygI0eOKCYmJuI1jx07pqFDh8rj8ej3v/+9Onbs6NB0QGQ4HTQAGIotAMAGzz33nDZs2PCV3xszZoyGDBnickfApbEFAACG4k1gADAUAQAAhiIAAMBQBAAAGIoAAABD/R+g2LBfFQqybgAAAABJRU5ErkJggg==", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Create a visualization with Modin df\n", "sns.histplot(data=modin_tips, x=\"total_bill\", stat='frequency')" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAEJCAYAAACdePCvAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAAZRUlEQVR4nO3de3BU9fnH8c8uu8lgE0TSBSla7aBo6SXSiyGWEsAakCQEA8WEmUQKjFKtXDoaICKMchVsIxaZ1sogCiihgkAkqZbbKFCoaEEdFEoJEMnEGMolJiS72fP7wx+pWIFNOOck6/f9+iu7yT7f5yFhP3vO7jnHY1mWJQCAcbyt3QAAoHUQAABgKAIAAAxFAACAoQgAADAUAQAAhiIAAMBQvtZuoLn+85/PFA5//Q9dSEiIU3V1TWu30WqYn/lNnd/u2b1ej6666htf+b2oC4Bw2DIiACQZM+eFMD/zm8qt2dkFBACGIgAAwFAEAAAYigAAAEMRAABgKEcDoKamRunp6SovLz/v/uXLlys3N9fJpQEAl+BYAOzdu1c5OTkqKys77/5//etfevbZZ51aFgAQIceOAygqKtKMGTOUn5/fdF9DQ4OmT5+u8ePHa926dU4tjQvocGV7xcbY/yuvbwjp9Kk62+sCcJZjATB79uz/ue93v/udhg0bpmuuuabFdRMS4i6nragSCMTbXrNg8Xbba865/2eO9OpEzWjC/ObO79bsrh0JvH37dlVUVGjq1KnatWtXi+tUV9cYcYRgIBCvqqozttcMBkO21jzHiV7trhlNmN/c+e2e3ev1XPCFs2sBUFxcrIMHDyozM1O1tbX69NNPNXHiRD311FNutQAA+ALXAmDu3LlNX+/atUuLFi3iyR8AWhHHAQCAoRzfAti8efP/3JeUlKSkpCSnlwYAXARbAABgKAIAAAxFAACAoQgAADAUAQAAhiIAAMBQUXdReFMEQ2Gjz4UCwHkEQBvl93n18MJtttZcMCHF1noAohu7gADAUAQAABiKAAAAQxEAAGAoAgAADEUAAIChCAAAMBQBAACGIgAAwFAEAAAYigAAAEMRAABgKEcDoKamRunp6SovL5ckrVq1Sunp6crIyNDUqVPV0NDg5PIAgItwLAD27t2rnJwclZWVSZIOHz6sJUuW6OWXX9b69esVDoe1cuVKp5YHAFyCYwFQVFSkGTNmqHPnzpKkmJgYzZgxQ3FxcfJ4POrRo4eOHz/u1PIAgEtw7HoAs2fPPu92t27d1K1bN0nSiRMntGLFCs2dO9ep5QEAl+D6BWEqKys1duxYDRs2TElJSc1+fEJCnANdtU1+v/2/HidqSnLk6mWmXxGN+c2d363ZXQ2AQ4cOaezYscrNzdXo0aNbVKO6ukbhsGVzZ21PIBCvYDBke10nakpSVdUZW+sFAvG214wmzG/u/HbP7vV6LvjC2bUAqKmp0ZgxYzRx4kQNHTrUrWUBABfg2nEAf/nLX/Tpp59q6dKlyszMVGZmphYuXOjW8gCAL3F8C2Dz5s2SpFGjRmnUqFFOLwcAiBBHAgOAoQgAADAUAQAAhnL9OAB8/QRDYUc+t9zhyvY6farO9roAPkcA4LL5fV49vHCbvTX9Ps25/2e21gRwPnYBAYChCAAAMBQBAACGIgAAwFAEAAAYigAAAEMRAABgKAIAAAxFAACAoQgAADAUAQAAhiIAAMBQBAAAGIoAAABDEQAAYCgCAAAM5WgA1NTUKD09XeXl5ZKkHTt2KCMjQ6mpqSosLHRyaQDAJTgWAHv37lVOTo7KysokSWfPnlVBQYEWL16sjRs36v3339e2bfZeRQoAEDnHAqCoqEgzZsxQ586dJUn79u3Tddddp2uvvVY+n08ZGRkqLS11ankAwCU4dk3g2bNnn3f7k08+USAQaLrduXNnVVZWNrtuQkLcZfcWLfx++389TtR0sq4TF5uPFibPLpk9v1uzu3ZR+HA4LI/H03TbsqzzbkequrpG4bBlZ2ttUiAQr2AwZHtdJ2o6UfdcoFRVnbG1brQIBOKNnV0ye367Z/d6PRd84ezap4CuvvpqVVVVNd2uqqpq2j0EAHCfawGQmJiow4cP68iRI2psbFRxcbH69u3r1vIAgC9xbRdQbGys5s2bpwcffFD19fVKSUnRoEGD3FoeAPAljgfA5s2bm75OTk7W+vXrnV4SABABjgQGAEMRAABgKAIAAAxFAACAoQgAADAUAQAAhoooAF588UXV1NQ43QsAwEURBcBHH32kgQMH6pFHHtF7773ndE8AABdEdCDYrFmzVFNTow0bNuixxx6TZVnKyclRRkaGYmNjne4RAOCAiN8DiIuL06BBg5Senq6TJ09q5cqVGjRo0HlH+gIAokdEWwA7d+7UqlWrtHPnTg0cOFDPPPOMbr75Zh09elQjR47UgAEDnO4TAGCziALgscce08iRIzVz5kzFx//3QgXf/va3NWLECMeaAwA4J6IAWL9+vUpLSxUfH6+qqiq99tprysvLk9fr1fjx453uEYYKhsKOXBmpviGk06fqbK8LRJuIAmDmzJn67LPPNGTIEHm9Xu3Zs0fl5eWaNm2a0/3BYH6fVw8v3GZ73QUTUmyvCUSjiALg3XffVXFxsSQpISFBCxcuVGZmpqONAQCcFdGngILBoBoaGppuh0LOXFcWAOCeiLYA+vXrpzFjxigzM1Mej0fFxcVKSWEzGgCiWUQBkJ+frxUrVmjTpk3y+Xy64447lJ2d7XRvAAAHRRQA7dq1U15envLy8pzuBwDgkogC4G9/+5vmzJmjU6dOybKspvvfeecdxxoDADgrogBYsGCBpkyZop49e8rj8Vz2ouvWrdOzzz4rSerbt68mT5582TUBAM0TUQB06NBBqamptixYV1en2bNnq7S0VB06dFBOTo527Nih2267zZb6AIDIRPQx0MTERG3bZs8BOY2NjQqHw6qrq1MoFFIoFOKMogDQCiLaAti2bZuWL18uv98vv98vy7Lk8Xha9B5AXFycJkyYoDvvvFPt27fXT3/6U/3oRz9qdh0AwOWJKACef/552xb88MMP9corr2jLli2Kj4/XQw89pCVLlmjs2LERPT4hIc62Xto6vz+iX0+r14zGuk6cY8hu0dCjk0ye363ZI/rf1a1bN5WWlmr//v0aN26cNm3apPT09BYt+NZbbyk5OVkJCQmSpKysLK1cuTLiAKiurlE4bF36B6NcIBCvYND+I66dqOlE3XNP/E71W1V1xpG6dgkE4tt8j04yeX67Z/d6PRd84RzRewDPPvusXnrpJZWWlurs2bNatGiRnnnmmRY1c/PNN2vHjh2qra2VZVnavHmzfvCDH7SoFgCg5SIKgNdee01//vOf1b59e1111VUqKipqOjlcc/Xp00dpaWnKysrSkCFDFAqFdO+997aoFgCg5SLaBeTz+RQTE9N0u0OHDvL5Wr5v9t577+VJHwBaWUTP4l27dtXWrVvl8XjU0NCgJUuWqFu3bk73BgBwUEQB8Oijjyo/P18fffSRbrnlFiUmJurJJ590ujcAgIMiCoAuXbpo2bJlqqurU2Njo+LizPkoJgB8XUUUAEuXLv3K+3/1q1/Z2gwAwD0RBcCBAweavm5oaNA//vEPJScnO9YUAMB5EQXA3Llzz7tdWVmpRx55xJGGAADuiOg4gC/r0qWLPv74Y7t7AQC4qNnvAViWpffff7/pVA4AgOjU7PcApM+PC8jPz3ekIQCAO1r0HgAAIPpFFAC5ubkXvRTkCy+8YFtDAAB3RBQA3//+93Xo0CGNGDFCfr9f69atUygUUlpamtP9AQAcElEAvPPOO1q5cqXatWsnSfr5z3+uESNGaODAgY42BwBwTkQfAz1x4oTq6+ubbn/22Wc6e/asY00BAJwX0RZAenq67r77bt1xxx2yLEslJSXKy8tzujcAgIMiCoAJEyaoZ8+e+vvf/67Y2Fg9/vjjuvXWW53uDQDgoIiPBO7SpYtuvPFGTZw4UX6/38meAAAuiCgAXnnlFU2dOlXPPfeczpw5o/vvv19FRUVO9wYAcFBEAbB8+XKtWrVKcXFxSkhI0Jo1a7Rs2TKnewMAOCiiAPB6veddBKZr165NHwkFAESniAKgY8eO2r9/f9PRwOvXr9eVV17Z4kU3b96srKws3XnnnZo1a1aL6wAAWi6iTwEVFBRowoQJOnr0qPr06aPY2FgtXry4RQseO3ZMM2bM0OrVq5WQkKB77rlH27ZtU0pKSovqAQBaJqIAOHv2rNatW6eysjI1NjbqO9/5Tos/CfTGG29o8ODBuvrqqyVJhYWFio2NbVEtoCWCobACgXhba9Y3hHT6VJ2tNQGnRRQADz30kEpKStS9e/fLXvDIkSPy+/0aN26cKioq1K9fP02cOPGy6wKR8vu8enjhNltrLpjAFiyiT0QBcNNNN2nDhg368Y9/rCuuuKLp/o4dOzZ7wcbGRr399tt68cUXdcUVV+jXv/611q5dq6ysrIgen5AQd+kf+prw+yP69bR6Tep+zu6tCrvrRRuT53dr9oj+F2zatEmlpaXn3efxeLR///5mL/jNb35TycnJ6tSpkyTpF7/4hfbt2xdxAFRX1ygctpq9brQJBOIVDIZsr+tETSfqnnuCjpZ+Jamq6oxttQKBeFvrRRuT57d7dq/Xc8EXzhEFwHvvvWdbM/3799fkyZN1+vRpfeMb39Cbb76p22+/3bb6AIDIXPRjoI8++mjT1ydOnLBlwcTERI0dO1YjR47U4MGD9a1vfUvDhg2zpTYAIHIX3QJ4//33m74eM2aM1q5da8uiw4cP1/Dhw22pBQBomYtuAViW9ZVfAwCiX8RnA73YNYEBANHnoruAwuGwTp06Jcuy1NjY2PT1OS35GCgAoG24aAAcOHBAvXv3bnrST0pKavpeSz8GCgBoGy4aAB9++KFbfQAAXBbxewAAgK8XAgAADEUAAIChCAAAMBQBAACGIgAAwFAEAAAYigAAAEMRAABgKAIAAAxFAACAoQgAADAUAQAAhiIAAMBQFz0dNIDIBENhBQLxttYMBOJV3xDS6VN1ttYFziEAABv4fV49vHCbffX8PgWDIS2YkGJbTeDLWnUX0BNPPKEpU6a0ZgsAYKxWC4CdO3dq7dq1rbU8ABivVXYBnTx5UoWFhRo3blzUX3ayw5XtFRvDnjQA0adVnrmmT5+uSZMmqaKiotmPTUiIc6Cjy1OweLvtNefc/zP5/fb/epyoSV1nap6rZ/eby9HC1Lkl92Z3PQBWr16trl27Kjk5WWvWrGn246uraxQOWw501jKBQLyCwZAjtZ2oGy29nnvyi5Z+7a557k1gSaqqOmNb3WgRCMQbObdk/+xer+eCL5xdD4CNGzeqqqpKmZmZOnXqlGprazVnzhwVFBS43QoAGM31AFi6dGnT12vWrNHu3bt58geAVsCRwABgqFb9+EpWVpaysrJaswUAMBZbAABgKAIAAAxFAACAoQgAADAUAQAAhiIAAMBQBAAAGIoAAABDEQAAYCgCAAAMRQAAgKEIAAAwFAEAAIYiAADAUFzNHGjDgqGwI9eHrW8I6fSpOtvrIroQAEAb5vd59fDCbbbXXTAhxfaaiD7sAgIAQxEAAGAoAgAADEUAAIChWuVN4EWLFqmkpESSlJKSovz8/NZoAwCM5voWwI4dO/TWW29p7dq1evXVV/XBBx/ojTfecLsNADCe61sAgUBAU6ZMUUxMjCSpe/fuOn78uNttAIDxXA+AG2+8senrsrIylZSU6KWXXnK7DcBoHGAGqRUPBDt48KDuu+8+5efn6/rrr4/4cQkJcc411UJ+vzP/jE7UjaZeo62u3TXP1XOkV59XBYu32153zv0/sy1YnAioaOHW7K0SAHv27NH48eNVUFCgtLS0Zj22urpG4bDlUGfNFwjEKxgMOVLbibrR0uu5J71o6dfumn6/r6leNP0bSFJV1ZnLrhEIxNtSJxrZPbvX67ngC2fXA6CiokIPPPCACgsLlZyc7PbyAID/53oALFmyRPX19Zo3b17TfdnZ2crJyXG7FQA2s/O9hXN1eF/BOa4HwLRp0zRt2jS3lwXgArtOXvfFXWCcuM45HAkMAIYiAADAUAQAABiKAAAAQxEAAGAoAgAADEUAAIChuCg8ANiow5XtFRtzeU+tX3UwnRMHxBEAAGCj2BjfZR0M98WD4L7IiQPi2AUEAIYiAADAUAQAABiKAAAAQxEAAGAoAgAADEUAAIChjDkOwI6DMwC4z86rjJ1ft1F+Xzvb60YTY54RL/fgjAvhakWAs+y6ytiXLZiQYvxzAruAAMBQBAAAGIoAAABDtUoAbNiwQYMHD1ZqaqpWrFjRGi0AgPFcfxO4srJShYWFWrNmjWJiYpSdna2kpCTdcMMNbrcCAEZzPQB27Nih3r17q2PHjpKkgQMHqrS0VL/5zW8ierzX62nx2lfFx7b4sV+XutHSq8/vc6TuOW3939bn9ykUbGd73S9qy3W/OL9dNb9KW6z75dm/qCXPfxd7jMeyLKvZFS/Dn/70J9XW1mrSpEmSpNWrV2vfvn2aOXOmm20AgPFcfw8gHA7L4/lvIlmWdd5tAIA7XA+Aq6++WlVVVU23q6qq1LlzZ7fbAADjuR4At912m3bu3KkTJ06orq5Or7/+uvr27et2GwBgPNffBO7SpYsmTZqkvLw8BYNBDR8+XD/84Q/dbgMAjOf6m8AAgLaBI4EBwFAEAAAYigAAAEMRAABgKAKgjaipqVF6errKy8slfX7KjIyMDKWmpqqwsLCVu3PWokWLlJaWprS0NM2fP1+SWfMvXLhQgwcPVlpampYuXSrJrPnPeeKJJzRlyhRJZs2fm5urtLQ0ZWZmKjMzU3v37nVvfgut7p///KeVnp5ufe9737OOHTtm1dXVWSkpKdbRo0etYDBojR492tq6dWtrt+mI7du3W3fffbdVX19vNTQ0WHl5edaGDRuMmX/Xrl1Wdna2FQwGrbq6Oqt///7W/v37jZn/nB07dlhJSUnW5MmTjfr7D4fDVp8+faxgMNh0n5vzswXQBhQVFWnGjBlNR0Tv27dP1113na699lr5fD5lZGSotLS0lbt0RiAQ0JQpUxQTEyO/36/u3burrKzMmPlvvfVWvfDCC/L5fKqurlZjY6NOnz5tzPySdPLkSRUWFmrcuHGSzPr7//e//y1JGj16tIYMGaLly5e7Oj8B0AbMnj1bP/nJT5puf/LJJwoEAk23O3furMrKytZozXE33nijbrnlFklSWVmZSkpK5PF4jJlfkvx+v55++mmlpaUpOTnZqN+/JE2fPl2TJk1Shw4dJJn193/69GklJyfrmWee0fPPP6+XX35Zx48fd21+AqANMvGEeQcPHtTo0aOVn5+va6+91rj5x48fr507d6qiokJlZWXGzL969Wp17dpVycnJTfeZ9Pffq1cvzZ8/X/Hx8erUqZOGDx+up59+2rX5XT8VBC7NtBPm7dmzR+PHj1dBQYHS0tK0e/duY+Y/dOiQGhoa9N3vflft27dXamqqSktL1a7df88H/3Wef+PGjaqqqlJmZqZOnTql2tpaffzxx8bM//bbbysYDDYFoGVZ6tatm2t//2wBtEGJiYk6fPiwjhw5osbGRhUXF39tT5hXUVGhBx54QE8++aTS0tIkmTV/eXm5pk2bpoaGBjU0NGjTpk3Kzs42Zv6lS5equLhY69at0/jx4zVgwAA999xzxsx/5swZzZ8/X/X19aqpqdHatWv129/+1rX52QJog2JjYzVv3jw9+OCDqq+vV0pKigYNGtTabTliyZIlqq+v17x585ruy87ONmb+lJQU7du3T0OHDlW7du2UmpqqtLQ0derUyYj5v4pJf//9+/fX3r17NXToUIXDYY0cOVK9evVybX5OBgcAhmIXEAAYigAAAEMRAABgKAIAAAxFAACAoQgAADAUAQAjjR49WidOnLjsn9m1a5fS09Mvud5NN930lbU2bdqkWbNmSfr8tMClpaUqLy9Xr169LlkTuFwcCAYjbd++3ZafuVy33367br/9dsfXAb4KWwAwztSpUyVJ99xzj3bv3q3c3FxlZGRoyJAhevXVV//nZyoqKrRlyxZlZ2crKytL/fr101NPPdXsdZ966indddddyszM1JYtWyRJa9as0X333WfLXEBzsQUA48ydO1dr1qzRsmXLNGLECOXn5ys1NVWVlZX65S9/qeuuu+68n7nqqquUn5+vefPm6frrr1dlZaX69++vvLy8Zq17zTXX6PHHH9eBAweUm5urkpIShyYEIkMAwFiHDh1SfX29UlNTJUldunRRamqq3nzzzfP2wXs8Hv3xj3/U1q1bVVxcrEOHDsmyLNXV1TVrvZycHElSjx491L17d7377rv2DQO0ALuAYCyPx/M/51m3LEuhUOi8+2pra3XXXXfpgw8+UM+ePZWfny+fz6fmnkbL6/3vf7dwOCyfj9dfaF0EAIzUrl07devWTT6fT6+//rokqbKyUn/961912223Nf1MKBTSkSNHVFNTo4kTJ2rAgAHatWuXGhoaFA6Hm7Xm2rVrJUkffPCBjh49qsTERHuHApqJlyAw0qBBgzRq1CgtXrxYs2bN0h/+8Ac1NjbqgQceUO/evZt+Jjc3VwsXLlS/fv105513KiYmRj169NANN9ygI0eOKCYmJuI1jx07pqFDh8rj8ej3v/+9Onbs6NB0QGQ4HTQAGIotAMAGzz33nDZs2PCV3xszZoyGDBnickfApbEFAACG4k1gADAUAQAAhiIAAMBQBAAAGIoAAABD/R+g2LBfFQqybgAAAABJRU5ErkJggg==", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Create a visualization with pandas df\n", "sns.histplot(data=pandas_tips, x=\"total_bill\", stat='frequency')" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYEAAAEKCAYAAAD0Luk/AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAABA7ElEQVR4nO29eZhU5Zn//T1bbb1vNItBQjOIWUg0TBAnV1wYGbAjKMEJ0ZBMzO991Z+jwcmko8YxE18TtN+8cUkMZoxijBI1pCVuEAhKflGwDaJ2UBBpAw3SNL1X13q25/3j1Dld1V3VXdVdVaeqz/25Li851VXnPM+pU/d9P/dzLxxjjIEgCIJwJLzdAyAIgiDsg5QAQRCEgyElQBAE4WBICRAEQTgYUgIEQRAOhpQAQRCEg7FVCdxzzz245ZZb7BwCQRCEo7FNCezduxfPPvusXZcnCIIgAIh2XHRgYAD33nsvrrvuOhw6dCijz/b3B6Hrzshvq6kpRW9vwO5h2IKT5w7Q/Gn+2Zs/z3OoqipJ+XdblMAdd9yBm2++GZ2dnRl/dqzJTEVqakrtHoJtOHnuAM2f5p+f+eddCfzud7/DjBkzsGTJErS0tGT8+d7egGNWAnV1ZejuHrJ7GLbg5LkDNH+af/bmz/PcmAol70rgpZdeQnd3N1atWoXBwUGEQiH8+Mc/xm233ZbvoRAEQTievCuBTZs2Wf9uaWnBG2+8QQqAIAjCJihPgCAIwsHYsjFssnr1aqxevdrOIRAEQRQ0be092N7agZ7BCGorPFi+eDYWNtRm7fy2KgGCIAgiNW3tPXhy52EIAg+fR8RAUMaTOw8DQNYUAbmDCIIgCpTtrR0QBB5uSQDHcXBLAgSBx/bWjqxdg5QAQRBEgdIzGIFLTBTTLpFHz2Aka9cgJUAQBFGg1FZ4IKt6wmuyqqO2wpO1a5ASIAiCKFCWL54NTdMRVTQwxhBVNGiajuWLZ2ftGrQxTBAEUaCYm78UHUQQBOFQFjbUZlXoj4SUQBbJdTwvQRBEtiElkCXyEc9LEASRbWhjOEvkI56XIAgi25ASyBL5iOclCILINqQEskQ+4nkJgiCyDSmBLJGPeF6CIIhsQxvDWSIf8bwEQRDZhpRAFsl1PC9BEES2IXcQQRCEg6GVADHloSQ+gkgNKQFiSkNJfAQxNuQOIqY0lMRHEGNDSoCY0lASH0GMjS3uoPvvvx9//OMfwXEc1qxZg29+85t2DINwALUVHgwEZbglwXqNkviIfFLoe1J5Xwm88cYbeP311/Hcc8/h97//PX7zm9/gww8/zPcwCIdASXyEnZh7UgNBOWFPqq29x+6hWeRdCXz+85/H448/DlEU0dvbC03T4PP58j0MwiEsbKjF1ZfMR2WJC6GIisoSF66+ZH5BWWLE1KUY9qRscQdJkoQHHngAjz76KJYvX476+vq0P1tTU5rDkRUedXVldg/BNrI196V1ZVh63sezcq584uTvHpga8+8LyCjziuA4znpNFDj0B+Rx55ev+XOMMZaXKyUhHA7juuuuw6WXXoqvfOUraX2mtzcAXbdtyHmlrq4M3d1Ddg/DFpw8d4DmP1Xm37x5/6g9qaiiobLEhaarzk35uWzOn+e5MY3nvLuD2tvbcfDgQQCA1+vFsmXL8P777+d7GARBEDmnGPak8q4ETpw4gdtvvx2yLEOWZezatQuf+9zn8j0MgiCInFMMe1J53xO44IIL0NbWhssvvxyCIGDZsmVobGzM9zAIgiDyQqEXlrRlY/jGG2/EjTfeaMelCYIgiDgoY5ggCMLBkBIgCIJwMFRFlCBySKGXDCAIUgIEkSOojDVRDJA7iCByRDGUDCAIWgkQRI7oGYzA50n8iVEZ60TIXWY/tBIgiBxRW+GBrOoJr1EZ62GKocKmEyAlQBA5ohhKBtgJucsKA3IHEUSOMN0a5O5IDrnLCgNSAgSRQwq9ZICdUNe3woCUAEGANijtYPni2Xhy52FEYawAZFUnd5kNkBIgHMFYQp7i+e2B3GWFASkBYsoznpCP36AEALckIApDOJFAyi3kLrMfig4ipjzjRaH0DEbgEhN/CrRBSTgFWgkQU57xolCcvEFJeyEErQSIKc94SVtOjeenZC0CICVAOIDxhHwxtADMBZSsRQDkDioYki3Ll9aV2T2sKUE6UShO3KCkZC0CICVQEKSKXqmo8OHMWp/dw5sSOFHIj4eT90KIYWxxB/385z9HY2MjGhsb0dzcbMcQCopUy/KW3UfsHhoxhXHqXgiRSN6VwJ49e/Dqq6/i2WefxdatW/Huu+9i586d+R5GQZEqRPF0X8imERFOwKl7IUQieXcH1dXV4ZZbboHL5QIANDQ04OTJk/keRkGRalk+rZpcQVOVQgnNJDcZwTHGmF0XP3r0KL761a/it7/9LebMmWPXMGxn38Eu/LKlDaJouIKiigZVZbh29UIsOrve7uHZzr6DXWjZfQRdfSHUV/uw+sJ5RX1f6PsmCgnblMAHH3yAa6+9FjfeeCOuuOKKtD/X2xuArtumt3JG0uig8z6O7u4hu4dmC3V1ZejuHkrYNI8vMlbMbovmzftHrfyiiobKEhearjoXwPD8nQrNP3vz53kONTWlKf9uS3TQm2++iZtuugm33XYbGhsb7RhCwUHL8uRMxbo+FJpJFBJ5VwKdnZ244YYbcO+992LJkiX5vjxRZExFgUmhmUQhkffooEceeQTRaBR33303Vq1ahVWrVuG3v/1tvodBFAlTsU8vhWYShUTeVwK33347br/99nxflihSMmk8UigRN+NBdfSJQoIyhomCJl2BWWyNYWgPiCgUSAkQGZNvizsdgTkVN5AzpVhWQkRhQUqAyIhCtbin4gZyJhTq90IUPqQEHE6m1mOhWtxOj7gp1O+FKHyon4CDmUhTkUJtxej0iJtC/V6IwodWAg5mItZjoVrcuY64KXR/e6F+L0ThQ0rAwUzEj55JyGa+yVXEjV3+9kwUTyF/L0RhQ0rAwUzEenRijLsd/vZ9B7syUjxO/F6I7EBKwMFM1Hp0Qox7vBU+GIiisswNxCnLXPvbW3YfyVjxOOF7IbIPKQEHQ9Zjcka6f/xBGX3+CDiOg9dt/GRy7W/v6gvBI9FGL5F7SAk4nMlYj4W+WTpRRrp/Kkpd6PNHMTAUhccl5MXfXl/tQ3d/iDZ6iZxDIaLEhDCt5a7+MIZCMj44MYAHWw7guVc/tHtok2ZkuKXPI6GqzAXGkLc2jKsvnOfokFcif9BKgJgQ21s7oGoMQyEZAAeB56HpDC++3oE5M8qxsKG2aFcKyTbMRVFAwyyv1fQl1yw6ux5XXzK/KO8fUVyQEihi7BSyPYMRhCIKAA48Z7zGc4CmM2xv7QCAoi1jkOmGea6+B9roJfIBKYEixe5aMbUVHvQPRSDww24TBkASjM3LiYRVxgvTGXWlWHrOTNuarwPpbZjb/T0QxGQhJVCk2F0rZvni2Wj/yA9NZ+A5QwEwAF63gNoKT0IiWiiiwB9SoKo6egciaGvvGbcUdL8/bKswTdcKt/t7IIjJQkqgSMk02zfbLouFDbVoXDIbL77eAU1nkAQeXrcASeSxfPFsbG/twEBQhqbp6BuKIuYxAschqXAfKUw9kgBVY7YJ03Tvl9OrlxLFDymBIiWTbN9cuSxWfmEu5swoTyksn9x5GIMBOaYAOHAcg9ctYDAg48GWv6FhVoX1/okI01z54jO5X9ms2VMo7jDCWZASKFIy2bzMtssiHeFrHj/Y8jcwxiCJxvUDERUcAMZYgnDNVJjm0hefyf3KVs2eQnOHEc6B8gSKlIUNtbj6kvmoLHGNG7uezTLDmZSfXthQi4ZZFair8qG+2oeIolurAkkU4JYECAKP7a0do0pBR2R1TGEaL6g5jks412TJ5H5l8j2Mxcj5eFxi1uZDEGNh20ogEAhg7dq1eOihh3DGGWfYNYyiJt3Ny2y6LDJdVcRbyqqqAwA4jqG8xA1gWLiOjMgx3SEA0Lx5/6hVRy598Zner2yEcprzCUdV+IMyVI1B4IFQWJnUeQliPGxRAu+88w5uv/12HD161I7LO45slhnOVPjGC/fegQg4DqgscyetwRMvTOvqyrDr9b9bLhKOAz486ccDW9ows8YHj2TMIxdlFewoy1xb4bGyr43kO0DVGHSmJY2mIohsYYs76JlnnsEPfvADTJs2zY7LO45suSwAQ1jJMYveZDzhe7TTj46uABRNh6LqCEWUtEohbG/tgKLq6B2MoHsgAiV23dMDEfjDKkJhJSdlFbJ5v9Jl+eLZCIQUMAZwYNB1ABxQ4hHJJUTkFI4xxuy6+MUXX4zHH3+c3EFFxL6DXfhlSxtE0fDDRxUNqspw7eqFWHR2/aj3/3bHITy98zDAAQIHKBoDY0CJV8LcWRVYfeG8pJ8DgK/9YBuCYQWqlviI8hwws64EoiCgrMSF030hTKv2jXmuYuBrP9iGqKxB1XRIIo/KUjd8HhGBsIpfff8Su4dHTFGKLjqotzcAXbdNb+WVuroydHcP2T2MBM6s9WHt0nmjooPOrPUlHevW3e0AAIEztoQlgYOmMzCd4eY1CwEg6efq6sqgKDpME8XMMzCT0niOQyAk445vLEr4XKHdr0yYUe2z9iIkkYei6ghGVFSVuop6XhOhEJ/9fJLN+fM8h5qa0pR/LzolQNhPJhuhEVmFYBYXisFzxuvjIQocogozQkrjXucwNcsqx+9FGHOnyqFE7iElQEyasfIGPC4RUUWDEKcHdGa8Ph4za0vQ1R9GIKxAUY3wUp6HUbF0CgrH+E30/oCMqlLXqByMYq3MShQupAQIi4kImPgkJ4DhgxMDONQxAIHnMKPGh880VKP10GlourEC0BnAwLDsH8ffBzIt45oKDzRNx2BAhqrpmFbpwaIF07C9tQNP7DicU2GYb6FrrrKSuQOoWF1xUuiK21Yl8PLLL9t5eSKOiQiYtvYe/M9z7yGiaOA5JGzgajpDZ28Q/pCCxQum4Z32PkRkFR6XiGX/eAZWfmHuuGMamTswd2a5Zf3nWhi2tfdgy+52nOwJQhR4lJdItgtdKlZXfBSD4qaVgEMZaZ0EQnJGAsZ8uKOKCoHjEhSA6cPnOB6RqIrjpwOYXV9qXWvOjPK0x5ls/6F58/6cCkNzboMBGRzHQWcM/QEZ1WVuK4vXjh8wFasrPopBcZMScCDJrJPTfSFUl7sTkq/GEjDmwy2JRrVPFrd1aygAw/2jqDpO9oYwjSFrllAmwnAiS3Fzbjpj1qa2zhj8IQX1VV7bhG66mcyF7n5wEsWguEkJFCht7T3YtaUNnd2BrP+QzSQss8a/KBoZuf6gghKvy3rfWBE45sNdXuJCnz8yKoJH4DnoDAAHiFm2hDIRhhNZiptzEwUeqmb0S+BglL2wMyopnUzmybgfnnv1Q+z464mM3XZEarJZsiVXkBIoQMwfstsl5MSP+FFPEKGoUc2T5wBN08FgPJxRRUsqYEZal0zXcao3ZDSV4Y0y0WZMv8Aj9m8jw7ei1JVw/claQumWdZjoUtz84ZoKTmdcbF6crVFJ6XQ8m+icn3v1Qzy35yg4cBB4Izz1uT1HAYAUwSSwowRJppASKEDMH7LHJUKJ1cfJph9R0xjAjCQSIObDZ0Yilz8gJ1iCZsP4eOvyVF8Ig0EZgJG0xRgDx3HwShx8Hgn+oAKAob66BACgjkjuG2kJJXNfLK0rSzn+dNs/TnQpbv5wBYFHVZk7FpXEML3GizUXzbPVtTJejsZE57zjrycsBQAY2d2abrxOSmDiZNKq1C5ICRQgufYjmklYOuOssE2dMeiMoabUA58qwB9U8NxrR7Hv/W4ASLAuw7IGnuPA84bQUDUdAs+hpsKLO7+1OOFa1gYykltCqdwXFRU+nFnrSzmHdBLWJroUTxWVVEg/3FRMdM6TSeojxiYbVWZzCSmBAsT8Ibty5Ec0k7DCURWqpkMUeDBwEDjD3dEf6wbGmOE6YsxQHNXlHnjdIlRVN/oKs2FrnzGGUGRYYMRb9x6JBzgOoYg6yhJK5b5o2X3EKisxUSazFC/0H24qJjrnyST1EcUNfcMFiPlDjsgqeI6btB9xpLtlwexK9PojqCxzW4LidF8I5eUu+EOK1fmLAdZur6oxnO4Pg+cNF5DOACmu8Uq8khpp3cuqDk3V8LVloytxplr1nO4LTWiu8RTDUjzbTHTOy/7xDDy35+iEkvqI4oaUQAFi/mB3vXUyaXRQJiGApkBWVB3hqIa+oSjaT/qxaH4t+gOydQ6RA1QGqKpsbBanqNGn64Ae0wwe3tgPGKmkRlr3ZrbvyL7CQGr3xbTqRFfQRMMei9WinwwTmbPp96foIOdhaynpieD0KqLxVnb8cj9Vvfvmzftxqi+EobBh4XMwrDye43DD6k8lKBYzQUpnbFT55pEIPAee4+BxC6gocQGMIaIYq4GPeoKoKnOD4ziEIgr6hqLW6qKuypcw3lTz+d9rPmvtCZjvUTWGUESBqungOR6NS2bnRUjZEXdvdxVNu3MN7J6/3eSziij1GC4yMu2t2zMYQTiqxcJBOXAcF4vh1xM+YzZSmVbpga4zcFzS0yVQU+lBRYnL6CkQlwwWiWrwx6KHTPdSsr7C8dcd2cAlvi/A9tYOqBrDUEiGpscKyOk6/vDaUax/4C9o3rw/aY/jbJBJT+WpghPn7GTSdgcNDg5CEASUlqbWKIRBLq2o8SKHRl7b4xJiljiDorNY5ypAELhR0UamG6GtvQdbXjmCEz3J/fIcAFE0LPfOniBqKr3QNB1dQ1Goqm4knoUMJRCVtdhnGEpjiWgjI53SCXsMRRQAZjQTg7kYjCpaTuuxFEPaf7Zx4pydzLhK4MMPP8R3v/tdHDx4EBzH4ZxzzkFzczNmzpyZj/EVHbkuGDVWCGC822QoGE0ZUspgFHjzuISkfzeF8nOvfojn9xyDNtL9xgHlPinWZpKDqmpWRBHPGefWGTAQkK2P8DyHQFiBSxLA81xGkU4el4CeQbNJPazNai52rVwKqUJO+8+VsVHIcyayz7hK4NZbb8WVV16JL3/5y2CM4emnn8b3v/99bNq0KR/jKzpybUWNFQJouk0GA1GMtW0i8ADAAeNsB638wlzMmVGOLbvb0RnLDhZ4oCpWSE3TdNRXeXB6IGK5mzRdT7i2wBtJR8bWE4eBoSgqYnXy06GtvcdyLQGJQ+ZglKQAhoVUtgVjJiUqtrxyBF39ERiJcj6subAhZ5ZzLo2NYih1QGSPcZVAOBzG2rVrreN169bhmWeeyemgiplcW1FjhQA+seNwrIm78d74lozx8DyPihIJEUXHeMS7auIFbGXJsCB/YEsbOM4osKOPOCUXM911BkuCp9rEjj//jLpSLD1nJra3dsDnlazaRvFz4XkO5SWGi0lWdXhcAh598SAisgZNZ/AHZTz64kFc03h22oIxWTjtawdOjVuv59EXDyIYUcHFdGtnbxCPvnQI11y6ICeKIJfGRjGUOiCyx7hKYO7cudi/fz/OPfdcAMDhw4epMfwYpGNFTdZajffdDzdW6YDHJaB/SE8QlCMVgChwVvTP9GpvRnNL5bufWVuC0/1h6LHcAoE3lAEDRkUZaTrDb7YfwrrlicJxpGV7snsIDz57AKqmwyUKKC+RUFclwh+UEZU1MABlPhc8LsFqw6jIuiWIzb2DYETFlleOjBp3su8AGN2n4LUDp/BPn5qOQx0DY9bricha7LoxZcg4RKJqRkI5mRJM9dlcGhtOzK9wMuMqgZMnT2LdunU466yzIIoi3nvvPdTV1eGyyy4DADz//PM5H2QxMZ4Vla1lfLLzhMLG5qkp+kcqgGGXjSEcF8yuRPPm/ZP+oa+5sMEaS78/ClXTwfEAS+GT6h2S8bPf/w0lHhEza0ssV5Zp2fb7IxgKDVv9sqKhb0hHdZkb9dU+RBUNIgeU+lwJY3+w5cCwII7NVweLuWjGvnePvnQIUdnIoJZEAeUlLnjdIqIADnUMoOmqc1POv2cwYhTSi4uoMvZG9LSF8sgx9fvDYz4XuXbZODG/wqmMqwT+8z//Mx/jmDKMZ0VteeUIBgMyNJ1BFHmU+6QJNSpJ5g4AjCze/kB0lFuG52DVyBcFI0z0tQOnrNj7/qEI2j/yTyj2Pn7OwbACXWco9UkYCERTbjtoOkuI7InIGqrK3AhFFPhDSsJ7GQBdY/AHFWsvYm1Sl1KskmmcMGYM4LjEQYy8d7rOEAwb4aeSYDTI6fNHUF3uiW1Kjy3Iays88AeN/Arz0jozQlnTFcojx+SRjD4NqZ4LctkQ2SKlEmhvb0dDQwNKSkqS/v2Tn/xkzgZV7KSyotrae3CyNwSe56wSzn1DUVSVujJexqdyB6iqjpu+vNDapFQ1HbzAoSZW9wcwwir9scqYQyEZABeLvWd48fUOzJlRnrEVmGzvIBCSocTcQcn2J8zIHr+sIRhWEYyoMHMXh7uTDZcxUFTN2otINr76ah86e4MJhfHiq5mmunfGxrNxRQbzehz8QRk87x5XkC9fPNvaE9BNRQQGjyf9DfBM3TvksiGyRUol0NzcjF/+8pe48sorMWPGDMQnFofDYezdu3fCF33++eexceNGqKqKb3zjG7j66qsnfK5iYntrB0SBh84M4WZk7xoW7tyZwy0XTSF6qi8MWdEgCBxmxdwmqcothCIKBgMyGDOsR7Pkselm4EeUeBAELiH2HhgO75zo5uJIP3vjkjPx4t4OKJqe1DUlCjzCURX+YBQMAItbvZjvF3jOUgjzz6gY0y2z5sIGPPrSIUSiKjRdh8Dz8HhcWHNhQ8L7Rt47VTNyG0SBM1YdMYteSdO6XthQi2saz7YUL8cZiieT6KCJuHfIZUNkg5RK4J577sHAwAAaGhrwm9/8xqoZrygKvva1r034gl1dXbj33nvR0tICl8uFtWvXYvHixZg3b96Ez1ks9AxGUFHqQv9Q1LJWGQNUffSegaox+IPDoZ6HOgZwqGMAZ9QZwiXeHaCqGvqHZAAM1eUey8dd7hURUfSkVTy3t3bggxMDEPjhpHEGQBImtrnY1t6TIID9QRkne0NYdFYt3jjUPSrXgItF9viDsmE5s1jTlrj38dzwHobAc+kJ40sXjGsdj3SlMMaswmmCEDtmgEcSU0YyJbv2ZATyyDFFZLVg3Dt2l5AgckvKshHf+c53cN555+HIkSNYsmQJzj//fCxZsgQXXXQRPvWpT034gnv27MF5552HyspK+Hw+/Mu//Au2b9+e9udfffXPAABVVdHUtB4vv7wTABCJRNDUtB5//vPLAIBgMICmpvV47bX/A8DIeG5qWo/XX98DAOjr60NT03rs2/cGAKC7+zSamtbjrbfeBAB0dp5EU9N6tLW9DQA4caIDTU3r8d57BwAAR4/+HU1N6/H++4cAAO3tR9DUtB7t7UcAAO+/fwhNTetx9OjfAQDvvXcAx1sfhRLqRXW5B8rgMZx68zHo0QHMrC2B5j+Gpqb12PryOxAEHt0nDqL7rcehyQFjfj2H0fv24zjR2Y0ndx7GgbdbEXj3SZRKGgYDCiLd76K/7Qm4RQZdZzj9931o2/UQPC4OKgM62/+K4HtPoumqc7GwoRZV6mF0v/0ENJ2BMYbB42/g9NtPwusWUFvhwdatW/Df//19675v2fI07rrrDuv4mWc2Y8OGO63jB37xPzj+5tOWL7y//RUc378Fx7uDuPHLnwY79Rf4P3jRELQ8h/CxXej82/NQVA06AwaP/BGDR/4IMVbLePCDbRg4stNYuegMQ0dexLY/PGld76c/vQePP/6oddzc/CNs3vw4FjbUoumqc8GdeBHHDuzCEzsOo3nzfnz3lluwZcvTAAyBrbS3YKjjdfQPRQFw6PvbZgRP7jOqpzKgv+0JfKbmpCXsmprWY+fO7Tl79p745V34/MeM0NuB3tM48pdfYckcBQsbarPy7DU1rceJE0apjra2t9HUtB6dnScBAG+99Saamtaju/s0AGDfvjfQ1LQefX19aGvvwcbf/AF/3fZzSFwUA0EZv/j1s7jhxhsQDBrP5p///DKamtYjEjGMh5df3ommpvVQVaO0+M6d29HUtN76rrZtewG33vod6/iFF7biv/7re9bx1q1bcPPNN6f97G3e/Diam39kHT/++KP46U/vsY43bXoY99//E+v44Yc34sEH77OOH3ro53jooZ9bxw8+eB8efnijdXz//T/Bpk0PW8epnj2TDRvuxDPPbLaO77rrDuvZA4D//u/vY+vWLdbxf/3X9/DCC1ut41tv/Q6effZZ63iyz178vUpGypXAI488EhvQrdiwYcOYJ8mE06dPo66uzjqeNm0a2tra0v58SYkbdXVlUFUVkiSgrMyDuroyRCISJElAebkXdXVl8Hq5hGNJ0iBJAioqjGOOiyYca1ow4TgaLYEkCais9KGurgzBYOLx4KAPkiSgqso47utLPD59OvG4stKHuiofwACXxKOu0oOgxKOy1I1vrfo0NP9RSJKAgaCCmhoRmpY8hl9nQJ8/ip3HTqBMFPD/XP8FfPv+vfBGPTh1moMk8ugdUmLx+YBLEuDhBQyIHPoDMupiHbvOnluLfdU+cPywpS2JRjezryxbgA/ePgmXS7DeX1rqhsslWsclJW643cPHgZhryWxMYtYe6uoPY+l5H8fBfXMwODiI739/FfYd7MKGe/6CYDAMj1tCKKxYXc4EngeL8wu5JQFV5W6c6OBx8Fg/jvWEsOjseng8kvUsAEg43newCx1dAZRWl2NmqQuBiILj3QHMD8o41hNCy+4j+KgnhBln8qiu8KKiVINfEmKrIg6iYNyLs+fWWuePf9Zy9eydc/Z0XLd4MU6cOIE773wF535iRtaePeO4JOG4uto4rqjwJj2uqSnBr19+H5LIQ+A5uEQBkkeCX+TRF5BRW1uG0tJSlJcb76+rK4PH40FZmcc6FkUx4RgAyso8Cc9SaenoYwBpP3slJW54PFLCcSAwfOzzuaAoroRjQWAJx/HX83pdcLtdCcc+nyvps5bs2O0WE45dLhGlpfHHAkpLPSP+nngcP55sPHtjkfcqohs3bkQ0GsX69esBAM888wwOHDiAO+8cW1uZFHsV0fGW1s2b92MgKKOrNzTKj25iZuHynBGjDxgtHE1/8kfdhoUmCDymx0oym01fmq8/P6PxJCNZQ/IX93aAgcV6DbDYCsPw/X/7yoVjlrp+sOUAdGaEWMY8Q1Y464za4U1df1BGKKLEvn8O9VWepO0ezXsY71+PKhrEWO/c+Iqlp/tCqKnwwOeRrPemulf5pFCqaDZt3AOfR7SMCiA/96dQ5m8X+awimvd+AtOnT8e+ffus4+7ubkybNi3fw7CN8XzHyxfPxqMvHUqpAABDAXAwsnFP94eNGkCm5S/ylh+93Dcs2FJtMmbqyzYbkoMZq5JQVMXWV4/CLfFQNUBjOuIXMTyPMePdFzbUonHJbLz4egc0nUESeHjdAvwhBVVlww3qw1HVCjkVYo3tO/vCSTOCU0XamMXu4sNqRYHHYEBOUAJUImEYKiEx9cl7Kenzzz8fe/fuRV9fH8LhMHbs2IEvfvGL+R5GYROL5R8LnkesJDSDzytBEjj0+yPo6ApAVnXoupEVzBizMmqzscm4468nLAUADId+RhUdLonHSC9WqVcas9Q1YNQouuGKT2H+GRUo80mYXu3D7PoyCMLw42luIHMY7mXAcUBE1kadu7bCEytuN4xZ7M4lJj7y5SUSVE1HVNGyfq+mAssXz4ZG92dKk/eVQH19PW6++WZ8/etfh6IoWLNmDRYunFwv2bEopsiGtvYe/M9z7yGiaJBEHmVeASFZg6xoCUlXxuaqEWoqxpKnev1RCLzhz2bMCPXsH4piIBCFN+ayyca8I7I6SgGY+NwiwtFY6ehYCGwwosIljZ9wNXJF8qe3PsLv/vQBdGb0QDaFelwwEzgY8xx57lSJVPVVhnKIt2pFUcDMGt+o7ONCfUbyTSHlIxTTb7mYsKW95GWXXWaVncgluS7rPNZ1M31YzbFGFdVq+D4U1lFV5obX7cXAUBRul2B1/jITocpL3BgYihpC1wypjNtYZXEum22tHVixOHVGcDrj9rhEhKLqKAVgFHiT4RL5mH/feIfOgMGAnJAHkc79e/mvx1HiNZSKohmFiAwXWGJIq5CkLHUqwQUgqXJInn1MmBRCPoJdv2UnMKV7DNvRHCN5T98Do3r6JitCJgg8JFGIFV0z3Dk9AxFIIo9pVV6subABW145gpO9IYgCh4pSN3ieM5KdMGyZjywZYRJVdGx99Si6+kL4v1Ymhvmm+yNb9o9nYOurR0ftWXhdAhSVobxEQn8gsYSCqrGM3AfbW40EM6MaqA6XyEMUOISjGhjToTHOqtbpcQtpnzubVi1ZpfmFGt3kjimtBOxojrG9tQOKqls9fYVYeYi9751GiUeEqulJ6/SYYy0vMdwSZgQUg+HPPtEdROu7p3Dn/zrPis7pHYzA4xJRVebGUEixBO94sVN73zuNDz7ag68tm58gGFP9yMy/W+WVP1aBQ8cHrfOZ2b/VZRJUBlSXueEPKVBVHQLPYXqNUa003WJ1J3uCsWqghu9f1RgUVYNbElBd7kFXXwgAh+nVyaODxlNokxUaZJXmH2p0kzumtBKwI7JhZE9fwKxIA4QiKgSBT1qnxxyr1y0aG75JwmD3vncawAEcOelHeakLtaLh4w6FFQgCB01hGF2gIfU47/tdG1wij0vPm53yR3ayJ2gJPIDhw5N+yKoOnjOyfsGYEV/PcVi0YBpeO3AKgsCjvspruVsWLZiWkdBUNcP3k1DOghlrnTu/tTjpfOIt81BEhUvirYifbFuNZJXmH4pSyh1TutG8HZENtRUeKDH3jIm5qWsWJwMS6/TEj9UflKGoKfw5MBTByEbzPq+EmjI3ZlR7x2sWNgpZNVxEA0MRq4NXOKqiqy+Ej7oDCIRVq4aO6eYBDF8/B6CmwosZtSXweUQc6hhI2jT+UMfAqDGPFTEkCJxVV4kxZlyTM15PxsjG6FFFxVBYidVGMsim1dgzGBkVZURWafZoa+/BbRtfQ9PGPWjevB9t7T0UpZRDprQSWNhQm1Qo5dJaW754dix00zg2hSUwnEULjK7Ts7ChFv/0qekIhBPLKCdDVbWEY5fII6LoWHPRPFSXu1FV5h61cTvuOXWjJ/CxU0M43R82FBFnFJ0bCivoH4omrG5iM7IUhykEzbINX1s2HwDwxI7DaP9ocFQG9FhCc1ZtCSpKXbHS10bSW5lXwqza5BVt4y1zjuMgiQLAkFCSOptWY6oQVLJKJ4+p0Pv94VGrxon8ltvae9C8eX+CQiESmdLuICD/kQ0LG2rReN5svLi3A5puhDd6XCICYRVgRralmRVr1ukBjId1x19PQNd1SCI/5mrAH1RQ4h1OpDIFkCkMfR7B6CswFB3zPGPBGENNuQdDIQWKqkPVGSRhuFmLuapRY8I9XgiO9Jn7gzL6/BFwHGeVsx5LaC5fPBtP7TqCqjIhrVr5I11Z5SUu9A4aiiy+cmq2rEaq5Z87zGfY4xKhxMJ5TVebWfcqXWjvJj2mvBLIFplEg5gN2uPfX1Xqwr7DPQlZsZLIY/ni2dbDGlE0CBwXa4SSvA+8zy0gImuIKtooAfTEjsOWMPS6RXjdIkIRBT0DkbR2CuI3lRkAn0eCrGiIyMbKQ9UYOI5Z4aiabiS1jVyaj/SZV5S60OePYmAoCo9LGFdoLmyoRUWFD0/vOJTW/R7pL/a6RZSXuBGVtYTKqSPbWU40uqeQYuenGtncAKa9m/QgJZAGE7Eokq1A6uNq7gg8hws/a/SQbd68PxYeaiR+8RwA3vh3QmN1zhDE1eUeVJYMJzctmF2J7a0dGAxE4Q/KqCh1WZuigsDjrNmVWL54thVemqr0UkJvYgYc7xqy3Fnm6oUxoNwngec5BCMq3JIwqtHLyB+yzyMZlUoDSkqhPJJFZ9fjzFpfyr/Hk8wyFwUO31j5iZTNfSZrIRZC7PxUxFTorixsAFNEUXqQEkiDbFgUbe09eO3AqYSontcOnMKcGeXD4aE+CX1D0YQYe8sFzwwBLKtG39pgWMaZ08uxYHalFZFTWeZGnz+CPn8UjDGIomBZ3PFC67lXP8S21g5EldGuouGOXIkb2pWlLkSiGiKKZvnaF3ysAk1Xf27UOZJFcoiigIZZXquXwRM7DqO2oiMrFnSmlnk2LUTKF8gupkKPyCp4jpuUq40iitKDlEAaZMOiGEvwmA8rYFjdaqwlo9sloNQjYiAQhTbCeg/LOjq6hvD+8QFwMMrNlvsk1FQY2cWDAcUSuiOF0sovzMXKL8xFW3sPfvPH99HrjwIYrk4KJCoDgediPXQTx3Do+CCu//9ewfWXfzrhGql85gtmV+bMR5uJZZ4tCzFZI51HXzqEay5dQIpggpj3bddbJ9HZHZiUYqW9m/QgJZAG2bAoxhI8X1s23+pRy8WargAMPrexOSYKAjhOTyjPzACEYnV6GIb7FVeXuTG9xodQRE1oxZjKYv1//3dtQmloDkCpT0R1uRen+kIx9xQHWU3uQ4oqDA/8vg0rz59jJb6lsswnaoFn29rOloW4ZXc7gmEZHMdbtZyCYRlbdreTEpgECxtqsfS8j0+6lDLt3aQHKYE0SNeiGEtYjSV4FjbUorzEhYisQWcMksijvMQFl8SjbzAKRdMhxDaKR2YEx28gayPKTMSPaywL3FwZAMO1+AFY7qmRrSFHouvAi3sTG9Qns8zjN65NxrPAcxHhkS0L0cxcHpnUZrxOFAK0dzM+UzpPIFukk28wMmHJFFZmXPJ4yS4RWcP0Gh9m1ZWizCdhMCjjVG8IUUUDwKzm9AkKAMNx+6YLKb7MxB2PtFqKKd1krfhxet0iyryS1fVrLHSmj1kuGphYfH0mY0+X7OWPcAm5H4C5h5NplgZB2AetBNJkPItiPFfHeEtTc6VgunUsMcIB0AEtLl403l9vuo90nVkKgueGG848ufMworIGl2TkDaiqDlHkUeYVk1rgI8c5vdqH5Ytno/XdU7GyFaMxaweN51OfiAWeqwiPbFiI9VUedPaFoWO4ZhNjwPRq2ngkigdSAlkiHWE1luAxBeRgQIaZkmUmlnEcB4k3/q9oRjIZxwGaBpT6JPiDUQiCUWiN5wyBDBhlFwSBR0SWrf0GPlbQrn9Ixoxqb9KxJBvnwoZa9A+9mVA4zoTjjDDQkRZ9MvfY1ZfMz8hHW8gRHmsumodHXzwYq3Zq9Gn2uAWsuWie3UMjiLQhJZAlJiusTEH4YMvfwGL7Ajoz/O1czB10Rl0JooqGyhIXmq461xKygZAMjuPAgVmN3s2GMy6Rj/n0GQA+bg9Bxyhfxjg0Xf05tLX34Ikdhy3lJvJAWYkbosAlWPSpfPlXXzI/YcN6PAo5wmNhQy2uaTybNh6JooaUQJbIhrBa2FCLhlkVljL5qDtouX7EWMGy+NWFabGbAtdsOMPiGs6YFT8ry4wSEKpmlLIo83msTOBMWNhQi+bra8eN2MlWLP7Chloc7fSPamxfKIJ2PLdSseQRFMs4iexDSiBLZCscLV6ZiAIXK+QGq2l8stWFeY1kDWc0TUd9tQ+qzlBfPZyBa64oxmIswTCe8MtmLH6qJLvJCKl8CL1iqV2Ty3GScil8SAlkkWxsNsYrk0hUg6YrKPEYdYDGKp8bvyowf3RmOQcgeVvFsVYpkxUM2fLl56L+S76Ec7HUrsnVOItFCTodUgIFiCnQ6+rKsOv1v2dkSY2liDI5z/bWDqgaw1AoarmQvG4xbcGQLV9+LqKD8iWcxxp7IVnIuYrAKhYlmEsK6XtOhW1K4L777oMgCLjxxhvtGkJRkK1kl0zPY7R4VMBxvNXi0R+MjuoLMNb1gMwUT/wPZkZdKZaeMzMn0UH5KiyWauwelzCmhdzW3oNdW9omXTYhk3F29YcRjqoJCr++Knn0WLo4vYBbsayE8p4sNjQ0hNtuuw2bNm3K96WJDDCSzxKzYQHOSkrLNiOT7U52B/BgywEcPeVH74DR9SxbHaXy1RQmVYIgYqG7yRLgUjVVyWUzlAWzKzEYNHpPcAAUVcdgMIoFsysndV6nN9/JRaJjLsi7Eti1axfmzJmDb37zm/m+NJEBgsABnJFroGq60WJSNwRZOgJpvAzqkcT/YCKyFhfpxKHEKyIQVjAwFM1Kd7h8tSpMlZkcUfSU7Snjm6qkIziy0TnrUMcAyn0uiLGwZFHkUe5z4VDHwESmbeH0lpDF0oY07+6gyy+/HADws5/9bEKfr6kpzeJoCp+6ujJbrjtnZgVOdg8lVA/leSP34KldR1BR4cOis+tTfn7Xlja4XQI8LuMRc0kCIrKKXW+dxNLzPj7q/X0BGWVeMZbprMQS2zhouo7aylKUyiqqyr348fX/NOm5La0rQ0WFDy27j+B0XwjTqn1YfeG8MeczmWuNnO+ut06i3x+GJ85NFJFVzKgrRVdfCGVe455JMQEiChz6A/KoZ2HfwS48tesIRJFDRakLgYiS1nczkr6AjJoKD7i4vBHGWNJrZsJk77Ndz362mFFXmvJ7Tmdu+Zp/zpTAtm3bsGHDhoTX5s6di8cee2xS5+3tDUAfp6DZVKGurmzSlRQnytJzZuLJnYfBgYMoAGbpuopSF8ABT+84NGbTl87uAHweMaG9Jc9x6OwOJJ1TdanL8p8rqgaB54wsXMFotTnWZ5Mx3obcmbU+3LxmYcJn8nWvzXuraixh03zpOTOxvbUDA0EZpV7JundRRUNVqWvU+J7ecQjgAIHnoWrM+D+njfvdjCT+3pukumamTPQ+2/nsZ4uxvufx5pbN+fM8N6bxnDMlsGLFCqxYsSJXpyfSIBstFOMzmMtL3PC6RTDGxl3SZrqhGx9NJPAc1JiiHys/IhWFviE33qZ5uk1VsrXxWshZ2cVMsZSyphDRKUq2WijGZzCbpCOQMxUs8T+YUFgBkzX40siPSEYxhCamitbKpKlKtiKnikVYFSPFUMqalMAUpK29B//z3HuIKJphwfsk+DzShAThRK3EiQiW+B/MsZ5Q2o3mR1LsoYnpNlUZ77vJZCVYDMKKyA22KQHKD8gN5gogqqgQOM4qTQ0AXnfy8tFjMRkrcTKCJZNG8yPxSDxO9Yag6SwW6SJBEPgpF5o41ndT6C4xonCglcAUw3SFSKJglZbWGYM/pExYEBaTldjW3gN/2Oj5C8SUoD+KEo+ItRdPvRLPqb6bYnCJEYUBKYEphukKKS9xoc8fgc44KwHICZt921s74POI8LgE+IMyVE2HwHMoL3HZKvzyXT6g2F1iRP4gJTDFMDcLvW4R1eUe+IMyFFWHRxInnWRVDJjCj+M4eN3G480YQyii2jYmO1wzhdyMhygsSAlMMeI3Cz0uATzvhqbpBakAcmEdpyv8zGuf7AkaMfYCh1m1JTmx0O1wzVDYJ5EupASmGMUS7pcr6zgd4WdeW9UYghEFAAeowKm+UE4sdDtcM8XyHBD2Q0pgClIMG7m5so7TEX7mtYdCUXAcD4BB1RgGAzIkkceW3e1ZvX92uWaK4Tkg7IeUAJFXTDfM4eMDkEQB5SVGDgOQPes43a5naqwstlkZlQHQdIaTPUG0tfdkTYCSa4YoZPJeRZRwLvGVRSWRhxrLYQhFFAD527g0SxyLAg8trjQ2xxn/iVku95uqmihZ6UQhQCsBIm/Eu4AqSt3o80fAGOAPGjkM+bKOTcvc6xYRkTXrdaNEHlBeImXdX0+uGaJQISVA5I34DVIzhHUwEIWialY/5HwIyvh9A6NUNgMYIEmClV1cWeLK+TgIohAgJUDkjZEbpF63CJ7nUFniQtNV5+Z1LKZlHh+lRP56womQEiAmTbrx/oW4QUqhlITTISVATIpM4v0LVeCSv55wMqQEiEmRabz/VBW4+a4NRBDZgpQAMSmoUFnhdzIjiLGgPAFiUpgx9/E4rVBZ/GqI4zi4JQFClnMNCCJX0EqAmBSFuNmbb9JdDZHLiChESAkQk6JQN3vzSTq1gZzoMiKlVxyQEiAmzVTd7E2XdFZDTuv05USlV6zkXQm8+eab2LBhAxRFQWVlJX784x9j1qxZ+R4GQWSNdFZDTttAT6X0trxyhFYHBUbelcB3v/td/OIXv8CCBQuwZcsW3HXXXdi4cWO+h0EQWWW81ZDTOn0lU3qqquG0P4ppDLQ6KCDyGh0kyzK+/e1vY8GCBQCAs846C52dnfkcAkHYwvLFs6FpOqKKBsYYooqWcgO9rb0Ht218DU0b96B58360tffYMOLJkSxqzB9UIFIUVcHBMcbY+G/LPrqu4/rrr8enP/1p/Pu//7sdQyCIvLLvYBdadh/B6b4QplX7sPrCeVh0dv2o9/yypQ2iaAjJqKJBVRmuXb1w1HsLmWTzONkTwrQqL0q9kvU+xhgCYRW/+v4lNo7W2eRMCWzbtg0bNmxIeG3u3Ll47LHHIMsybrnlFgwODuKhhx6CJEkpzjKa3t4AdN0WvZV36urK0N09ZPcwbMGpc2/evB8DQRmlXgn+oAx/UIaianBLIv7vlZ8oKrfJyOigQFiBqrMEl1hU0ZIWEHTq92+SzfnzPIeamtKUf8/ZnsCKFSuwYsWKUa8Hg0Fcf/31qKysxMaNGzNSAAQx1TF96cGIij5/BAAHnuMQUbSi85+P3CcxI4bSzSmhENP8YMvG8Jlnnokf/vCH4HlKWCaKj1wKJ3MDeTAgw1AAgM4ASeQt/3mxCsJMckr2HeyyNcTUSQoor0rgvffew65duzBv3jxcccUVAIBp06bh4YcfzucwCGLC5Dr+3cw5kBXNUgAMQLlPmhIhpenmlLTsPmJbXoXTchzyqgQ+8YlP4P3338/nJQkiq+Q66cs8x8MvHEQ4qkISeZT7JPg8EqKKNmVDSkfS1ReCR0r0FORLCTotsY/8MQSRAT2DEbjE3AqnhQ21+M+rP4eacjeqytzwusUxQ0qnIvXVPtsKE+bjOy4kSAkQRAbkq2rqorPrcfUl81FZ4kIooqKyxIWrL5k/JS3RZKy+cF7aeRXZxmmVcal2EEFkQD6rpjq5JpOpBO3YnHVaZVxSAgSRAVQ1NX/YpQSd9h2TEiCIDHGyhe4UnPQd054AQRCEgyElQBAE4WBICRAEQTgYUgIEQRAOhpQAQRCEgyElQBAE4WBICRAEQTgYUgIEQRAOhpQAQRCEgyElQBAE4WBICRAEQTgYUgIEQRAOhpQAQRCEgyElQBAE4WBICRAEQTiYvCuBffv2YfXq1bjssstw3XXXYXBwMN9DIAiCIGLkXQnceuutaG5uxvPPP4958+bhkUceyfcQCIIgiBh57yz20ksvQZIkKIqCrq4unHXWWfkeAkEQBNraexzTQnIsOMYYy/dF33//fXzzm9+EKIp4+umnMWPGjHwPgSAIB7PvYBd+2dIGUeTglgREFQ2qynDt6oVYdHa93cPLKzlTAtu2bcOGDRsSXps7dy4ee+wx6/ipp57C1q1b8dRTT6V93t7eAHQ973rLFurqytDdPWT3MGzByXMHaP65nn/z5v0YCMpwS4L1WlTRUFniQtNV5+bsuumSzfnzPIeamtKUf8+ZO2jFihVYsWJFwmvRaBR/+tOf8M///M8AgJUrV+Kee+7J1RAIwrGQq2NsegYj8HkSxZ9L5NEzGLFpRPaR141hURTxwx/+EAcOHABgrBbOPdd+rUsQU4m29h48ufMwBoIyfB4RA0EZT+48jLb2HruHVjDUVnggq3rCa7Kqo7bCY9OI7COvG8OCIODee+/FHXfcAU3TUF9fjx/96Ef5HAJBTHm2t3ZAEHjL1eGWBERjr9NqwGD54tl4cudhRGGsAGRVh6bpWL54tt1Dyzt5jw5atGgRWlpa8n1ZgnAM5OoYH1MZksvMBiVAEERuqa3wjNr0dKqrYywWNtQ6UuiPhMpGEMQUY/ni2dA0HVFFA2MMUUVzrKuDGB9aCRDEFINcHUQmkBIgiCkIuTqIdCF3EEEQhIMhJUAQBOFgSAkQBEE4GFICBEEQDqboNoZ5nrN7CHnFafONx8lzB2j+NP/szH+889hSSpogCIIoDMgdRBAE4WBICRAEQTgYUgIEQRAOhpQAQRCEgyElQBAE4WBICRAEQTgYUgIEQRAOhpQAQRCEgyElQBAE4WBICRAEQTgYUgIFRCAQwJe+9CWcOHECALBnzx5cdtllWLZsGe69916bR5dbfv7zn6OxsRGNjY1obm4G4Jz533///bj00kvR2NiITZs2AXDO3OO55557cMsttwBw1vzXrVuHxsZGrFq1CqtWrcI777yT3/kzoiB4++232Ze+9CX2yU9+kh0/fpyFw2F2wQUXsI6ODqYoCrvmmmvY7t277R5mTnjttdfYV77yFRaNRpksy+zrX/86e/755x0x/9bWVrZ27VqmKAoLh8PsoosuYgcPHnTE3OPZs2cPW7x4Mfve977nqGdf13X2hS98gSmKYr2W7/nTSqBAeOaZZ/CDH/wA06ZNAwC0tbXhzDPPxMc+9jGIoojLLrsM27dvt3mUuaGurg633HILXC4XJElCQ0MDjh496oj5f/7zn8fjjz8OURTR29sLTdPg9/sdMXeTgYEB3HvvvbjuuusAOOvZ//DDDwEA11xzDVauXIknnngi7/MnJVAg/OhHP8KiRYus49OnT6Ours46njZtGrq6uuwYWs75h3/4B3z2s58FABw9ehTbtm0Dx3GOmb8kSXjggQfQ2NiIJUuWOOq7B4A77rgDN998M8rLywE469n3+/1YsmQJHnzwQTz22GN46qmncPLkybzOn5RAgaLrOjhuuA44YyzheCrywQcf4JprrkFTUxM+9rGPOWr+N910E/bu3YvOzk4cPXrUMXP/3e9+hxkzZmDJkiXWa0569s855xw0NzejrKwM1dXVWLNmDR544IG8zr/omso4henTp6O7u9s67u7utlxFU5E333wTN910E2677TY0NjbijTfecMT829vbIcsyzj77bHi9Xixbtgzbt2+HIAjWe6bq3AHgpZdeQnd3N1atWoXBwUGEQiF89NFHjpn/vn37oCiKpQQZY5g1a1Zen31aCRQon/nMZ/D3v/8dx44dg6ZpeOGFF/DFL37R7mHlhM7OTtxwww34yU9+gsbGRgDOmf+JEydw++23Q5ZlyLKMXbt2Ye3atY6YOwBs2rQJL7zwAv7whz/gpptuwsUXX4xf/epXjpn/0NAQmpubEY1GEQgE8Oyzz+I//uM/8jp/WgkUKG63G3fffTduvPFGRKNRXHDBBVi+fLndw8oJjzzyCKLRKO6++27rtbVr1zpi/hdccAHa2tpw+eWXQxAELFu2DI2Njaiurp7yc0+Fk579iy66CO+88w4uv/xy6LqOq666Cuecc05e50/tJQmCIBwMuYMIgiAcDCkBgiAIB0NKgCAIwsGQEiAIgnAwpAQIgiAcDCkBwpFcc8016Ovrm/R7Wltb8aUvfWnc65111llJz7Vr1y7cddddAIxqktu3b8eJEydwzjnnjHtOgsgGlCdAOJLXXnstK++ZLEuXLsXSpUtzfh2CSAWtBAjHceuttwIAvvGNb+CNN97AunXrcNlll2HlypXYunXrqPd0dnbilVdewdq1a7F69WpceOGFuO+++zK+7n333YcrrrgCq1atwiuvvAIAaGlpwbXXXpuVeRHERKCVAOE4NmzYgJaWFvz617/Gv/7rv6KpqQnLli1DV1cXrrzySpx55pkJ76mqqkJTUxPuvvtuzJkzB11dXbjooovw9a9/PaPrnnHGGbjzzjtx+PBhrFu3Dtu2bcvRDAkifUgJEI6lvb0d0WgUy5YtAwDU19dj2bJl+Mtf/pLgk+c4Dg899BB2796NF154Ae3t7WCMIRwOZ3S9r371qwCA+fPno6GhAW+99Vb2JkMQE4TcQYRj4ThuVIlexhhUVU14LRQK4YorrsC7776LT3ziE2hqaoIoisi04grPD//cdF2HKJINRtgPKQHCkQiCgFmzZkEURezYsQMA0NXVhT/+8Y84//zzrfeoqopjx44hEAhg/fr1uPjii9Ha2gpZlqHrekbXfPbZZwEA7777Ljo6OvCZz3wmu5MiiAlApgjhSJYvX45/+7d/wy9+8Qvcdddd+NnPfgZN03DDDTfgvPPOs96zbt063H///bjwwguxYsUKuFwuzJ8/H/PmzcOxY8fgcrnSvubx48dx+eWXg+M4/PSnP0VlZWWOZkcQ6UNVRAmCIBwMrQQIIgv86le/wvPPP5/0b9/61rewcuXKPI+IINKDVgIEQRAOhjaGCYIgHAwpAYIgCAdDSoAgCMLBkBIgCIJwMKQECIIgHMz/D7RrkTIqFAnbAAAAAElFTkSuQmCC", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Create a visualization with Modin df\n", "sns.residplot(data=modin_tips, x=\"total_bill\", y=\"tip\")" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYEAAAEKCAYAAAD0Luk/AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAABA7ElEQVR4nO29eZhU5Zn//T1bbb1vNItBQjOIWUg0TBAnV1wYGbAjKMEJ0ZBMzO991Z+jwcmko8YxE18TtN+8cUkMZoxijBI1pCVuEAhKflGwDaJ2UBBpAw3SNL1X13q25/3j1Dld1V3VXdVdVaeqz/25Li851VXnPM+pU/d9P/dzLxxjjIEgCIJwJLzdAyAIgiDsg5QAQRCEgyElQBAE4WBICRAEQTgYUgIEQRAOhpQAQRCEg7FVCdxzzz245ZZb7BwCQRCEo7FNCezduxfPPvusXZcnCIIgAIh2XHRgYAD33nsvrrvuOhw6dCijz/b3B6Hrzshvq6kpRW9vwO5h2IKT5w7Q/Gn+2Zs/z3OoqipJ+XdblMAdd9yBm2++GZ2dnRl/dqzJTEVqakrtHoJtOHnuAM2f5p+f+eddCfzud7/DjBkzsGTJErS0tGT8+d7egGNWAnV1ZejuHrJ7GLbg5LkDNH+af/bmz/PcmAol70rgpZdeQnd3N1atWoXBwUGEQiH8+Mc/xm233ZbvoRAEQTievCuBTZs2Wf9uaWnBG2+8QQqAIAjCJihPgCAIwsHYsjFssnr1aqxevdrOIRAEQRQ0be092N7agZ7BCGorPFi+eDYWNtRm7fy2KgGCIAgiNW3tPXhy52EIAg+fR8RAUMaTOw8DQNYUAbmDCIIgCpTtrR0QBB5uSQDHcXBLAgSBx/bWjqxdg5QAQRBEgdIzGIFLTBTTLpFHz2Aka9cgJUAQBFGg1FZ4IKt6wmuyqqO2wpO1a5ASIAiCKFCWL54NTdMRVTQwxhBVNGiajuWLZ2ftGrQxTBAEUaCYm78UHUQQBOFQFjbUZlXoj4SUQBbJdTwvQRBEtiElkCXyEc9LEASRbWhjOEvkI56XIAgi25ASyBL5iOclCILINqQEskQ+4nkJgiCyDSmBLJGPeF6CIIhsQxvDWSIf8bwEQRDZhpRAFsl1PC9BEES2IXcQQRCEg6GVADHloSQ+gkgNKQFiSkNJfAQxNuQOIqY0lMRHEGNDSoCY0lASH0GMjS3uoPvvvx9//OMfwXEc1qxZg29+85t2DINwALUVHgwEZbglwXqNkviIfFLoe1J5Xwm88cYbeP311/Hcc8/h97//PX7zm9/gww8/zPcwCIdASXyEnZh7UgNBOWFPqq29x+6hWeRdCXz+85/H448/DlEU0dvbC03T4PP58j0MwiEsbKjF1ZfMR2WJC6GIisoSF66+ZH5BWWLE1KUY9qRscQdJkoQHHngAjz76KJYvX476+vq0P1tTU5rDkRUedXVldg/BNrI196V1ZVh63sezcq584uTvHpga8+8LyCjziuA4znpNFDj0B+Rx55ev+XOMMZaXKyUhHA7juuuuw6WXXoqvfOUraX2mtzcAXbdtyHmlrq4M3d1Ddg/DFpw8d4DmP1Xm37x5/6g9qaiiobLEhaarzk35uWzOn+e5MY3nvLuD2tvbcfDgQQCA1+vFsmXL8P777+d7GARBEDmnGPak8q4ETpw4gdtvvx2yLEOWZezatQuf+9zn8j0MgiCInFMMe1J53xO44IIL0NbWhssvvxyCIGDZsmVobGzM9zAIgiDyQqEXlrRlY/jGG2/EjTfeaMelCYIgiDgoY5ggCMLBkBIgCIJwMFRFlCBySKGXDCAIUgIEkSOojDVRDJA7iCByRDGUDCAIWgkQRI7oGYzA50n8iVEZ60TIXWY/tBIgiBxRW+GBrOoJr1EZ62GKocKmEyAlQBA5ohhKBtgJucsKA3IHEUSOMN0a5O5IDrnLCgNSAgSRQwq9ZICdUNe3woCUAEGANijtYPni2Xhy52FEYawAZFUnd5kNkBIgHMFYQp7i+e2B3GWFASkBYsoznpCP36AEALckIApDOJFAyi3kLrMfig4ipjzjRaH0DEbgEhN/CrRBSTgFWgkQU57xolCcvEFJeyEErQSIKc94SVtOjeenZC0CICVAOIDxhHwxtADMBZSsRQDkDioYki3Ll9aV2T2sKUE6UShO3KCkZC0CICVQEKSKXqmo8OHMWp/dw5sSOFHIj4eT90KIYWxxB/385z9HY2MjGhsb0dzcbMcQCopUy/KW3UfsHhoxhXHqXgiRSN6VwJ49e/Dqq6/i2WefxdatW/Huu+9i586d+R5GQZEqRPF0X8imERFOwKl7IUQieXcH1dXV4ZZbboHL5QIANDQ04OTJk/keRkGRalk+rZpcQVOVQgnNJDcZwTHGmF0XP3r0KL761a/it7/9LebMmWPXMGxn38Eu/LKlDaJouIKiigZVZbh29UIsOrve7uHZzr6DXWjZfQRdfSHUV/uw+sJ5RX1f6PsmCgnblMAHH3yAa6+9FjfeeCOuuOKKtD/X2xuArtumt3JG0uig8z6O7u4hu4dmC3V1ZejuHkrYNI8vMlbMbovmzftHrfyiiobKEhearjoXwPD8nQrNP3vz53kONTWlKf9uS3TQm2++iZtuugm33XYbGhsb7RhCwUHL8uRMxbo+FJpJFBJ5VwKdnZ244YYbcO+992LJkiX5vjxRZExFgUmhmUQhkffooEceeQTRaBR33303Vq1ahVWrVuG3v/1tvodBFAlTsU8vhWYShUTeVwK33347br/99nxflihSMmk8UigRN+NBdfSJQoIyhomCJl2BWWyNYWgPiCgUSAkQGZNvizsdgTkVN5AzpVhWQkRhQUqAyIhCtbin4gZyJhTq90IUPqQEHE6m1mOhWtxOj7gp1O+FKHyon4CDmUhTkUJtxej0iJtC/V6IwodWAg5mItZjoVrcuY64KXR/e6F+L0ThQ0rAwUzEj55JyGa+yVXEjV3+9kwUTyF/L0RhQ0rAwUzEenRijLsd/vZ9B7syUjxO/F6I7EBKwMFM1Hp0Qox7vBU+GIiisswNxCnLXPvbW3YfyVjxOOF7IbIPKQEHQ9Zjcka6f/xBGX3+CDiOg9dt/GRy7W/v6gvBI9FGL5F7SAk4nMlYj4W+WTpRRrp/Kkpd6PNHMTAUhccl5MXfXl/tQ3d/iDZ6iZxDIaLEhDCt5a7+MIZCMj44MYAHWw7guVc/tHtok2ZkuKXPI6GqzAXGkLc2jKsvnOfokFcif9BKgJgQ21s7oGoMQyEZAAeB56HpDC++3oE5M8qxsKG2aFcKyTbMRVFAwyyv1fQl1yw6ux5XXzK/KO8fUVyQEihi7BSyPYMRhCIKAA48Z7zGc4CmM2xv7QCAoi1jkOmGea6+B9roJfIBKYEixe5aMbUVHvQPRSDww24TBkASjM3LiYRVxgvTGXWlWHrOTNuarwPpbZjb/T0QxGQhJVCk2F0rZvni2Wj/yA9NZ+A5QwEwAF63gNoKT0IiWiiiwB9SoKo6egciaGvvGbcUdL8/bKswTdcKt/t7IIjJQkqgSMk02zfbLouFDbVoXDIbL77eAU1nkAQeXrcASeSxfPFsbG/twEBQhqbp6BuKIuYxAschqXAfKUw9kgBVY7YJ03Tvl9OrlxLFDymBIiWTbN9cuSxWfmEu5swoTyksn9x5GIMBOaYAOHAcg9ctYDAg48GWv6FhVoX1/okI01z54jO5X9ms2VMo7jDCWZASKFIy2bzMtssiHeFrHj/Y8jcwxiCJxvUDERUcAMZYgnDNVJjm0hefyf3KVs2eQnOHEc6B8gSKlIUNtbj6kvmoLHGNG7uezTLDmZSfXthQi4ZZFair8qG+2oeIolurAkkU4JYECAKP7a0do0pBR2R1TGEaL6g5jks412TJ5H5l8j2Mxcj5eFxi1uZDEGNh20ogEAhg7dq1eOihh3DGGWfYNYyiJt3Ny2y6LDJdVcRbyqqqAwA4jqG8xA1gWLiOjMgx3SEA0Lx5/6hVRy598Zner2yEcprzCUdV+IMyVI1B4IFQWJnUeQliPGxRAu+88w5uv/12HD161I7LO45slhnOVPjGC/fegQg4DqgscyetwRMvTOvqyrDr9b9bLhKOAz486ccDW9ows8YHj2TMIxdlFewoy1xb4bGyr43kO0DVGHSmJY2mIohsYYs76JlnnsEPfvADTJs2zY7LO45suSwAQ1jJMYveZDzhe7TTj46uABRNh6LqCEWUtEohbG/tgKLq6B2MoHsgAiV23dMDEfjDKkJhJSdlFbJ5v9Jl+eLZCIQUMAZwYNB1ABxQ4hHJJUTkFI4xxuy6+MUXX4zHH3+c3EFFxL6DXfhlSxtE0fDDRxUNqspw7eqFWHR2/aj3/3bHITy98zDAAQIHKBoDY0CJV8LcWRVYfeG8pJ8DgK/9YBuCYQWqlviI8hwws64EoiCgrMSF030hTKv2jXmuYuBrP9iGqKxB1XRIIo/KUjd8HhGBsIpfff8Su4dHTFGKLjqotzcAXbdNb+WVuroydHcP2T2MBM6s9WHt0nmjooPOrPUlHevW3e0AAIEztoQlgYOmMzCd4eY1CwEg6efq6sqgKDpME8XMMzCT0niOQyAk445vLEr4XKHdr0yYUe2z9iIkkYei6ghGVFSVuop6XhOhEJ/9fJLN+fM8h5qa0pR/LzolQNhPJhuhEVmFYBYXisFzxuvjIQocogozQkrjXucwNcsqx+9FGHOnyqFE7iElQEyasfIGPC4RUUWDEKcHdGa8Ph4za0vQ1R9GIKxAUY3wUp6HUbF0CgrH+E30/oCMqlLXqByMYq3MShQupAQIi4kImPgkJ4DhgxMDONQxAIHnMKPGh880VKP10GlourEC0BnAwLDsH8ffBzIt45oKDzRNx2BAhqrpmFbpwaIF07C9tQNP7DicU2GYb6FrrrKSuQOoWF1xUuiK21Yl8PLLL9t5eSKOiQiYtvYe/M9z7yGiaOA5JGzgajpDZ28Q/pCCxQum4Z32PkRkFR6XiGX/eAZWfmHuuGMamTswd2a5Zf3nWhi2tfdgy+52nOwJQhR4lJdItgtdKlZXfBSD4qaVgEMZaZ0EQnJGAsZ8uKOKCoHjEhSA6cPnOB6RqIrjpwOYXV9qXWvOjPK0x5ls/6F58/6cCkNzboMBGRzHQWcM/QEZ1WVuK4vXjh8wFasrPopBcZMScCDJrJPTfSFUl7sTkq/GEjDmwy2JRrVPFrd1aygAw/2jqDpO9oYwjSFrllAmwnAiS3Fzbjpj1qa2zhj8IQX1VV7bhG66mcyF7n5wEsWguEkJFCht7T3YtaUNnd2BrP+QzSQss8a/KBoZuf6gghKvy3rfWBE45sNdXuJCnz8yKoJH4DnoDAAHiFm2hDIRhhNZiptzEwUeqmb0S+BglL2wMyopnUzmybgfnnv1Q+z464mM3XZEarJZsiVXkBIoQMwfstsl5MSP+FFPEKGoUc2T5wBN08FgPJxRRUsqYEZal0zXcao3ZDSV4Y0y0WZMv8Aj9m8jw7ei1JVw/claQumWdZjoUtz84ZoKTmdcbF6crVFJ6XQ8m+icn3v1Qzy35yg4cBB4Izz1uT1HAYAUwSSwowRJppASKEDMH7LHJUKJ1cfJph9R0xjAjCQSIObDZ0Yilz8gJ1iCZsP4eOvyVF8Ig0EZgJG0xRgDx3HwShx8Hgn+oAKAob66BACgjkjuG2kJJXNfLK0rSzn+dNs/TnQpbv5wBYFHVZk7FpXEML3GizUXzbPVtTJejsZE57zjrycsBQAY2d2abrxOSmDiZNKq1C5ICRQgufYjmklYOuOssE2dMeiMoabUA58qwB9U8NxrR7Hv/W4ASLAuw7IGnuPA84bQUDUdAs+hpsKLO7+1OOFa1gYykltCqdwXFRU+nFnrSzmHdBLWJroUTxWVVEg/3FRMdM6TSeojxiYbVWZzCSmBAsT8Ibty5Ec0k7DCURWqpkMUeDBwEDjD3dEf6wbGmOE6YsxQHNXlHnjdIlRVN/oKs2FrnzGGUGRYYMRb9x6JBzgOoYg6yhJK5b5o2X3EKisxUSazFC/0H24qJjrnyST1EcUNfcMFiPlDjsgqeI6btB9xpLtlwexK9PojqCxzW4LidF8I5eUu+EOK1fmLAdZur6oxnO4Pg+cNF5DOACmu8Uq8khpp3cuqDk3V8LVloytxplr1nO4LTWiu8RTDUjzbTHTOy/7xDDy35+iEkvqI4oaUQAFi/mB3vXUyaXRQJiGApkBWVB3hqIa+oSjaT/qxaH4t+gOydQ6RA1QGqKpsbBanqNGn64Ae0wwe3tgPGKmkRlr3ZrbvyL7CQGr3xbTqRFfQRMMei9WinwwTmbPp96foIOdhaynpieD0KqLxVnb8cj9Vvfvmzftxqi+EobBh4XMwrDye43DD6k8lKBYzQUpnbFT55pEIPAee4+BxC6gocQGMIaIYq4GPeoKoKnOD4ziEIgr6hqLW6qKuypcw3lTz+d9rPmvtCZjvUTWGUESBqungOR6NS2bnRUjZEXdvdxVNu3MN7J6/3eSziij1GC4yMu2t2zMYQTiqxcJBOXAcF4vh1xM+YzZSmVbpga4zcFzS0yVQU+lBRYnL6CkQlwwWiWrwx6KHTPdSsr7C8dcd2cAlvi/A9tYOqBrDUEiGpscKyOk6/vDaUax/4C9o3rw/aY/jbJBJT+WpghPn7GTSdgcNDg5CEASUlqbWKIRBLq2o8SKHRl7b4xJiljiDorNY5ypAELhR0UamG6GtvQdbXjmCEz3J/fIcAFE0LPfOniBqKr3QNB1dQ1Goqm4knoUMJRCVtdhnGEpjiWgjI53SCXsMRRQAZjQTg7kYjCpaTuuxFEPaf7Zx4pydzLhK4MMPP8R3v/tdHDx4EBzH4ZxzzkFzczNmzpyZj/EVHbkuGDVWCGC822QoGE0ZUspgFHjzuISkfzeF8nOvfojn9xyDNtL9xgHlPinWZpKDqmpWRBHPGefWGTAQkK2P8DyHQFiBSxLA81xGkU4el4CeQbNJPazNai52rVwKqUJO+8+VsVHIcyayz7hK4NZbb8WVV16JL3/5y2CM4emnn8b3v/99bNq0KR/jKzpybUWNFQJouk0GA1GMtW0i8ADAAeNsB638wlzMmVGOLbvb0RnLDhZ4oCpWSE3TdNRXeXB6IGK5mzRdT7i2wBtJR8bWE4eBoSgqYnXy06GtvcdyLQGJQ+ZglKQAhoVUtgVjJiUqtrxyBF39ERiJcj6subAhZ5ZzLo2NYih1QGSPcZVAOBzG2rVrreN169bhmWeeyemgiplcW1FjhQA+seNwrIm78d74lozx8DyPihIJEUXHeMS7auIFbGXJsCB/YEsbOM4osKOPOCUXM911BkuCp9rEjj//jLpSLD1nJra3dsDnlazaRvFz4XkO5SWGi0lWdXhcAh598SAisgZNZ/AHZTz64kFc03h22oIxWTjtawdOjVuv59EXDyIYUcHFdGtnbxCPvnQI11y6ICeKIJfGRjGUOiCyx7hKYO7cudi/fz/OPfdcAMDhw4epMfwYpGNFTdZajffdDzdW6YDHJaB/SE8QlCMVgChwVvTP9GpvRnNL5bufWVuC0/1h6LHcAoE3lAEDRkUZaTrDb7YfwrrlicJxpGV7snsIDz57AKqmwyUKKC+RUFclwh+UEZU1MABlPhc8LsFqw6jIuiWIzb2DYETFlleOjBp3su8AGN2n4LUDp/BPn5qOQx0DY9bricha7LoxZcg4RKJqRkI5mRJM9dlcGhtOzK9wMuMqgZMnT2LdunU466yzIIoi3nvvPdTV1eGyyy4DADz//PM5H2QxMZ4Vla1lfLLzhMLG5qkp+kcqgGGXjSEcF8yuRPPm/ZP+oa+5sMEaS78/ClXTwfEAS+GT6h2S8bPf/w0lHhEza0ssV5Zp2fb7IxgKDVv9sqKhb0hHdZkb9dU+RBUNIgeU+lwJY3+w5cCwII7NVweLuWjGvnePvnQIUdnIoJZEAeUlLnjdIqIADnUMoOmqc1POv2cwYhTSi4uoMvZG9LSF8sgx9fvDYz4XuXbZODG/wqmMqwT+8z//Mx/jmDKMZ0VteeUIBgMyNJ1BFHmU+6QJNSpJ5g4AjCze/kB0lFuG52DVyBcFI0z0tQOnrNj7/qEI2j/yTyj2Pn7OwbACXWco9UkYCERTbjtoOkuI7InIGqrK3AhFFPhDSsJ7GQBdY/AHFWsvYm1Sl1KskmmcMGYM4LjEQYy8d7rOEAwb4aeSYDTI6fNHUF3uiW1Kjy3Iays88AeN/Arz0jozQlnTFcojx+SRjD4NqZ4LctkQ2SKlEmhvb0dDQwNKSkqS/v2Tn/xkzgZV7KSyotrae3CyNwSe56wSzn1DUVSVujJexqdyB6iqjpu+vNDapFQ1HbzAoSZW9wcwwir9scqYQyEZABeLvWd48fUOzJlRnrEVmGzvIBCSocTcQcn2J8zIHr+sIRhWEYyoMHMXh7uTDZcxUFTN2otINr76ah86e4MJhfHiq5mmunfGxrNxRQbzehz8QRk87x5XkC9fPNvaE9BNRQQGjyf9DfBM3TvksiGyRUol0NzcjF/+8pe48sorMWPGDMQnFofDYezdu3fCF33++eexceNGqKqKb3zjG7j66qsnfK5iYntrB0SBh84M4WZk7xoW7tyZwy0XTSF6qi8MWdEgCBxmxdwmqcothCIKBgMyGDOsR7Pkselm4EeUeBAELiH2HhgO75zo5uJIP3vjkjPx4t4OKJqe1DUlCjzCURX+YBQMAItbvZjvF3jOUgjzz6gY0y2z5sIGPPrSIUSiKjRdh8Dz8HhcWHNhQ8L7Rt47VTNyG0SBM1YdMYteSdO6XthQi2saz7YUL8cZiieT6KCJuHfIZUNkg5RK4J577sHAwAAaGhrwm9/8xqoZrygKvva1r034gl1dXbj33nvR0tICl8uFtWvXYvHixZg3b96Ez1ks9AxGUFHqQv9Q1LJWGQNUffSegaox+IPDoZ6HOgZwqGMAZ9QZwiXeHaCqGvqHZAAM1eUey8dd7hURUfSkVTy3t3bggxMDEPjhpHEGQBImtrnY1t6TIID9QRkne0NYdFYt3jjUPSrXgItF9viDsmE5s1jTlrj38dzwHobAc+kJ40sXjGsdj3SlMMaswmmCEDtmgEcSU0YyJbv2ZATyyDFFZLVg3Dt2l5AgckvKshHf+c53cN555+HIkSNYsmQJzj//fCxZsgQXXXQRPvWpT034gnv27MF5552HyspK+Hw+/Mu//Au2b9+e9udfffXPAABVVdHUtB4vv7wTABCJRNDUtB5//vPLAIBgMICmpvV47bX/A8DIeG5qWo/XX98DAOjr60NT03rs2/cGAKC7+zSamtbjrbfeBAB0dp5EU9N6tLW9DQA4caIDTU3r8d57BwAAR4/+HU1N6/H++4cAAO3tR9DUtB7t7UcAAO+/fwhNTetx9OjfAQDvvXcAx1sfhRLqRXW5B8rgMZx68zHo0QHMrC2B5j+Gpqb12PryOxAEHt0nDqL7rcehyQFjfj2H0fv24zjR2Y0ndx7GgbdbEXj3SZRKGgYDCiLd76K/7Qm4RQZdZzj9931o2/UQPC4OKgM62/+K4HtPoumqc7GwoRZV6mF0v/0ENJ2BMYbB42/g9NtPwusWUFvhwdatW/Df//19675v2fI07rrrDuv4mWc2Y8OGO63jB37xPzj+5tOWL7y//RUc378Fx7uDuPHLnwY79Rf4P3jRELQ8h/CxXej82/NQVA06AwaP/BGDR/4IMVbLePCDbRg4stNYuegMQ0dexLY/PGld76c/vQePP/6oddzc/CNs3vw4FjbUoumqc8GdeBHHDuzCEzsOo3nzfnz3lluwZcvTAAyBrbS3YKjjdfQPRQFw6PvbZgRP7jOqpzKgv+0JfKbmpCXsmprWY+fO7Tl79p745V34/MeM0NuB3tM48pdfYckcBQsbarPy7DU1rceJE0apjra2t9HUtB6dnScBAG+99Saamtaju/s0AGDfvjfQ1LQefX19aGvvwcbf/AF/3fZzSFwUA0EZv/j1s7jhxhsQDBrP5p///DKamtYjEjGMh5df3ommpvVQVaO0+M6d29HUtN76rrZtewG33vod6/iFF7biv/7re9bx1q1bcPPNN6f97G3e/Diam39kHT/++KP46U/vsY43bXoY99//E+v44Yc34sEH77OOH3ro53jooZ9bxw8+eB8efnijdXz//T/Bpk0PW8epnj2TDRvuxDPPbLaO77rrDuvZA4D//u/vY+vWLdbxf/3X9/DCC1ut41tv/Q6effZZ63iyz178vUpGypXAI488EhvQrdiwYcOYJ8mE06dPo66uzjqeNm0a2tra0v58SYkbdXVlUFUVkiSgrMyDuroyRCISJElAebkXdXVl8Hq5hGNJ0iBJAioqjGOOiyYca1ow4TgaLYEkCais9KGurgzBYOLx4KAPkiSgqso47utLPD59OvG4stKHuiofwACXxKOu0oOgxKOy1I1vrfo0NP9RSJKAgaCCmhoRmpY8hl9nQJ8/ip3HTqBMFPD/XP8FfPv+vfBGPTh1moMk8ugdUmLx+YBLEuDhBQyIHPoDMupiHbvOnluLfdU+cPywpS2JRjezryxbgA/ePgmXS7DeX1rqhsslWsclJW643cPHgZhryWxMYtYe6uoPY+l5H8fBfXMwODiI739/FfYd7MKGe/6CYDAMj1tCKKxYXc4EngeL8wu5JQFV5W6c6OBx8Fg/jvWEsOjseng8kvUsAEg43newCx1dAZRWl2NmqQuBiILj3QHMD8o41hNCy+4j+KgnhBln8qiu8KKiVINfEmKrIg6iYNyLs+fWWuePf9Zy9eydc/Z0XLd4MU6cOIE773wF535iRtaePeO4JOG4uto4rqjwJj2uqSnBr19+H5LIQ+A5uEQBkkeCX+TRF5BRW1uG0tJSlJcb76+rK4PH40FZmcc6FkUx4RgAyso8Cc9SaenoYwBpP3slJW54PFLCcSAwfOzzuaAoroRjQWAJx/HX83pdcLtdCcc+nyvps5bs2O0WE45dLhGlpfHHAkpLPSP+nngcP55sPHtjkfcqohs3bkQ0GsX69esBAM888wwOHDiAO+8cW1uZFHsV0fGW1s2b92MgKKOrNzTKj25iZuHynBGjDxgtHE1/8kfdhoUmCDymx0oym01fmq8/P6PxJCNZQ/IX93aAgcV6DbDYCsPw/X/7yoVjlrp+sOUAdGaEWMY8Q1Y464za4U1df1BGKKLEvn8O9VWepO0ezXsY71+PKhrEWO/c+Iqlp/tCqKnwwOeRrPemulf5pFCqaDZt3AOfR7SMCiA/96dQ5m8X+awimvd+AtOnT8e+ffus4+7ubkybNi3fw7CN8XzHyxfPxqMvHUqpAABDAXAwsnFP94eNGkCm5S/ylh+93Dcs2FJtMmbqyzYbkoMZq5JQVMXWV4/CLfFQNUBjOuIXMTyPMePdFzbUonHJbLz4egc0nUESeHjdAvwhBVVlww3qw1HVCjkVYo3tO/vCSTOCU0XamMXu4sNqRYHHYEBOUAJUImEYKiEx9cl7Kenzzz8fe/fuRV9fH8LhMHbs2IEvfvGL+R5GYROL5R8LnkesJDSDzytBEjj0+yPo6ApAVnXoupEVzBizMmqzscm4468nLAUADId+RhUdLonHSC9WqVcas9Q1YNQouuGKT2H+GRUo80mYXu3D7PoyCMLw42luIHMY7mXAcUBE1kadu7bCEytuN4xZ7M4lJj7y5SUSVE1HVNGyfq+mAssXz4ZG92dKk/eVQH19PW6++WZ8/etfh6IoWLNmDRYunFwv2bEopsiGtvYe/M9z7yGiaJBEHmVeASFZg6xoCUlXxuaqEWoqxpKnev1RCLzhz2bMCPXsH4piIBCFN+ayyca8I7I6SgGY+NwiwtFY6ehYCGwwosIljZ9wNXJF8qe3PsLv/vQBdGb0QDaFelwwEzgY8xx57lSJVPVVhnKIt2pFUcDMGt+o7ONCfUbyTSHlIxTTb7mYsKW95GWXXWaVncgluS7rPNZ1M31YzbFGFdVq+D4U1lFV5obX7cXAUBRul2B1/jITocpL3BgYihpC1wypjNtYZXEum22tHVixOHVGcDrj9rhEhKLqKAVgFHiT4RL5mH/feIfOgMGAnJAHkc79e/mvx1HiNZSKohmFiAwXWGJIq5CkLHUqwQUgqXJInn1MmBRCPoJdv2UnMKV7DNvRHCN5T98Do3r6JitCJgg8JFGIFV0z3Dk9AxFIIo9pVV6subABW145gpO9IYgCh4pSN3ieM5KdMGyZjywZYRJVdGx99Si6+kL4v1Ymhvmm+yNb9o9nYOurR0ftWXhdAhSVobxEQn8gsYSCqrGM3AfbW40EM6MaqA6XyEMUOISjGhjToTHOqtbpcQtpnzubVi1ZpfmFGt3kjimtBOxojrG9tQOKqls9fYVYeYi9751GiUeEqulJ6/SYYy0vMdwSZgQUg+HPPtEdROu7p3Dn/zrPis7pHYzA4xJRVebGUEixBO94sVN73zuNDz7ag68tm58gGFP9yMy/W+WVP1aBQ8cHrfOZ2b/VZRJUBlSXueEPKVBVHQLPYXqNUa003WJ1J3uCsWqghu9f1RgUVYNbElBd7kFXXwgAh+nVyaODxlNokxUaZJXmH2p0kzumtBKwI7JhZE9fwKxIA4QiKgSBT1qnxxyr1y0aG75JwmD3vncawAEcOelHeakLtaLh4w6FFQgCB01hGF2gIfU47/tdG1wij0vPm53yR3ayJ2gJPIDhw5N+yKoOnjOyfsGYEV/PcVi0YBpeO3AKgsCjvspruVsWLZiWkdBUNcP3k1DOghlrnTu/tTjpfOIt81BEhUvirYifbFuNZJXmH4pSyh1TutG8HZENtRUeKDH3jIm5qWsWJwMS6/TEj9UflKGoKfw5MBTByEbzPq+EmjI3ZlR7x2sWNgpZNVxEA0MRq4NXOKqiqy+Ej7oDCIRVq4aO6eYBDF8/B6CmwosZtSXweUQc6hhI2jT+UMfAqDGPFTEkCJxVV4kxZlyTM15PxsjG6FFFxVBYidVGMsim1dgzGBkVZURWafZoa+/BbRtfQ9PGPWjevB9t7T0UpZRDprQSWNhQm1Qo5dJaW754dix00zg2hSUwnEULjK7Ts7ChFv/0qekIhBPLKCdDVbWEY5fII6LoWHPRPFSXu1FV5h61cTvuOXWjJ/CxU0M43R82FBFnFJ0bCivoH4omrG5iM7IUhykEzbINX1s2HwDwxI7DaP9ocFQG9FhCc1ZtCSpKXbHS10bSW5lXwqza5BVt4y1zjuMgiQLAkFCSOptWY6oQVLJKJ4+p0Pv94VGrxon8ltvae9C8eX+CQiESmdLuICD/kQ0LG2rReN5svLi3A5puhDd6XCICYRVgRralmRVr1ukBjId1x19PQNd1SCI/5mrAH1RQ4h1OpDIFkCkMfR7B6CswFB3zPGPBGENNuQdDIQWKqkPVGSRhuFmLuapRY8I9XgiO9Jn7gzL6/BFwHGeVsx5LaC5fPBtP7TqCqjIhrVr5I11Z5SUu9A4aiiy+cmq2rEaq5Z87zGfY4xKhxMJ5TVebWfcqXWjvJj2mvBLIFplEg5gN2uPfX1Xqwr7DPQlZsZLIY/ni2dbDGlE0CBwXa4SSvA+8zy0gImuIKtooAfTEjsOWMPS6RXjdIkIRBT0DkbR2CuI3lRkAn0eCrGiIyMbKQ9UYOI5Z4aiabiS1jVyaj/SZV5S60OePYmAoCo9LGFdoLmyoRUWFD0/vOJTW/R7pL/a6RZSXuBGVtYTKqSPbWU40uqeQYuenGtncAKa9m/QgJZAGE7Eokq1A6uNq7gg8hws/a/SQbd68PxYeaiR+8RwA3vh3QmN1zhDE1eUeVJYMJzctmF2J7a0dGAxE4Q/KqCh1WZuigsDjrNmVWL54thVemqr0UkJvYgYc7xqy3Fnm6oUxoNwngec5BCMq3JIwqtHLyB+yzyMZlUoDSkqhPJJFZ9fjzFpfyr/Hk8wyFwUO31j5iZTNfSZrIRZC7PxUxFTorixsAFNEUXqQEkiDbFgUbe09eO3AqYSontcOnMKcGeXD4aE+CX1D0YQYe8sFzwwBLKtG39pgWMaZ08uxYHalFZFTWeZGnz+CPn8UjDGIomBZ3PFC67lXP8S21g5EldGuouGOXIkb2pWlLkSiGiKKZvnaF3ysAk1Xf27UOZJFcoiigIZZXquXwRM7DqO2oiMrFnSmlnk2LUTKF8gupkKPyCp4jpuUq40iitKDlEAaZMOiGEvwmA8rYFjdaqwlo9sloNQjYiAQhTbCeg/LOjq6hvD+8QFwMMrNlvsk1FQY2cWDAcUSuiOF0sovzMXKL8xFW3sPfvPH99HrjwIYrk4KJCoDgediPXQTx3Do+CCu//9ewfWXfzrhGql85gtmV+bMR5uJZZ4tCzFZI51HXzqEay5dQIpggpj3bddbJ9HZHZiUYqW9m/QgJZAG2bAoxhI8X1s23+pRy8WargAMPrexOSYKAjhOTyjPzACEYnV6GIb7FVeXuTG9xodQRE1oxZjKYv1//3dtQmloDkCpT0R1uRen+kIx9xQHWU3uQ4oqDA/8vg0rz59jJb6lsswnaoFn29rOloW4ZXc7gmEZHMdbtZyCYRlbdreTEpgECxtqsfS8j0+6lDLt3aQHKYE0SNeiGEtYjSV4FjbUorzEhYisQWcMksijvMQFl8SjbzAKRdMhxDaKR2YEx28gayPKTMSPaywL3FwZAMO1+AFY7qmRrSFHouvAi3sTG9Qns8zjN65NxrPAcxHhkS0L0cxcHpnUZrxOFAK0dzM+UzpPIFukk28wMmHJFFZmXPJ4yS4RWcP0Gh9m1ZWizCdhMCjjVG8IUUUDwKzm9AkKAMNx+6YLKb7MxB2PtFqKKd1krfhxet0iyryS1fVrLHSmj1kuGphYfH0mY0+X7OWPcAm5H4C5h5NplgZB2AetBNJkPItiPFfHeEtTc6VgunUsMcIB0AEtLl403l9vuo90nVkKgueGG848ufMworIGl2TkDaiqDlHkUeYVk1rgI8c5vdqH5Ytno/XdU7GyFaMxaweN51OfiAWeqwiPbFiI9VUedPaFoWO4ZhNjwPRq2ngkigdSAlkiHWE1luAxBeRgQIaZkmUmlnEcB4k3/q9oRjIZxwGaBpT6JPiDUQiCUWiN5wyBDBhlFwSBR0SWrf0GPlbQrn9Ixoxqb9KxJBvnwoZa9A+9mVA4zoTjjDDQkRZ9MvfY1ZfMz8hHW8gRHmsumodHXzwYq3Zq9Gn2uAWsuWie3UMjiLQhJZAlJiusTEH4YMvfwGL7Ajoz/O1czB10Rl0JooqGyhIXmq461xKygZAMjuPAgVmN3s2GMy6Rj/n0GQA+bg9Bxyhfxjg0Xf05tLX34Ikdhy3lJvJAWYkbosAlWPSpfPlXXzI/YcN6PAo5wmNhQy2uaTybNh6JooaUQJbIhrBa2FCLhlkVljL5qDtouX7EWMGy+NWFabGbAtdsOMPiGs6YFT8ry4wSEKpmlLIo83msTOBMWNhQi+bra8eN2MlWLP7Chloc7fSPamxfKIJ2PLdSseQRFMs4iexDSiBLZCscLV6ZiAIXK+QGq2l8stWFeY1kDWc0TUd9tQ+qzlBfPZyBa64oxmIswTCe8MtmLH6qJLvJCKl8CL1iqV2Ty3GScil8SAlkkWxsNsYrk0hUg6YrKPEYdYDGKp8bvyowf3RmOQcgeVvFsVYpkxUM2fLl56L+S76Ec7HUrsnVOItFCTodUgIFiCnQ6+rKsOv1v2dkSY2liDI5z/bWDqgaw1AoarmQvG4xbcGQLV9+LqKD8iWcxxp7IVnIuYrAKhYlmEsK6XtOhW1K4L777oMgCLjxxhvtGkJRkK1kl0zPY7R4VMBxvNXi0R+MjuoLMNb1gMwUT/wPZkZdKZaeMzMn0UH5KiyWauwelzCmhdzW3oNdW9omXTYhk3F29YcRjqoJCr++Knn0WLo4vYBbsayE8p4sNjQ0hNtuuw2bNm3K96WJDDCSzxKzYQHOSkrLNiOT7U52B/BgywEcPeVH74DR9SxbHaXy1RQmVYIgYqG7yRLgUjVVyWUzlAWzKzEYNHpPcAAUVcdgMIoFsysndV6nN9/JRaJjLsi7Eti1axfmzJmDb37zm/m+NJEBgsABnJFroGq60WJSNwRZOgJpvAzqkcT/YCKyFhfpxKHEKyIQVjAwFM1Kd7h8tSpMlZkcUfSU7Snjm6qkIziy0TnrUMcAyn0uiLGwZFHkUe5z4VDHwESmbeH0lpDF0oY07+6gyy+/HADws5/9bEKfr6kpzeJoCp+6ujJbrjtnZgVOdg8lVA/leSP34KldR1BR4cOis+tTfn7Xlja4XQI8LuMRc0kCIrKKXW+dxNLzPj7q/X0BGWVeMZbprMQS2zhouo7aylKUyiqqyr348fX/NOm5La0rQ0WFDy27j+B0XwjTqn1YfeG8MeczmWuNnO+ut06i3x+GJ85NFJFVzKgrRVdfCGVe455JMQEiChz6A/KoZ2HfwS48tesIRJFDRakLgYiS1nczkr6AjJoKD7i4vBHGWNJrZsJk77Ndz362mFFXmvJ7Tmdu+Zp/zpTAtm3bsGHDhoTX5s6di8cee2xS5+3tDUAfp6DZVKGurmzSlRQnytJzZuLJnYfBgYMoAGbpuopSF8ABT+84NGbTl87uAHweMaG9Jc9x6OwOJJ1TdanL8p8rqgaB54wsXMFotTnWZ5Mx3obcmbU+3LxmYcJn8nWvzXuraixh03zpOTOxvbUDA0EZpV7JundRRUNVqWvU+J7ecQjgAIHnoWrM+D+njfvdjCT+3pukumamTPQ+2/nsZ4uxvufx5pbN+fM8N6bxnDMlsGLFCqxYsSJXpyfSIBstFOMzmMtL3PC6RTDGxl3SZrqhGx9NJPAc1JiiHys/IhWFviE33qZ5uk1VsrXxWshZ2cVMsZSyphDRKUq2WijGZzCbpCOQMxUs8T+YUFgBkzX40siPSEYxhCamitbKpKlKtiKnikVYFSPFUMqalMAUpK29B//z3HuIKJphwfsk+DzShAThRK3EiQiW+B/MsZ5Q2o3mR1LsoYnpNlUZ77vJZCVYDMKKyA22KQHKD8gN5gogqqgQOM4qTQ0AXnfy8tFjMRkrcTKCJZNG8yPxSDxO9Yag6SwW6SJBEPgpF5o41ndT6C4xonCglcAUw3SFSKJglZbWGYM/pExYEBaTldjW3gN/2Oj5C8SUoD+KEo+ItRdPvRLPqb6bYnCJEYUBKYEphukKKS9xoc8fgc44KwHICZt921s74POI8LgE+IMyVE2HwHMoL3HZKvzyXT6g2F1iRP4gJTDFMDcLvW4R1eUe+IMyFFWHRxInnWRVDJjCj+M4eN3G480YQyii2jYmO1wzhdyMhygsSAlMMeI3Cz0uATzvhqbpBakAcmEdpyv8zGuf7AkaMfYCh1m1JTmx0O1wzVDYJ5EupASmGMUS7pcr6zgd4WdeW9UYghEFAAeowKm+UE4sdDtcM8XyHBD2Q0pgClIMG7m5so7TEX7mtYdCUXAcD4BB1RgGAzIkkceW3e1ZvX92uWaK4Tkg7IeUAJFXTDfM4eMDkEQB5SVGDgOQPes43a5naqwstlkZlQHQdIaTPUG0tfdkTYCSa4YoZPJeRZRwLvGVRSWRhxrLYQhFFAD527g0SxyLAg8trjQ2xxn/iVku95uqmihZ6UQhQCsBIm/Eu4AqSt3o80fAGOAPGjkM+bKOTcvc6xYRkTXrdaNEHlBeImXdX0+uGaJQISVA5I34DVIzhHUwEIWialY/5HwIyvh9A6NUNgMYIEmClV1cWeLK+TgIohAgJUDkjZEbpF63CJ7nUFniQtNV5+Z1LKZlHh+lRP56womQEiAmTbrx/oW4QUqhlITTISVATIpM4v0LVeCSv55wMqQEiEmRabz/VBW4+a4NRBDZgpQAMSmoUFnhdzIjiLGgPAFiUpgx9/E4rVBZ/GqI4zi4JQFClnMNCCJX0EqAmBSFuNmbb9JdDZHLiChESAkQk6JQN3vzSTq1gZzoMiKlVxyQEiAmzVTd7E2XdFZDTuv05USlV6zkXQm8+eab2LBhAxRFQWVlJX784x9j1qxZ+R4GQWSNdFZDTttAT6X0trxyhFYHBUbelcB3v/td/OIXv8CCBQuwZcsW3HXXXdi4cWO+h0EQWWW81ZDTOn0lU3qqquG0P4ppDLQ6KCDyGh0kyzK+/e1vY8GCBQCAs846C52dnfkcAkHYwvLFs6FpOqKKBsYYooqWcgO9rb0Ht218DU0b96B58360tffYMOLJkSxqzB9UIFIUVcHBMcbY+G/LPrqu4/rrr8enP/1p/Pu//7sdQyCIvLLvYBdadh/B6b4QplX7sPrCeVh0dv2o9/yypQ2iaAjJqKJBVRmuXb1w1HsLmWTzONkTwrQqL0q9kvU+xhgCYRW/+v4lNo7W2eRMCWzbtg0bNmxIeG3u3Ll47LHHIMsybrnlFgwODuKhhx6CJEkpzjKa3t4AdN0WvZV36urK0N09ZPcwbMGpc2/evB8DQRmlXgn+oAx/UIaianBLIv7vlZ8oKrfJyOigQFiBqrMEl1hU0ZIWEHTq92+SzfnzPIeamtKUf8/ZnsCKFSuwYsWKUa8Hg0Fcf/31qKysxMaNGzNSAAQx1TF96cGIij5/BAAHnuMQUbSi85+P3CcxI4bSzSmhENP8YMvG8Jlnnokf/vCH4HlKWCaKj1wKJ3MDeTAgw1AAgM4ASeQt/3mxCsJMckr2HeyyNcTUSQoor0rgvffew65duzBv3jxcccUVAIBp06bh4YcfzucwCGLC5Dr+3cw5kBXNUgAMQLlPmhIhpenmlLTsPmJbXoXTchzyqgQ+8YlP4P3338/nJQkiq+Q66cs8x8MvHEQ4qkISeZT7JPg8EqKKNmVDSkfS1ReCR0r0FORLCTotsY/8MQSRAT2DEbjE3AqnhQ21+M+rP4eacjeqytzwusUxQ0qnIvXVPtsKE+bjOy4kSAkQRAbkq2rqorPrcfUl81FZ4kIooqKyxIWrL5k/JS3RZKy+cF7aeRXZxmmVcal2EEFkQD6rpjq5JpOpBO3YnHVaZVxSAgSRAVQ1NX/YpQSd9h2TEiCIDHGyhe4UnPQd054AQRCEgyElQBAE4WBICRAEQTgYUgIEQRAOhpQAQRCEgyElQBAE4WBICRAEQTgYUgIEQRAOhpQAQRCEgyElQBAE4WBICRAEQTgYUgIEQRAOhpQAQRCEgyElQBAE4WBICRAEQTiYvCuBffv2YfXq1bjssstw3XXXYXBwMN9DIAiCIGLkXQnceuutaG5uxvPPP4958+bhkUceyfcQCIIgiBh57yz20ksvQZIkKIqCrq4unHXWWfkeAkEQBNraexzTQnIsOMYYy/dF33//fXzzm9+EKIp4+umnMWPGjHwPgSAIB7PvYBd+2dIGUeTglgREFQ2qynDt6oVYdHa93cPLKzlTAtu2bcOGDRsSXps7dy4ee+wx6/ipp57C1q1b8dRTT6V93t7eAHQ973rLFurqytDdPWT3MGzByXMHaP65nn/z5v0YCMpwS4L1WlTRUFniQtNV5+bsuumSzfnzPIeamtKUf8+ZO2jFihVYsWJFwmvRaBR/+tOf8M///M8AgJUrV+Kee+7J1RAIwrGQq2NsegYj8HkSxZ9L5NEzGLFpRPaR141hURTxwx/+EAcOHABgrBbOPdd+rUsQU4m29h48ufMwBoIyfB4RA0EZT+48jLb2HruHVjDUVnggq3rCa7Kqo7bCY9OI7COvG8OCIODee+/FHXfcAU3TUF9fjx/96Ef5HAJBTHm2t3ZAEHjL1eGWBERjr9NqwGD54tl4cudhRGGsAGRVh6bpWL54tt1Dyzt5jw5atGgRWlpa8n1ZgnAM5OoYH1MZksvMBiVAEERuqa3wjNr0dKqrYywWNtQ6UuiPhMpGEMQUY/ni2dA0HVFFA2MMUUVzrKuDGB9aCRDEFINcHUQmkBIgiCkIuTqIdCF3EEEQhIMhJUAQBOFgSAkQBEE4GFICBEEQDqboNoZ5nrN7CHnFafONx8lzB2j+NP/szH+889hSSpogCIIoDMgdRBAE4WBICRAEQTgYUgIEQRAOhpQAQRCEgyElQBAE4WBICRAEQTgYUgIEQRAOhpQAQRCEgyElQBAE4WBICRAEQTgYUgIFRCAQwJe+9CWcOHECALBnzx5cdtllWLZsGe69916bR5dbfv7zn6OxsRGNjY1obm4G4Jz533///bj00kvR2NiITZs2AXDO3OO55557cMsttwBw1vzXrVuHxsZGrFq1CqtWrcI777yT3/kzoiB4++232Ze+9CX2yU9+kh0/fpyFw2F2wQUXsI6ODqYoCrvmmmvY7t277R5mTnjttdfYV77yFRaNRpksy+zrX/86e/755x0x/9bWVrZ27VqmKAoLh8PsoosuYgcPHnTE3OPZs2cPW7x4Mfve977nqGdf13X2hS98gSmKYr2W7/nTSqBAeOaZZ/CDH/wA06ZNAwC0tbXhzDPPxMc+9jGIoojLLrsM27dvt3mUuaGurg633HILXC4XJElCQ0MDjh496oj5f/7zn8fjjz8OURTR29sLTdPg9/sdMXeTgYEB3HvvvbjuuusAOOvZ//DDDwEA11xzDVauXIknnngi7/MnJVAg/OhHP8KiRYus49OnT6Ours46njZtGrq6uuwYWs75h3/4B3z2s58FABw9ehTbtm0Dx3GOmb8kSXjggQfQ2NiIJUuWOOq7B4A77rgDN998M8rLywE469n3+/1YsmQJHnzwQTz22GN46qmncPLkybzOn5RAgaLrOjhuuA44YyzheCrywQcf4JprrkFTUxM+9rGPOWr+N910E/bu3YvOzk4cPXrUMXP/3e9+hxkzZmDJkiXWa0569s855xw0NzejrKwM1dXVWLNmDR544IG8zr/omso4henTp6O7u9s67u7utlxFU5E333wTN910E2677TY0NjbijTfecMT829vbIcsyzj77bHi9Xixbtgzbt2+HIAjWe6bq3AHgpZdeQnd3N1atWoXBwUGEQiF89NFHjpn/vn37oCiKpQQZY5g1a1Zen31aCRQon/nMZ/D3v/8dx44dg6ZpeOGFF/DFL37R7mHlhM7OTtxwww34yU9+gsbGRgDOmf+JEydw++23Q5ZlyLKMXbt2Ye3atY6YOwBs2rQJL7zwAv7whz/gpptuwsUXX4xf/epXjpn/0NAQmpubEY1GEQgE8Oyzz+I//uM/8jp/WgkUKG63G3fffTduvPFGRKNRXHDBBVi+fLndw8oJjzzyCKLRKO6++27rtbVr1zpi/hdccAHa2tpw+eWXQxAELFu2DI2Njaiurp7yc0+Fk579iy66CO+88w4uv/xy6LqOq666Cuecc05e50/tJQmCIBwMuYMIgiAcDCkBgiAIB0NKgCAIwsGQEiAIgnAwpAQIgiAcDCkBwpFcc8016Ovrm/R7Wltb8aUvfWnc65111llJz7Vr1y7cddddAIxqktu3b8eJEydwzjnnjHtOgsgGlCdAOJLXXnstK++ZLEuXLsXSpUtzfh2CSAWtBAjHceuttwIAvvGNb+CNN97AunXrcNlll2HlypXYunXrqPd0dnbilVdewdq1a7F69WpceOGFuO+++zK+7n333YcrrrgCq1atwiuvvAIAaGlpwbXXXpuVeRHERKCVAOE4NmzYgJaWFvz617/Gv/7rv6KpqQnLli1DV1cXrrzySpx55pkJ76mqqkJTUxPuvvtuzJkzB11dXbjooovw9a9/PaPrnnHGGbjzzjtx+PBhrFu3Dtu2bcvRDAkifUgJEI6lvb0d0WgUy5YtAwDU19dj2bJl+Mtf/pLgk+c4Dg899BB2796NF154Ae3t7WCMIRwOZ3S9r371qwCA+fPno6GhAW+99Vb2JkMQE4TcQYRj4ThuVIlexhhUVU14LRQK4YorrsC7776LT3ziE2hqaoIoisi04grPD//cdF2HKJINRtgPKQHCkQiCgFmzZkEURezYsQMA0NXVhT/+8Y84//zzrfeoqopjx44hEAhg/fr1uPjii9Ha2gpZlqHrekbXfPbZZwEA7777Ljo6OvCZz3wmu5MiiAlApgjhSJYvX45/+7d/wy9+8Qvcdddd+NnPfgZN03DDDTfgvPPOs96zbt063H///bjwwguxYsUKuFwuzJ8/H/PmzcOxY8fgcrnSvubx48dx+eWXg+M4/PSnP0VlZWWOZkcQ6UNVRAmCIBwMrQQIIgv86le/wvPPP5/0b9/61rewcuXKPI+IINKDVgIEQRAOhjaGCYIgHAwpAYIgCAdDSoAgCMLBkBIgCIJwMKQECIIgHMz/D7RrkTIqFAnbAAAAAElFTkSuQmCC", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Create a visualization with pandas df\n", "sns.residplot(data=pandas_tips, x=\"total_bill\", y=\"tip\")" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAaEAAAGkCAYAAACYZZpxAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAACHGElEQVR4nOzddXxcx7nw8d8cWl6tmM3sGGKmxGFmTpqUb/EWb2+5TUopv7ntLd5yUwo3zA6ZYrZjZpQsptXigfePtWUrK9uSLGkF8/180lqj3XOelVb7nJkz84xwHMdBkiRJkjJAyXQAkiRJ0tAlk5AkSZKUMTIJSZIkSRkjk5AkSZKUMTIJSZIkSRkjk5AkSZKUMVqmA+iKmpoWsrO9NDREMh3KKfX3+EDG2FNkjD1jqMSYnx/ooWgGlwHXE9I0NdMhnFZ/jw9kjD1FxtgzZIxD24DqCUn9n5OIYh3diVWzH7vpKE60GWwTVB3hyULJLkHNH4laOBqhuTIdriRJGSaTkHTWHNvE3LeW5PY3sap2o2SXoGQVIrzZKNmloKhgmzixMHbtAczdK7Gbq1FLJqCPPx9t+DSEIt+KkjQUyb98qdscK0li6+skNz6D8Gajlp2DPuUyhGac+bmJGFbVThLr/k182YPoU6/EmHRhp54rSdLgIZOQ1C3mgQ3Elv0N4QthzLgBJVTUpecLw41WPhWtfCp2YyXm7pW0bnoe15xb0cYu6KWoJUnqb2QSkrrEiYWJLf0LVvUe9MmXoOaPPOtjKqFijFk3YjUcIbHhWRLbXiNxw6eArLMPWJKkfm3AzY6TMses3EHro9/AAVznvb9HEtDJ1OxSjIXvQc0bScVfvkZ843M4jt2j55AkqX+RPSHpjBzHIbH5JRLrnsaYdgVqweheO5cQCtrIGYTGTKJu6RNYBzfhvvhjKN5Qr51TkqTMkT0h6bQc2yL21p9JbnkV18L39GoCOpkWyMGYezsikEfksW9iVmzrk/NKktS3ZE9IOiXHTBB95Zc4sTCu+Xch9L5d1yMUBX3cQpTsUmKv/Ap92pUYU69ECNGncUiS1HtkT0jqkJOIEnnuJ2BZGLNu7PMEdDI1fwTGwrtJ7niL2JLf4JiJjMUiSVLPkklISuMkokSe/THC5UOffhVCyXzJEsUTxDX/TpxYmMhT92NHGjMdkiRJPUAmIakdJxkn8txPEN4Q+jmX9quhL6Hq6NOvRsktJ/LEt7DqDmY6JEmSzpJMQlIbx0oSffF/Uj2gcy7pVwnoOCEE+tgFaOPPI/LMjzAPbsp0SJIknQWZhCQAHNsm+upvwHHQp17eLxPQybSSiRgzryf2+u9IbF2S6XAkSeommYQkHMchvvzvOOFa9OlXI8TAeFuoOWUY8+8kseFZYisfkgtbJWkAGhifNlKvSmx6AfPwOxgzb0SoA2vWvuLLxrXgPViHNxN75Vdy5pwkDTAyCQ1xyf1rSW56HmP2zRmdhn02hOHBmHsbTiJC5JkfYEebMx2SJEmdNLAue6UeZdUeIPbGH3HNvgXFE+zUc0zbobLR5GiTSXPMJmk5GKog6FEoDGqUhDRUpe/vJwlVQ59+DeaOt4g88W08V30eNVTS53FIktQ1MgkNUXakieiL/4Mx+ZIzbsOQtBzWHYjx9r4Iu6qSBNwKuX4Vn6GgKqnE1Bp3qG+1aI7ZjC3QmTncw4zhbrxG33W2hRDoE85H+LKJPHk/nos/jlY2uc/OL0lS18kkNAQ5ZpLoSz9DLZ2EWjLhlI+LJmxe2dbKku2tFAQ0JhQZnD/Wi+c0iSWatDlQm2Tl3igPr2lm1gg3V0z2UxDsu7eaVj4F4c0ituTXGDNvxJh8cZ+dW5KkrpFJaAiqfeF3CEVHG7uww+/bjsOyXRGeWB9meK7GbbOC5Pg6VzXBoytMKHYxodhFa9xm4+EY9z9Xy7QyNzecGyC7k8c5W2ruMMT8u0iseQKr7gDuhe8dcJMuJGkokBMThpjE1iVED2xGn95xIdDasMlPXqzj1e0RbjjXzxXn+DudgN7N51JYMNrLBxZm4QD3PV3DMxtbSFrOWb6KzlF82bgWvgen8SiRp76H3drQJ+eVJKnzZBIaQqyq3cRXP0b2+bcjtPSZcGv2R/neM7WUhjRunx2gsIeG0Ny6wsIxHt4zN8jWyjjfeqqG3dV9M5VaaC70mTeg5JQTefxezCNb++S8kiR1jhyfGCLsSCPRl3+BMfUKtGAuNEbavmfZDo+uaWbdwRg3zui55PNuWR6V66cH2FmV4NevNTBnpJsbZwQxtN6dTZcq9TMfJbuY2Ku/RpuwGNesG/tFYVZJGupkT2gIcGyT6Mu/QC2bjFo4pt33ogmbn79az57aJHfNDfZaAjrZuEKDe+YHOdJo8p1najhYl+z1cwKoeSNwLXov1pGtRJ78HnZzdZ+cV5KkU5NJaAiIr/gXQNpEhMaIxQ9fqMOlCW6c7set993bwWMoXDXFx8zhbh54uY4XNoexnd6/VyTcfow5t6Dmj6T18fuIb35ZlvuRpAySw3GDXHL3Csz963AtvKfdRISqZpMHXq5jcomL2SPcGSlYKoRgYrGLkpDGC5tb2VIR50OLQoS8vTtMJoRAGzULpWAkyU0vYu5eifv8D6LmlPbqeSVJSid7QoOYVXeQ2LK/Ycy8HmG429oP1Mb58Qt1zBzuZs5IT8YrZmd5VG6dGSDXp/Ltp2vZcCjWJ+dV/LkY8+9ELRhN5On7iS3/B068tU/OLUlSiuwJDVJOLEz0xZ+jT7oIJVjQ1r6/NsH/Lmng/HEeJhT1n1pxiiKYP9rDsByNf7zdxI5qkxumenH18hChEAJtxLmoxeNI7lhK60NfRj/3WoxJFyJUvVfPLUmS7AkNSo5tEX3ll6gFo9BKJ7W1761J8PNX67lqela/SkAnK83WuXtukLqwybefrmVPX03ldvkwpl6OMecWzL2raP3Xl0hsex3H6ptJE5I0VMkkNAjF334IJxlDm7C4rW1PdYL/XVLPpZN8TCjxZDC6M3PpCtfPzGb+aA+/fK2Bh1Y3ETf7aIFrsADX7JvRp11JcsdbtP7zC8Q3PCeH6SSpl8gkNMgkdizF3Lsa49xrEUrq17u7OsEvXqvn8sk+RuUbGY6w88Yem8pd0Why35M1bDkS77NzqzlluObcgjHzRqyKLYT/+QVib/0Fq/5wn8UgSUOBvCc0iFhVu4mv/CeuebcjjFRvZ1dVgl+9Xs8Vk/2MyBt49zi8hsJVU/zsq0nw1xWNDMvVuXVWkIJA37x1laxCjOnX4MRaMA9uIvrMDxHBfIyJF2JnXdgnMUjSYCaT0CBht9QSfennGFOvRAnkA7DjaJzfvN7AlVP8DM8deAnoZCPzDcpzdNYejHH/s7XMGenh6ql+sjx9U/VAuAPo4xaijZmPXb2H5I63OLDyn6jDpqOPPw+1ePyA2RZdkvoTmYQGASfeSvS5n6CNmoNaOBqArRVxfvdmA1dN9TMsZ2AnoOM0VTB3pIcpJS7e3hfjm/+uYd4oD5dN9pHr75u3slAU1KKxqEVjCbhs6retJfbWn8FMoI1ZgD5uAWq2XG8kSZ0lk9AA55gJoi/+DJFThjZyJgAbDsb4y/JGrpnmpyx7cCSgk3ldChdO8DJ7hJt1B2N8++laxhUaXDDex8QSA6WP1j2pHj/6qNnoo2ZjN1djHdmaGq7zZKGNW4g+Zj6KN6tPYpGkgUomoQHMsW2iS34Lioo+KXV/YvnuCI+ubeaGcwMUZQ3uX6/frXD+OC/zRnnYVhnnoTVNRBMOc0a4mT3Sw/Bcvc8W4irBApRgAdqE87FrD2Id2UJi7b9RC8agj1+ENuLcDiuXS9JQN7g/pQYxx3GIvflHnNZ6jFk3AYIXNod5ZVsrt8wMkusfOhWiDU0wrdzNtHI3tS0mO6oS/PaNRkzbYUqpiyllbsYXGX2y1bgQCmr+CNT8ETiTE1hHd5HY/DKxt/6CNuJc9HGLUIsntM1clKShTiahAchxHOLLHsSu2Y8x91ZsofLQqma2VMS5fXaAoHvoJKB3ywto5AU0Fo6BulaLfTUJXtwc5g9LTYqDGhOKDcYXGYwpMHq9YKvQDLSyyWhlk3FiYcyKbcSWPQiJCNro+antJXKHZbxskiRlkkxCA4zj2MSXPoh1dCfGnFuJ2hr/91o9kaTDbbMCfVoJu7/L9ank+jzMGgGm5VDRZHKoPsm/17dwtMmkIKgxvtBgfJGLsYUGPlfv/eyE+6T7Ry01WEe2E33xf0DV0UbPQR81ByWnXCYkaciRSWgAcWyT2Bt/xK4/jDHnVo6EFX79Wi3lOTqXT/agKPID7FQ0VTAsR2+bKWjaDkebTA43mDx/rKeU61eZUGQwqcTFuMLe6ykpgXyUCflo4xfhNB3FqtxB9IUHQChow89FG56qZSdr10lDgUxCA4STiBJ95Zc4yRj6rJt4c6/JE+tbWDzOy6QSecO7qzRFUJatt80etGyHqmaTg/Umz2wMU9FkUhrSmFziYnKJixF5OmoPJ3khBCJUjBIqRpuwGKelpm3Bsd1cg1o4BrV0ElrxeJS8EQhV/rlKg498Vw8AdksN0Rf+BxHIJzLxWh58M0xti8lts4bWBITepCqCkpBOSSiVlJKWw5FGk4N1SVbvj9IctRlbaDCl1MXEEhf5PfxzF0Igjs2wY+wCnEQUu+4gVs1ekjvewgnXo+SUohaMQs0fiZI7DCVULHtL0oAnk1A/Zx7cSOz136OMmsvS+HiefqaeqeVuLpkY7PErc+kEXRWMyNUZcazSRGvc5kBdkg2HYjy1IYymwrThPkblKowtcPX4xYAwPKjF41GLxwPgmHHsxqPYTUdJ7l6BvfZJnEgDwp+Lkl2Kkl2Gml2CEipGCRXJ6eDSgCGTUD/lJOPE336Y5N417Cq9jsfecaGpUW6ZFSCvj6oDSCf4XAqTSlxMKnHhOA51rRbVrYJlu6I8tKoZXRWMyjcYXaAzPDc1zNeTU8KF5kLNG46aN7ytzbFMnNZ67JZanHAtyapd2OE6nNZ6hDuIklVEbekIEu4ClOwS1FAJwu3vsZgkqSfIT7N+yDy4kchbf2WbOoGX4jfRsg0WjHYzpqDvFl9KpyaEIM+vMaLIzaRCFcdxaIjYVDaa7K5OsHx3lJoWE59LoSSkUZylUZilke/XyA+oZHtVNPXsf49C1U4M4Z3EcWycSBNOuA7HbMY8sB5nyyvYLTUI1Uj1lLJLUbPLUEJFqWE9X7asfSdlhExC/YhVu5+apU/wdqXBssSVaLrOzBFuxhX2XSkaqeuEEOT4VHJ8KpNJDYPZjkNTxKau1aKu1WLT4TjNkQhNUZuWmI3PpZDtVcn2KW3PTX2d+neWR+n2cKsQCsKXDb5s/CEvZmMESK0vI9aS6i211GEe2YKzc2kqWSWjqaG9QD4ikIfiz0XxhhCeLITbn+pBGZ7UMJ8qL4akniOTUIY5jk319k2sX72BdbUeDltTGFPo5pJxLkpCmvxjH6AUIcj2pZLKmHd9z7YdwnGblngqIYVjNvtqkryTiBOO2TRHbSIJm6AnlaDy/Cr5AZW8QKo3ledXyfIqXb4wEUKAJ4jqCUL+yHbfc8xEqvcUacSJNmPXH8Gq3IGTiOAkYpCM4SRjYMbBtkFRQdVAKCCU1LEVBRCpNkUBRUMoGmg6QjNAdyMM77GkFkB4gijeLOJmKXbSnWqT7/chRyahPmbbDpV1Yfbs3M/O3YfYUW0RdXRGZBUycUI2V+W70HtgqEbqvxRFEPSoBE+zDYVlO7QcS0hNMZvasMW+2iRNUZvGiE0saZPlSfWacv0qOb5UzyroUQm6FfxuBa8h8Fud25FWaAYimA/B/DM+1nFssK3Uf46dSko44Dip3pbjpNodCywLxzbBSoKZxDFjOIkYTmMlTvUenHgr1asjJMONYCUQvhyUQB5KsBAlq/DYcGMhSjA/lcikQUcmoV5g2w5NrQnqm2PUNsWorm2i4mgdFbURqsIOXmIUGq0UBTWuOCebgvwQiqwlJp1EVQQhr0rI23GiSlrHklTMpjlq0dBqcaTBJJKwiSQdogmbaMIhblajiNRsP00VaErq2KoCiqCt5yEAIU60KYK2x+pq6j9DE7h1gUcXeAwFjyHw6gpeQ8fnEnhdCr5j7V3ppYVCXhobI6neWLQp1SNrbcSs2o2zby1OpAGntRHh8qWGCoMFiGABaiAvNYToy0nd09LljMCBaEAloeMVAbpaGSAcTZJMWhy/JnRIjY87zvF/g2M72I6DZaeu5izbwbJsTBvMSDPxRAIzaZMwLZJJi1jSIhY3U3/scYvWhE0k4dCaFCQsUIVDQE3gI4JfRMlz2YwpVMkZ58UI5CGMYT36s+kK23b6/U1oGePpGRrk+lVyzzDZze3RCUcSWFaqSoR1rKNiO6m/AfvYH4XT9j/HvwcWDo4Nlk3q78BOlT9KWA6tUUi2OCQsk4TpEDchZjrEkw4OqWTm1gQe46TEpQtcxxKZoQlcaqr4bKDWxkya6IpAVUJoWjZKNqg5AlUAAhRAmBGIhxHxME5zK6JuJ068FScRwW82ogsLXH4Utw9cPjB8CMOTGgLU3QjdANVI/b9iINTjQ4oqQlFTQ4xCtA0xpv6d+jrZFINI5FgwSupemRw67BHCcZzO9dclSZIkqYf170tNSZIkaVCTSUiSJEnKGJmEJEmSpIyRSUiSJEnKGJmEJEmSpIyRSUiSJEnKGJmEJEmSpIyRSUiSJEnKGJmEJEmSpIyRSUiSJEnKmAFVOw7ANC0aGiKZDuOUsrO9/To+kDH2FBljzxgqMebnB7r0+Lq6MLY9OKqqne61D7iekKaduvx9f9Df4wMZY0+RMfYMGePQ1qtJKBwOc80113D48GEAHnroIa655hquvfZavvKVr5BIJHrz9JIkSVI/12tJaOPGjdx5553s378fgH379vGHP/yBf/3rXzz11FPYts0//vGP3jq9JEmSNAD0WhJ6+OGHuffeeykoKADAMAzuvfde/H4/QgjGjRtHRUVFb51ekiRJGgB6fT+hiy66iL/+9a+UlZW1tdXX13PLLbfw/e9/n7lz557V8W3bpra2lvr6BizLOttwBz2Px015eTm6rmc6FEmSpL6fHVdVVcWHP/xhbr755m4noJqalrZ/19dXI4QgKysfVdUyvtuhpimYpp3RGE7FcRxaW5s5dOgQgUB+psM5rfz8QLvfc38kY+wZQyVGOTuuY306O27Pnj3ccccd3HjjjXzyk5/skWMmEjFCoVw0Tc94AurvhBD4fEGi0VimQ5EkSQL6sCcUDof50Ic+xGc/+1luuOGGHjyygxADbqZ5xshELUlSf9Jnn96PPvootbW1/OlPf+L666/n+uuv52c/+1lfnV6SJEnqh3q9J7RkyRIA3v/+9/P+97+/t08nSZIkDSByHEuSJEnKmAFXO66/iEQi3H//tzh8+BCKIhg/fiL//d9f5a233uKPf/w9ppnE7XbzyU9+lnPOmcr993+LaDTKd77zA/bu3cOnP/0xfvGL/2PEiJGZfimSJEkZI5NQN7355mtEIhH+/Od/YFkWP/nJ9zly5DC/+c0v+fnPf0NWVoi9e/fwuc99gn/969987nNf5IMffA/PP/8M//jHX/n0pz8vE5AkSUOeTELdNHXqdP7v/37Ff/7nR5g9ey633nonq1e/TW1tLZ/5zCfaHieEwuHDhxg7dhzf+tb9fOQj7+fyy6/issuuzGD0kiRJ/YNMQt1UUlLKv/71BOvXr2Xt2tV87nOf4J57PsCsWbP51re+3/a4qqqj5OWlFoYePHiArKwsdu3aQTKZlFULJEka8uTEhG564olHuf/+bzFnzjw+8YlPM2fOfJqbm1m1aiUHDuwHYMWKpbzvfXcSj8eprKzgZz/7KQ888EuGDRvBr3/988y+AEmSpH5A9oS66Yorrmb9+rXcffetuFxuCguLuOWWOxg9ejT33vtVHMdBVVV++MP/h2EY3Hff17jzznsYNWoMn//8l3jf++5g1qy5LFiwKNMvRZIkKWN6vYBpbzi5htPRowcoKhqewWja68+1446rrj5EQUF5psM4raFST6y3yRh7hqwdd3b6Te04SZIkSTqZTEKSJElSxsgkJEmSJGWMTEKSJElSxsgkJEmSJGWMTEKSJElSxsh1Qr2gsrKCO++8iREjRrVr/+EP/x+FhUU9fq5PfeqjPPro0z16XEmSpL4gk1AvycvL589//kemw5AkSerXhmwSWrHlKI+/sYe65ji5QRc3LR7N/Mk920t5t/r6On784/upqqpCURQ++tFPMnv2XP7wh99SVXWUQ4cO0tjYwHvf+0HWrl3N1q2bGTMmVfjUsix++tPUNhD19fWMGTOG++77XqeOL0mS1F8NySS0YstR/vL8dhLHKhvUNcf5y/PbAXosEdXW1vD+99/V9vVll13Bjh3buPrq61i0aDG1tbV84hMfaust7d27h9/85o+8885GPvOZj/OXv/yL8vJh3H33rezevYvW1jCapvPb3/4J27b59Kc/xooVyxg/fmLbOX72s590eHyv19cjr0mSJKmnDckk9Pgbe9oS0HEJ0+bxN/b0WBLqaDju6qsv5sCBA/z+978FwDRNjhw5DMDs2XPRNI2iomJyc/MYOXJU23FaWpqZMWMWwWAWjz32MAcP7ufw4UNEo9F2x1+zZlWHxx87dnyPvCZJkqSeNiSTUF1zvEvtPcWybH7+818TDGYBUFtbS3Z2Nm+++TqaduJXoapq2nOXLn2D3//+t9x66x1cddV1NDY28u6yf6c6viRJUn81JKdo5wZdXWrvKTNnzuLxxx8BYN++vbz3vbcTj8c69dw1a1Zx0UWXcPXV1+H3+1m/fi22bfXY8SVJkjJhSPaEblo8ut09IQBDU7hp8ehePe/nPvdFfvSj7/G+992B4zh84xvf7vT9mmuvvZFvfetrvPLKi2iazpQpU6moqGDmzJ45viRJUiYM2a0cemt2nNzKoWcMlfL+vU3G2DPkVg5n53SvfUj2hCA1C663p2RLkiR1l2PbgMh0GL1uSN4TkiRJ6vfsZKYj6BMyCUmSJPVDjimTkCRJkpQplkxCkiRJUqaYiUxH0CdkEpIkSeqPLJmEJEmSpAxxkjIJSd1UWVnBokWz+NGP2le53rVrB4sWzeK55069988tt1xLZWVFb4coSVJ/J2fHSWcjKyuLt99egWWdKK3z6qsvEwrJWm6SJJ3ZUOkJDdnFqoldy0msfgwnXIfw52LMvhlj7IIeO77H42Xs2HFs3LieGTNmAbBq1UpmzZoDwGOPPcQLLzxHLBZF13Xuu+97DBs2ou35lmXxq1/9jPXr12JZNldddQ233/6eHotPkqR+Ts6OG7wSu5YTf+vPOOE6AJxwHfG3/kxi1/IePc+FF17Ka6+9CsC2bVsYM2Ysuq7T2trKm2++wS9+8VsefPBhFiw4j8cee7jdc59++gkA/vjHv/O73/2Ft956g40b1/dofJIk9V/OEElCQ7InlFj9WPr0RzNBYvVjPdobWrTofH73u19j2zavvvoyF110Ka+++hI+n4/77vsur7zyEocOHeTtt5en7fmzZs0qdu3aydq1awCIRiPs2bObadPO7bH4JEnqx4bI7LghmYSO94A6295dXq+XMWPGsmnTBtatW83HPvafvPrqS1RXV/HRj36Am2++jXnzFpCTk8uuXTvaPdeybD7xiU+zePFFADQ2NuLxeHo0PkmS+i/HNDMdQp/o1eG4cDjMNddcw+HDqd1Dly9fzrXXXstll13GAw880JunPi3hz+1S+9m46KJL+M1vfsH48ZPaNq5zu92UlZVz++3vYeLESbz55msd7g301FP/xjRNIpEIn/jEh9iy5Z0ej0+S+gudJIYTR4jBX7SzMxx7aPSEei0Jbdy4kTvvvJP9+/cDEIvF+OpXv8qvfvUrnnvuOTZv3swbb7zRW6c/LWP2zaAZ7Rs1I9XewxYuPJ9du3Zw8cWXtrXpuo5t29x996188IN3M3z4CCoq2k/LvuGGWygvL+cDH7iLD3/4Hq666tq2CQ6SNJg4VhJX/U7CT/+Qpke+ibLnDXSrNdNhZd4Q6Qn12nDcww8/zL333ssXv/hFADZt2sTw4cMpL0/tY3PttdfywgsvsHjx4t4K4ZSO3/fprdlxxcUlPPpoai2Q1+vl1VeXtX3va1+7D4Cbb769w+cefx7AZz/73z0SjyT1Z/GKPdQ++n0gtXdO4yt/JHQpiFGL07awH1JsmYTOyve+136hZnV1Nfn5+W1fFxQUUFVV1VunPyNj7IIenYQgSVLXCQHRQ1s5noCOC695jqyRc0ngykxg/YCcHdfDbNtuN9brOE63x35P3qWvulpB0/rXTPP+Fk9HurrLYybIGHtGf4+x+UD6FvSqN4g/y49quDMQUcf6+ufo0hXy+vnvrif0WRIqKiqipqam7euamhoKCgq6dayTt9m1bbtfbac9ELb3BobEdsq9TcbYM7KGTULxBrEjzcdaBMEFt1LflAT6R28gE9t7x6Pxfv+766x+sb33tGnT2LdvHwcOHKCsrIxnnnmGm2/uiYkAAsexEaL/9z76gyE9xi71S0Z+Obm3fhPz6A7seASjZDyJQNm7R+iGHsc682MGgT5LQi6Xix/84Ad86lOfIh6Ps3jxYq644oqzPq5huGlsrCUQyEZVNTm98zQcx6G1tRmPp/8McUgSQMyVB8PzEAJiDjIBAQyRC8ZeT0JLlixp+/f8+fN56qmnevT42dn5hMNN1NdXpa21yQRFUbDt/jscp2kGY8aMpLExlulQJCnNEPnc7RTH6b+fIz1pwFdMEEIQCIQIBEKZDgUYGGPwuq4DMglJUr/Wjy9me5K8kSJJktQvDY1uoUxCkiRJ/dEQGZuUSUiSJEnKGJmEJEmS+qOh0RGSSUiSJEnKHJmEJEmSpIyRSUiSJEnKGJmEJEmSpIyRSUiSJKk/GiIVyGQSkiRJkjJGJiFJkiQpY2QSkiRJ6o+GyI4AMglJkiRJGSOTkCRJvUZRBC47jMsOoyhD48q+xwyRjToH/FYOkiT1T5odw9n7NvXLHgUcggtuQR85l6TiyXRoA8JQ2S16aLxKSZL6nFK9g8ZX/oQdbcGOhml89c+Iqu2ZDmvgkElIkiSpezRNIbL1rbT2yOY30DT5sdMpQ2T0Ur4bJEnqcbbtoOWWpLVruaXY9hApD322hJrpCPqETEKSJPU423Zwj1uA4va1tSkuL+6Ji2QS6ixlaCQhOTFBkqReEfcWkXP7t7DrDgKg5g4j7s4bMvvknDVlaPQRZBKSJKlXOA7EXXlQkgdAEmQC6gIxRHpCQyPVSpIkDTTynpAkSZKUMapMQpIkSVKmKEPjbolMQpIkSf2QkElIkiRJyhg5MUGSJEnKFDFEpmgPjVcpSZI00MiekCRJkpQxsoCpJEmSlDEyCUmSJEkZI7f3liRJkjJFDJG9HGQSkiRJ6o9kT0iSJEnKGGdoVHuVSUiSJKkfcoZIyXGZhCRJkvoj2850BH0iI0noySef5Oqrr+bqq6/mhz/8YSZCkCRJ6udkEuoV0WiU733vezz44IM8+eSTrFmzhuXLl/d1GJIkSf2bZWU6gj7R50nIsixs2yYajWKaJqZp4nK5+joMSZKkfs2xh0YS6vNa4X6/n8985jNceeWVeDweZs+ezYwZM/o6DEmSpP7NTmY6gj4hHKdv5wFu376dL3/5y/zhD38gEAjwhS98galTp/LhD3+4L8OQJEnq1xqWP0b2gpszHUav6/Oe0NKlS5k/fz65ubkA3HTTTfzjH//oUhKqqWnprfDOWn5+oF/HBzLGniJj7BlDJcb8/ECXHh9tacXs5z+Xzjrda+/ze0ITJkxg+fLlRCIRHMdhyZIlTJkypa/DkCRJ6tccM5HpEPpEn/eEFi1axNatW7npppvQdZ0pU6bwkY98pK/DkCRJ6t+S8UxH0Ccyson5Rz7yEZl4JEmSTsMxY5kOoU/IigmSJEn9UUL2hCRJGkBUTLRwJXZTNYovhJVVCnTtZrjUfwyVnpBMQpI0CAjhIPa/Te2Lv2tr8597OdZFd2cwKumsJKOZjqBPyOE4SRoEjEQDja/+pV1beP2LJGsPZSgi6Ww5iaHRE5JJSJIGg0Sswym9ViycgWCkHuHYONbgr5ogk5AkDQKONxs9r7Rdm9Bd6KGiDEUknS2he3CGwEWETEKSNAgkhIfsqz6Nq3wiAHpuCXk3fxnjXYlJGkB015BIQnJigiQNElF3Ib4rP08wGcbW3MSER86NG8BSPaHBUbbndGQSkqRBJIlOUs/OdBhSTzA8ONHmTEfR6+RwnCRJUn9kuGUSkiRJkjJD6B7sSGOmw+h1MglJkiT1Q8Lw4sgkJEmSJGWCcPlwWhsyHUavk0lIkiSpH3IMD3ZrY6bD6HUyCUmSJPVDlu7HiciekCRJkpQBCXSwLZzE4C5kKpOQJElSP5SwQHizsMN1mQ6lV8kkJEmS1A8lLAfhycKRSUiSJEnqa4mkg+IJyp6QJEmS1PfipgOeAHZTdaZD6VUyCUmS1GsUxcFltWAwNDZo60lx004Nx7XUZDqUXiULmEpSBgghMBJ1OC21CHcA01uAhZrpsHqUy2wktv55mt95DTWQS+ii95HMG4ftyGvfzkiYoHizSB7cmOlQepVMQpKUAa6mPdQ+/qNjWzgLshbeijbpEkxhZDq0HqEqDrH1zxNe/yIAZkMltY//iPy7vkPMJ/c46oyE6SC8IeyW2kyH0qvkJYkk9TGXE6Hh+d8cS0AADk3LHkYNV2Y0rp6kJcO0vvNa+0bHxqo/kpmABqCE5YDhBdvEibdmOpxeI5OQJPW1RASzg5vN9iCqE+YoOqo/J61dGJ4MRDMwJUwbIQTCl409iO8LySQkDQqKIlDVnn07q6qCEKJHjwnguPzoHWy7rQTyevxcmZIQbkIXvx848fPTi0YickdkKqQBJ2Gl/l/xhrCbB28SkveEpAFNCHBFKontWoXVUodn/ALs3FFndW8llrTYfrCJNTuqGFceYvrYfILunvtTSeAmdPknqH/qp1gt9QhVJ3Tx+zB9RT12jv4gmTeO/Lu+g9VwBGF4EbnDiWtyw/HOSloOAMKThd1cleFoeo9MQtKA5opWUfvQt9vqa0W2vEnOtZ9BlJyL43TjgAKeXr6f51ccAGD5pkqGFR7my/fMxK31XE8r5isl+/ZvQ2s9wuUj4crF7k68/ZjtKMT8ZeAvy3QoA5J17A0hfCHsxsGbhORwnDSgmVV70go8tix7GN2Jd+t4ja1JXlx5oF3bwaoWKusj3Y7xVOKKn3hgGDFj8CUg6eyZx3tC3mzspqMZjqb3yCQkDWy2ndbk2DbQvU915xTPdGSWkPqYeeytLXzZOM2Dt2qCTELSgKYVjUFo7e//BOfdSFK4u3W8kFdn8bnth48KcjwU53q7HaMkdcfx6x7hCeIkWnGSg7PqhLwnJA1ocW8Rebd/k8jGl7Fa6vBNuwS7cEL37geRmst1ywWjGV2axbJ3Kpg0IodF00rw6IOrmoHU/1nHe0JCIHy52E1VqHnDMxtUL5BJSBrQHAdivjL08z6IwbEFfmfJa6gsOqeI86cW4Thgy6E4KQPsk66kFH8OdmOlTEKS1F9ZPZB8TuY4DpbVo4eUpC45+f0nfNlYDRXomQun18h7QpIkSf2QdVJPSPhzsesPZzCa3iOTkCRJUj9kntQTUgJ52I2Ds+6eTEKSJEn9UNI6sfxA+HJwwnU4ZiKDEfWOjCShJUuWcNNNN3HllVfy3e9+NxMhSFKvM+wIRvMBXOHDaN1cPCsNXXHzpOE4VUvNkGuoyGBEvaPPJyYcOnSIe++9l0ceeYTc3Fze97738cYbb7B48eK+DkWSeo07UUvjsz8nWXMQAM/4eXgXvYeEKmunSZ1zchICULIKsGr3o+aPyExAvaTPe0Ivv/wyV111FUVFRei6zgMPPMC0adP6OgxJ6jSXFcao3Y5etQl3oo4zFdZWFIi9s6QtAQFEd6zEqdrZy5FKg0nCdNpK9wAowQKsmn0ZjKh39HlP6MCBA+i6zsc+9jEqKyu54IIL+OxnP9ulY+Tn9++ryf4eHwydGB3HwUnGUYzuVVBINhzl6L9/TLI6VU9OGB6K33Mv7pKxp4zRirVSsT99S2azai/50y7oVhxnY6j8rntbX8foc6tgGIQCqYnZibKRtKx/eUD8rLqiz5OQZVmsWbOGBx98EK/Xy8c//nGeeOIJbrrppk4fo6ampRcjPDv5+YF+HR8MnRjdiTpiW98gvn8j7jGzcI1fSExP32jtdLSDm9sSEICTiNKw9HHcF32M7LysDmNUFHCPOpdkXfvZTFrh6D7/uQ+V33Vv64kYu5o8fIZgb0UYrdAFgKNkkairoLqiFqG7ziqWvna6197nw3F5eXnMnz+fnJwc3G43l1xyCZs2berrMKRBzmVHaHzmZ7SseopE9QGalz9G88v/16Xq2ooiMBvTqxcnaw6gOMlTPs+2wT35QozCkW1t3knnIQrGde1FSENa0KNS1XxinrZQNZSsIqyavRmMquf1eU/owgsv5Etf+hLNzc34fD7eeustLr744r4OQxrknJYqkrUH27XFD28n0FrT6f1tbNvBVTqed1//+iafj6mcfngvZuQSuO6LiHANKBq2L5/EoFzvLvWWkEdwuKH9lGwluwSrYjtaycQMRdXz+rwnNG3aND784Q9z1113cdVVV1FSUsLNN9/c12FIg5xQTlFw9FTtp2DljCR08fsRhhuEgm/KhejjF3WqnlxCeIgHhhH3lZCUCUjqoly/xoHa9j1uJaccq2JrhiLqHRmpHXfLLbdwyy23ZOLU0iDlsiMQrgbNheXNx/YX4B45ndi+DW2P8U5chOXN69JxTeFCjLmA3OHnIhyLpBEi7sg13lLvyw+oHG5IYloOmpqakqnklpFY/xSOGUdoA+u+0KnIAqbSgOeJV1P/9AOY9ZUA+M+9DGPGDfgv/ACe8dtJVOzAVT4JUTS+W0NijgNxLevYFyfarXgEV7wGNBdJPUtW25Z6lEsV5PpV9tUmGVuY2jNLaK7UfaHKnWjlUzIcYc+QSUga0FRhE179VFsCAgivf4nckdOJ502CYXPRR84j2cNVtt2JWqoe/hOxg1tQ3H5CF38AUTodC7nvkNRzynN0Nh+JtSUhACV3OOahTYMmCXV6XKGpqYlwONybsUhSl2lWjNj+9NmVZt2RtkWlPb3NgyZMwkv/SezgFgDsWJj6Z/8XPVx5hmdKUteMzNPZcKj9jE61YBTmwfR1aAPVGZPQ3r17ufnmm5k/fz5z587l7rvvpqJi8NUvkgYmU3XjHp5+RajllHR7d9Uz0ZJhorvXprVbjTIJST2rJKTRErOobjbb2kRWIU4iit2UvnxgIDpjEvrKV77CrbfeysaNG1m/fj2XX345X/va1/oiNkk6I8tR8M25Hi2rsK3NN+1inNyRp3nW2bFVF1p2cVq78Gb12jmloUkRgjEFBqv3R9vahBCohaNJ7l+fwch6zhmTUDQa5Y477kDXdQzD4J577qG2trYvYpOkTom5Csi69V5yb7+XvPfcjzH3DpKKt9fOlxAesi/9ECgnbql6x8+DUOfWH0lSV4wvMnh7bxTnpK69WjAGc396b3wgOuPEhFGjRrFu3TpmzJgBwM6dOykrk39sUv+SULyQ1Xu9n3eLZ4+m7EM/IlJ9GMXlww6WkFA8fXZ+aegoDWlEkw6H6k2G5aZmdyp5w7E3PIMdaUIZ4D3wMyahiooK7rnnHsaPH4+maWzdupX8/HyuvfZaAJ5++uleD1KS+hvHERgFw2kSXatFJ0ldJYRgYrHBij0RhuWmEo5QtdQEhQPrMSZekNkAz9IZk9AXvvCFvohDkiRJOoWJxS4eWdPCLbOCqEpq2qdaOBZzz9uDNwnt2bOH0aNH4/P5Ovz+5MmTey0oSRpKFAWiSRtdVTKz1bHU7+X4VIIehe2VCSaXpiolKAWjSLzzIk4sjHD7Mxxh950yCf3oRz/it7/9LbfeeivFxcXtbopFo1FWrFjRJwFKZ6aqAgHtNsCSOiYE6HYUYSWw9ABmhkvwtCYs9lY0U9MYxePSGFUapDDL3a4ygyQBjCs0eHtftC0JCc1AzRuBeWA9+vjzMhxd950yCf3whz+ksbGR0aNH8+CDD+I4DkIIkskkd999d1/GKJ2CwMHVcpDW9S9ix8L4zr0MO388piyW2SEhHIy6nTS+/EfM5hq8E+bjm3dzl/YYips2h2rCHK2LUF4UoTjkxtC6l8iEAodqwvzr5Z1U1rUCsGh6CTecN4ocn3GGZ0tDzbhCg7+tbMK0HbRjQ3JK0ViSu1cOziT0X//1XyxbtgwhBPPnz29rV1WVyy+/vE+Ck07P1XqEmoe+DXZqz5HY/k3kXvc5KJLbpXfE1XqUmsd+CI4NQGTbMhzbxH3BRzCdM5fbsRyHx9/cy8urTmwRcd15o7h+4QjOsON3h6JJm9fXHWlLQABLN1Qwa0KhTEJSmoBbIeRV2Xk0waSSVG9ILRxNcvPLOPFWhKvjWyf93Skv4f7whz+wfft2brjhBrZt29b23+bNm/npT3/alzFKHRBCkDjwTlsCOq5l9VPowjzFs4Y2q7GyLQEdF92xCi3RuR0za5vj7RIQwNNL99IQ7vxGeSezbdh5oCGtvao+0lZySJJONiJX553DsbavheZCzRuOuX9dBqM6O2ccR/j+97/fF3FI3aGk//qEokG3rsv7H8NuxdWwG6N+Jy7rRKLQSOIKH8Ko2YorXoMQp7+BIgS0xEwSHWxEp/pDJIXG/upW9lWFiSbtDo6QkkhaaW2OA4loBM2J42o5gFG7FXeiDnEsi5zqNQD4XAqTR6UPBZYV+DF7uCK3EOAyGzFqt+Nq2ofuRNO+3xhJsrOimcqGWI+cX3MSuMKHO/17ks5sRK7O1sp3bXRXNI7knrczFNHZk1W0ByjHcXANm4LQHsMxT7wpA3NvIN6JoaX+LlFfSfNTPyFZvR8ALVRA9vVfxDQCmBuepmH1MwAIVSf3pv8mkTPulLXiqprifOePb3Pp1BAXlU7GObLl2HcEWRd/gO89sovtx3okRblevnj3TEKe9PtqhUGNvJCb2sYTV6IleR7yRCOJ1c/RuuHl1FF1F3k3fwXHk0XTsz9Lew0xV2pPI8eGmy4Yw54jTRytiwBwwYwy3t58lIraVi6cVozSQ10iV2sldY99DzuaKkLsHj0D3wUfJKH6EQL2VbXy/b+uJmmmkvC1i0ZyzYIR6Er3zq85CcxNz9Lw9pOpBlUj74YvkMib0Gs1/YaCgqBKXatFa9zG50pdhA70ITk5I3QAi3lLyLv9XgKzr8E35QLybvs6Zt7YTId11oSAyK41bR/eAGZjNbHtb6FFamk5loAAHCtJwwu/QbdaOzgS2MDfX9xOa8zk36tqec1zOeGFn8R/2cfJf893Wd1S2JaAAI7WRVi2sRKlgw9fnwFfvCKXOeNC+Dw68ydm85nzXHjjNbRueOVETMk4zW/+Hevghg5fw8nHzvbqfOW9s7j7igncdfl46ppivLnhCH9/YTu1Le2veLtLExYtKx5pS0AAsT3roHZf6t+mza8e39iWgACeXrqPqoZo2rE6S22ppOV4AgKwTBpe/A2GKSvxnw1VERQFVfbXndhxdaAPyckkNMDFfKUoM29FX/RB4qExWIOgc6soCrEju9LaE4e3gZV+/8VqqUckO/7ATJg2e440tX39xKpavvZ0CyuiIzGzylmzoz7tOVv213V4TyaJi9IgvNf7Bt9bUM979Jfxb3oY1Rfi3XOqHTNBoqKD13BoK8q7hqWicZO/vbCdf7y4g3f2nKjL2NTNe03vplgxEpW709rNpiqEgGjcate7O67xLM5vR5vS2qxwIyQj3T6mlJIf0DhU/65tv4vGkdy9MkMRnR2ZhAYB23awrFPfyxhoLMvGN2ZGWrtn/Hww0guTGgUjsV2BDo/l1lVmTyxMay/N82GaNjMmFKR9b8GUEuxT/DiTuWPJOe9WcvLzyJ11BaEbvoStpd9rUvy5uEed2+FrsOz2GS7oNcgPta87p6mCvFDP1KKzNC/esXPS2rW8chwH/B6N0aXp9ccKsrtfBFYJ5PPue5N6XjmOO9jtY0opOT6Fisb2k4/UwjFY1Xuwo80Ziqr7ZBKS+iXPyKn4z70stZgG8E5ciDZiJgl3PjlX/yfCSH1A67klhC7/CElcHR/Icbjx/NFMGJ6derym8N4rJ1Cam/qAnTgsm8vnDuf4CNn500uYMTav3eLsk9mOQsxXhjliIf7JC4nr2Zj+ErIv/w+EnopBzx9G8Lw7oGhi+msYOTPt2G5N4XN3ntuWiPwenf+6aybZ3p5Z72XZAs+5V+Eqn5RqUDWyzrsDOzQCAE0IPnbjFErzU/cTPC6NT982jfxg96eJJ70F5F77KYQr9XPWsosJXflxEqQnbKlrsr0qR5vaJyGhGagFo0nuWZWhqLpPOKf6a+vHamo6N6U2E/LzA/06Phg4MdbXNqHF6sBxsNw5mMeGGhVFoMcbIBnB8WSTEGfuMZgONLUmMDSFoEdPSwSNkSS245DtMzo9t/Dkn6MQYCQaIBk7FlPqw1YVNlq0Dmj/GjoSt2yaWhP43Dp+l3bKRNhZLieCYbaQwCBhZKM5CZRoPULVSbhysJ32rzRpOTS2JvC4VIIeHfssZ8gpikBPNEDi9L+ngfJ+PNsY8/M77q2fyso3VhGLpQ+JNkctHlrdwk9ua9/Dt6r3YO5dg++m+84iyt5xutc+8G8gSIOW5ShYrvy0dtt2iOsh0EOdOo5GElfDPpQdy1F82bjGziHuLW43Syt0lr0Ox4G4ns27i1VYjoLlTn8NHXGpCgXBVPJSFIED2N0sxeSOVNDwzM8wG6sQupvsSz+EVTaDpKfoWMDpz9FVQX4w1Zs72wR0/BhxLQRa6KyPJZ3gdymE4zaW7bQVMwVQ8kbibHoRq+EIanZpBiPsGjkcJw16ytGt1D56P63vvE7LyieofehbuKJVmQ6rQ5btsKuymQce3shvn9rK4fpIl5d9GU6Mxhd/g9mYeo1OMkb9c7/CiFT3QsRSX1MUgddQaI62v3EpFAW17ByS217PTGDdJJOQNKjpxGle/mi7NicRw6xMn7nWH+yubOH7f1nDhl01rHinkvt+t5LKhvSZa6cj4i0kaw6+q9XBapZJaLAIuAWN0fTF0+qwaSR3LcdJ9szMyr4gk5A0qAnAsdLLGDl2+h9wximCp97a267JdmDt9uoO1y2dimN4UQPplRgUX+hsI5T6Cb9LoTGSPoVT8Wah5pSR3Lk0A1F1j0xC0qCWFC6C825s36hq6MW9t6hXF0l04l2u/yYc0LX0J2lq1w6UVH1kX/FxhHZidltg3o1Y/uKuBST1Wz6XQkOk4wspdeQsEhuf658XWh2QExOkQc1xwCmdRu51n6V1w0so/hx80y8n7ivp8T17VEy02p00L3sEJxknMOd6nLJpmB3UrOs4VofrzxvNpt11bW2aqjBjfEGXJgo4DiRyxpJ79/dRI3VYug/LVyC3+BhEAm6FupaOCxWrOWWYbj/m7hXo4xb1cWRdJ5OQNOiZiguKpuO5ejoOgpjl9MqmcVrjAWof/1Hb1/Uv/Jqcq/8TSmd1+hjDC3zc9+G5LNtUgcelMf+cYgqzXF2ut+Y4EDdyyS8d0e+nP0tdF/K0L93zbtrYhcRXP442ei5C7d8XHzIJSUOGaUFvbVmqKIL4nrVp7eF1L+Avn0HS7tzItyIEw/J8jLx0PI7jYNuOLPgppcnxK6zce+otW9TccqxALol3XsI1/eo+jKzr5D0hacjQieMKH8LVegSdU19FdofjgOJNL0mj+rJwurG1hmXZPbJWRxqccrwqjVGL2Gm2HtEmXEBi47PYrel7VvUnMglJPS5pO1Q0RNlXFSbSwR483aU7MVzNBzDqd6XtzXMmXrMe9q0kvuU16h76FtE3/oBh9lydLcdxMEZMbytTA4Ci4p91LabdM9sxSNJxiiIoDGrsrz31xZTiz0EbNp3Y0r/2YWRdJ4fjpB4VTVr84+VdLNtUAUAo4OJr75tNrv/stqs2rDCRpX8juiNVKVj1h8i96ctE3UWnfZ4Q4G45RN3zv8JsqEQLFZJz4d00vPUw7pHbYNjcs4rrZHFvEXm330eycgckE+il44n7Bs7KdWlgKQlpbD8aZ0LxKeomAtqY+cSX/pXE7pUYY+b1YXSdJ3tCUo86WN3aloAAGlviPLJkF51d9q8oAkXt4G1Zt68tAUFqW4CWlU+gidP3tAyzmdp//xizoRIAs7GKhrceJjD9YuKHtqB2dK7TEAISlkNrwiJhtj+340DMXYA18jzs8RcT85V1ayhOkjpjeI7O5iOnX5QqVA1j2pXEl/8NO1x32sdmiuwJST3qaF365nLbDzQQMy3c2uk/8JujJsveqWDT7jrmTylmxrh8/C4VIcBsOJr2+MSRHfisOKZy6i0HnHAddqT9sJsdCyNUDVfZRKxT7dnQ0bGAXRUt/P7JzdQ2RVk0rZRbLhxNwJX+ZyQnE0i9rTRbo2aTRX2rRY7v1LspK6FitJGziL7yK7zXfQWh9K+PfdkTknpUeUF6tdyZEwpw66ffcjxu2vz0X+t4ZMludhxs4M/PbuXPz2/DIvWBruWWpz3HPXoGpnr6CtrC7QflXecWCmogB7V0UpeSRW1znB/+dTU1jVEcB97acIRHX9vd5dpuktQTVEUwpsBg1b4z74CrjZoDQhBf+VAfRNY1MglJPaos38ctF45pKzMzpiyL6xaNBMdBCHCZzRgNu3FFK1E5McW0qiHKoar2Wz+v215NfXOMysYoB5Ry3Fd+rm1vHqN4NN4ZV2M5p88ASXcuoQvubtcWWnQbYthM4lr6Rm6nU1HbyrsnrC3bWEFrfGCsTJcGn0klBkt3Rc647YcQAmPaVZj71pLYuayPouuc/tUvkwY8QxVcOXcYC6YUE09a5AbdHK9E4wofpu7xH2JHWwBBYN4NaFOuwBSuU9dGq9qJO1rHy5XZHGqCT7zn54ScJhxfHrFTbWR3EttRUEcvIq9oDHZLHUogB9NfTLwb1QN8nvQ/l9wsD/oZhhklqbeUhjQcB7YfTTDxNBMUAIThwZh1A/EV/0ANFqAW9V7pqq7IaBL64Q9/SENDAz/4wQ8yGcbgJ1JDSUfrIvg8Orr77FZQt8RMDteEURRBWZ4fn6v9cJcAClwJlOgB7MMtqIFs8GbRvOwR7GgLgXMvRcsqQM8uwj66GUVzMzJUxuSROWzZVw/A4skhrp+skWtWIdwO141o5puvaazb18qic0rbhtGEACNeh11/GKHqiJwy4mr79TqW0LH85di+MqqaYtTuD5MdcFGU7UFTBIqw8UQqSdYdQagqSt4Ionpu2usuy/MzZUwu7xwrq6MI+PD1k3Gpos/vAbnMJpyGwzi2hZpdRtyVI+9DDUFCCKYPc/HC5vAZkxCktl03pl1J9KX/xXv911CyCs/4nN6WsSS0YsUKnnjiCS644IJMhTAkCHFse4C/rmlb/Dh3chHvv3ICrm5cwde3Jvj2H1fR3JoAIC/LzdfeP5ssz4nEZlhhwq/8hviBzakGRSP34nvwjZuFkV9G7NBW9Oxial/4LXY0NQSnZuXzxeu+yNL9JVjRMHNjS4m9soTGY8fMXnwH188ezTt7alk8rRjTTE0ocLVWUPvwd3ASqXFxPbeMrOv+K7XBXLufg+DtbdX87snNbW03XTCGq+YNw9u4h+qn/7dtAoMayCH3pi8RdbX/A3XrCh+/fgqHa8O0Rk2GFQcJebQ+//B3J+po+PePMRtTkzUUt4/cW79BzHP66erS4DSpxMXbe2Psr00wIu/MSyHUgtE4Y+cTee7HeG/4JoonfZF1X8rIOEJjYyMPPPAAH/vYxzJx+iElbjr83783t1t9//aWoxypi3T5WKoqeG3dkbYEBFDbFGPN9hrEySWjGw6eSEAAtknzupcwm2txFY/CMU3iR/e2JSAAq6kG59BGFk4u5NIRcWKbl7Q7d+PKpxiTrzF9hA/l0FpcZiOq4hBe83RbAgJI1h3GqtieFntjJMmfn93aru3x13fT0NxK67Zl7WbQWS31xHev7nCI0K0rjCkOMm1UDuOGZff5nAQhIHnonbYEBGDHWolseJEuzjaXBglNEcwa4eaJ9Z1fwK0Nn45aPJ7ocz9u9/eTCRnpCX3zm9/kc5/7HJWVld16flf3au9r/Sm+ytpWahrT32TRuNnlOC3LZu+RprT2g1XN5OWNb/u6pSJ9mnaysQoUDYSKFsxt9yHadvy6QxTk+glXpa8Cd+IRHDPB+MRu6p5+BKNwJAU3/zetaZu3gd10NO211R+oJ2mmT8dujZkEG9J3WU3WHKQo15/W/m6Z+F3XrDqS1pas2U9+0EDR04dk+tP78VRkjOm8PhdaB1t7dGTBBBe/fqWaQy0wpfzUSxZO5sy+lObVz2G++nOK7vxGh++dvtDnSeiRRx6huLiY+fPn8/jjj3frGP25KnB+fqBfxacAsycWsnrbiQ9aISA/y93lOIWAC2aUsm1/fbv2OROL2h3LFUwfFvKOngGqhuLLJV6xi8D0S4hX7G73GNfImdTUtOAOliBUHcc6kYz0gmEUmocJr3sEgETVPmK1lXgnn0/Tm/9sdxy9dGLaa/O5VPKzPdQ0nEjIHpdGTpYH94ipxA5ta/d4z9jZZ/z5ZOp3bQyfButfbtfmnbSY+qYkjpNo197f3o8dGSoxdjWJRVrjxGKd3yF1wWgPv19SwzevzUPt5CaIztjFJDc8y6F//gDP5Z/utTVEp3vtfd6Bf+6551i2bBnXX389P//5z1myZAn3339/X4cxZCjAey4bz4zxBUCqjM6X3zubgqzO7XFzMseBKaNyuXHxaDRVwaWr3HXZeMaWth9TNv0l5F776WMFPQWeUdPxjp2FMWYecX8R2Zd8EC1URODcy0DREJpB1vl34hSkZuvE3XkU3vYVtGA+AK7S8QSmXkT4XcnGjrWij5mPf/plIBSE4SZ08fuxckamxe41NL58z0xGl6amZZfk+/jq+2YTcLtwjZqRikU9FsuCm6B4cpd/Pn3FzhtD6IL3IHQXKBqBWVejjph5xmm60uA2tkDHpQmWbE8fiTgVIRT0aVdBMkrs1d/idGHxdk8RTgbfuY8//jirVq3q8uy4/nzV1F+v6hxSs9oMTaG8OOusYhQCWmIWQoDfreF0UO1Z0wR6sgWRjIDqIqH5MR0VjTjKoXU0LX8cNbuYrFlXoPjziLoLOPn9n58foLm6CpGMgctHZNk/ad38+okYDDe5d32XuJGHKhy0RCMIlaSR1e7+V8y02XmoiTXbjzKyJItzx+WjAB5DQz9px1JDmGixOlA04u5crE4s/cnk71oIMJJN4DiYriysUxRJ7a/vx5MNlRi72hNa+caqLvWEAOpbLR5a3cw3rskn13/6BeIncyyTxOrHULJLcV/wIYTo2f7J6V67XCc0RAgg6O6ZX7fjgP/YtOx3JyAFG6NpP+F1zxNNxvHPuBIzbwyWk3q8Vr2Dmud/C4DZVEP1/k3kXvYhGFmQdp648ILhBQc8c25ADeQQ2fIWWv4wAvNvJO7KAwcsR2Adnw13UjxCgVfWHOLx1/cAsHRjJS+vOsTX3z+rXQICSDgaieOz4fpw7akiQE80gG1hubMxnc59cDgOJxbb9v3Fq9RP5fhUzh3m5q8rGvnsJTntJwydhlA1jFk3klj1KPFlf8O18J5OP/dsZTQJ3XTTTdx0002ZDEHqYUbzQWoe+g7HN4+L7d9E3k1fwsqbiKoqRLavSHtO6463CY6aTdQ59RBhXAshpl5H1jmXYasGMVs54/50zRGTp97a267taF0rFbURxhRn/ka4bkcxty2hduUTYFl4Jy3CO/cW4nrXKjlI0slmj3Dzz1XNLN0V4bxxvk4/T2gGxuybib/9EKx6FPfcW3sxyhPkpE6pxyiKILrrbd6dHcJrn0VTU3vuqIH0RaCpjd/O/FZ0HEgIN+YZdil1WS24mvZCMr3MDoDdT+6diNo9NC97BCwTcIhsfYvk7hWnrh4hSZ2gKoIrJvt4fF0LtS2n3n21I0J34ZpzC+beVcQ3PNtLEbYnk5DUozraz16oqQV0tu3gGT8XYZwoOip0F/5zLiDunN1+Q8d5YlU0PHwftQ99G/W1/+WyGe1n6uUEXJTmdf7qsLcoiiB+aEtae2TbMjSnZ3d9lYaevIDGrBFufv9WI1YXd+gVhhfXnFtJbn6ZxLbXeyfAk8h7QlKPsW0H99jZtKx5DuzUFZiaXQoL7iFqQjxhkXSXkXv3d2luimJYYXxeD/FAOSoWWrQGx0ri+POB1HCZEBBPJNGw8LoUSMZJaj4sR0EngWrFMTVvatKDsGhZ8ShWS6qsjnV0N5dN3UbJVbNZvrWWEUVBLp0zDJ/R+Ru2vcW2HfTc9A3vjKJRWEI741BjpigCdDOMo6gkFY8sFdSPzRzu5kBdmOc2hbl2eteGn4UngDHnVuIr/4XiyUIbcW4vRSmTkNQJ0aTFoepWmlrjlOT5Kc52o5zipmXcX0b+nfcR27mS1pxxrG7MpbBBYfOavby+7jB3XDqeHQfqeXtLFeUFfj5w7TBG2nESa56gYf3LgINRNArP9Z8hZvp5c/1hnlx6ALehcvf5RYw9/DQevw/3rOtYt7eFRNJkWLZKeUkutuoifmy9j1E0Ct+E+TjJGBcEjhCcUsDOWsgPutpK/mSanpWPnl9OsuYQAIo3iG/iAmL99IPdsJpJbH6FunUvongChC58L1bhZCwyn9SldEIILp/s4+9vNzGpxMXogq6NNij+HFyzbiT6xu/xej+PWjC6d+LM5BTt7urP0zkH23TTuGnz639vZtPu2ra2T906jZlj89pNhX43TVN4cfUhsoNuDlWFefLNPcycUEA8YbF574kdHnVN4UcfPAfl0S+0e35w7vUs1+fxu6faLyL92nWFFCz9MVr5Ofw9eQnLt9ajqQr33jqMUSNLaX3zL8QPbCZr9lU0vPGvtueJ4glYi/6DrOz296R0EgjHwlS9p309HTmb37WmKcRe/BlaVi6aPxvHscEyiR/dh+fK/yKZ7Jkpej31fhQC2Pxs6h7WSfLu+Bbx4PCzOvZg+5s53TG6ojtTtDuyuzrBmzsj3HtdPl6j63dgrKO7SG55JVVnLpDXrRj61WJVqf8RAlyJWvSjGzCq3sFlNrZ9r6Iu0i4BAfzxma2E46e/4elEm5hp7CHL5bDmWLWGMWWhdgkIIGnaVNSm17GLxhK8uOpwWvs7R23UYB7moc3MLk915E3L5t9rm7DjEQJzbyQ48wqaVrW/qepUbifPrmn7WhE2rrrttPz7ezT+62uI7S9jWJ1f5He2LMvBKBlHy7qXaHjzIRrfeoTG5U+gF43BsvpHT+1khh2hdeMrae1m1Z4MRCN1xZgCgxF5Bn9d3tStBc1q0VjUETOJvvA/OMmzT4rvJpOQhKu1grp/fpP6p/6Huid/SsOj38WVSCWeaAfJpjWaJGmephckLJJrH0dd9kc0M0JOMDX1Op608HWwVsnrS5+a7Q5mU5STvmtqnl/BjkcQLi8tiRNDgtXNJpZiEDXycY2Z0644apuTCjUazYeofeyHJKsPYLXU0/j637D2vt1nayMcx8EYPQstp6StTQsV4Bo7r8s9sr5gCx01mH4VrHjldPKB4PyxHg41JFm2u+uFiwG0UbMRvhCxt/7c45U5ZBIa4lQFIhtexImfeHNazbWYBzchBBTletM2bZszqZBABxu8tR0zUkvrptcACNa+w3nTS9A1hTfXH+Ha89uPK8+YUEBZfhDP+HltbVqoEP/wCVw7UcE46dy5QRcTvA048QjWjFt5ct2JYqqXzSjAJSy0/ctIHNmOd/yc9kEpGmp26gNfCEhU7iRtKvm659GdvqsoHDNyCd34VXJv+Sq5N3+F0M3fIObq3nBHb0uik3Xene22StdySlAKRmUwKqmzNFVw5Tk+HlvbQk0Xp21D6v6Sfs6lWNV7SO54s2dj69GjSQOOgk2yNr0StVlXgTZekOs3+MYH5vCX57ZxuCbMwqklXL9oZNvVS2r9j2g/DdSxEboLvXQC2uG1jDXcfOWu6VQ1W+QEdO77wAyONibwew2GFwZQDRVt8Qfxzbwax0ygBgtoXPIHsqv2862r7qIiEUB3exhZ4CLUehDljvvYFQmiKDvwe3SuWTiCKWPzaXz+pyQqdgEQWnQLiuGhdftK9Oxisi68h7inEJzUeiPVlV4hW/GFcPp4Zlpc9UPOuL474VmIh0aSf9d3sOqPIDQXIm8YcS2U6bCkTso/Nm37D0sb+eIVuaecXHQqQjMwpl9D/O2H0EomogTTq5x0h0xCQ5zpKHjPuZDE0X3t2l2jziVhpT6Ny3K9fOnumSQtG6+u4jgOmpNAqd5O6/oXEJ4A/hlXEg8Mx0EQceWza/aXeGVzEwUBlUvdCsXrfsvI0jE0NCg8Wj2a5dsaABheFOCzt59LlsfA9JcD4FFieEZNx1U8mqBSR3DHo5hN1fgX3Iw55Tocx2F4CD5581TW7ahhydrDPPHmXj5zxS0Ma/o1VmsjjUsfxTX8HArf+30Sqje1FfhJyUUrHofqD2GFG1MNQiFr0W3EnJ75k1AU0S+H1c6G4whi3hLwlpz5wVK/NHO4m701Lby6rZVLJ515q5J3U4L5aKPmEHvjD3iu+XKPDF/LJDTEOQ6ow2cQmFdPePWzoOlkLboNO7f9sJkmUjO6jo8Hi6NbqHv6Z23fj+5cTeE99+MoGq/uTPD7Z/YDsA1YuV3l+x/8GIknvsyBGZ9h+UnbShw42sLLqw9y6+LR6MlmRONBrNYGrOY6EnWHie5/h5zz76Bp1TMYZZOJHzt/fTjJd/+0CtM68UH/i5eq+PYF16Cv+hsAiSM7sVBJkr5PSszIJeeWb2BV78GJR9CLxhD3l5x1L0i3o4i6PcQPbEHPK0MtnZS2y6skZYoQgksm+XhodTMzhrnJ9Xc9BWgjZxFf/jeSu5ZjjFt41jHJJCSRUP0o024gd/KFgEJSD552lbUuTMKrnmrX5p98Hq3rnqdFzeLRNe23xY4nLfYdjTDel8We2vSZXxt31XLLecOILPs7wSmLad65itjhHRiFI8m96B4aVz5J3rWfbjcVuLEl3i4BAURiJmER4PhHvn/WlSS04CkTS8zIpT7kZ822SvZuaeT8aR7GlOVgqN27ulMEWDtep+mth9ra9PxhZN34JTCTgCChZ8kFnlJGHS9y+s9VzfznRTldfr5QFPTJF5NY9TD6yFmpLUXOgkxCEpAqPh1Xs058cQqqsNHsOHpeOYmqY0N4ioaWXUDjW48gplyF3tE+07obxZfNuGEhWFPT7lvnjs/HFasDt5f61/5Gsi61c2j88HbMhkr855yPpbnaLYrMDrjQVNEuEXndGtkBN3puKb6pF6GOmkPiNB/4LdEE3/rjKloiqTI5q7ZW88GrxnP+9LLTDqWZtsOh2lZe21hBbtDD6NIgPkNFTzRSt7z9Ro3ukjEk1j9H87oXEYpG1sKbUcacR1Lp+n5OktRTZg1389cVTWyvjDOhuOtJRM0uxQqVkNjyCq7pV59VLHJ2nNRp7lgViTf/QN2/volQVXIuugemX4+68B7MxmpAoOx6i9vntb+68nt0mqNJ3HNvobx+NedPPvH9USVBLp5ZhpOMogZy2xLQcVZrM4rbj+MOtWsP+TQ+ddv0ttlzHpfGZ28/l5yxUwncfC/2uEtIqKdfHHjoaFNbAjruoSV7iMdPvRZCCHh7ezXf+eMqHnx+O//z0Hr+99GNxEwb4Vg41omZR4rbh+IJ0Lz6WbBMnGSMxtf/jqjZddq4JKm3aapgwRgPj6xt7vaUa23sAhKbnscxE2d+8OmOc1bPloYMl9VCy9J/4hk2GSOnCKHq2Ik49aULSSgeppSUYhSNAqEyL6Qy7EPncLARKutjjBuWjd8l2HAkjO2ayNXnCK4cF8L2F9Jguth9uJF4bjYloVKyz7sN5/jNTsdGMTzouWWYhzbg9gYhq4S4KxfHEcwY7uUn/zGVxpYo2QE3oaBB0hKAQWfGvBw7faqq7YDhRDEaDmA1VaMG81B9WSSrDyA0g+bQeP7+wvZ2z9l5sJGK2lb8xdn4zlnctvmeq2QssQOb084R27MWvWRatxelChxckUqs2oMo3iwUlwez4SjC5UXkDj+xzxCQbKjCOLoLJxFFzS0j4S/BduS1pwTjCw1W74vxzpE4U8u63jNXAnmooWKSO5diTLqo23HIJCR1TmsdnvIJNLz+97YmV/FoChedSyJSRe3jP25bTa14/BRdcA+rjgbJysnhSE2Yx1/bTTia6nW4DZX73jOB/3tuHweqUuuThIAvXldK6cqnyD7vVhpX/Bs7miqTIjSDnAvuouaJnxJadAvusfNJGlkk1z2Ns/oZskjt65acfQ3quTdgoaGqAsfhtMNqwwt8+Dw6rdETvaGPXl4OO16ndsUTbW2+SQuxWpuIHdhM8ryPE++gpE4iaWM6Cp45N6LlFBPZuhS9YCR2pIl4ZfuqAlre6Yf7zsTVuIeaR+5HaAbZ591G7ZIHOX7jS88fRta1/0Vcy8JlNnH08R+TrD1WeUIo5N38JeI547t9bmnwEEIwa4SbFzaHu5WEIDWpKbnlVfSJF3Z7ppy8JJI6RdWNtFI48co9uJ0Inr2vtyvnYUfDmFV7mVpgkzRtqhuibQkIIJaw2FtPWwKCVMflwaUNKFMux2yqaUtAAI6ZIHpoK0bRSJpWP4fTcATDihA7uLVdPC2rn0VEGzlY28rfX9nNU8v3U9McP+UfR1bQx7fuGsc1swuYNiaHz1xVyuxhGs0rn2z3uNaty/AMnwyA+8AyFkxuP/HC49IozvOmfiZaFky6guDN96JMvx7PtMtQ3CemwqrBPIwR53Z7CEQnSdNb/wLHxjdhLs3rX+LkmRfJmoM4dal1X07t/hMJCMCxaXrjbxj0fOkVaWAaV2hwtMmisrF724coecNxElHs2gPdjkH2hKROcTRXx6VwLAu7uTqt2W5twpVtY5o2TeH0D71IIr03UdcSx/blYdVsTfue1dKA6gmS0INsihbywhP7Cbhu4NILVLLX/gm7pRYUlV0NCt/569ttz3tm6T6++9H55AXSb75ajkpeQQF3LXah2Aks1QORBnDSh8kcOxWvdXATd1x7A/k5o3hzQwUjioPcdtFYQl69bQTQth0SpBa92t4icu/4NlbDYYRQETllxE4aLusqYSfbtqpQPUGscEN6rIlUcrfj6bXwrJZ6hJ0E5exmNEmDg6oIJhUbLN8T5eaZ6XuBnYkQArV0EsmdS1HzR3QrBtkTkjol6QrhnTCvfaOi0eJ4cMaen/Z4o3g0BxoVDF1hTFko7fvD8728ewPRS6bmoOxYgj5qRtrjPSOnEq/ez8GRN/CDf25hw65a3tpcy7f+XU3T9LsB0Ceex6Nvtq/+kDBtNu2pO2VvyHYEMS1ExCggrgbAl4cWar8RnuoPYcdPlPPJ8Thcv2gkP/v8BXzypnMoyHKd8haU40DMyCFZOJVEweR292u6I6l68U+/FIDI3g343v07EUpqDydAzS0H2r9u37RLSGpdX6QoDV7jigzWHoh1+/lqyQTMvatTleC7QSYh6YwURWCj4VtwO/5plyB0N0bBSHJu+SoHIj7qguMInP8eFG8QNZBDaPGdtGSNIa98OGPKsqioDXPnZePJCbrJ8ht84JIyCo8u5au3j6eswI/HpXHtgjIuya/BHLWQh7Z7sRZ8EDWQg+IN4ll0F1YsgmvmdTyxqn0Vbst22FKn4520iMCcG0h0cL8m2YX9g+KKl+zrP49nzEyEZuAeMZXcyz9C67YVaFn55F73WcysYTi2QyjgQvTxmh/HAW3cIoLzb8JqqUMLFRGceSXC8KDnlZJ385dIBFIVDZL+Uopu+wp6TinC5SUw9zpcky/G7n9FuqUMKgioxJMO1c1drykHoPhzwXBjdbOiutxPqIcNpr1RhIDq5jgHD1WTo0bIzc3Cm5OPjwi24iKJgRCCmGmjCPA6YXRhERU+Eo6GrgiEACNShV13gKg7H+HPw+d1oyQjoAgijoeopeJ3a2hmC1srk3zvr+sIeHUunZqNrgre2hHmG3efg9+j852/bWbnocZ2cb73ivFcOqOIpK2y5WAjP/3HurbvKYrgux+ZT1GoazdeNWGhmhFszYOJhmGFcUT73UQz+btWlOM7nGpYihstGcZRdRLvqg6Rnx+gqboWYSdJ6v5+mYAG09/MmY7RFT21n1BnPP9OmFkjPCwa6+3W85M7liJcXtwL3tPh90/32uU9IemUGlqTNBw+wMS9j2FV7EAYblj4Huxx80g6qfFjx3FwHaswkCTA8dub+rFRIHdrBTUPfxsnkeruC83Af9vXifiHpR4gwKOAZdlYwkfcagSgJZLk8ZWpe02KgDguXI7GzReO4ft/XdMWo6EpTB6VR9JOLWQdX5bFF++eyXPL9+P36lw1fwTF2e4uVykwHRVTDbTd848rx4aw+sklm22fFJMNlnrqIbaEcIHqSk0hlKQOFGZp7K1JdDsJqcXjSKx9Emf+XV2eJSeTkHRK4XCE0sMvYVXsAMBJxIi+9gc8hcOgE7tpKoogtmNZWwKC1Ey3yKZXMRZ/uMPdQ0N+F163RiR2Ymhg/pSStjsbo4sD3Puhuby54QhBr8H8KcUUZLnbZptpimBCWRaT7pyOQGBZtiyTI0lnkO9XWbWv+/eFRCAfALvuAGreiC49VyahIcbBYc/hJiprW8gOuMkLuNImCByXYySx9q9La3caj3Y6CcWba9LazZY6/NEqTD0vLUHYtsNdl01gw64aKmtbmTYuH1UIzGPrahQhGJ7v4/1XTMBxHGzb6XC6s2059JtuiyT1c9k+lZpw9+4JwbFZcsXjSO5Z3eUkJCcmDCFCwLrd9Xzuf17nhw+u5cu/WsbyLUdxTvFh7fb50PLL09qTRudmV5mmjXfsnLR2z+gZtKx9Dr2D7bTzs9wsWXuIlkiCkSVB3t5ciWlZhLztp49alt3hgk9FOLjitbia9+OymulupXlFpKoS6BXrMOp3YNjd25FSkgYCnyGIJR0Sp9kx+UzU4vGYe1Z2eQ2c7AkNIY2RJP/373fa9T7+/OxWpo7JBQSqIvC7VOKmTVVjhCCt5C++m/onftxWH0qMnMVr+1QuzmnGpavEFX/6m05AU9TEshxyCyaTfeF7aNmwBBwb/znno/lDNLz+D3znXIDhU0lwYtKAS1P4zK3T2Hqgge3767nnigmMK8uiM7lExULsX0ntK38Cy0TxBsm94QvEjt9/6iQhQK/ZTu0TP25bM+QePQPfBR8iofq6dKyBzrQdwnETr0vDOFWXWRrwhBD4XQotMatb2zsAiGAhCIFdvQe1cEynnyeT0BASjibTpitfe95oHn9jL0s3HMHQVT5+01Sam8Kco+1DW/sw1Qs/TNN5/40ercXWPCT8RaxZXsls/XUCTgvu0TNJ5ozBOvZWipk2z688wPMr9gOCS+cO44JpCyk7Lw+ScRwzTuOyx9AC2US3vEns8HZCl3+MuL+sLTkG3BrXLBrFwsmFxBIWjpP6IznTFZYWqaL2xd+1fW1Hmml49peEbruXuOj8DVfDitDw8u/bLVqN7VmH79zLYQiVvKluivOrxzdxsKqF4lwvH795GuW5HnmPbZBy64JIwiG3m89PDclNJLlzmUxCUsey/S6CPoPm1lSvpiDbQzxh8ub6VOXqWMKiuiHKWFcdytI/IkbM4O/rbdbuPnTsCFGEqOe/7piGuuIftDTXoPpCaIYPy1+OELBxTx1PLz2+S6vD88v3kx1w4R1Zhnj1fqzWhlQP5bIPUvfKX7DDDdT/+ydk3/Gd1GLRYyKxJOt31/Gvl3di2TY3XziG6aNzO94m4hi7pTatzWyqglgLeDqfhISdwGqpT2t3Yh1UjBikokmbH/19LfXNqZvVlXURfvDX1Xz/4wsIuOTHxmCkq4L4WQzHAahlk4kvexDX/DsRmtGp58h7QkOIz6Xy3++ZSX7IA8CsCYWs39l+4oBl2wTN1AdwvHgqa3c3tvu+40C8NYx1bMJBvGIXIp76cFZUhbc2tN+KAWDHgQYakjqB6RcRWngz/snnYdYfxY6k1l1YrY3Q2n4R6ta9dTzwr/VU1rVS3RDl14+/w47DTad9fYo/fYMuNZCL4+pahQBTD+AZO6t9o1BQQ8VdOs5A1tASa0tAx0ViJrVN3Z9BJfVvqhBpG0V2leLNQgkWYB7Y0PnnnNUZpQHFcaAs18NPPnM+P/rkQq5dNILhRSd6H7lZbjRVwTo28UBrrSE/25N2nIB1IhlooUIcNXVl7NgOozso0VOY48WxHRqXPUbjsseI7FqDnYyBY2MUDEfLKQHXiXstqqrw1sb0ZPbSqoOop9n11PQWErrwHhCpt7UwPORc9QmSXbyPYzoq/kV34BkzMxWPP5u86z9Pwld0hmcOHl63jvaun7UQEPB0vb6YNECInplPqpZOJrn9jU4/XvarhxjHgZygGyueWlZ60wVj2HmwkY9ekE1x63ZcTVvQxlxPvHQy9vZX+Y/Fn+OHT1W0bfc9b0I2BQ2bAFADORiFI3ACqR6CbTucP72Ut9YfofFY0dK8kJvJo3Lxe1SyL7wbIQSKN4uWDa+Sc8FdxI7sRPVno1iJtvs+juOQm5We/PI6aDuZJXTUsReQX3YOdqwF4c8lbuR0q2J1TM/FffHH8Z/Xgq0aJFTfkLoXEvLpvP/qSfz+qS1tbbdfMo4cvyx8Olg5jnPK5RpdoRaPI7nlVezWBhRf9hkfL5PQEFeY5eJ/PjiOlse/gx1pJgEkdiwn77avYwM+xcOPPjaHisYkPrfGsJDA0yBgzATUrAJMTx4JJXW/RQjI82t87s5zqaqPgBAEvTq7DzUyNuSl4bW/AeAZMY3A1MXUPvebtjjCm14j785vEfMUY9sOC6eVsHRjBVec4yfLZbOrDhbNHoZ1huECCxXLUwieY9stnEXmMB0NUz/2RzSEEhAADsybWMCo0ixqG6PkBNwUZrt75ENK6p+SFhja2f+ChaqjFo0luXslrmlXnvHxMgkNcY4DWtNh7EgzkNqozjNmBslomB2xfP748kFGlgS5afEY8oNG6p5Q4XQgNROuoSmOx50k4NKor6tFj9bxg8eqicbbL3ybVj6Z4LF/C5eL5jXPt4/DTJA8sg0xthjHgVGFPn5wlU7zK7/CjrUyrmAE2e6PEyW9BpymKdi2c1YbxUnpFCEoynJTlNW9Dc+kgSWWtPEaPXOHRi0ej7l3lUxCUicd6y14x81B9fhpXPoYODalecN438K7+NHTFWzeU8f9H5uP/9jMqJrmOD/+x1pqG2NoqsLdV05giqeKRGtLWgICaDWVE0lIKB2WfXdOqq6ZqDlI4zP/y/EuSLJ6P40v/x7/Nf9NktSsG92OIqp2ENn6JlpeOe5x84l5hs59G0nqSZGEg9/VM0lIyRuOve4pnFgY4T79xCA5MUFCyR2G4vLiKhlDy8YlbetjrNqDlFe/xfjyIOFoksq6VNUA04ZfP/EOtY2pmVKmZfPnZ7YSVXz46ncwoTzY7vguXaUoP4ucaz6F4vYT2buBwJQL2gehahilk9pGz5INR3n3GFiiYhdqPDWjTlEE9p4V1D39P0T3rKPl7Sepe/S7uBPtZ9lJknRmsWSqxqLX6JnxVqGoKLnlmEd3nPGxsickkXDnkXv7vSR3r0r/5pF3mDpsPjsONaNrqWuW1rjJ/srmtIfWWz6CB9byobnTeMQTYvWuRoYV+vnwdeeQ5XVhemeRc9dYsBJgeMkNFtK68WUUXwjf1EuI+0va8o7qTd/8TQ3kYOupoSE92UL98kfbfd+OhrHrDkFxd5fbpVNVBSEEpplebFWSBovGiE1eQO1yBezTUQIF2LUHYcTM0z5OJqFBpKOqAqeqNKAoJ4bEHAfinkKMwlHpBy0Yy86jCSaOyKYkz4eigNetMmFYiEjCpKE5TjiaZO74bCZmRQje/EWSjVV88jwfrfMCePOLsXVvWwxxLYhiiNT9m4Jz8Fw5BccRxCy7LQEJIdDyyvBPv5TwhpePBayRfflHU2VzHNqmkwrDjVB17OjxvV4chBC4zQaspmoUt5+kNx+LM08tVhTRdl9JCIErfJjo5iVYLXV4p1yMXTAOU7SfHXb8b3YozZyTBp/qFpOy7J5NB8ITwG5NX/T9bhlJQr/4xS94/vnUjenFixfzxS9+MRNhDBrueC3Jim04kRZc5ZNIBMvRky1YR3diNR7FVToBK3s4cQwO14R5/u195PsVxuarOAhiejZ7K5ooz8qheOJ5RLe9BaTWxzjTrmNWnYeSPA/Jyp34YkcI5Jbz1Ys04kcP4QQL0PNGElvxMJEnNhHz+PEvfi+rqnwI4TBBayAv2ErcU4hmRRF1e0kc3Y2eU4ZSOJY4QY5nH1eyAatyB3bjUSJZeWihfPJv+HyqbE9WEXF3/onhOi1A/jWfJH5kJ04yjpaVT3jbStS8csTR9UQPbEb1h0jGWkHR8Ey+gKjecQ8pbtrsrwqz53Aj5YUBRpcEybGqqX3oW20186J7N5Bz9X8iymZhJBqxju6k7p0KjNxSHARKdilxb0m3poNLUqYdbbKYUtrD0+9VDY79/ZxOnyeh5cuXs3TpUp544gmEEHz4wx/m5Zdf5tJLL+3rUAYFV6KWuke+3Ta7DSD/pi/Qsv4VhGOTqDtC84rHyb7io6yLjeXXj29qe1xJrocvzEuwP15Awgjx19cquHTGZYy6bCGKlaBOy+GXjx+kKRzHtBwmjwjxH+VVGBtfxTNiKpE1z+Mun0hyv5fEgdRx7WiY5hd+Rd7Cz/Odp2tT55gToXiSRmzbUppXPnEi9uHn4L/skySEB5fZROOTP8asr2j7fmjRLdS//EeCC2/BKprW7gPeiFZR++yvsdtK6QgKbvkS8Z1v07TskRPnKBmLnluCuX8D7uHTiRntE5EDPLfywEmlhmDmhAI+Ot9oS0DHNS9/jJybxtLw/M9IHj3x+Kw519K6/Amyr/wEMV9pF357kpR5juNwsD7JLTO7tvPrGSWiCNeZF4r3+cSE/Px8vvzlL2MYBrquM3r0aCoqKs78RKlDVtWedgkIwKyvxFU4HDsZwztmBqHzbqW+oZkHn9/W7nEVdVEqnVxGuRupbYyyaFoJ//fsbtbVB1jVkMP9D++irinWVspjy/5G6oITSNYeQfWmJh+4yycQ3bM+La4su77tHBWiEJqraH77yXaPiR/YjGiqBMCpP9QuAQE0r3kB36SFtKx6Gt2JtrULAckj209KQAAOTSufIFG1r90x4hW70LOLsBMRrOq9aXE2tCZ4dln756zdXk2VmT6jRwiB01jZLgEBNK9/Cd+4WcR2rkCRC2mkAaa6xUJToCirZ/skdksNSk76VjDv1uc9obFjx7b9e//+/Tz//PP885//7NIxurpXe1/ry/ia9rXfg947bg7RveuJHdwKQPzITvS8ctQ57yGWqEx7ftIWqIpFNG6iH1tvI0htqd1RHamEnfqQPT6d2myuR88uIvmuBBJXfEBL2zkcK9muKvVxmmIRzA/QUp1+499OxFB0A8Xw4A94Ud0nrqoatkfTHx8NowXSh9wcx0boLoSVSPvdNMUa6Gh5ka15ELobJ3miVlro/NsQooOp5WYSoek44UZyc7tWp64v9Pe/F5AxdsTrc6H1wOLRM1m+t4nzJgTJzu65bUocy6S6dj8F134MPfv0P7eMTUzYtWsXH/3oR/niF7/IiBEjuvTcmpqWMz8oQ/LzA30an7tgVKpW2rEPeKNwOI1vPdLuMcnaQ+ToCS6fN5xnl+1va3cZKsVGmGorm+HFQTbvrWfSyBw27a7l3HH5TB2Tx6bdJypTZ/kNCp1ahO5um0XTun0FORfeTd2rfwU7tT5IlE9n1VEX0IJLVynWW1CCw3GPmEps/4nhQNWXhektoKamBXewGKG1HwLzT15IZPc6gotup77FhpYTP1d3yURAcPI07sCsq4gePFFmBkDxBjFyy2lc+W+yLv5Q2u/GZ6hMHJHDtv0nbqDmZbnJy8ki+/ZvEtu+DCtcj2fS+Zi5o9GSLQiXFyd+YpM779hZRPdvJjDvpn733uzr92N3DJUYu5rEIq1xYrH4mR94FpKWw8aDEb56dR6NjT23caN56B1EqIRG0wc1Lad97RlJQmvXruXTn/40X/3qV7n66qszEcKgkfCXkn/b12he8Th2uAE9f0SHj9PcPkaVhrjjkrG8sb6C0lwXN8wM4dJMNHLZtbGS6voI77t8NKt31tMUjnPD4lGUFfhZu72asWVBrpkgCBxYgv+6TxGrOoAWKkTNH4GZPYzsi+5B0XQcd5BdkSCrXq5k1thsbpyZxfBsh5i7kMCFH8BzeDNWUxWOZeOevJiYlpqKHfcUknf7N2hZ+QRmfQX+iQtQc0pwT1qMmZW+KV0iUEb+rV+lecVj2NEW/LOuhvJz8ZVMQgvmE92xEqNoFP5zzie8822yFt9N3FeS/nNRBB+94RxeXXOIVVurmDwqh6vnj8SjK8T0EpRZt6EpgsSxfZgsVy55t36d8Kp/k6w+gGf0uWiBXJRQMWb2yJ77xUpSH9h8JM7ofIOCQM+lAicZx9yxFM8Vn+nU44XTx9N5KisrufHGG3nggQeYP39+t47Rn6+aMnVVpwsLHBOESvTNPxLdtrzte67yibgv/hh7d+4jqCdxsssxhUFlU5KSXC/FNW9jBQpwkkka/CN54IndjCnLoqo+QlGuh4tnlTOywINqx1EAE5WYbeByooh4C+Gl/0CoOv4ZV2Bmj8IgTtTR0LFBqCRs9diU50O0rHgMs/Yw3nPOw5iwmLjWfj2QptgodpKs/Dzq6sJnLMWTet0WpjDaZs4pCmhWDFt1IewECI2ko572OEIIEpaNoSqdmuGmCZugVyEcNXEcSDr9c7XDUOll9LZM9IRWvrGqV3tCScvhz8ua+M+LshmR17m9f87EcRyS655EyS7Bveh9be39qif0hz/8gXg8zg9+8IO2tjvuuIM777yzr0MZVFIfsio44J1/B+7hU4nv34irbCJa+TlEt71O7kkz0/S5d/HY6iB3z/HgWfm3tvaDc7/MkZowR2pSN/23H2jgSE2ED1wziWjcIj/oJuDRETgkhAfcHjyXfzY11dtywAbzWH23BLSNlrliNdQ+/B2cZOqPqnn54/haGtAX3IPlnJgfY9oKkJoq2placCe/7uNsGxLCDTapY3XiMstxHHTlzLu3tsXpKKjeAInW/v3hKUmnsvZAjNEFeo8mIHP7GzhmAtf8uzr9vD5PQl//+tf5+te/3tenHVISWhAxfB7GyAVYto3aWkH4pAQEoKx9lA9e9FXyWk/MGFPKziEnP5c7LvWzcVct2/bXc87oXIYVBvjab5anynq4Nb783tmU5ZzYVkGYMdTmStR4BBEqJOHKT/swtxqOtCWg41o3v0H+rOuw9DOXe5ckqec0RizWH4zx9WvyeuR4juNg7ngLu/4Q3mu/ilA7v+9U/xxDkM6a44BlHauI8O5tqYfPpKL0YtbujVPgKWTqoo9htFbzZM1wXnvwHQAWTi3hsrnDKcj28PArO7ni3HzKczRW7I7wq0c3ct+H5mCoCrodIbbs70S2LQNAaAZ5t3yVWHBEu1N2tNWvMNw4yumHySRJ6lmO4/DqtgiXT/aR5z/7FOBYJsl3XsKJteC55stnLFj6bjIJDQEikI8wPDiJKIo3yOb8y/jtMyd2Ln3W7+KTNyxgyasnZq4t3VjBh6+bTH7IxXevz8a78V9YByqZMmo2+yZfSCRuYXgVRMOhtgQEqS0ZGl/9I8Ebv0aCEyuwRXYZev5wkjUH2tpC599JUs+iwznSkiT1ivWH4jjAZZPPfjmBHWkiuf4pRFYR3uu+gtC6XnVBJqFBKGbaNLUm8Ht0Am6NhJFDwY3/Rd1Lv8ccMZ9/La9p9/im1jiKcPjCFXkkHZVn32lld0WY9Ttq+MxluTQ8979Yx6Zf23veZrQZw3vOp3AAJ9IIgNDd+KcsRvX4cSwT3YmTOKnOWlwNkHXN57CrdmM1V6MXj8UKDcdxwGU2QTwM7iAJLdAnddiEELTETFpjSbJ8Bh5dkfXfpEGvusVk1d4oX74qD/UsF1abFdtIblmCMf1qjKlXdLv4qUxCg4SiCDQVGsIJvvfXNRyti+J1a3zi+vFMGZFFvL4Sd/lEEuWTSC6rZN64LK4Yr+CzmsnKy0WpWELruudBUfnUlCv5d85oirMFHN3etv7nOPPARlzJJmJGLpo/G6G7yLngLhpXPonVUodwedHzh6GUnIt90qSDuB6CslkIIYg7DkI4GDVbqX/ul9ixMKovRM41nyYe6qCQag/bdqiRXz66kdaYSU7QzefvPJfSHI9MRNKgFU/aPLspzB1zghQGu//R78QjJLcuwQ7X4r3y86gFZ/f3KvcTGgTciTqMyvVEX/pfjNd/xrcu05k9JkgkZvL/HtlKbVUtdrie8KbXsJc/yPsvH80dpfvJXvYzjJV/IvrMT1CwMIpGgW3Bxme4aqzNucF6SKZPEVU8fhw1dY/HcfvJu/JjNL79FFZLai8fJx6h9plfYkRr054LtE1acMXrqXvqgbbyO1ZrI3VPPYBhpm8T0ZMaIkn+3z/X0RpLJdf65hj/75/riSbTqyFI0mDgOA4vbmllcqmLuaO83T6GeegdYm/+CSWnFN/N3znrBAQyCQ14LqsZ68gWap7+XyK71hA7sJnYi7/gvdMsXIaKbTscrarDXTIGANXlZlwogdj4TLvjtKx/Bd/YWW1f58YO4t7wCMm6I7jLJ7V7bOiiD5DQUrXjLE8udiKK1fyuhOPY2C3th/3ezYpHsKZdhzJ2IRyboGBHW6C1oVs/i86qbYqmlSSqb47R1Hrmir+SNBCt2BvFtOHO2en7dHWG3VxNYuW/sA6/g/eqL+Cef1eHk426Qw7HDXTN1STrjqR6MCdRt77I7LHXs3RLHSHdxIq0knv5f9Cy6TWURBjn3YtnHLvdltua4cKOhQlveYvAjMsJzLoKKxZBCRVhBk5sWZDEhbdoDIrHjx1tPwtPeEOnDLslZvL0qijLNgUpzy/k7sXnkbPq19iJGLh7t0ZXyOdCiPZ7APncGn5356eVStJAseNonO2VCb52dR6a2rX7Nk4iSnLXMqyKHbhm3Yg+8UKE0rN9F9kTGugU0eGbQqgapuVw13mFFKhNWJEwDUsfxSgYTjAvH9UXavd41Z+NHU8VBXWPnIrZUp+qR6douIZPIV4whWT5HOKBYVjvunaJugrJufLjoJxozzr/TkxvQYchO8CDL+7gldWHicZNdh5u4XtPVdE67TayL/8ICVfO2f1MziA34OK9V05s+1pVBJ+4eRoBj7wmkwaXyiaT13ZE+M+Lcgh6Or8cwrEtzL1riL3xBxRPEN/t38eYfHGPJyCQPaGBz1+InlOSVvzTM+s6LrFLyPGAT+jUPHw/en45enYhomIzOVf8B7GDW2lZ/Sx6yViyFt+DlYhTMHJ6av2OmcQ1agYE8km48k5bScBxHBJ5k8i/5wfYLTUIb6ow6buT1XHNkSRrtlW1a4snLeoCkykszen1yQGKgPOmFDNpRA6NrXHysjxke7VOVWiQpIGiOWrx9IYw71sQojync718x3Gwj+4kueNNlKxivNd9FTW7d/fIkklogIurfjzlU8i/7tNE923CTsZxRs7hB6+E2X54A6oi+NptYygMFeKftIiG1//R9lzfxAXkXvlRFE+QRNYwbBvM05zrdBwEMVceuM68AlvTFLxujUis/dncbne72XS9SRGQH3SRH+zh3SRP4VTbrEtSb4gnbf69Pszl5/iYXu7u1HOs+sOY298Ex8Z9/gfRys7p5ShT5HDcIBBVQ8Tyz0FfcA9VE+/gP/5SxfbDrQBYtsPvXzmCb/4tNL39VLvntW5bjh1povapn6HHzrwXfE8JuDXed1X7yQ5Tx+RRnNu9WTv9mebEcdXvRGx+Bv3ImtSaKEnqRZbt8MymViaWuLh00pn3CLLD9STWPklyw7PoUy7De/N3+iwBgewJDRqOA8mkRVV9+p4gR+siCFcRdqw1/XmWmRrGS0TAaH8vRgiIWw6m5eA1lE4VAu0M23aYMTaXb//HPI7UtBIKGAwr8OPSBtc1kaKAs2MZta/9ta3NKB5D4OrPklD63+Z30sDnOA5LtkfwuRTunBM87QJSJx4huXs5VsU2jKlX4rny8z02460rZBIaZIrz0q98Fk4twcgrxSgcRaLqpC2uFQ2hqGihQvC2T0C247DzcDN/fGYLjS0JLpszjCvnDcfn6plab6oQlOV6KRuEvZ/j9HgjdUsfateWqNwNDRWQOy5DUUmD2ZoDMerCFl+6MveUFREcy8Tcvw5z7yr00XNx3/YDFE+wjyM9YXBdeg5AqipQOzlt8lSPVRSBqqZ+lQVZLv7rrhlk+VNXNHMmFXLzhWOIKgGyrvgYrvLUrDA1mEfORXcTPbyTnGs/S1xpnwyONsb4+4s70DUV07J5bsV+Xll7CNHFUh+pfXocTNuhm1U9BixhWzjJ9LVHJ08gkaSesqsqwcZDcT59cQ5uPf2j3XEcrModxN/8I05rPb7rv4F70XszmoBA9oQyqrIxyrJNldi2w8KpJZTkeOjoc9oBKuqjLNtUgSIEC6cWU5yd2kqhujnOis2VtESSnDethPI8H+cMz+b+j84naTn43VrblUbMVYDvys+TL6IkLQfbsvCOmk+U9jfnFWwKkkf48rjtOEKhKX86v3yjmVdXH+LyOcNwd3LYLG7arNlRwxNv7MGlq9x1+XgmlGehDpFsZLpDeCfMJ7L9xAaDwvCgZheTzGBc0uBT1WzyyrZWPntJDtm+9NEKu7mG5NZXwUrivuA/0EondXCUzJBJKEMqGqJ883cr26YFv/j2Ae778DzKOxieOlIX4d7fr2ybuvzC2wf49n/MQ9cUvvl/K9q2nl6y5hBffd9sxhQH8Ogqng5mZSbRMfJyaKppOWU/2GjaT9O/v8fxm0BBsYRPLv4cf1qVQFc7l4CEgM376vnD01va2n76j3Xc+8G5DC84883SwcB0VLzzb0MNFRDdugy9YDj+eTcRd+X22P01SWqN2zy1Mczd87LSNqhzkvHUYtMj244tNr0A0c+2T5HDcRmgqgpvbqhoty7FcVKJSHtXL0PTFF54+0C7tTO27bBqaxXv7KlrS0DHPf76bpyz6GioqiC84SXafUo6NqHq9Xz4usl0dsG1jeCFlQfS2tfurEY5y+q9A0lcD6FMv4Gs27+N+5KPE/OWyCKpUo9JzYQLc94YD7NGeNp/r3IH8Tf+iNBcJy027V8JCGRPKGNMK71YpmnaaR9QjpNqfzfLdrA6WFxpWvZZXmULsNIHi9yaQzDH27ZR3pkoAvKyPew50n5Kck7APeQ+hFPbjXuObTcuST3n9R0Rsn0q104/UerKiYVJbnkFu7UR92X/iVbUvyfByJ5QBliWzeJzS9Nu1F8+b3jah7xl2Vwxb3jaMeZMLGTq6PQZMDcsHt3hfaWuxOabfvm7WgW+yed3OgEBOLbDdYtGoZ00fBf0GUwbc/rqC5Ikdc7WijhHGk0+tCiEcuzDxKrcQXzpX1AKxuC75Tv9PgEBCGcAfiLU1LRkOoRTys8PdCo+BzhU28pzK/ZjWQ5XLRjB8Hw/HY1U2Q4cqAnz/Ir9KEJw1YIRlOf5EKTuLb2wcj8tkSRXzhvBqOIA2hmGu84Uo4qJ1rD32P5CGv4ZV5AIjexyNQMhoKY5zt6KZnRNYWRxkJC3c+VDOvtzzCQZY88YKjHm53etMO/KN1YRi6VvpQJQF7Z4eE0z/315LqXZOo6VJLnlVeyGCjwXfRS1cMxZxdrTTvfaZRLqYV19sx6fWt2ZXsapHpuati063VPpbIyaKnAAy+r7t8hQ+WDqbTLGntGfklDScvjnqmaumOzjvHG+VMWDdU+i5o/Cff77EXrnyvT0pdO9dnlPKMO6MsR1qsemkkTPJ4p377kjSVLmLd0VpTxbZ9FYL1b1XhIbn8M1+5bUzLcBuPxBJiFJkqQB4lB9kj01Ce67Lh/r0DuYO5fiuezTaMXjMx1at8kkJEmSNAAkLYeXt7Zy97wsjCNrMQ9uwHvd11BCRZkO7azIJCRJkjQArN4XY0SeweTkZsxDm/Be93UUf+9uANkX5BRtSZKkfq4pYrHxcIxby49i7l+H99qvDIoEBDIJSZIk9XvL9kS5aKSDb++reK76PIo/N9Mh9RiZhCRJkvqx2haTQ/VJFrc8i/v8D6DmlGc6pB4lk5AkSVI/tvpAjAtzjuAdcQ76yFmZDqfHySQkSZLUT7XEbPZVx1kkNuCed0emw+kVMglJkiT1U5uPxDnXc4TQ/JsQhufMTxiAZBKSJEnqhxwcth2JMD9wGG3MvEyH02tkEpIkSeqHasM2ihVn7OyFCDF4P6oH7yuTJEkawA7WRJmiH0AfxL0gkElIkiSpX6qoj3FOmQ+huzIdSq+SSUiSJKkfqo0qjDtn4BYm7ayMJKGnn36aq666issuu4y///3vmQhBkiSpX8tWwviGTc50GL2uzwuYVlVV8cADD/D4449jGAZ33HEHc+fOZcyY/rUToCRJUiYVuaIItz/TYfS6Pu8JLV++nHnz5hEKhfB6vVx++eW88MILfR2GJElSv1YUHBqbHPR5EqquriY/P7/t64KCAqqqqvo6DEmSpH4tNzT4e0GQgeE427bbbUHrOE6Xt6Tt6l7tfa2/xwcyxp4iY+wZMsZ0paV5A+Lncrb6PAkVFRWxZs2atq9ramooKCjo0jFqalp6Oqwek58f6NfxgYyxp8gYe8ZQibGrCUVzefr9z6WzTvfa+3w4bsGCBaxYsYL6+nqi0SgvvfQS559/fl+HIUmS1K/5s0KZDqFP9HlPqLCwkM997nO8973vJZlMcssttzB16tS+DkOSJKlfM/wBnEwH0QcyMv3i2muv5dprr83EqSVJkgYG1WAoZCFZMUGSJKkf6uqErYFKJiFJkiQpY2QSkiRJkjJGJiFJkiQpY2QSkiRJkjJGJiFJkiQpY2QSkiRJkjJGJiFJkiQpY2QSkiRJkjJGJiFJkiQpY2QSkiRJkjJGJiFJkiQpY2QSkiRJkjJGJiFJkiQpY2QSkiRJkjJGJiFJkiQpY2QSkiRJkjJGJiFJkiQpY2QSkiRJkjJGJiFJkiQpY2QSkiRJkjJGJiFJkiQpY7RMB9AdiiIyHcJp9ff4QMbYU2SMPUPGOHQJx3GcTAchSZIkDU1yOE6SJEnKGJmEJEmSpIz5/+3dfUxT1x/H8XdHwYc4o2YiCxqIj9ElMjIzrZph0VTKpRSND+BSVOYDxojTPxAXo5nBiMYI4rbMROOzMZqABiLowiRTMRAWlcQsmcEBoozh3FZB0pb2/P4w9if57UEz/d1b/L7+u6cHzufeE/j2XMq5UoSEEELoRoqQEEII3UgREkIIoRspQkIIIXQjRUgIIYRupAgJIYTQjRQhIYQQugmpIlRUVMT+/fuDx263m1WrVmG32/n444/p6OjQMR2UlZWRnJyMzWbj5MmTumZ5XmdnJykpKbS2tgJQU1ODw+HAZrNRWFioczr44osv0DQNTdPYvXs3YLyM+/btIzk5GU3TOHz4MGC8jM/s2rWLvLw8wHgZXS4XmqbhdDpxOp3cunXLcBm//fZb5s+fj91uJz8/HzDedexTVAhwu91q8+bNavLkyaq4uDjY/vnnn6sDBw4opZQqLS1V69ev1ymhUj///LOyWq3qt99+U11dXcrhcKg7d+7olueZmzdvqpSUFPXee++pe/fuqe7ubpWQkKBaWlqUz+dTWVlZqrq6Wrd8165dU4sXL1Yej0d5vV6VmZmpysrKDJWxtrZWpaenK5/Pp7q7u5XValU//PCDoTI+U1NTo6ZOnao2bdpkuLkOBAJq5syZyufzBduMlrGlpUXNnDlTtbW1Ka/XqzIyMlR1dbWhMvY1IbESqqqqIjY2luXLl/dqr66uxuFwAJCSksJ3332Hz+fTIyI1NTVMmzaNIUOGMHDgQObOnUtlZaUuWZ535swZtm3bRmRkJAANDQ3ExMQwatQozGYzDodD15zDhw8nLy+PiIgIwsPDGTNmDE1NTYbK+OGHH3Ls2DHMZjO//vorfr8ft9ttqIwAv//+O4WFhWRnZwPGm+u7d+8CkJWVRWpqKidOnDBcxm+++Ybk5GSioqIIDw+nsLCQAQMGGCpjXxMSRSgtLY1Vq1YRFhbWq/2XX35h+PDhAJjNZgYNGsSjR4/0iNgrC0BkZCTt7e26ZHnejh07mDJlSvDYaDnHjRvH+++/D0BTUxMVFRWYTCZDZQQIDw+nuLgYTdOwWCyGu44AW7duZcOGDQwePBgw3ly73W4sFgtffvklR44c4fTp0zx48MBQGZubm/H7/WRnZ+N0Ojl16pThrmNfY6hHOVRUVLBz585ebaNHj+bIkSMv9PVKKd56S5+6GggEMJn+u9W7UqrXsVEYNeedO3dYvXo1ubm5hIWF0dTUFHzNKBlzcnJYuXIl2dnZNDU1Geo6nj17lnfffReLxUJJSQlgvLmOj48nPj4+eLxgwQKKi4v54IMPgm16Z/T7/dTX13P8+HEGDhzImjVr6N+/v6GuY19jqCJkt9ux2+0v3D8yMpKHDx8SFRVFT08PXV1dDBky5PUF/BtRUVHU19cHjzs6OoK3wIwkKiqq1wc4jJDz+++/Jycnh88++wxN06irqzNUxsbGRrxeLxMnTmTAgAHYbDYqKyt7rcz1znjhwgU6OjpwOp388ccfPHnyhPv37xsqY319PT6fD4vFAjz9ZR4dHW2ouX7nnXewWCwMGzYMgDlz5hhurvuakLgd91cSEhI4d+4c8PSHcMqUKYSHh+uSZfr06Vy/fp1Hjx7R3d3NpUuX+Oijj3TJ8nfi4uL46aefgrcdysvLdc3Z1tbG2rVr2bNnD5qmGTJja2srW7Zswev14vV6qaqqIj093VAZDx8+THl5OefPnycnJ4fExEQOHjxoqIyPHz9m9+7deDweOjs7KS0tZePGjYbKaLVauXr1Km63G7/fz5UrV0hKSjJUxr7GUCuhl7V+/Xry8vLQNI23336bPXv26JZlxIgRbNiwgczMTHw+HwsWLGDy5Mm65fkr/fr1o6CggHXr1uHxeEhISCApKUm3PIcOHcLj8VBQUBBsS09PN1TGhIQEGhoaSEtLIywsDJvNhqZpDBs2zDAZ/4zR5tpqtXLr1i3S0tIIBAIsWbKE+Ph4Q2WMi4tjxYoVLFmyBJ/Px4wZM8jIyGD06NGGydjXyJNVhRBC6Cakb8cJIYQIbVKEhBBC6EaKkBBCCN1IERJCCKEbKUJCCCF0I0VICCGEbqQIiZCUlZX1j/sEvkif2tpaUlJS/nG8CRMm/On3qqqqCm7373K5qKyspLW1tdf2NEKIvxbS/6wq3lzXrl17JX3+rdmzZzN79uzXPo4QfZWshETI2bx5MwBLly6lrq4Ol8uFw+EgNTU1uI3T833a2tq4fPky6enpzJ8/n1mzZlFUVPTS4xYVFTFv3jycTieXL18GoKSkhNWrV7+S8xLiTSQrIRFydu7cSUlJCUePHmXRokXk5uZis9lob29n4cKFxMTE9OozdOhQcnNzKSgoIDY2lvb2dqxWK5mZmS817siRI9m+fTs//vgjLpeLioqK13SGQrw5pAiJkNXY2IjH48FmswFP9++z2WxcuXKl199kTCYTX3/9NdXV1ZSXl9PY2IhSiu7u7pcaLyMjA4Dx48czZswYbty48epORog3lNyOEyHLZDL9z3NdlFL09PT0anvy5Anz5s3j9u3bTJo0idzcXMxmMy+7beLzz6oKBAKYzfIeToh/S4qQCElhYWFER0djNpu5dOkSAO3t7Vy8eJHp06cH+/T09NDc3ExnZyeffvopiYmJ1NbW4vV6CQQCLzVmaWkpALdv36alpYW4uLhXe1JCvIHkrZwISUlJSSxbtoyvvvqK/Px89u/fj9/vZ+3atUybNi3Yx+VysW/fPmbNmoXdbiciIoLx48czduxYmpubiYiIeOEx7927R1paGiaTib179+r2AEUh+hJ5lIMQQgjdyEpICODgwYOUlZX96WuffPIJqamp/+dEQrwZZCUkhBBCN/LBBCGEELqRIiSEEEI3UoSEEELoRoqQEEII3UgREkIIoZv/AP/kVwligiBHAAAAAElFTkSuQmCC", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Create a visualization with Modin df \n", "sns.jointplot(data=modin_tips, x=\"total_bill\", y=\"tip\", hue=\"sex\", hue_order=[\"Female\", \"Male\"])" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAaEAAAGkCAYAAACYZZpxAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAACHGElEQVR4nOzddXxcx7nw8d8cWl6tmM3sGGKmxGFmTpqUb/EWb2+5TUopv7ntLd5yUwo3zA6ZYrZjZpQsptXigfePtWUrK9uSLGkF8/180lqj3XOelVb7nJkz84xwHMdBkiRJkjJAyXQAkiRJ0tAlk5AkSZKUMTIJSZIkSRkjk5AkSZKUMTIJSZIkSRkjk5AkSZKUMVqmA+iKmpoWsrO9NDREMh3KKfX3+EDG2FNkjD1jqMSYnx/ooWgGlwHXE9I0NdMhnFZ/jw9kjD1FxtgzZIxD24DqCUn9n5OIYh3diVWzH7vpKE60GWwTVB3hyULJLkHNH4laOBqhuTIdriRJGSaTkHTWHNvE3LeW5PY3sap2o2SXoGQVIrzZKNmloKhgmzixMHbtAczdK7Gbq1FLJqCPPx9t+DSEIt+KkjQUyb98qdscK0li6+skNz6D8Gajlp2DPuUyhGac+bmJGFbVThLr/k182YPoU6/EmHRhp54rSdLgIZOQ1C3mgQ3Elv0N4QthzLgBJVTUpecLw41WPhWtfCp2YyXm7pW0bnoe15xb0cYu6KWoJUnqb2QSkrrEiYWJLf0LVvUe9MmXoOaPPOtjKqFijFk3YjUcIbHhWRLbXiNxw6eArLMPWJKkfm3AzY6TMses3EHro9/AAVznvb9HEtDJ1OxSjIXvQc0bScVfvkZ843M4jt2j55AkqX+RPSHpjBzHIbH5JRLrnsaYdgVqweheO5cQCtrIGYTGTKJu6RNYBzfhvvhjKN5Qr51TkqTMkT0h6bQc2yL21p9JbnkV18L39GoCOpkWyMGYezsikEfksW9iVmzrk/NKktS3ZE9IOiXHTBB95Zc4sTCu+Xch9L5d1yMUBX3cQpTsUmKv/Ap92pUYU69ECNGncUiS1HtkT0jqkJOIEnnuJ2BZGLNu7PMEdDI1fwTGwrtJ7niL2JLf4JiJjMUiSVLPkklISuMkokSe/THC5UOffhVCyXzJEsUTxDX/TpxYmMhT92NHGjMdkiRJPUAmIakdJxkn8txPEN4Q+jmX9quhL6Hq6NOvRsktJ/LEt7DqDmY6JEmSzpJMQlIbx0oSffF/Uj2gcy7pVwnoOCEE+tgFaOPPI/LMjzAPbsp0SJIknQWZhCQAHNsm+upvwHHQp17eLxPQybSSiRgzryf2+u9IbF2S6XAkSeommYQkHMchvvzvOOFa9OlXI8TAeFuoOWUY8+8kseFZYisfkgtbJWkAGhifNlKvSmx6AfPwOxgzb0SoA2vWvuLLxrXgPViHNxN75Vdy5pwkDTAyCQ1xyf1rSW56HmP2zRmdhn02hOHBmHsbTiJC5JkfYEebMx2SJEmdNLAue6UeZdUeIPbGH3HNvgXFE+zUc0zbobLR5GiTSXPMJmk5GKog6FEoDGqUhDRUpe/vJwlVQ59+DeaOt4g88W08V30eNVTS53FIktQ1MgkNUXakieiL/4Mx+ZIzbsOQtBzWHYjx9r4Iu6qSBNwKuX4Vn6GgKqnE1Bp3qG+1aI7ZjC3QmTncw4zhbrxG33W2hRDoE85H+LKJPHk/nos/jlY2uc/OL0lS18kkNAQ5ZpLoSz9DLZ2EWjLhlI+LJmxe2dbKku2tFAQ0JhQZnD/Wi+c0iSWatDlQm2Tl3igPr2lm1gg3V0z2UxDsu7eaVj4F4c0ituTXGDNvxJh8cZ+dW5KkrpFJaAiqfeF3CEVHG7uww+/bjsOyXRGeWB9meK7GbbOC5Pg6VzXBoytMKHYxodhFa9xm4+EY9z9Xy7QyNzecGyC7k8c5W2ruMMT8u0iseQKr7gDuhe8dcJMuJGkokBMThpjE1iVED2xGn95xIdDasMlPXqzj1e0RbjjXzxXn+DudgN7N51JYMNrLBxZm4QD3PV3DMxtbSFrOWb6KzlF82bgWvgen8SiRp76H3drQJ+eVJKnzZBIaQqyq3cRXP0b2+bcjtPSZcGv2R/neM7WUhjRunx2gsIeG0Ny6wsIxHt4zN8jWyjjfeqqG3dV9M5VaaC70mTeg5JQTefxezCNb++S8kiR1jhyfGCLsSCPRl3+BMfUKtGAuNEbavmfZDo+uaWbdwRg3zui55PNuWR6V66cH2FmV4NevNTBnpJsbZwQxtN6dTZcq9TMfJbuY2Ku/RpuwGNesG/tFYVZJGupkT2gIcGyT6Mu/QC2bjFo4pt33ogmbn79az57aJHfNDfZaAjrZuEKDe+YHOdJo8p1najhYl+z1cwKoeSNwLXov1pGtRJ78HnZzdZ+cV5KkU5NJaAiIr/gXQNpEhMaIxQ9fqMOlCW6c7set993bwWMoXDXFx8zhbh54uY4XNoexnd6/VyTcfow5t6Dmj6T18fuIb35ZlvuRpAySw3GDXHL3Csz963AtvKfdRISqZpMHXq5jcomL2SPcGSlYKoRgYrGLkpDGC5tb2VIR50OLQoS8vTtMJoRAGzULpWAkyU0vYu5eifv8D6LmlPbqeSVJSid7QoOYVXeQ2LK/Ycy8HmG429oP1Mb58Qt1zBzuZs5IT8YrZmd5VG6dGSDXp/Ltp2vZcCjWJ+dV/LkY8+9ELRhN5On7iS3/B068tU/OLUlSiuwJDVJOLEz0xZ+jT7oIJVjQ1r6/NsH/Lmng/HEeJhT1n1pxiiKYP9rDsByNf7zdxI5qkxumenH18hChEAJtxLmoxeNI7lhK60NfRj/3WoxJFyJUvVfPLUmS7AkNSo5tEX3ll6gFo9BKJ7W1761J8PNX67lqela/SkAnK83WuXtukLqwybefrmVPX03ldvkwpl6OMecWzL2raP3Xl0hsex3H6ptJE5I0VMkkNAjF334IJxlDm7C4rW1PdYL/XVLPpZN8TCjxZDC6M3PpCtfPzGb+aA+/fK2Bh1Y3ETf7aIFrsADX7JvRp11JcsdbtP7zC8Q3PCeH6SSpl8gkNMgkdizF3Lsa49xrEUrq17u7OsEvXqvn8sk+RuUbGY6w88Yem8pd0Why35M1bDkS77NzqzlluObcgjHzRqyKLYT/+QVib/0Fq/5wn8UgSUOBvCc0iFhVu4mv/CeuebcjjFRvZ1dVgl+9Xs8Vk/2MyBt49zi8hsJVU/zsq0nw1xWNDMvVuXVWkIJA37x1laxCjOnX4MRaMA9uIvrMDxHBfIyJF2JnXdgnMUjSYCaT0CBht9QSfennGFOvRAnkA7DjaJzfvN7AlVP8DM8deAnoZCPzDcpzdNYejHH/s7XMGenh6ql+sjx9U/VAuAPo4xaijZmPXb2H5I63OLDyn6jDpqOPPw+1ePyA2RZdkvoTmYQGASfeSvS5n6CNmoNaOBqArRVxfvdmA1dN9TMsZ2AnoOM0VTB3pIcpJS7e3hfjm/+uYd4oD5dN9pHr75u3slAU1KKxqEVjCbhs6retJfbWn8FMoI1ZgD5uAWq2XG8kSZ0lk9AA55gJoi/+DJFThjZyJgAbDsb4y/JGrpnmpyx7cCSgk3ldChdO8DJ7hJt1B2N8++laxhUaXDDex8QSA6WP1j2pHj/6qNnoo2ZjN1djHdmaGq7zZKGNW4g+Zj6KN6tPYpGkgUomoQHMsW2iS34Lioo+KXV/YvnuCI+ubeaGcwMUZQ3uX6/frXD+OC/zRnnYVhnnoTVNRBMOc0a4mT3Sw/Bcvc8W4irBApRgAdqE87FrD2Id2UJi7b9RC8agj1+ENuLcDiuXS9JQN7g/pQYxx3GIvflHnNZ6jFk3AYIXNod5ZVsrt8wMkusfOhWiDU0wrdzNtHI3tS0mO6oS/PaNRkzbYUqpiyllbsYXGX2y1bgQCmr+CNT8ETiTE1hHd5HY/DKxt/6CNuJc9HGLUIsntM1clKShTiahAchxHOLLHsSu2Y8x91ZsofLQqma2VMS5fXaAoHvoJKB3ywto5AU0Fo6BulaLfTUJXtwc5g9LTYqDGhOKDcYXGYwpMHq9YKvQDLSyyWhlk3FiYcyKbcSWPQiJCNro+antJXKHZbxskiRlkkxCA4zj2MSXPoh1dCfGnFuJ2hr/91o9kaTDbbMCfVoJu7/L9ank+jzMGgGm5VDRZHKoPsm/17dwtMmkIKgxvtBgfJGLsYUGPlfv/eyE+6T7Ry01WEe2E33xf0DV0UbPQR81ByWnXCYkaciRSWgAcWyT2Bt/xK4/jDHnVo6EFX79Wi3lOTqXT/agKPID7FQ0VTAsR2+bKWjaDkebTA43mDx/rKeU61eZUGQwqcTFuMLe6ykpgXyUCflo4xfhNB3FqtxB9IUHQChow89FG56qZSdr10lDgUxCA4STiBJ95Zc4yRj6rJt4c6/JE+tbWDzOy6QSecO7qzRFUJatt80etGyHqmaTg/Umz2wMU9FkUhrSmFziYnKJixF5OmoPJ3khBCJUjBIqRpuwGKelpm3Bsd1cg1o4BrV0ElrxeJS8EQhV/rlKg498Vw8AdksN0Rf+BxHIJzLxWh58M0xti8lts4bWBITepCqCkpBOSSiVlJKWw5FGk4N1SVbvj9IctRlbaDCl1MXEEhf5PfxzF0Igjs2wY+wCnEQUu+4gVs1ekjvewgnXo+SUohaMQs0fiZI7DCVULHtL0oAnk1A/Zx7cSOz136OMmsvS+HiefqaeqeVuLpkY7PErc+kEXRWMyNUZcazSRGvc5kBdkg2HYjy1IYymwrThPkblKowtcPX4xYAwPKjF41GLxwPgmHHsxqPYTUdJ7l6BvfZJnEgDwp+Lkl2Kkl2Gml2CEipGCRXJ6eDSgCGTUD/lJOPE336Y5N417Cq9jsfecaGpUW6ZFSCvj6oDSCf4XAqTSlxMKnHhOA51rRbVrYJlu6I8tKoZXRWMyjcYXaAzPDc1zNeTU8KF5kLNG46aN7ytzbFMnNZ67JZanHAtyapd2OE6nNZ6hDuIklVEbekIEu4ClOwS1FAJwu3vsZgkqSfIT7N+yDy4kchbf2WbOoGX4jfRsg0WjHYzpqDvFl9KpyaEIM+vMaLIzaRCFcdxaIjYVDaa7K5OsHx3lJoWE59LoSSkUZylUZilke/XyA+oZHtVNPXsf49C1U4M4Z3EcWycSBNOuA7HbMY8sB5nyyvYLTUI1Uj1lLJLUbPLUEJFqWE9X7asfSdlhExC/YhVu5+apU/wdqXBssSVaLrOzBFuxhX2XSkaqeuEEOT4VHJ8KpNJDYPZjkNTxKau1aKu1WLT4TjNkQhNUZuWmI3PpZDtVcn2KW3PTX2d+neWR+n2cKsQCsKXDb5s/CEvZmMESK0vI9aS6i211GEe2YKzc2kqWSWjqaG9QD4ikIfiz0XxhhCeLITbn+pBGZ7UMJ8qL4akniOTUIY5jk319k2sX72BdbUeDltTGFPo5pJxLkpCmvxjH6AUIcj2pZLKmHd9z7YdwnGblngqIYVjNvtqkryTiBOO2TRHbSIJm6AnlaDy/Cr5AZW8QKo3ledXyfIqXb4wEUKAJ4jqCUL+yHbfc8xEqvcUacSJNmPXH8Gq3IGTiOAkYpCM4SRjYMbBtkFRQdVAKCCU1LEVBRCpNkUBRUMoGmg6QjNAdyMM77GkFkB4gijeLOJmKXbSnWqT7/chRyahPmbbDpV1Yfbs3M/O3YfYUW0RdXRGZBUycUI2V+W70HtgqEbqvxRFEPSoBE+zDYVlO7QcS0hNMZvasMW+2iRNUZvGiE0saZPlSfWacv0qOb5UzyroUQm6FfxuBa8h8Fud25FWaAYimA/B/DM+1nFssK3Uf46dSko44Dip3pbjpNodCywLxzbBSoKZxDFjOIkYTmMlTvUenHgr1asjJMONYCUQvhyUQB5KsBAlq/DYcGMhSjA/lcikQUcmoV5g2w5NrQnqm2PUNsWorm2i4mgdFbURqsIOXmIUGq0UBTWuOCebgvwQiqwlJp1EVQQhr0rI23GiSlrHklTMpjlq0dBqcaTBJJKwiSQdogmbaMIhblajiNRsP00VaErq2KoCiqCt5yEAIU60KYK2x+pq6j9DE7h1gUcXeAwFjyHw6gpeQ8fnEnhdCr5j7V3ppYVCXhobI6neWLQp1SNrbcSs2o2zby1OpAGntRHh8qWGCoMFiGABaiAvNYToy0nd09LljMCBaEAloeMVAbpaGSAcTZJMWhy/JnRIjY87zvF/g2M72I6DZaeu5izbwbJsTBvMSDPxRAIzaZMwLZJJi1jSIhY3U3/scYvWhE0k4dCaFCQsUIVDQE3gI4JfRMlz2YwpVMkZ58UI5CGMYT36s+kK23b6/U1oGePpGRrk+lVyzzDZze3RCUcSWFaqSoR1rKNiO6m/AfvYH4XT9j/HvwcWDo4Nlk3q78BOlT9KWA6tUUi2OCQsk4TpEDchZjrEkw4OqWTm1gQe46TEpQtcxxKZoQlcaqr4bKDWxkya6IpAVUJoWjZKNqg5AlUAAhRAmBGIhxHxME5zK6JuJ068FScRwW82ogsLXH4Utw9cPjB8CMOTGgLU3QjdANVI/b9iINTjQ4oqQlFTQ4xCtA0xpv6d+jrZFINI5FgwSupemRw67BHCcZzO9dclSZIkqYf170tNSZIkaVCTSUiSJEnKGJmEJEmSpIyRSUiSJEnKGJmEJEmSpIyRSUiSJEnKGJmEJEmSpIyRSUiSJEnKGJmEJEmSpIyRSUiSJEnKmAFVOw7ANC0aGiKZDuOUsrO9/To+kDH2FBljzxgqMebnB7r0+Lq6MLY9OKqqne61D7iekKaduvx9f9Df4wMZY0+RMfYMGePQ1qtJKBwOc80113D48GEAHnroIa655hquvfZavvKVr5BIJHrz9JIkSVI/12tJaOPGjdx5553s378fgH379vGHP/yBf/3rXzz11FPYts0//vGP3jq9JEmSNAD0WhJ6+OGHuffeeykoKADAMAzuvfde/H4/QgjGjRtHRUVFb51ekiRJGgB6fT+hiy66iL/+9a+UlZW1tdXX13PLLbfw/e9/n7lz557V8W3bpra2lvr6BizLOttwBz2Px015eTm6rmc6FEmSpL6fHVdVVcWHP/xhbr755m4noJqalrZ/19dXI4QgKysfVdUyvtuhpimYpp3RGE7FcRxaW5s5dOgQgUB+psM5rfz8QLvfc38kY+wZQyVGOTuuY306O27Pnj3ccccd3HjjjXzyk5/skWMmEjFCoVw0Tc94AurvhBD4fEGi0VimQ5EkSQL6sCcUDof50Ic+xGc/+1luuOGGHjyygxADbqZ5xshELUlSf9Jnn96PPvootbW1/OlPf+L666/n+uuv52c/+1lfnV6SJEnqh3q9J7RkyRIA3v/+9/P+97+/t08nSZIkDSByHEuSJEnKmAFXO66/iEQi3H//tzh8+BCKIhg/fiL//d9f5a233uKPf/w9ppnE7XbzyU9+lnPOmcr993+LaDTKd77zA/bu3cOnP/0xfvGL/2PEiJGZfimSJEkZI5NQN7355mtEIhH+/Od/YFkWP/nJ9zly5DC/+c0v+fnPf0NWVoi9e/fwuc99gn/969987nNf5IMffA/PP/8M//jHX/n0pz8vE5AkSUOeTELdNHXqdP7v/37Ff/7nR5g9ey633nonq1e/TW1tLZ/5zCfaHieEwuHDhxg7dhzf+tb9fOQj7+fyy6/issuuzGD0kiRJ/YNMQt1UUlLKv/71BOvXr2Xt2tV87nOf4J57PsCsWbP51re+3/a4qqqj5OWlFoYePHiArKwsdu3aQTKZlFULJEka8uTEhG564olHuf/+bzFnzjw+8YlPM2fOfJqbm1m1aiUHDuwHYMWKpbzvfXcSj8eprKzgZz/7KQ888EuGDRvBr3/988y+AEmSpH5A9oS66Yorrmb9+rXcffetuFxuCguLuOWWOxg9ejT33vtVHMdBVVV++MP/h2EY3Hff17jzznsYNWoMn//8l3jf++5g1qy5LFiwKNMvRZIkKWN6vYBpbzi5htPRowcoKhqewWja68+1446rrj5EQUF5psM4raFST6y3yRh7hqwdd3b6Te04SZIkSTqZTEKSJElSxsgkJEmSJGWMTEKSJElSxsgkJEmSJGWMTEKSJElSxsh1Qr2gsrKCO++8iREjRrVr/+EP/x+FhUU9fq5PfeqjPPro0z16XEmSpL4gk1AvycvL589//kemw5AkSerXhmwSWrHlKI+/sYe65ji5QRc3LR7N/Mk920t5t/r6On784/upqqpCURQ++tFPMnv2XP7wh99SVXWUQ4cO0tjYwHvf+0HWrl3N1q2bGTMmVfjUsix++tPUNhD19fWMGTOG++77XqeOL0mS1F8NySS0YstR/vL8dhLHKhvUNcf5y/PbAXosEdXW1vD+99/V9vVll13Bjh3buPrq61i0aDG1tbV84hMfaust7d27h9/85o+8885GPvOZj/OXv/yL8vJh3H33rezevYvW1jCapvPb3/4J27b59Kc/xooVyxg/fmLbOX72s590eHyv19cjr0mSJKmnDckk9Pgbe9oS0HEJ0+bxN/b0WBLqaDju6qsv5sCBA/z+978FwDRNjhw5DMDs2XPRNI2iomJyc/MYOXJU23FaWpqZMWMWwWAWjz32MAcP7ufw4UNEo9F2x1+zZlWHxx87dnyPvCZJkqSeNiSTUF1zvEvtPcWybH7+818TDGYBUFtbS3Z2Nm+++TqaduJXoapq2nOXLn2D3//+t9x66x1cddV1NDY28u6yf6c6viRJUn81JKdo5wZdXWrvKTNnzuLxxx8BYN++vbz3vbcTj8c69dw1a1Zx0UWXcPXV1+H3+1m/fi22bfXY8SVJkjJhSPaEblo8ut09IQBDU7hp8ehePe/nPvdFfvSj7/G+992B4zh84xvf7vT9mmuvvZFvfetrvPLKi2iazpQpU6moqGDmzJ45viRJUiYM2a0cemt2nNzKoWcMlfL+vU3G2DPkVg5n53SvfUj2hCA1C663p2RLkiR1l2PbgMh0GL1uSN4TkiRJ6vfsZKYj6BMyCUmSJPVDjimTkCRJkpQplkxCkiRJUqaYiUxH0CdkEpIkSeqPLJmEJEmSpAxxkjIJSd1UWVnBokWz+NGP2le53rVrB4sWzeK55069988tt1xLZWVFb4coSVJ/J2fHSWcjKyuLt99egWWdKK3z6qsvEwrJWm6SJJ3ZUOkJDdnFqoldy0msfgwnXIfw52LMvhlj7IIeO77H42Xs2HFs3LieGTNmAbBq1UpmzZoDwGOPPcQLLzxHLBZF13Xuu+97DBs2ou35lmXxq1/9jPXr12JZNldddQ233/6eHotPkqR+Ts6OG7wSu5YTf+vPOOE6AJxwHfG3/kxi1/IePc+FF17Ka6+9CsC2bVsYM2Ysuq7T2trKm2++wS9+8VsefPBhFiw4j8cee7jdc59++gkA/vjHv/O73/2Ft956g40b1/dofJIk9V/OEElCQ7InlFj9WPr0RzNBYvVjPdobWrTofH73u19j2zavvvoyF110Ka+++hI+n4/77vsur7zyEocOHeTtt5en7fmzZs0qdu3aydq1awCIRiPs2bObadPO7bH4JEnqx4bI7LghmYSO94A6295dXq+XMWPGsmnTBtatW83HPvafvPrqS1RXV/HRj36Am2++jXnzFpCTk8uuXTvaPdeybD7xiU+zePFFADQ2NuLxeHo0PkmS+i/HNDMdQp/o1eG4cDjMNddcw+HDqd1Dly9fzrXXXstll13GAw880JunPi3hz+1S+9m46KJL+M1vfsH48ZPaNq5zu92UlZVz++3vYeLESbz55msd7g301FP/xjRNIpEIn/jEh9iy5Z0ej0+S+gudJIYTR4jBX7SzMxx7aPSEei0Jbdy4kTvvvJP9+/cDEIvF+OpXv8qvfvUrnnvuOTZv3swbb7zRW6c/LWP2zaAZ7Rs1I9XewxYuPJ9du3Zw8cWXtrXpuo5t29x996188IN3M3z4CCoq2k/LvuGGWygvL+cDH7iLD3/4Hq666tq2CQ6SNJg4VhJX/U7CT/+Qpke+ibLnDXSrNdNhZd4Q6Qn12nDcww8/zL333ssXv/hFADZt2sTw4cMpL0/tY3PttdfywgsvsHjx4t4K4ZSO3/fprdlxxcUlPPpoai2Q1+vl1VeXtX3va1+7D4Cbb769w+cefx7AZz/73z0SjyT1Z/GKPdQ++n0gtXdO4yt/JHQpiFGL07awH1JsmYTOyve+136hZnV1Nfn5+W1fFxQUUFVV1VunPyNj7IIenYQgSVLXCQHRQ1s5noCOC695jqyRc0ngykxg/YCcHdfDbNtuN9brOE63x35P3qWvulpB0/rXTPP+Fk9HurrLYybIGHtGf4+x+UD6FvSqN4g/y49quDMQUcf6+ufo0hXy+vnvrif0WRIqKiqipqam7euamhoKCgq6dayTt9m1bbtfbac9ELb3BobEdsq9TcbYM7KGTULxBrEjzcdaBMEFt1LflAT6R28gE9t7x6Pxfv+766x+sb33tGnT2LdvHwcOHKCsrIxnnnmGm2/uiYkAAsexEaL/9z76gyE9xi71S0Z+Obm3fhPz6A7seASjZDyJQNm7R+iGHsc682MGgT5LQi6Xix/84Ad86lOfIh6Ps3jxYq644oqzPq5huGlsrCUQyEZVNTm98zQcx6G1tRmPp/8McUgSQMyVB8PzEAJiDjIBAQyRC8ZeT0JLlixp+/f8+fN56qmnevT42dn5hMNN1NdXpa21yQRFUbDt/jscp2kGY8aMpLExlulQJCnNEPnc7RTH6b+fIz1pwFdMEEIQCIQIBEKZDgUYGGPwuq4DMglJUr/Wjy9me5K8kSJJktQvDY1uoUxCkiRJ/dEQGZuUSUiSJEnKGJmEJEmS+qOh0RGSSUiSJEnKHJmEJEmSpIyRSUiSJEnKGJmEJEmSpIyRSUiSJKk/GiIVyGQSkiRJkjJGJiFJkiQpY2QSkiRJ6o+GyI4AMglJkiRJGSOTkCRJvUZRBC47jMsOoyhD48q+xwyRjToH/FYOkiT1T5odw9n7NvXLHgUcggtuQR85l6TiyXRoA8JQ2S16aLxKSZL6nFK9g8ZX/oQdbcGOhml89c+Iqu2ZDmvgkElIkiSpezRNIbL1rbT2yOY30DT5sdMpQ2T0Ur4bJEnqcbbtoOWWpLVruaXY9hApD322hJrpCPqETEKSJPU423Zwj1uA4va1tSkuL+6Ji2QS6ixlaCQhOTFBkqReEfcWkXP7t7DrDgKg5g4j7s4bMvvknDVlaPQRZBKSJKlXOA7EXXlQkgdAEmQC6gIxRHpCQyPVSpIkDTTynpAkSZKUMapMQpIkSVKmKEPjbolMQpIkSf2QkElIkiRJyhg5MUGSJEnKFDFEpmgPjVcpSZI00MiekCRJkpQxsoCpJEmSlDEyCUmSJEkZI7f3liRJkjJFDJG9HGQSkiRJ6o9kT0iSJEnKGGdoVHuVSUiSJKkfcoZIyXGZhCRJkvoj2850BH0iI0noySef5Oqrr+bqq6/mhz/8YSZCkCRJ6udkEuoV0WiU733vezz44IM8+eSTrFmzhuXLl/d1GJIkSf2bZWU6gj7R50nIsixs2yYajWKaJqZp4nK5+joMSZKkfs2xh0YS6vNa4X6/n8985jNceeWVeDweZs+ezYwZM/o6DEmSpP7NTmY6gj4hHKdv5wFu376dL3/5y/zhD38gEAjwhS98galTp/LhD3+4L8OQJEnq1xqWP0b2gpszHUav6/Oe0NKlS5k/fz65ubkA3HTTTfzjH//oUhKqqWnprfDOWn5+oF/HBzLGniJj7BlDJcb8/ECXHh9tacXs5z+Xzjrda+/ze0ITJkxg+fLlRCIRHMdhyZIlTJkypa/DkCRJ6tccM5HpEPpEn/eEFi1axNatW7npppvQdZ0pU6bwkY98pK/DkCRJ6t+S8UxH0Ccyson5Rz7yEZl4JEmSTsMxY5kOoU/IigmSJEn9UUL2hCRJGkBUTLRwJXZTNYovhJVVCnTtZrjUfwyVnpBMQpI0CAjhIPa/Te2Lv2tr8597OdZFd2cwKumsJKOZjqBPyOE4SRoEjEQDja/+pV1beP2LJGsPZSgi6Ww5iaHRE5JJSJIGg0Sswym9ViycgWCkHuHYONbgr5ogk5AkDQKONxs9r7Rdm9Bd6KGiDEUknS2he3CGwEWETEKSNAgkhIfsqz6Nq3wiAHpuCXk3fxnjXYlJGkB015BIQnJigiQNElF3Ib4rP08wGcbW3MSER86NG8BSPaHBUbbndGQSkqRBJIlOUs/OdBhSTzA8ONHmTEfR6+RwnCRJUn9kuGUSkiRJkjJD6B7sSGOmw+h1MglJkiT1Q8Lw4sgkJEmSJGWCcPlwWhsyHUavk0lIkiSpH3IMD3ZrY6bD6HUyCUmSJPVDlu7HiciekCRJkpQBCXSwLZzE4C5kKpOQJElSP5SwQHizsMN1mQ6lV8kkJEmS1A8lLAfhycKRSUiSJEnqa4mkg+IJyp6QJEmS1PfipgOeAHZTdaZD6VUyCUmS1GsUxcFltWAwNDZo60lx004Nx7XUZDqUXiULmEpSBgghMBJ1OC21CHcA01uAhZrpsHqUy2wktv55mt95DTWQS+ii95HMG4ftyGvfzkiYoHizSB7cmOlQepVMQpKUAa6mPdQ+/qNjWzgLshbeijbpEkxhZDq0HqEqDrH1zxNe/yIAZkMltY//iPy7vkPMJ/c46oyE6SC8IeyW2kyH0qvkJYkk9TGXE6Hh+d8cS0AADk3LHkYNV2Y0rp6kJcO0vvNa+0bHxqo/kpmABqCE5YDhBdvEibdmOpxeI5OQJPW1RASzg5vN9iCqE+YoOqo/J61dGJ4MRDMwJUwbIQTCl409iO8LySQkDQqKIlDVnn07q6qCEKJHjwnguPzoHWy7rQTyevxcmZIQbkIXvx848fPTi0YickdkKqQBJ2Gl/l/xhrCbB28SkveEpAFNCHBFKontWoXVUodn/ALs3FFndW8llrTYfrCJNTuqGFceYvrYfILunvtTSeAmdPknqH/qp1gt9QhVJ3Tx+zB9RT12jv4gmTeO/Lu+g9VwBGF4EbnDiWtyw/HOSloOAMKThd1cleFoeo9MQtKA5opWUfvQt9vqa0W2vEnOtZ9BlJyL43TjgAKeXr6f51ccAGD5pkqGFR7my/fMxK31XE8r5isl+/ZvQ2s9wuUj4crF7k68/ZjtKMT8ZeAvy3QoA5J17A0hfCHsxsGbhORwnDSgmVV70go8tix7GN2Jd+t4ja1JXlx5oF3bwaoWKusj3Y7xVOKKn3hgGDFj8CUg6eyZx3tC3mzspqMZjqb3yCQkDWy2ndbk2DbQvU915xTPdGSWkPqYeeytLXzZOM2Dt2qCTELSgKYVjUFo7e//BOfdSFK4u3W8kFdn8bnth48KcjwU53q7HaMkdcfx6x7hCeIkWnGSg7PqhLwnJA1ocW8Rebd/k8jGl7Fa6vBNuwS7cEL37geRmst1ywWjGV2axbJ3Kpg0IodF00rw6IOrmoHU/1nHe0JCIHy52E1VqHnDMxtUL5BJSBrQHAdivjL08z6IwbEFfmfJa6gsOqeI86cW4Thgy6E4KQPsk66kFH8OdmOlTEKS1F9ZPZB8TuY4DpbVo4eUpC45+f0nfNlYDRXomQun18h7QpIkSf2QdVJPSPhzsesPZzCa3iOTkCRJUj9kntQTUgJ52I2Ds+6eTEKSJEn9UNI6sfxA+HJwwnU4ZiKDEfWOjCShJUuWcNNNN3HllVfy3e9+NxMhSFKvM+wIRvMBXOHDaN1cPCsNXXHzpOE4VUvNkGuoyGBEvaPPJyYcOnSIe++9l0ceeYTc3Fze97738cYbb7B48eK+DkWSeo07UUvjsz8nWXMQAM/4eXgXvYeEKmunSZ1zchICULIKsGr3o+aPyExAvaTPe0Ivv/wyV111FUVFRei6zgMPPMC0adP6OgxJ6jSXFcao3Y5etQl3oo4zFdZWFIi9s6QtAQFEd6zEqdrZy5FKg0nCdNpK9wAowQKsmn0ZjKh39HlP6MCBA+i6zsc+9jEqKyu54IIL+OxnP9ulY+Tn9++ryf4eHwydGB3HwUnGUYzuVVBINhzl6L9/TLI6VU9OGB6K33Mv7pKxp4zRirVSsT99S2azai/50y7oVhxnY6j8rntbX8foc6tgGIQCqYnZibKRtKx/eUD8rLqiz5OQZVmsWbOGBx98EK/Xy8c//nGeeOIJbrrppk4fo6ampRcjPDv5+YF+HR8MnRjdiTpiW98gvn8j7jGzcI1fSExP32jtdLSDm9sSEICTiNKw9HHcF32M7LysDmNUFHCPOpdkXfvZTFrh6D7/uQ+V33Vv64kYu5o8fIZgb0UYrdAFgKNkkairoLqiFqG7ziqWvna6197nw3F5eXnMnz+fnJwc3G43l1xyCZs2berrMKRBzmVHaHzmZ7SseopE9QGalz9G88v/16Xq2ooiMBvTqxcnaw6gOMlTPs+2wT35QozCkW1t3knnIQrGde1FSENa0KNS1XxinrZQNZSsIqyavRmMquf1eU/owgsv5Etf+hLNzc34fD7eeustLr744r4OQxrknJYqkrUH27XFD28n0FrT6f1tbNvBVTqed1//+iafj6mcfngvZuQSuO6LiHANKBq2L5/EoFzvLvWWkEdwuKH9lGwluwSrYjtaycQMRdXz+rwnNG3aND784Q9z1113cdVVV1FSUsLNN9/c12FIg5xQTlFw9FTtp2DljCR08fsRhhuEgm/KhejjF3WqnlxCeIgHhhH3lZCUCUjqoly/xoHa9j1uJaccq2JrhiLqHRmpHXfLLbdwyy23ZOLU0iDlsiMQrgbNheXNx/YX4B45ndi+DW2P8U5chOXN69JxTeFCjLmA3OHnIhyLpBEi7sg13lLvyw+oHG5IYloOmpqakqnklpFY/xSOGUdoA+u+0KnIAqbSgOeJV1P/9AOY9ZUA+M+9DGPGDfgv/ACe8dtJVOzAVT4JUTS+W0NijgNxLevYFyfarXgEV7wGNBdJPUtW25Z6lEsV5PpV9tUmGVuY2jNLaK7UfaHKnWjlUzIcYc+QSUga0FRhE179VFsCAgivf4nckdOJ502CYXPRR84j2cNVtt2JWqoe/hOxg1tQ3H5CF38AUTodC7nvkNRzynN0Nh+JtSUhACV3OOahTYMmCXV6XKGpqYlwONybsUhSl2lWjNj+9NmVZt2RtkWlPb3NgyZMwkv/SezgFgDsWJj6Z/8XPVx5hmdKUteMzNPZcKj9jE61YBTmwfR1aAPVGZPQ3r17ufnmm5k/fz5z587l7rvvpqJi8NUvkgYmU3XjHp5+RajllHR7d9Uz0ZJhorvXprVbjTIJST2rJKTRErOobjbb2kRWIU4iit2UvnxgIDpjEvrKV77CrbfeysaNG1m/fj2XX345X/va1/oiNkk6I8tR8M25Hi2rsK3NN+1inNyRp3nW2bFVF1p2cVq78Gb12jmloUkRgjEFBqv3R9vahBCohaNJ7l+fwch6zhmTUDQa5Y477kDXdQzD4J577qG2trYvYpOkTom5Csi69V5yb7+XvPfcjzH3DpKKt9fOlxAesi/9ECgnbql6x8+DUOfWH0lSV4wvMnh7bxTnpK69WjAGc396b3wgOuPEhFGjRrFu3TpmzJgBwM6dOykrk39sUv+SULyQ1Xu9n3eLZ4+m7EM/IlJ9GMXlww6WkFA8fXZ+aegoDWlEkw6H6k2G5aZmdyp5w7E3PIMdaUIZ4D3wMyahiooK7rnnHsaPH4+maWzdupX8/HyuvfZaAJ5++uleD1KS+hvHERgFw2kSXatFJ0ldJYRgYrHBij0RhuWmEo5QtdQEhQPrMSZekNkAz9IZk9AXvvCFvohDkiRJOoWJxS4eWdPCLbOCqEpq2qdaOBZzz9uDNwnt2bOH0aNH4/P5Ovz+5MmTey0oSRpKFAWiSRtdVTKz1bHU7+X4VIIehe2VCSaXpiolKAWjSLzzIk4sjHD7Mxxh950yCf3oRz/it7/9LbfeeivFxcXtbopFo1FWrFjRJwFKZ6aqAgHtNsCSOiYE6HYUYSWw9ABmhkvwtCYs9lY0U9MYxePSGFUapDDL3a4ygyQBjCs0eHtftC0JCc1AzRuBeWA9+vjzMhxd950yCf3whz+ksbGR0aNH8+CDD+I4DkIIkskkd999d1/GKJ2CwMHVcpDW9S9ix8L4zr0MO388piyW2SEhHIy6nTS+/EfM5hq8E+bjm3dzl/YYips2h2rCHK2LUF4UoTjkxtC6l8iEAodqwvzr5Z1U1rUCsGh6CTecN4ocn3GGZ0tDzbhCg7+tbMK0HbRjQ3JK0ViSu1cOziT0X//1XyxbtgwhBPPnz29rV1WVyy+/vE+Ck07P1XqEmoe+DXZqz5HY/k3kXvc5KJLbpXfE1XqUmsd+CI4NQGTbMhzbxH3BRzCdM5fbsRyHx9/cy8urTmwRcd15o7h+4QjOsON3h6JJm9fXHWlLQABLN1Qwa0KhTEJSmoBbIeRV2Xk0waSSVG9ILRxNcvPLOPFWhKvjWyf93Skv4f7whz+wfft2brjhBrZt29b23+bNm/npT3/alzFKHRBCkDjwTlsCOq5l9VPowjzFs4Y2q7GyLQEdF92xCi3RuR0za5vj7RIQwNNL99IQ7vxGeSezbdh5oCGtvao+0lZySJJONiJX553DsbavheZCzRuOuX9dBqM6O2ccR/j+97/fF3FI3aGk//qEokG3rsv7H8NuxdWwG6N+Jy7rRKLQSOIKH8Ko2YorXoMQp7+BIgS0xEwSHWxEp/pDJIXG/upW9lWFiSbtDo6QkkhaaW2OA4loBM2J42o5gFG7FXeiDnEsi5zqNQD4XAqTR6UPBZYV+DF7uCK3EOAyGzFqt+Nq2ofuRNO+3xhJsrOimcqGWI+cX3MSuMKHO/17ks5sRK7O1sp3bXRXNI7knrczFNHZk1W0ByjHcXANm4LQHsMxT7wpA3NvIN6JoaX+LlFfSfNTPyFZvR8ALVRA9vVfxDQCmBuepmH1MwAIVSf3pv8mkTPulLXiqprifOePb3Pp1BAXlU7GObLl2HcEWRd/gO89sovtx3okRblevnj3TEKe9PtqhUGNvJCb2sYTV6IleR7yRCOJ1c/RuuHl1FF1F3k3fwXHk0XTsz9Lew0xV2pPI8eGmy4Yw54jTRytiwBwwYwy3t58lIraVi6cVozSQ10iV2sldY99DzuaKkLsHj0D3wUfJKH6EQL2VbXy/b+uJmmmkvC1i0ZyzYIR6Er3zq85CcxNz9Lw9pOpBlUj74YvkMib0Gs1/YaCgqBKXatFa9zG50pdhA70ITk5I3QAi3lLyLv9XgKzr8E35QLybvs6Zt7YTId11oSAyK41bR/eAGZjNbHtb6FFamk5loAAHCtJwwu/QbdaOzgS2MDfX9xOa8zk36tqec1zOeGFn8R/2cfJf893Wd1S2JaAAI7WRVi2sRKlgw9fnwFfvCKXOeNC+Dw68ydm85nzXHjjNbRueOVETMk4zW/+Hevghg5fw8nHzvbqfOW9s7j7igncdfl46ppivLnhCH9/YTu1Le2veLtLExYtKx5pS0AAsT3roHZf6t+mza8e39iWgACeXrqPqoZo2rE6S22ppOV4AgKwTBpe/A2GKSvxnw1VERQFVfbXndhxdaAPyckkNMDFfKUoM29FX/RB4qExWIOgc6soCrEju9LaE4e3gZV+/8VqqUckO/7ATJg2e440tX39xKpavvZ0CyuiIzGzylmzoz7tOVv213V4TyaJi9IgvNf7Bt9bUM979Jfxb3oY1Rfi3XOqHTNBoqKD13BoK8q7hqWicZO/vbCdf7y4g3f2nKjL2NTNe03vplgxEpW709rNpiqEgGjcate7O67xLM5vR5vS2qxwIyQj3T6mlJIf0DhU/65tv4vGkdy9MkMRnR2ZhAYB23awrFPfyxhoLMvGN2ZGWrtn/Hww0guTGgUjsV2BDo/l1lVmTyxMay/N82GaNjMmFKR9b8GUEuxT/DiTuWPJOe9WcvLzyJ11BaEbvoStpd9rUvy5uEed2+FrsOz2GS7oNcgPta87p6mCvFDP1KKzNC/esXPS2rW8chwH/B6N0aXp9ccKsrtfBFYJ5PPue5N6XjmOO9jtY0opOT6Fisb2k4/UwjFY1Xuwo80Ziqr7ZBKS+iXPyKn4z70stZgG8E5ciDZiJgl3PjlX/yfCSH1A67klhC7/CElcHR/Icbjx/NFMGJ6derym8N4rJ1Cam/qAnTgsm8vnDuf4CNn500uYMTav3eLsk9mOQsxXhjliIf7JC4nr2Zj+ErIv/w+EnopBzx9G8Lw7oGhi+msYOTPt2G5N4XN3ntuWiPwenf+6aybZ3p5Z72XZAs+5V+Eqn5RqUDWyzrsDOzQCAE0IPnbjFErzU/cTPC6NT982jfxg96eJJ70F5F77KYQr9XPWsosJXflxEqQnbKlrsr0qR5vaJyGhGagFo0nuWZWhqLpPOKf6a+vHamo6N6U2E/LzA/06Phg4MdbXNqHF6sBxsNw5mMeGGhVFoMcbIBnB8WSTEGfuMZgONLUmMDSFoEdPSwSNkSS245DtMzo9t/Dkn6MQYCQaIBk7FlPqw1YVNlq0Dmj/GjoSt2yaWhP43Dp+l3bKRNhZLieCYbaQwCBhZKM5CZRoPULVSbhysJ32rzRpOTS2JvC4VIIeHfssZ8gpikBPNEDi9L+ngfJ+PNsY8/M77q2fyso3VhGLpQ+JNkctHlrdwk9ua9/Dt6r3YO5dg++m+84iyt5xutc+8G8gSIOW5ShYrvy0dtt2iOsh0EOdOo5GElfDPpQdy1F82bjGziHuLW43Syt0lr0Ox4G4ns27i1VYjoLlTn8NHXGpCgXBVPJSFIED2N0sxeSOVNDwzM8wG6sQupvsSz+EVTaDpKfoWMDpz9FVQX4w1Zs72wR0/BhxLQRa6KyPJZ3gdymE4zaW7bQVMwVQ8kbibHoRq+EIanZpBiPsGjkcJw16ytGt1D56P63vvE7LyieofehbuKJVmQ6rQ5btsKuymQce3shvn9rK4fpIl5d9GU6Mxhd/g9mYeo1OMkb9c7/CiFT3QsRSX1MUgddQaI62v3EpFAW17ByS217PTGDdJJOQNKjpxGle/mi7NicRw6xMn7nWH+yubOH7f1nDhl01rHinkvt+t5LKhvSZa6cj4i0kaw6+q9XBapZJaLAIuAWN0fTF0+qwaSR3LcdJ9szMyr4gk5A0qAnAsdLLGDl2+h9wximCp97a267JdmDt9uoO1y2dimN4UQPplRgUX+hsI5T6Cb9LoTGSPoVT8Wah5pSR3Lk0A1F1j0xC0qCWFC6C825s36hq6MW9t6hXF0l04l2u/yYc0LX0J2lq1w6UVH1kX/FxhHZidltg3o1Y/uKuBST1Wz6XQkOk4wspdeQsEhuf658XWh2QExOkQc1xwCmdRu51n6V1w0so/hx80y8n7ivp8T17VEy02p00L3sEJxknMOd6nLJpmB3UrOs4VofrzxvNpt11bW2aqjBjfEGXJgo4DiRyxpJ79/dRI3VYug/LVyC3+BhEAm6FupaOCxWrOWWYbj/m7hXo4xb1cWRdJ5OQNOiZiguKpuO5ejoOgpjl9MqmcVrjAWof/1Hb1/Uv/Jqcq/8TSmd1+hjDC3zc9+G5LNtUgcelMf+cYgqzXF2ut+Y4EDdyyS8d0e+nP0tdF/K0L93zbtrYhcRXP442ei5C7d8XHzIJSUOGaUFvbVmqKIL4nrVp7eF1L+Avn0HS7tzItyIEw/J8jLx0PI7jYNuOLPgppcnxK6zce+otW9TccqxALol3XsI1/eo+jKzr5D0hacjQieMKH8LVegSdU19FdofjgOJNL0mj+rJwurG1hmXZPbJWRxqccrwqjVGL2Gm2HtEmXEBi47PYrel7VvUnMglJPS5pO1Q0RNlXFSbSwR483aU7MVzNBzDqd6XtzXMmXrMe9q0kvuU16h76FtE3/oBh9lydLcdxMEZMbytTA4Ci4p91LabdM9sxSNJxiiIoDGrsrz31xZTiz0EbNp3Y0r/2YWRdJ4fjpB4VTVr84+VdLNtUAUAo4OJr75tNrv/stqs2rDCRpX8juiNVKVj1h8i96ctE3UWnfZ4Q4G45RN3zv8JsqEQLFZJz4d00vPUw7pHbYNjcs4rrZHFvEXm330eycgckE+il44n7Bs7KdWlgKQlpbD8aZ0LxKeomAtqY+cSX/pXE7pUYY+b1YXSdJ3tCUo86WN3aloAAGlviPLJkF51d9q8oAkXt4G1Zt68tAUFqW4CWlU+gidP3tAyzmdp//xizoRIAs7GKhrceJjD9YuKHtqB2dK7TEAISlkNrwiJhtj+340DMXYA18jzs8RcT85V1ayhOkjpjeI7O5iOnX5QqVA1j2pXEl/8NO1x32sdmiuwJST3qaF365nLbDzQQMy3c2uk/8JujJsveqWDT7jrmTylmxrh8/C4VIcBsOJr2+MSRHfisOKZy6i0HnHAddqT9sJsdCyNUDVfZRKxT7dnQ0bGAXRUt/P7JzdQ2RVk0rZRbLhxNwJX+ZyQnE0i9rTRbo2aTRX2rRY7v1LspK6FitJGziL7yK7zXfQWh9K+PfdkTknpUeUF6tdyZEwpw66ffcjxu2vz0X+t4ZMludhxs4M/PbuXPz2/DIvWBruWWpz3HPXoGpnr6CtrC7QflXecWCmogB7V0UpeSRW1znB/+dTU1jVEcB97acIRHX9vd5dpuktQTVEUwpsBg1b4z74CrjZoDQhBf+VAfRNY1MglJPaos38ctF45pKzMzpiyL6xaNBMdBCHCZzRgNu3FFK1E5McW0qiHKoar2Wz+v215NfXOMysYoB5Ry3Fd+rm1vHqN4NN4ZV2M5p88ASXcuoQvubtcWWnQbYthM4lr6Rm6nU1HbyrsnrC3bWEFrfGCsTJcGn0klBkt3Rc647YcQAmPaVZj71pLYuayPouuc/tUvkwY8QxVcOXcYC6YUE09a5AbdHK9E4wofpu7xH2JHWwBBYN4NaFOuwBSuU9dGq9qJO1rHy5XZHGqCT7zn54ScJhxfHrFTbWR3EttRUEcvIq9oDHZLHUogB9NfTLwb1QN8nvQ/l9wsD/oZhhklqbeUhjQcB7YfTTDxNBMUAIThwZh1A/EV/0ANFqAW9V7pqq7IaBL64Q9/SENDAz/4wQ8yGcbgJ1JDSUfrIvg8Orr77FZQt8RMDteEURRBWZ4fn6v9cJcAClwJlOgB7MMtqIFs8GbRvOwR7GgLgXMvRcsqQM8uwj66GUVzMzJUxuSROWzZVw/A4skhrp+skWtWIdwO141o5puvaazb18qic0rbhtGEACNeh11/GKHqiJwy4mr79TqW0LH85di+MqqaYtTuD5MdcFGU7UFTBIqw8UQqSdYdQagqSt4Ionpu2usuy/MzZUwu7xwrq6MI+PD1k3Gpos/vAbnMJpyGwzi2hZpdRtyVI+9DDUFCCKYPc/HC5vAZkxCktl03pl1J9KX/xXv911CyCs/4nN6WsSS0YsUKnnjiCS644IJMhTAkCHFse4C/rmlb/Dh3chHvv3ICrm5cwde3Jvj2H1fR3JoAIC/LzdfeP5ssz4nEZlhhwq/8hviBzakGRSP34nvwjZuFkV9G7NBW9Oxial/4LXY0NQSnZuXzxeu+yNL9JVjRMHNjS4m9soTGY8fMXnwH188ezTt7alk8rRjTTE0ocLVWUPvwd3ASqXFxPbeMrOv+K7XBXLufg+DtbdX87snNbW03XTCGq+YNw9u4h+qn/7dtAoMayCH3pi8RdbX/A3XrCh+/fgqHa8O0Rk2GFQcJebQ+//B3J+po+PePMRtTkzUUt4/cW79BzHP66erS4DSpxMXbe2Psr00wIu/MSyHUgtE4Y+cTee7HeG/4JoonfZF1X8rIOEJjYyMPPPAAH/vYxzJx+iElbjr83783t1t9//aWoxypi3T5WKoqeG3dkbYEBFDbFGPN9hrEySWjGw6eSEAAtknzupcwm2txFY/CMU3iR/e2JSAAq6kG59BGFk4u5NIRcWKbl7Q7d+PKpxiTrzF9hA/l0FpcZiOq4hBe83RbAgJI1h3GqtieFntjJMmfn93aru3x13fT0NxK67Zl7WbQWS31xHev7nCI0K0rjCkOMm1UDuOGZff5nAQhIHnonbYEBGDHWolseJEuzjaXBglNEcwa4eaJ9Z1fwK0Nn45aPJ7ocz9u9/eTCRnpCX3zm9/kc5/7HJWVld16flf3au9r/Sm+ytpWahrT32TRuNnlOC3LZu+RprT2g1XN5OWNb/u6pSJ9mnaysQoUDYSKFsxt9yHadvy6QxTk+glXpa8Cd+IRHDPB+MRu6p5+BKNwJAU3/zetaZu3gd10NO211R+oJ2mmT8dujZkEG9J3WU3WHKQo15/W/m6Z+F3XrDqS1pas2U9+0EDR04dk+tP78VRkjOm8PhdaB1t7dGTBBBe/fqWaQy0wpfzUSxZO5sy+lObVz2G++nOK7vxGh++dvtDnSeiRRx6huLiY+fPn8/jjj3frGP25KnB+fqBfxacAsycWsnrbiQ9aISA/y93lOIWAC2aUsm1/fbv2OROL2h3LFUwfFvKOngGqhuLLJV6xi8D0S4hX7G73GNfImdTUtOAOliBUHcc6kYz0gmEUmocJr3sEgETVPmK1lXgnn0/Tm/9sdxy9dGLaa/O5VPKzPdQ0nEjIHpdGTpYH94ipxA5ta/d4z9jZZ/z5ZOp3bQyfButfbtfmnbSY+qYkjpNo197f3o8dGSoxdjWJRVrjxGKd3yF1wWgPv19SwzevzUPt5CaIztjFJDc8y6F//gDP5Z/utTVEp3vtfd6Bf+6551i2bBnXX389P//5z1myZAn3339/X4cxZCjAey4bz4zxBUCqjM6X3zubgqzO7XFzMseBKaNyuXHxaDRVwaWr3HXZeMaWth9TNv0l5F776WMFPQWeUdPxjp2FMWYecX8R2Zd8EC1URODcy0DREJpB1vl34hSkZuvE3XkU3vYVtGA+AK7S8QSmXkT4XcnGjrWij5mPf/plIBSE4SZ08fuxckamxe41NL58z0xGl6amZZfk+/jq+2YTcLtwjZqRikU9FsuCm6B4cpd/Pn3FzhtD6IL3IHQXKBqBWVejjph5xmm60uA2tkDHpQmWbE8fiTgVIRT0aVdBMkrs1d/idGHxdk8RTgbfuY8//jirVq3q8uy4/nzV1F+v6hxSs9oMTaG8OOusYhQCWmIWQoDfreF0UO1Z0wR6sgWRjIDqIqH5MR0VjTjKoXU0LX8cNbuYrFlXoPjziLoLOPn9n58foLm6CpGMgctHZNk/ad38+okYDDe5d32XuJGHKhy0RCMIlaSR1e7+V8y02XmoiTXbjzKyJItzx+WjAB5DQz9px1JDmGixOlA04u5crE4s/cnk71oIMJJN4DiYriysUxRJ7a/vx5MNlRi72hNa+caqLvWEAOpbLR5a3cw3rskn13/6BeIncyyTxOrHULJLcV/wIYTo2f7J6V67XCc0RAgg6O6ZX7fjgP/YtOx3JyAFG6NpP+F1zxNNxvHPuBIzbwyWk3q8Vr2Dmud/C4DZVEP1/k3kXvYhGFmQdp648ILhBQc8c25ADeQQ2fIWWv4wAvNvJO7KAwcsR2Adnw13UjxCgVfWHOLx1/cAsHRjJS+vOsTX3z+rXQICSDgaieOz4fpw7akiQE80gG1hubMxnc59cDgOJxbb9v3Fq9RP5fhUzh3m5q8rGvnsJTntJwydhlA1jFk3klj1KPFlf8O18J5OP/dsZTQJ3XTTTdx0002ZDEHqYUbzQWoe+g7HN4+L7d9E3k1fwsqbiKoqRLavSHtO6463CY6aTdQ59RBhXAshpl5H1jmXYasGMVs54/50zRGTp97a267taF0rFbURxhRn/ka4bkcxty2hduUTYFl4Jy3CO/cW4nrXKjlI0slmj3Dzz1XNLN0V4bxxvk4/T2gGxuybib/9EKx6FPfcW3sxyhPkpE6pxyiKILrrbd6dHcJrn0VTU3vuqIH0RaCpjd/O/FZ0HEgIN+YZdil1WS24mvZCMr3MDoDdT+6diNo9NC97BCwTcIhsfYvk7hWnrh4hSZ2gKoIrJvt4fF0LtS2n3n21I0J34ZpzC+beVcQ3PNtLEbYnk5DUozraz16oqQV0tu3gGT8XYZwoOip0F/5zLiDunN1+Q8d5YlU0PHwftQ99G/W1/+WyGe1n6uUEXJTmdf7qsLcoiiB+aEtae2TbMjSnZ3d9lYaevIDGrBFufv9WI1YXd+gVhhfXnFtJbn6ZxLbXeyfAk8h7QlKPsW0H99jZtKx5DuzUFZiaXQoL7iFqQjxhkXSXkXv3d2luimJYYXxeD/FAOSoWWrQGx0ri+POB1HCZEBBPJNGw8LoUSMZJaj4sR0EngWrFMTVvatKDsGhZ8ShWS6qsjnV0N5dN3UbJVbNZvrWWEUVBLp0zDJ/R+Ru2vcW2HfTc9A3vjKJRWEI741BjpigCdDOMo6gkFY8sFdSPzRzu5kBdmOc2hbl2eteGn4UngDHnVuIr/4XiyUIbcW4vRSmTkNQJ0aTFoepWmlrjlOT5Kc52o5zipmXcX0b+nfcR27mS1pxxrG7MpbBBYfOavby+7jB3XDqeHQfqeXtLFeUFfj5w7TBG2nESa56gYf3LgINRNArP9Z8hZvp5c/1hnlx6ALehcvf5RYw9/DQevw/3rOtYt7eFRNJkWLZKeUkutuoifmy9j1E0Ct+E+TjJGBcEjhCcUsDOWsgPutpK/mSanpWPnl9OsuYQAIo3iG/iAmL99IPdsJpJbH6FunUvongChC58L1bhZCwyn9SldEIILp/s4+9vNzGpxMXogq6NNij+HFyzbiT6xu/xej+PWjC6d+LM5BTt7urP0zkH23TTuGnz639vZtPu2ra2T906jZlj89pNhX43TVN4cfUhsoNuDlWFefLNPcycUEA8YbF574kdHnVN4UcfPAfl0S+0e35w7vUs1+fxu6faLyL92nWFFCz9MVr5Ofw9eQnLt9ajqQr33jqMUSNLaX3zL8QPbCZr9lU0vPGvtueJ4glYi/6DrOz296R0EgjHwlS9p309HTmb37WmKcRe/BlaVi6aPxvHscEyiR/dh+fK/yKZ7Jkpej31fhQC2Pxs6h7WSfLu+Bbx4PCzOvZg+5s53TG6ojtTtDuyuzrBmzsj3HtdPl6j63dgrKO7SG55JVVnLpDXrRj61WJVqf8RAlyJWvSjGzCq3sFlNrZ9r6Iu0i4BAfzxma2E46e/4elEm5hp7CHL5bDmWLWGMWWhdgkIIGnaVNSm17GLxhK8uOpwWvs7R23UYB7moc3MLk915E3L5t9rm7DjEQJzbyQ48wqaVrW/qepUbifPrmn7WhE2rrrttPz7ezT+62uI7S9jWJ1f5He2LMvBKBlHy7qXaHjzIRrfeoTG5U+gF43BsvpHT+1khh2hdeMrae1m1Z4MRCN1xZgCgxF5Bn9d3tStBc1q0VjUETOJvvA/OMmzT4rvJpOQhKu1grp/fpP6p/6Huid/SsOj38WVSCWeaAfJpjWaJGmephckLJJrH0dd9kc0M0JOMDX1Op608HWwVsnrS5+a7Q5mU5STvmtqnl/BjkcQLi8tiRNDgtXNJpZiEDXycY2Z0644apuTCjUazYeofeyHJKsPYLXU0/j637D2vt1nayMcx8EYPQstp6StTQsV4Bo7r8s9sr5gCx01mH4VrHjldPKB4PyxHg41JFm2u+uFiwG0UbMRvhCxt/7c45U5ZBIa4lQFIhtexImfeHNazbWYBzchBBTletM2bZszqZBABxu8tR0zUkvrptcACNa+w3nTS9A1hTfXH+Ha89uPK8+YUEBZfhDP+HltbVqoEP/wCVw7UcE46dy5QRcTvA048QjWjFt5ct2JYqqXzSjAJSy0/ctIHNmOd/yc9kEpGmp26gNfCEhU7iRtKvm659GdvqsoHDNyCd34VXJv+Sq5N3+F0M3fIObq3nBHb0uik3Xene22StdySlAKRmUwKqmzNFVw5Tk+HlvbQk0Xp21D6v6Sfs6lWNV7SO54s2dj69GjSQOOgk2yNr0StVlXgTZekOs3+MYH5vCX57ZxuCbMwqklXL9oZNvVS2r9j2g/DdSxEboLvXQC2uG1jDXcfOWu6VQ1W+QEdO77wAyONibwew2GFwZQDRVt8Qfxzbwax0ygBgtoXPIHsqv2862r7qIiEUB3exhZ4CLUehDljvvYFQmiKDvwe3SuWTiCKWPzaXz+pyQqdgEQWnQLiuGhdftK9Oxisi68h7inEJzUeiPVlV4hW/GFcPp4Zlpc9UPOuL474VmIh0aSf9d3sOqPIDQXIm8YcS2U6bCkTso/Nm37D0sb+eIVuaecXHQqQjMwpl9D/O2H0EomogTTq5x0h0xCQ5zpKHjPuZDE0X3t2l2jziVhpT6Ny3K9fOnumSQtG6+u4jgOmpNAqd5O6/oXEJ4A/hlXEg8Mx0EQceWza/aXeGVzEwUBlUvdCsXrfsvI0jE0NCg8Wj2a5dsaABheFOCzt59LlsfA9JcD4FFieEZNx1U8mqBSR3DHo5hN1fgX3Iw55Tocx2F4CD5581TW7ahhydrDPPHmXj5zxS0Ma/o1VmsjjUsfxTX8HArf+30Sqje1FfhJyUUrHofqD2GFG1MNQiFr0W3EnJ75k1AU0S+H1c6G4whi3hLwlpz5wVK/NHO4m701Lby6rZVLJ515q5J3U4L5aKPmEHvjD3iu+XKPDF/LJDTEOQ6ow2cQmFdPePWzoOlkLboNO7f9sJkmUjO6jo8Hi6NbqHv6Z23fj+5cTeE99+MoGq/uTPD7Z/YDsA1YuV3l+x/8GIknvsyBGZ9h+UnbShw42sLLqw9y6+LR6MlmRONBrNYGrOY6EnWHie5/h5zz76Bp1TMYZZOJHzt/fTjJd/+0CtM68UH/i5eq+PYF16Cv+hsAiSM7sVBJkr5PSszIJeeWb2BV78GJR9CLxhD3l5x1L0i3o4i6PcQPbEHPK0MtnZS2y6skZYoQgksm+XhodTMzhrnJ9Xc9BWgjZxFf/jeSu5ZjjFt41jHJJCSRUP0o024gd/KFgEJSD552lbUuTMKrnmrX5p98Hq3rnqdFzeLRNe23xY4nLfYdjTDel8We2vSZXxt31XLLecOILPs7wSmLad65itjhHRiFI8m96B4aVz5J3rWfbjcVuLEl3i4BAURiJmER4PhHvn/WlSS04CkTS8zIpT7kZ822SvZuaeT8aR7GlOVgqN27ulMEWDtep+mth9ra9PxhZN34JTCTgCChZ8kFnlJGHS9y+s9VzfznRTldfr5QFPTJF5NY9TD6yFmpLUXOgkxCEpAqPh1Xs058cQqqsNHsOHpeOYmqY0N4ioaWXUDjW48gplyF3tE+07obxZfNuGEhWFPT7lvnjs/HFasDt5f61/5Gsi61c2j88HbMhkr855yPpbnaLYrMDrjQVNEuEXndGtkBN3puKb6pF6GOmkPiNB/4LdEE3/rjKloiqTI5q7ZW88GrxnP+9LLTDqWZtsOh2lZe21hBbtDD6NIgPkNFTzRSt7z9Ro3ukjEk1j9H87oXEYpG1sKbUcacR1Lp+n5OktRTZg1389cVTWyvjDOhuOtJRM0uxQqVkNjyCq7pV59VLHJ2nNRp7lgViTf/QN2/volQVXIuugemX4+68B7MxmpAoOx6i9vntb+68nt0mqNJ3HNvobx+NedPPvH9USVBLp5ZhpOMogZy2xLQcVZrM4rbj+MOtWsP+TQ+ddv0ttlzHpfGZ28/l5yxUwncfC/2uEtIqKdfHHjoaFNbAjruoSV7iMdPvRZCCHh7ezXf+eMqHnx+O//z0Hr+99GNxEwb4Vg41omZR4rbh+IJ0Lz6WbBMnGSMxtf/jqjZddq4JKm3aapgwRgPj6xt7vaUa23sAhKbnscxE2d+8OmOc1bPloYMl9VCy9J/4hk2GSOnCKHq2Ik49aULSSgeppSUYhSNAqEyL6Qy7EPncLARKutjjBuWjd8l2HAkjO2ayNXnCK4cF8L2F9Jguth9uJF4bjYloVKyz7sN5/jNTsdGMTzouWWYhzbg9gYhq4S4KxfHEcwY7uUn/zGVxpYo2QE3oaBB0hKAQWfGvBw7faqq7YDhRDEaDmA1VaMG81B9WSSrDyA0g+bQeP7+wvZ2z9l5sJGK2lb8xdn4zlnctvmeq2QssQOb084R27MWvWRatxelChxckUqs2oMo3iwUlwez4SjC5UXkDj+xzxCQbKjCOLoLJxFFzS0j4S/BduS1pwTjCw1W74vxzpE4U8u63jNXAnmooWKSO5diTLqo23HIJCR1TmsdnvIJNLz+97YmV/FoChedSyJSRe3jP25bTa14/BRdcA+rjgbJysnhSE2Yx1/bTTia6nW4DZX73jOB/3tuHweqUuuThIAvXldK6cqnyD7vVhpX/Bs7miqTIjSDnAvuouaJnxJadAvusfNJGlkk1z2Ns/oZskjt65acfQ3quTdgoaGqAsfhtMNqwwt8+Dw6rdETvaGPXl4OO16ndsUTbW2+SQuxWpuIHdhM8ryPE++gpE4iaWM6Cp45N6LlFBPZuhS9YCR2pIl4ZfuqAlre6Yf7zsTVuIeaR+5HaAbZ591G7ZIHOX7jS88fRta1/0Vcy8JlNnH08R+TrD1WeUIo5N38JeI547t9bmnwEEIwa4SbFzaHu5WEIDWpKbnlVfSJF3Z7ppy8JJI6RdWNtFI48co9uJ0Inr2vtyvnYUfDmFV7mVpgkzRtqhuibQkIIJaw2FtPWwKCVMflwaUNKFMux2yqaUtAAI6ZIHpoK0bRSJpWP4fTcATDihA7uLVdPC2rn0VEGzlY28rfX9nNU8v3U9McP+UfR1bQx7fuGsc1swuYNiaHz1xVyuxhGs0rn2z3uNaty/AMnwyA+8AyFkxuP/HC49IozvOmfiZaFky6guDN96JMvx7PtMtQ3CemwqrBPIwR53Z7CEQnSdNb/wLHxjdhLs3rX+LkmRfJmoM4dal1X07t/hMJCMCxaXrjbxj0fOkVaWAaV2hwtMmisrF724coecNxElHs2gPdjkH2hKROcTRXx6VwLAu7uTqt2W5twpVtY5o2TeH0D71IIr03UdcSx/blYdVsTfue1dKA6gmS0INsihbywhP7Cbhu4NILVLLX/gm7pRYUlV0NCt/569ttz3tm6T6++9H55AXSb75ajkpeQQF3LXah2Aks1QORBnDSh8kcOxWvdXATd1x7A/k5o3hzQwUjioPcdtFYQl69bQTQth0SpBa92t4icu/4NlbDYYRQETllxE4aLusqYSfbtqpQPUGscEN6rIlUcrfj6bXwrJZ6hJ0E5exmNEmDg6oIJhUbLN8T5eaZ6XuBnYkQArV0EsmdS1HzR3QrBtkTkjol6QrhnTCvfaOi0eJ4cMaen/Z4o3g0BxoVDF1hTFko7fvD8728ewPRS6bmoOxYgj5qRtrjPSOnEq/ez8GRN/CDf25hw65a3tpcy7f+XU3T9LsB0Ceex6Nvtq/+kDBtNu2pO2VvyHYEMS1ExCggrgbAl4cWar8RnuoPYcdPlPPJ8Thcv2gkP/v8BXzypnMoyHKd8haU40DMyCFZOJVEweR292u6I6l68U+/FIDI3g343v07EUpqDydAzS0H2r9u37RLSGpdX6QoDV7jigzWHoh1+/lqyQTMvatTleC7QSYh6YwURWCj4VtwO/5plyB0N0bBSHJu+SoHIj7qguMInP8eFG8QNZBDaPGdtGSNIa98OGPKsqioDXPnZePJCbrJ8ht84JIyCo8u5au3j6eswI/HpXHtgjIuya/BHLWQh7Z7sRZ8EDWQg+IN4ll0F1YsgmvmdTyxqn0Vbst22FKn4520iMCcG0h0cL8m2YX9g+KKl+zrP49nzEyEZuAeMZXcyz9C67YVaFn55F73WcysYTi2QyjgQvTxmh/HAW3cIoLzb8JqqUMLFRGceSXC8KDnlZJ385dIBFIVDZL+Uopu+wp6TinC5SUw9zpcky/G7n9FuqUMKgioxJMO1c1drykHoPhzwXBjdbOiutxPqIcNpr1RhIDq5jgHD1WTo0bIzc3Cm5OPjwi24iKJgRCCmGmjCPA6YXRhERU+Eo6GrgiEACNShV13gKg7H+HPw+d1oyQjoAgijoeopeJ3a2hmC1srk3zvr+sIeHUunZqNrgre2hHmG3efg9+j852/bWbnocZ2cb73ivFcOqOIpK2y5WAjP/3HurbvKYrgux+ZT1GoazdeNWGhmhFszYOJhmGFcUT73UQz+btWlOM7nGpYihstGcZRdRLvqg6Rnx+gqboWYSdJ6v5+mYAG09/MmY7RFT21n1BnPP9OmFkjPCwa6+3W85M7liJcXtwL3tPh90/32uU9IemUGlqTNBw+wMS9j2FV7EAYblj4Huxx80g6qfFjx3FwHaswkCTA8dub+rFRIHdrBTUPfxsnkeruC83Af9vXifiHpR4gwKOAZdlYwkfcagSgJZLk8ZWpe02KgDguXI7GzReO4ft/XdMWo6EpTB6VR9JOLWQdX5bFF++eyXPL9+P36lw1fwTF2e4uVykwHRVTDbTd848rx4aw+sklm22fFJMNlnrqIbaEcIHqSk0hlKQOFGZp7K1JdDsJqcXjSKx9Emf+XV2eJSeTkHRK4XCE0sMvYVXsAMBJxIi+9gc8hcOgE7tpKoogtmNZWwKC1Ey3yKZXMRZ/uMPdQ0N+F163RiR2Ymhg/pSStjsbo4sD3Puhuby54QhBr8H8KcUUZLnbZptpimBCWRaT7pyOQGBZtiyTI0lnkO9XWbWv+/eFRCAfALvuAGreiC49VyahIcbBYc/hJiprW8gOuMkLuNImCByXYySx9q9La3caj3Y6CcWba9LazZY6/NEqTD0vLUHYtsNdl01gw64aKmtbmTYuH1UIzGPrahQhGJ7v4/1XTMBxHGzb6XC6s2059JtuiyT1c9k+lZpw9+4JwbFZcsXjSO5Z3eUkJCcmDCFCwLrd9Xzuf17nhw+u5cu/WsbyLUdxTvFh7fb50PLL09qTRudmV5mmjXfsnLR2z+gZtKx9Dr2D7bTzs9wsWXuIlkiCkSVB3t5ciWlZhLztp49alt3hgk9FOLjitbia9+OymulupXlFpKoS6BXrMOp3YNjd25FSkgYCnyGIJR0Sp9kx+UzU4vGYe1Z2eQ2c7AkNIY2RJP/373fa9T7+/OxWpo7JBQSqIvC7VOKmTVVjhCCt5C++m/onftxWH0qMnMVr+1QuzmnGpavEFX/6m05AU9TEshxyCyaTfeF7aNmwBBwb/znno/lDNLz+D3znXIDhU0lwYtKAS1P4zK3T2Hqgge3767nnigmMK8uiM7lExULsX0ntK38Cy0TxBsm94QvEjt9/6iQhQK/ZTu0TP25bM+QePQPfBR8iofq6dKyBzrQdwnETr0vDOFWXWRrwhBD4XQotMatb2zsAiGAhCIFdvQe1cEynnyeT0BASjibTpitfe95oHn9jL0s3HMHQVT5+01Sam8Kco+1DW/sw1Qs/TNN5/40ercXWPCT8RaxZXsls/XUCTgvu0TNJ5ozBOvZWipk2z688wPMr9gOCS+cO44JpCyk7Lw+ScRwzTuOyx9AC2US3vEns8HZCl3+MuL+sLTkG3BrXLBrFwsmFxBIWjpP6IznTFZYWqaL2xd+1fW1Hmml49peEbruXuOj8DVfDitDw8u/bLVqN7VmH79zLYQiVvKluivOrxzdxsKqF4lwvH795GuW5HnmPbZBy64JIwiG3m89PDclNJLlzmUxCUsey/S6CPoPm1lSvpiDbQzxh8ub6VOXqWMKiuiHKWFcdytI/IkbM4O/rbdbuPnTsCFGEqOe/7piGuuIftDTXoPpCaIYPy1+OELBxTx1PLz2+S6vD88v3kx1w4R1Zhnj1fqzWhlQP5bIPUvfKX7DDDdT/+ydk3/Gd1GLRYyKxJOt31/Gvl3di2TY3XziG6aNzO94m4hi7pTatzWyqglgLeDqfhISdwGqpT2t3Yh1UjBikokmbH/19LfXNqZvVlXURfvDX1Xz/4wsIuOTHxmCkq4L4WQzHAahlk4kvexDX/DsRmtGp58h7QkOIz6Xy3++ZSX7IA8CsCYWs39l+4oBl2wTN1AdwvHgqa3c3tvu+40C8NYx1bMJBvGIXIp76cFZUhbc2tN+KAWDHgQYakjqB6RcRWngz/snnYdYfxY6k1l1YrY3Q2n4R6ta9dTzwr/VU1rVS3RDl14+/w47DTad9fYo/fYMuNZCL4+pahQBTD+AZO6t9o1BQQ8VdOs5A1tASa0tAx0ViJrVN3Z9BJfVvqhBpG0V2leLNQgkWYB7Y0PnnnNUZpQHFcaAs18NPPnM+P/rkQq5dNILhRSd6H7lZbjRVwTo28UBrrSE/25N2nIB1IhlooUIcNXVl7NgOozso0VOY48WxHRqXPUbjsseI7FqDnYyBY2MUDEfLKQHXiXstqqrw1sb0ZPbSqoOop9n11PQWErrwHhCpt7UwPORc9QmSXbyPYzoq/kV34BkzMxWPP5u86z9Pwld0hmcOHl63jvaun7UQEPB0vb6YNECInplPqpZOJrn9jU4/XvarhxjHgZygGyueWlZ60wVj2HmwkY9ekE1x63ZcTVvQxlxPvHQy9vZX+Y/Fn+OHT1W0bfc9b0I2BQ2bAFADORiFI3ACqR6CbTucP72Ut9YfofFY0dK8kJvJo3Lxe1SyL7wbIQSKN4uWDa+Sc8FdxI7sRPVno1iJtvs+juOQm5We/PI6aDuZJXTUsReQX3YOdqwF4c8lbuR0q2J1TM/FffHH8Z/Xgq0aJFTfkLoXEvLpvP/qSfz+qS1tbbdfMo4cvyx8Olg5jnPK5RpdoRaPI7nlVezWBhRf9hkfL5PQEFeY5eJ/PjiOlse/gx1pJgEkdiwn77avYwM+xcOPPjaHisYkPrfGsJDA0yBgzATUrAJMTx4JJXW/RQjI82t87s5zqaqPgBAEvTq7DzUyNuSl4bW/AeAZMY3A1MXUPvebtjjCm14j785vEfMUY9sOC6eVsHRjBVec4yfLZbOrDhbNHoZ1huECCxXLUwieY9stnEXmMB0NUz/2RzSEEhAADsybWMCo0ixqG6PkBNwUZrt75ENK6p+SFhja2f+ChaqjFo0luXslrmlXnvHxMgkNcY4DWtNh7EgzkNqozjNmBslomB2xfP748kFGlgS5afEY8oNG6p5Q4XQgNROuoSmOx50k4NKor6tFj9bxg8eqicbbL3ybVj6Z4LF/C5eL5jXPt4/DTJA8sg0xthjHgVGFPn5wlU7zK7/CjrUyrmAE2e6PEyW9BpymKdi2c1YbxUnpFCEoynJTlNW9Dc+kgSWWtPEaPXOHRi0ej7l3lUxCUicd6y14x81B9fhpXPoYODalecN438K7+NHTFWzeU8f9H5uP/9jMqJrmOD/+x1pqG2NoqsLdV05giqeKRGtLWgICaDWVE0lIKB2WfXdOqq6ZqDlI4zP/y/EuSLJ6P40v/x7/Nf9NktSsG92OIqp2ENn6JlpeOe5x84l5hs59G0nqSZGEg9/VM0lIyRuOve4pnFgY4T79xCA5MUFCyR2G4vLiKhlDy8YlbetjrNqDlFe/xfjyIOFoksq6VNUA04ZfP/EOtY2pmVKmZfPnZ7YSVXz46ncwoTzY7vguXaUoP4ucaz6F4vYT2buBwJQL2gehahilk9pGz5INR3n3GFiiYhdqPDWjTlEE9p4V1D39P0T3rKPl7Sepe/S7uBPtZ9lJknRmsWSqxqLX6JnxVqGoKLnlmEd3nPGxsickkXDnkXv7vSR3r0r/5pF3mDpsPjsONaNrqWuW1rjJ/srmtIfWWz6CB9byobnTeMQTYvWuRoYV+vnwdeeQ5XVhemeRc9dYsBJgeMkNFtK68WUUXwjf1EuI+0va8o7qTd/8TQ3kYOupoSE92UL98kfbfd+OhrHrDkFxd5fbpVNVBSEEpplebFWSBovGiE1eQO1yBezTUQIF2LUHYcTM0z5OJqFBpKOqAqeqNKAoJ4bEHAfinkKMwlHpBy0Yy86jCSaOyKYkz4eigNetMmFYiEjCpKE5TjiaZO74bCZmRQje/EWSjVV88jwfrfMCePOLsXVvWwxxLYhiiNT9m4Jz8Fw5BccRxCy7LQEJIdDyyvBPv5TwhpePBayRfflHU2VzHNqmkwrDjVB17OjxvV4chBC4zQaspmoUt5+kNx+LM08tVhTRdl9JCIErfJjo5iVYLXV4p1yMXTAOU7SfHXb8b3YozZyTBp/qFpOy7J5NB8ITwG5NX/T9bhlJQr/4xS94/vnUjenFixfzxS9+MRNhDBrueC3Jim04kRZc5ZNIBMvRky1YR3diNR7FVToBK3s4cQwO14R5/u195PsVxuarOAhiejZ7K5ooz8qheOJ5RLe9BaTWxzjTrmNWnYeSPA/Jyp34YkcI5Jbz1Ys04kcP4QQL0PNGElvxMJEnNhHz+PEvfi+rqnwI4TBBayAv2ErcU4hmRRF1e0kc3Y2eU4ZSOJY4QY5nH1eyAatyB3bjUSJZeWihfPJv+HyqbE9WEXF3/onhOi1A/jWfJH5kJ04yjpaVT3jbStS8csTR9UQPbEb1h0jGWkHR8Ey+gKjecQ8pbtrsrwqz53Aj5YUBRpcEybGqqX3oW20186J7N5Bz9X8iymZhJBqxju6k7p0KjNxSHARKdilxb0m3poNLUqYdbbKYUtrD0+9VDY79/ZxOnyeh5cuXs3TpUp544gmEEHz4wx/m5Zdf5tJLL+3rUAYFV6KWuke+3Ta7DSD/pi/Qsv4VhGOTqDtC84rHyb7io6yLjeXXj29qe1xJrocvzEuwP15Awgjx19cquHTGZYy6bCGKlaBOy+GXjx+kKRzHtBwmjwjxH+VVGBtfxTNiKpE1z+Mun0hyv5fEgdRx7WiY5hd+Rd7Cz/Odp2tT55gToXiSRmzbUppXPnEi9uHn4L/skySEB5fZROOTP8asr2j7fmjRLdS//EeCC2/BKprW7gPeiFZR++yvsdtK6QgKbvkS8Z1v07TskRPnKBmLnluCuX8D7uHTiRntE5EDPLfywEmlhmDmhAI+Ot9oS0DHNS9/jJybxtLw/M9IHj3x+Kw519K6/Amyr/wEMV9pF357kpR5juNwsD7JLTO7tvPrGSWiCNeZF4r3+cSE/Px8vvzlL2MYBrquM3r0aCoqKs78RKlDVtWedgkIwKyvxFU4HDsZwztmBqHzbqW+oZkHn9/W7nEVdVEqnVxGuRupbYyyaFoJ//fsbtbVB1jVkMP9D++irinWVspjy/5G6oITSNYeQfWmJh+4yycQ3bM+La4su77tHBWiEJqraH77yXaPiR/YjGiqBMCpP9QuAQE0r3kB36SFtKx6Gt2JtrULAckj209KQAAOTSufIFG1r90x4hW70LOLsBMRrOq9aXE2tCZ4dln756zdXk2VmT6jRwiB01jZLgEBNK9/Cd+4WcR2rkCRC2mkAaa6xUJToCirZ/skdksNSk76VjDv1uc9obFjx7b9e//+/Tz//PP885//7NIxurpXe1/ry/ia9rXfg947bg7RveuJHdwKQPzITvS8ctQ57yGWqEx7ftIWqIpFNG6iH1tvI0htqd1RHamEnfqQPT6d2myuR88uIvmuBBJXfEBL2zkcK9muKvVxmmIRzA/QUp1+499OxFB0A8Xw4A94Ud0nrqoatkfTHx8NowXSh9wcx0boLoSVSPvdNMUa6Gh5ka15ELobJ3miVlro/NsQooOp5WYSoek44UZyc7tWp64v9Pe/F5AxdsTrc6H1wOLRM1m+t4nzJgTJzu65bUocy6S6dj8F134MPfv0P7eMTUzYtWsXH/3oR/niF7/IiBEjuvTcmpqWMz8oQ/LzA30an7tgVKpW2rEPeKNwOI1vPdLuMcnaQ+ToCS6fN5xnl+1va3cZKsVGmGorm+HFQTbvrWfSyBw27a7l3HH5TB2Tx6bdJypTZ/kNCp1ahO5um0XTun0FORfeTd2rfwU7tT5IlE9n1VEX0IJLVynWW1CCw3GPmEps/4nhQNWXhektoKamBXewGKG1HwLzT15IZPc6gotup77FhpYTP1d3yURAcPI07sCsq4gePFFmBkDxBjFyy2lc+W+yLv5Q2u/GZ6hMHJHDtv0nbqDmZbnJy8ki+/ZvEtu+DCtcj2fS+Zi5o9GSLQiXFyd+YpM779hZRPdvJjDvpn733uzr92N3DJUYu5rEIq1xYrH4mR94FpKWw8aDEb56dR6NjT23caN56B1EqIRG0wc1Lad97RlJQmvXruXTn/40X/3qV7n66qszEcKgkfCXkn/b12he8Th2uAE9f0SHj9PcPkaVhrjjkrG8sb6C0lwXN8wM4dJMNHLZtbGS6voI77t8NKt31tMUjnPD4lGUFfhZu72asWVBrpkgCBxYgv+6TxGrOoAWKkTNH4GZPYzsi+5B0XQcd5BdkSCrXq5k1thsbpyZxfBsh5i7kMCFH8BzeDNWUxWOZeOevJiYlpqKHfcUknf7N2hZ+QRmfQX+iQtQc0pwT1qMmZW+KV0iUEb+rV+lecVj2NEW/LOuhvJz8ZVMQgvmE92xEqNoFP5zzie8822yFt9N3FeS/nNRBB+94RxeXXOIVVurmDwqh6vnj8SjK8T0EpRZt6EpgsSxfZgsVy55t36d8Kp/k6w+gGf0uWiBXJRQMWb2yJ77xUpSH9h8JM7ofIOCQM+lAicZx9yxFM8Vn+nU44XTx9N5KisrufHGG3nggQeYP39+t47Rn6+aMnVVpwsLHBOESvTNPxLdtrzte67yibgv/hh7d+4jqCdxsssxhUFlU5KSXC/FNW9jBQpwkkka/CN54IndjCnLoqo+QlGuh4tnlTOywINqx1EAE5WYbeByooh4C+Gl/0CoOv4ZV2Bmj8IgTtTR0LFBqCRs9diU50O0rHgMs/Yw3nPOw5iwmLjWfj2QptgodpKs/Dzq6sJnLMWTet0WpjDaZs4pCmhWDFt1IewECI2ko572OEIIEpaNoSqdmuGmCZugVyEcNXEcSDr9c7XDUOll9LZM9IRWvrGqV3tCScvhz8ua+M+LshmR17m9f87EcRyS655EyS7Bveh9be39qif0hz/8gXg8zg9+8IO2tjvuuIM777yzr0MZVFIfsio44J1/B+7hU4nv34irbCJa+TlEt71O7kkz0/S5d/HY6iB3z/HgWfm3tvaDc7/MkZowR2pSN/23H2jgSE2ED1wziWjcIj/oJuDRETgkhAfcHjyXfzY11dtywAbzWH23BLSNlrliNdQ+/B2cZOqPqnn54/haGtAX3IPlnJgfY9oKkJoq2placCe/7uNsGxLCDTapY3XiMstxHHTlzLu3tsXpKKjeAInW/v3hKUmnsvZAjNEFeo8mIHP7GzhmAtf8uzr9vD5PQl//+tf5+te/3tenHVISWhAxfB7GyAVYto3aWkH4pAQEoKx9lA9e9FXyWk/MGFPKziEnP5c7LvWzcVct2/bXc87oXIYVBvjab5anynq4Nb783tmU5ZzYVkGYMdTmStR4BBEqJOHKT/swtxqOtCWg41o3v0H+rOuw9DOXe5ckqec0RizWH4zx9WvyeuR4juNg7ngLu/4Q3mu/ilA7v+9U/xxDkM6a44BlHauI8O5tqYfPpKL0YtbujVPgKWTqoo9htFbzZM1wXnvwHQAWTi3hsrnDKcj28PArO7ni3HzKczRW7I7wq0c3ct+H5mCoCrodIbbs70S2LQNAaAZ5t3yVWHBEu1N2tNWvMNw4yumHySRJ6lmO4/DqtgiXT/aR5z/7FOBYJsl3XsKJteC55stnLFj6bjIJDQEikI8wPDiJKIo3yOb8y/jtMyd2Ln3W7+KTNyxgyasnZq4t3VjBh6+bTH7IxXevz8a78V9YByqZMmo2+yZfSCRuYXgVRMOhtgQEqS0ZGl/9I8Ebv0aCEyuwRXYZev5wkjUH2tpC599JUs+iwznSkiT1ivWH4jjAZZPPfjmBHWkiuf4pRFYR3uu+gtC6XnVBJqFBKGbaNLUm8Ht0Am6NhJFDwY3/Rd1Lv8ccMZ9/La9p9/im1jiKcPjCFXkkHZVn32lld0WY9Ttq+MxluTQ8979Yx6Zf23veZrQZw3vOp3AAJ9IIgNDd+KcsRvX4cSwT3YmTOKnOWlwNkHXN57CrdmM1V6MXj8UKDcdxwGU2QTwM7iAJLdAnddiEELTETFpjSbJ8Bh5dkfXfpEGvusVk1d4oX74qD/UsF1abFdtIblmCMf1qjKlXdLv4qUxCg4SiCDQVGsIJvvfXNRyti+J1a3zi+vFMGZFFvL4Sd/lEEuWTSC6rZN64LK4Yr+CzmsnKy0WpWELruudBUfnUlCv5d85oirMFHN3etv7nOPPARlzJJmJGLpo/G6G7yLngLhpXPonVUodwedHzh6GUnIt90qSDuB6CslkIIYg7DkI4GDVbqX/ul9ixMKovRM41nyYe6qCQag/bdqiRXz66kdaYSU7QzefvPJfSHI9MRNKgFU/aPLspzB1zghQGu//R78QjJLcuwQ7X4r3y86gFZ/f3KvcTGgTciTqMyvVEX/pfjNd/xrcu05k9JkgkZvL/HtlKbVUtdrie8KbXsJc/yPsvH80dpfvJXvYzjJV/IvrMT1CwMIpGgW3Bxme4aqzNucF6SKZPEVU8fhw1dY/HcfvJu/JjNL79FFZLai8fJx6h9plfYkRr054LtE1acMXrqXvqgbbyO1ZrI3VPPYBhpm8T0ZMaIkn+3z/X0RpLJdf65hj/75/riSbTqyFI0mDgOA4vbmllcqmLuaO83T6GeegdYm/+CSWnFN/N3znrBAQyCQ14LqsZ68gWap7+XyK71hA7sJnYi7/gvdMsXIaKbTscrarDXTIGANXlZlwogdj4TLvjtKx/Bd/YWW1f58YO4t7wCMm6I7jLJ7V7bOiiD5DQUrXjLE8udiKK1fyuhOPY2C3th/3ezYpHsKZdhzJ2IRyboGBHW6C1oVs/i86qbYqmlSSqb47R1Hrmir+SNBCt2BvFtOHO2en7dHWG3VxNYuW/sA6/g/eqL+Cef1eHk426Qw7HDXTN1STrjqR6MCdRt77I7LHXs3RLHSHdxIq0knv5f9Cy6TWURBjn3YtnHLvdltua4cKOhQlveYvAjMsJzLoKKxZBCRVhBk5sWZDEhbdoDIrHjx1tPwtPeEOnDLslZvL0qijLNgUpzy/k7sXnkbPq19iJGLh7t0ZXyOdCiPZ7APncGn5356eVStJAseNonO2VCb52dR6a2rX7Nk4iSnLXMqyKHbhm3Yg+8UKE0rN9F9kTGugU0eGbQqgapuVw13mFFKhNWJEwDUsfxSgYTjAvH9UXavd41Z+NHU8VBXWPnIrZUp+qR6douIZPIV4whWT5HOKBYVjvunaJugrJufLjoJxozzr/TkxvQYchO8CDL+7gldWHicZNdh5u4XtPVdE67TayL/8ICVfO2f1MziA34OK9V05s+1pVBJ+4eRoBj7wmkwaXyiaT13ZE+M+Lcgh6Or8cwrEtzL1riL3xBxRPEN/t38eYfHGPJyCQPaGBz1+InlOSVvzTM+s6LrFLyPGAT+jUPHw/en45enYhomIzOVf8B7GDW2lZ/Sx6yViyFt+DlYhTMHJ6av2OmcQ1agYE8km48k5bScBxHBJ5k8i/5wfYLTUIb6ow6buT1XHNkSRrtlW1a4snLeoCkykszen1yQGKgPOmFDNpRA6NrXHysjxke7VOVWiQpIGiOWrx9IYw71sQojync718x3Gwj+4kueNNlKxivNd9FTW7d/fIkklogIurfjzlU8i/7tNE923CTsZxRs7hB6+E2X54A6oi+NptYygMFeKftIiG1//R9lzfxAXkXvlRFE+QRNYwbBvM05zrdBwEMVceuM68AlvTFLxujUis/dncbne72XS9SRGQH3SRH+zh3SRP4VTbrEtSb4gnbf69Pszl5/iYXu7u1HOs+sOY298Ex8Z9/gfRys7p5ShT5HDcIBBVQ8Tyz0FfcA9VE+/gP/5SxfbDrQBYtsPvXzmCb/4tNL39VLvntW5bjh1povapn6HHzrwXfE8JuDXed1X7yQ5Tx+RRnNu9WTv9mebEcdXvRGx+Bv3ImtSaKEnqRZbt8MymViaWuLh00pn3CLLD9STWPklyw7PoUy7De/N3+iwBgewJDRqOA8mkRVV9+p4gR+siCFcRdqw1/XmWmRrGS0TAaH8vRgiIWw6m5eA1lE4VAu0M23aYMTaXb//HPI7UtBIKGAwr8OPSBtc1kaKAs2MZta/9ta3NKB5D4OrPklD63+Z30sDnOA5LtkfwuRTunBM87QJSJx4huXs5VsU2jKlX4rny8z02460rZBIaZIrz0q98Fk4twcgrxSgcRaLqpC2uFQ2hqGihQvC2T0C247DzcDN/fGYLjS0JLpszjCvnDcfn6plab6oQlOV6KRuEvZ/j9HgjdUsfateWqNwNDRWQOy5DUUmD2ZoDMerCFl+6MveUFREcy8Tcvw5z7yr00XNx3/YDFE+wjyM9YXBdeg5AqipQOzlt8lSPVRSBqqZ+lQVZLv7rrhlk+VNXNHMmFXLzhWOIKgGyrvgYrvLUrDA1mEfORXcTPbyTnGs/S1xpnwyONsb4+4s70DUV07J5bsV+Xll7CNHFUh+pfXocTNuhm1U9BixhWzjJ9LVHJ08gkaSesqsqwcZDcT59cQ5uPf2j3XEcrModxN/8I05rPb7rv4F70XszmoBA9oQyqrIxyrJNldi2w8KpJZTkeOjoc9oBKuqjLNtUgSIEC6cWU5yd2kqhujnOis2VtESSnDethPI8H+cMz+b+j84naTn43VrblUbMVYDvys+TL6IkLQfbsvCOmk+U9jfnFWwKkkf48rjtOEKhKX86v3yjmVdXH+LyOcNwd3LYLG7arNlRwxNv7MGlq9x1+XgmlGehDpFsZLpDeCfMJ7L9xAaDwvCgZheTzGBc0uBT1WzyyrZWPntJDtm+9NEKu7mG5NZXwUrivuA/0EondXCUzJBJKEMqGqJ883cr26YFv/j2Ae778DzKOxieOlIX4d7fr2ybuvzC2wf49n/MQ9cUvvl/K9q2nl6y5hBffd9sxhQH8Ogqng5mZSbRMfJyaKppOWU/2GjaT9O/v8fxm0BBsYRPLv4cf1qVQFc7l4CEgM376vnD01va2n76j3Xc+8G5DC84883SwcB0VLzzb0MNFRDdugy9YDj+eTcRd+X22P01SWqN2zy1Mczd87LSNqhzkvHUYtMj244tNr0A0c+2T5HDcRmgqgpvbqhoty7FcVKJSHtXL0PTFF54+0C7tTO27bBqaxXv7KlrS0DHPf76bpyz6GioqiC84SXafUo6NqHq9Xz4usl0dsG1jeCFlQfS2tfurEY5y+q9A0lcD6FMv4Gs27+N+5KPE/OWyCKpUo9JzYQLc94YD7NGeNp/r3IH8Tf+iNBcJy027V8JCGRPKGNMK71YpmnaaR9QjpNqfzfLdrA6WFxpWvZZXmULsNIHi9yaQzDH27ZR3pkoAvKyPew50n5Kck7APeQ+hFPbjXuObTcuST3n9R0Rsn0q104/UerKiYVJbnkFu7UR92X/iVbUvyfByJ5QBliWzeJzS9Nu1F8+b3jah7xl2Vwxb3jaMeZMLGTq6PQZMDcsHt3hfaWuxOabfvm7WgW+yed3OgEBOLbDdYtGoZ00fBf0GUwbc/rqC5Ikdc7WijhHGk0+tCiEcuzDxKrcQXzpX1AKxuC75Tv9PgEBCGcAfiLU1LRkOoRTys8PdCo+BzhU28pzK/ZjWQ5XLRjB8Hw/HY1U2Q4cqAnz/Ir9KEJw1YIRlOf5EKTuLb2wcj8tkSRXzhvBqOIA2hmGu84Uo4qJ1rD32P5CGv4ZV5AIjexyNQMhoKY5zt6KZnRNYWRxkJC3c+VDOvtzzCQZY88YKjHm53etMO/KN1YRi6VvpQJQF7Z4eE0z/315LqXZOo6VJLnlVeyGCjwXfRS1cMxZxdrTTvfaZRLqYV19sx6fWt2ZXsapHpuati063VPpbIyaKnAAy+r7t8hQ+WDqbTLGntGfklDScvjnqmaumOzjvHG+VMWDdU+i5o/Cff77EXrnyvT0pdO9dnlPKMO6MsR1qsemkkTPJ4p377kjSVLmLd0VpTxbZ9FYL1b1XhIbn8M1+5bUzLcBuPxBJiFJkqQB4lB9kj01Ce67Lh/r0DuYO5fiuezTaMXjMx1at8kkJEmSNAAkLYeXt7Zy97wsjCNrMQ9uwHvd11BCRZkO7azIJCRJkjQArN4XY0SeweTkZsxDm/Be93UUf+9uANkX5BRtSZKkfq4pYrHxcIxby49i7l+H99qvDIoEBDIJSZIk9XvL9kS5aKSDb++reK76PIo/N9Mh9RiZhCRJkvqx2haTQ/VJFrc8i/v8D6DmlGc6pB4lk5AkSVI/tvpAjAtzjuAdcQ76yFmZDqfHySQkSZLUT7XEbPZVx1kkNuCed0emw+kVMglJkiT1U5uPxDnXc4TQ/JsQhufMTxiAZBKSJEnqhxwcth2JMD9wGG3MvEyH02tkEpIkSeqHasM2ihVn7OyFCDF4P6oH7yuTJEkawA7WRJmiH0AfxL0gkElIkiSpX6qoj3FOmQ+huzIdSq+SSUiSJKkfqo0qjDtn4BYm7ayMJKGnn36aq666issuu4y///3vmQhBkiSpX8tWwviGTc50GL2uzwuYVlVV8cADD/D4449jGAZ33HEHc+fOZcyY/rUToCRJUiYVuaIItz/TYfS6Pu8JLV++nHnz5hEKhfB6vVx++eW88MILfR2GJElSv1YUHBqbHPR5EqquriY/P7/t64KCAqqqqvo6DEmSpH4tNzT4e0GQgeE427bbbUHrOE6Xt6Tt6l7tfa2/xwcyxp4iY+wZMsZ0paV5A+Lncrb6PAkVFRWxZs2atq9ramooKCjo0jFqalp6Oqwek58f6NfxgYyxp8gYe8ZQibGrCUVzefr9z6WzTvfa+3w4bsGCBaxYsYL6+nqi0SgvvfQS559/fl+HIUmS1K/5s0KZDqFP9HlPqLCwkM997nO8973vJZlMcssttzB16tS+DkOSJKlfM/wBnEwH0QcyMv3i2muv5dprr83EqSVJkgYG1WAoZCFZMUGSJKkf6uqErYFKJiFJkiQpY2QSkiRJkjJGJiFJkiQpY2QSkiRJkjJGJiFJkiQpY2QSkiRJkjJGJiFJkiQpY2QSkiRJkjJGJiFJkiQpY2QSkiRJkjJGJiFJkiQpY2QSkiRJkjJGJiFJkiQpY2QSkiRJkjJGJiFJkiQpY2QSkiRJkjJGJiFJkiQpY2QSkiRJkjJGJiFJkiQpY2QSkiRJkjJGJiFJkiQpY7RMB9AdiiIyHcJp9ff4QMbYU2SMPUPGOHQJx3GcTAchSZIkDU1yOE6SJEnKGJmEJEmSpIz5/+3dfUxT1x/H8XdHwYc4o2YiCxqIj9ElMjIzrZph0VTKpRSND+BSVOYDxojTPxAXo5nBiMYI4rbMROOzMZqABiLowiRTMRAWlcQsmcEBoozh3FZB0pb2/P4w9if57UEz/d1b/L7+u6cHzufeE/j2XMq5UoSEEELoRoqQEEII3UgREkIIoRspQkIIIXQjRUgIIYRupAgJIYTQjRQhIYQQugmpIlRUVMT+/fuDx263m1WrVmG32/n444/p6OjQMR2UlZWRnJyMzWbj5MmTumZ5XmdnJykpKbS2tgJQU1ODw+HAZrNRWFioczr44osv0DQNTdPYvXs3YLyM+/btIzk5GU3TOHz4MGC8jM/s2rWLvLw8wHgZXS4XmqbhdDpxOp3cunXLcBm//fZb5s+fj91uJz8/HzDedexTVAhwu91q8+bNavLkyaq4uDjY/vnnn6sDBw4opZQqLS1V69ev1ymhUj///LOyWq3qt99+U11dXcrhcKg7d+7olueZmzdvqpSUFPXee++pe/fuqe7ubpWQkKBaWlqUz+dTWVlZqrq6Wrd8165dU4sXL1Yej0d5vV6VmZmpysrKDJWxtrZWpaenK5/Pp7q7u5XValU//PCDoTI+U1NTo6ZOnao2bdpkuLkOBAJq5syZyufzBduMlrGlpUXNnDlTtbW1Ka/XqzIyMlR1dbWhMvY1IbESqqqqIjY2luXLl/dqr66uxuFwAJCSksJ3332Hz+fTIyI1NTVMmzaNIUOGMHDgQObOnUtlZaUuWZ535swZtm3bRmRkJAANDQ3ExMQwatQozGYzDodD15zDhw8nLy+PiIgIwsPDGTNmDE1NTYbK+OGHH3Ls2DHMZjO//vorfr8ft9ttqIwAv//+O4WFhWRnZwPGm+u7d+8CkJWVRWpqKidOnDBcxm+++Ybk5GSioqIIDw+nsLCQAQMGGCpjXxMSRSgtLY1Vq1YRFhbWq/2XX35h+PDhAJjNZgYNGsSjR4/0iNgrC0BkZCTt7e26ZHnejh07mDJlSvDYaDnHjRvH+++/D0BTUxMVFRWYTCZDZQQIDw+nuLgYTdOwWCyGu44AW7duZcOGDQwePBgw3ly73W4sFgtffvklR44c4fTp0zx48MBQGZubm/H7/WRnZ+N0Ojl16pThrmNfY6hHOVRUVLBz585ebaNHj+bIkSMv9PVKKd56S5+6GggEMJn+u9W7UqrXsVEYNeedO3dYvXo1ubm5hIWF0dTUFHzNKBlzcnJYuXIl2dnZNDU1Geo6nj17lnfffReLxUJJSQlgvLmOj48nPj4+eLxgwQKKi4v54IMPgm16Z/T7/dTX13P8+HEGDhzImjVr6N+/v6GuY19jqCJkt9ux2+0v3D8yMpKHDx8SFRVFT08PXV1dDBky5PUF/BtRUVHU19cHjzs6OoK3wIwkKiqq1wc4jJDz+++/Jycnh88++wxN06irqzNUxsbGRrxeLxMnTmTAgAHYbDYqKyt7rcz1znjhwgU6OjpwOp388ccfPHnyhPv37xsqY319PT6fD4vFAjz9ZR4dHW2ouX7nnXewWCwMGzYMgDlz5hhurvuakLgd91cSEhI4d+4c8PSHcMqUKYSHh+uSZfr06Vy/fp1Hjx7R3d3NpUuX+Oijj3TJ8nfi4uL46aefgrcdysvLdc3Z1tbG2rVr2bNnD5qmGTJja2srW7Zswev14vV6qaqqIj093VAZDx8+THl5OefPnycnJ4fExEQOHjxoqIyPHz9m9+7deDweOjs7KS0tZePGjYbKaLVauXr1Km63G7/fz5UrV0hKSjJUxr7GUCuhl7V+/Xry8vLQNI23336bPXv26JZlxIgRbNiwgczMTHw+HwsWLGDy5Mm65fkr/fr1o6CggHXr1uHxeEhISCApKUm3PIcOHcLj8VBQUBBsS09PN1TGhIQEGhoaSEtLIywsDJvNhqZpDBs2zDAZ/4zR5tpqtXLr1i3S0tIIBAIsWbKE+Ph4Q2WMi4tjxYoVLFmyBJ/Px4wZM8jIyGD06NGGydjXyJNVhRBC6Cakb8cJIYQIbVKEhBBC6EaKkBBCCN1IERJCCKEbKUJCCCF0I0VICCGEbqQIiZCUlZX1j/sEvkif2tpaUlJS/nG8CRMm/On3qqqqCm7373K5qKyspLW1tdf2NEKIvxbS/6wq3lzXrl17JX3+rdmzZzN79uzXPo4QfZWshETI2bx5MwBLly6lrq4Ol8uFw+EgNTU1uI3T833a2tq4fPky6enpzJ8/n1mzZlFUVPTS4xYVFTFv3jycTieXL18GoKSkhNWrV7+S8xLiTSQrIRFydu7cSUlJCUePHmXRokXk5uZis9lob29n4cKFxMTE9OozdOhQcnNzKSgoIDY2lvb2dqxWK5mZmS817siRI9m+fTs//vgjLpeLioqK13SGQrw5pAiJkNXY2IjH48FmswFP9++z2WxcuXKl199kTCYTX3/9NdXV1ZSXl9PY2IhSiu7u7pcaLyMjA4Dx48czZswYbty48epORog3lNyOEyHLZDL9z3NdlFL09PT0anvy5Anz5s3j9u3bTJo0idzcXMxmMy+7beLzz6oKBAKYzfIeToh/S4qQCElhYWFER0djNpu5dOkSAO3t7Vy8eJHp06cH+/T09NDc3ExnZyeffvopiYmJ1NbW4vV6CQQCLzVmaWkpALdv36alpYW4uLhXe1JCvIHkrZwISUlJSSxbtoyvvvqK/Px89u/fj9/vZ+3atUybNi3Yx+VysW/fPmbNmoXdbiciIoLx48czduxYmpubiYiIeOEx7927R1paGiaTib179+r2AEUh+hJ5lIMQQgjdyEpICODgwYOUlZX96WuffPIJqamp/+dEQrwZZCUkhBBCN/LBBCGEELqRIiSEEEI3UoSEEELoRoqQEEII3UgREkIIoZv/AP/kVwligiBHAAAAAElFTkSuQmCC", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Create a visualization with pandas df\n", "sns.jointplot(data=pandas_tips, x=\"total_bill\", y=\"tip\", hue=\"sex\", hue_order=[\"Female\", \"Male\"])" ] } ], "metadata": { "interpreter": { "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" }, "kernelspec": { "display_name": "Python 3.9.10 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: examples/jupyter/integrations/sklearn.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Demonstrating sklearn Modin Interoperability" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Logistic Regression example taken / adapted from https://www.ritchieng.com/pandas-scikit-learn/" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import modin.pandas as pd\n", "import numpy as np\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# From https://www.ritchieng.com/pandas-scikit-learn/\n", "\n", "url = 'http://bit.ly/kaggletrain'\n", "train = pd.read_csv(url)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Pclass: passenger class\n", "# Parch: parents and children\n", "feature_cols = ['Pclass', 'Parch']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# you want all rows, and the feature_cols' columns\n", "X = train.loc[:, feature_cols]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# now we want to create our response vector\n", "y = train.Survived" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 1. import\n", "from sklearn.linear_model import LogisticRegression\n", "\n", "# 2. instantiate model\n", "logreg = LogisticRegression()\n", "\n", "# 3. fit \n", "logreg.fit(X, y)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "url_test = 'http://bit.ly/kaggletest'\n", "test = pd.read_csv(url_test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# missing Survived column because we are predicting\n", "test.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_new = test.loc[:, feature_cols]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 4. predict\n", "new_pred_class = logreg.predict(X_new)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# kaggle wants 2 columns\n", "# new_pred_class\n", "# PassengerId\n", "\n", "# pandas would align them next to each other\n", "# to ensure the first column is PassengerId, use .set_index\n", "kaggle_data = pd.DataFrame({'PassengerId':test.PassengerId, 'Survived':new_pred_class}).set_index('PassengerId')\n", "kaggle_data.to_csv('sub.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# save train data to disk using pickle\n", "train.to_pickle('train.pkl')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# read data\n", "pd.read_pickle('train.pkl')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# From https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html\n", "\n", "import numpy as np\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.preprocessing import Normalizer\n", "ct = ColumnTransformer(\n", " [(\"norm1\", Normalizer(norm='l1'), [0, 1]),\n", " (\"norm2\", Normalizer(norm='l1'), slice(2, 4))])\n", "X = pd.DataFrame(np.array([[0., 1., 2., 2.],\n", " [1., 1., 0., 1.]]))\n", "# Normalizer scales each row of X to unit norm. A separate scaling\n", "# is applied for the two first and two last elements of each\n", "# row independently.\n", "ct.fit_transform(X)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.feature_extraction import FeatureHasher\n", "from sklearn.preprocessing import MinMaxScaler\n", "X = pd.DataFrame({\n", " \"documents\": [\"First item\", \"second one here\", \"Is this the last?\"],\n", " \"width\": [3, 4, 5],\n", "}) \n", "ct = ColumnTransformer(\n", " [(\"text_preprocess\", FeatureHasher(input_type=\"string\"), \"documents\"),\n", " (\"num_preprocess\", MinMaxScaler(), [\"width\"])])\n", "X_trans = ct.fit_transform(X)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# From https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html\n", "\n", "import numpy as np\n", "from sklearn.impute import SimpleImputer\n", "imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')\n", "imp_mean.fit(pd.DataFrame([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]]))\n", "\n", "X = pd.DataFrame([[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]])\n", "print(imp_mean.transform(X))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# From https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html\n", "\n", "import numpy as np\n", "from sklearn.model_selection import train_test_split\n", "X, y = pd.DataFrame(np.arange(10).reshape((5, 2))), pd.Series(range(5))\n", "X\n", "list(y)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(\n", " X, y, test_size=0.33, random_state=42)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "type(X_train)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_test_split(y, shuffle=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Linear Regression example taken / adapted from https://github.com/chendaniely/2021-07-13-scipy-pandas/blob/main/05-models.ipynb" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tips = sns.load_dataset(\"tips\")\n", "tips = pd.DataFrame(tips)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pd.get_dummies(tips, drop_first=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn import linear_model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 1. create the model object\n", "lr = linear_model.LinearRegression()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 2. fit the model object\n", "lr.fit(X=tips[[\"total_bill\", \"size\"]], y=tips[\"tip\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# look at the coefficients\n", "lr.coef_" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# look at the intercept\n", "lr.intercept_" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tips_dummy = pd.get_dummies(tips, drop_first=True)[[\"tip\", \"total_bill\", \"smoker_No\"]]\n", "tips_dummy" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "lr2 = linear_model.LinearRegression()\n", "lr2.fit(X=tips_dummy.iloc[:, 1:], y=tips_dummy[\"tip\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "lr2.coef_, lr2.intercept_" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "new_data = tips_dummy[[\"total_bill\", \"smoker_No\"]].tail() # not really new data\n", "new_data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# use the model to give predicted tip values\n", "new_data[\"predicted_tips\"] = lr2.predict(new_data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "new_data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "type(new_data)" ] } ], "metadata": { "interpreter": { "hash": "9752fa87da8bf164654ccc33a595e9110c8fc9bb15d763374a7037fd32519b1f" }, "kernelspec": { "display_name": "Python 3.9.7 ('base')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: examples/jupyter/integrations/statsmodels.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Demonstrating statsmodels Modin Interoperability\n", "### Currently statsmodels is not completely interoperable with Modin. All the examples in this section are taken/ adapted from https://www.statsmodels.org/devel/gettingstarted.html or https://www.statsmodels.org/stable/index.html" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import statsmodels.api as sm\n", "import pandas\n", "import modin.pandas as pd\n", "from patsy import dmatrices" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Example with sm.OLS()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = sm.datasets.get_rdataset(\"Guerry\", \"HistData\").data\n", "modin_df = pd.DataFrame(df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "vars = ['Department', 'Lottery', 'Literacy', 'Wealth', 'Region']\n", "\n", "modin_df = modin_df[vars]\n", "\n", "modin_df[-5:]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modin_df = modin_df.dropna()\n", "\n", "modin_df[-5:]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "y, X = dmatrices('Lottery ~ Literacy + Wealth + Region', data=modin_df, return_type='dataframe')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "y = pd.DataFrame(y)\n", "X = pd.DataFrame(X)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "mod = sm.OLS(y, X) # Describe model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "res = mod.fit() # Fit model\n", "\n", "print(res.summary())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "sm.ols() is not interoperable with Modin currently." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Example with sm.ols(formula=)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modin_df = pd.DataFrame({\"A\": [10,20,30,40,50], \"B\": [20, 30, 10, 40, 50], \"C\": [32, 234, 23, 23, 42523]})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import statsmodels.formula.api as sm\n", "result = sm.ols(formula=\"A ~ B + C\", data=modin_df).fit()\n", "print(result.params)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(result.summary())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Replicating statsmodels workflow with pandas" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import statsmodels.api as sm\n", "\n", "df = sm.datasets.get_rdataset(\"Guerry\", \"HistData\").data\n", "pandas_df = pandas.DataFrame(df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "vars = ['Department', 'Lottery', 'Literacy', 'Wealth', 'Region']\n", "\n", "pandas_df = pandas_df[vars]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pandas_df = pandas_df.dropna()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "y, X = dmatrices('Lottery ~ Literacy + Wealth + Region', data=df, return_type='dataframe')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "y = pandas.DataFrame(y)\n", "X = pandas.DataFrame(X)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "mod = sm.OLS(y, X) # Describe model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "res = mod.fit() # Fit model\n", "\n", "print(res.summary())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Example with sm.ols(formula=)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pandas_df = pd.DataFrame({\"A\": [10,20,30,40,50], \"B\": [20, 30, 10, 40, 50], \"C\": [32, 234, 23, 23, 42523]})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import statsmodels.formula.api as sm\n", "result = sm.ols(formula=\"A ~ B + C\", data=pandas_df).fit()\n", "print(result.params)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(result.summary())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "interpreter": { "hash": "9752fa87da8bf164654ccc33a595e9110c8fc9bb15d763374a7037fd32519b1f" }, "kernelspec": { "display_name": "Python 3.9.7 ('base')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: examples/jupyter/integrations/tensorflow.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Demonstrating Tensorflow Modin Interoperability\n", "## All the examples in this section are taken/ adapted from https://www.tensorflow.org/tutorials/load_data/pandas_dataframe" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import tensorflow as tf\n", "import modin.pandas as pd\n", "import pandas" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "SHUFFLE_BUFFER = 500\n", "BATCH_SIZE = 2\n", "\n", "csv_file = tf.keras.utils.get_file('heart.csv', 'https://storage.googleapis.com/download.tensorflow.org/data/heart.csv')\n", "\n", "modin_df = pd.read_csv(csv_file)\n", "modin_df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "target = modin_df.pop('target')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "numeric_feature_names = ['age', 'thalach', 'trestbps', 'chol', 'oldpeak']\n", "numeric_features = modin_df[numeric_feature_names]\n", "numeric_features.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tf.convert_to_tensor(numeric_features)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "normalizer = tf.keras.layers.Normalization(axis=-1)\n", "normalizer.adapt(numeric_features)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Replicating statsmodels workflow with pandas" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "SHUFFLE_BUFFER = 500\n", "BATCH_SIZE = 2\n", "\n", "csv_file = tf.keras.utils.get_file('heart.csv', 'https://storage.googleapis.com/download.tensorflow.org/data/heart.csv')\n", "\n", "pandas_df = pandas.read_csv(csv_file)\n", "pandas_df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "target = pandas_df.pop('target')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "numeric_feature_names = ['age', 'thalach', 'trestbps', 'chol', 'oldpeak']\n", "numeric_features = pandas_df[numeric_feature_names]\n", "numeric_features.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tf.convert_to_tensor(numeric_features)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "normalizer = tf.keras.layers.Normalization(axis=-1)\n", "normalizer.adapt(numeric_features)" ] } ], "metadata": { "interpreter": { "hash": "9752fa87da8bf164654ccc33a595e9110c8fc9bb15d763374a7037fd32519b1f" }, "kernelspec": { "display_name": "Python 3.9.7 ('base')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: examples/jupyter/integrations/xgboost.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Demonstrating XGBoost Modin Interoperability" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## All the examples in this section are taken / adapted from https://xgboost.readthedocs.io/en/stable/python/python_intro.html" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import xgboost as xgb\n", "import modin.pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data_train = pd.DataFrame(np.arange(36).reshape((12,3)), columns=['a', 'b', 'c'])\n", "label_train = pd.DataFrame(np.random.randint(2, size=12))\n", "dtrain = xgb.DMatrix(data_train, label=label_train)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data_test = pd.DataFrame(np.arange(12).reshape((4,3)), columns=['a', 'b', 'c'])\n", "label_test = pd.DataFrame(np.random.randint(2, size=4))\n", "dtest = xgb.DMatrix(data_test, label=label_test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}\n", "param['nthread'] = 4\n", "param['eval_metric'] = 'auc'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "evallist = [(dtrain, 'train'), (dtest, 'eval')]\n", "num_round = 10\n", "bst = xgb.train(param, dtrain, num_round, evallist)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bst.save_model('0001.model')" ] } ], "metadata": { "interpreter": { "hash": "9752fa87da8bf164654ccc33a595e9110c8fc9bb15d763374a7037fd32519b1f" }, "kernelspec": { "display_name": "Python 3.9.7 ('base')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: examples/modin-scikit-learn-example.ipynb ================================================ { "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Process STDOUT and STDERR is being redirected to /tmp/raylogs/.\n", "Waiting for redis server at 127.0.0.1:35043 to respond...\n", "Waiting for redis server at 127.0.0.1:49923 to respond...\n", "Starting local scheduler with the following resources: {'CPU': 4, 'GPU': 0}.\n", "\n", "======================================================================\n", "View the web UI at http://localhost:8889/notebooks/ray_ui93764.ipynb?token=23507892afd3d95e7604e7cd889b30382368ed888e79fc8c\n", "======================================================================\n", "\n" ] } ], "source": [ "%matplotlib inline\n", "\n", "import numpy as np\n", "import modin.pandas as pd\n", "import matplotlib.pyplot as plt\n", "import sklearn" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTATPRICE
000.0063218.02.310.00.5386.57565.24.09001.0296.015.3396.904.9824.0
110.027310.07.070.00.4696.42178.94.96712.0242.017.8396.909.1421.6
220.027290.07.070.00.4697.18561.14.96712.0242.017.8392.834.0334.7
330.032370.02.180.00.4586.99845.86.06223.0222.018.7394.632.9433.4
440.069050.02.180.00.4587.14754.26.06223.0222.018.7396.905.3336.2
\n", "
" ], "text/plain": [ " Unnamed: 0 CRIM ZN INDUS CHAS NOX RM AGE DIS RAD \\\n", "0 0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 \n", "1 1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 \n", "2 2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 \n", "3 3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 \n", "4 4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 \n", "\n", " TAX PTRATIO B LSTAT PRICE \n", "0 296.0 15.3 396.90 4.98 24.0 \n", "1 242.0 17.8 396.90 9.14 21.6 \n", "2 242.0 17.8 392.83 4.03 34.7 \n", "3 222.0 18.7 394.63 2.94 33.4 \n", "4 222.0 18.7 396.90 5.33 36.2 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.read_csv(\"data/boston_housing.csv\")\n", "\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "modin.pandas.dataframe.DataFrame" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "features = data.drop(\"PRICE\", axis=1)\n", "labels = data[\"PRICE\"]\n", "\n", "type(features)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.linear_model import LinearRegression\n", "\n", "lm = LinearRegression()\n", "lm.fit(features, labels)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEWCAYAAABrDZDcAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJztnXmcXVWV77+/qlxIhakC5iEUhOAEiiGkSQs2tAIOqAimIYqIttq0tO/52vGlDU9awpNu0Girrwd9KAqKIpMWUyvYQDtgMySEgBHSrULAYgqQYkoBlar1/jjnVE7dOufec4dz77n3ru/nU5+69wx7rzPcvfZea+21ZWY4juM4vUtfuwVwHMdx2osrAsdxnB7HFYHjOE6P44rAcRynx3FF4DiO0+O4InAcx+lxXBF0EJKOkPSHBs7/uqS/baZMCXWYpJel7DtZ0nV1lrtS0oWNSecUlWY+X0nzJT0jqb8Z5fUCrghajKT7JI2FL+rDks6XtGMO9XxA0i/j28zsw2b2uWbXlRUz+56ZvbnV9XaCEgnfgxfC9+IJST+VtH+75SoCYQdoMrw3T0vaIOmDaceb2f1mtqOZTbRSzk7GFUF7ONbMdgQOAhYDp7VZHqcYfCF8L4aAEeC8NstTJB4M783OwKeBb0h6VflBkma1XLIuwBVBGzGzh4FrCRQCAJK2l/RFSfdLeiQ05wwknS9phaTfhb2k30j6s3D7K4GvA68Ne1Gj4fbzJZ0VO/9Dkn4b9kCvlLRnbJ9J+rCk/5I0KumfJSnc9zJJP5P0pKTHJF1cJtobU86bNkoJ6/iopN+H5aySVOmdnC3p4vB6b5e0KFbWnpIul7RJ0r2SPhpufwvwv4ETw3uxTtKRku6KnftTSbfFvv9C0tJK5Yb7+mLP4HFJl0jaNdy3ILy+94fP8jFJn6lwbVOY2RhwCdPfiz5Jp0vaKOlRSd+RtEts/3GS1of3/N/DdyDad5+k5ZLulPSspPMk7S7px+G9/DdJc8NjZ0u6MLyeUUm3Sdo9Sc609y/c9wFJvwzf5c3hvXtrbP++4Tv0tKSfAi/KeG/MzIaBzcCrYvf5FEn3AzfEts0K69pV0rclPRjKMhyT4+2S7giv9VeSDswiR9dhZv7Xwj/gPuCN4ee9gLuAr8b2fxm4EtgV2Am4Cjg73HcE8IfYse8E9iRQ6CcCzwJ7hPs+APyyrO7zgbPCz0cBjwF/BGwP/CPw89ixBlwNDALzgU3AW8J9FwGfCeudDRye8bxpMoXH3hhe63zgP4G/TLlvK4FxYBlQAv4XcG/4uQ9YA3wW2A54CfB74OjYuRfGyhoAniNofErAIwQ98J3CfWPAbhnK/Rhwc/gctwf+H3BRuG9BeH3fCMtcBDwPvDLl+uLPZgfgu8C62P6/AH4byrAj8EPgu+G+V4TP/k3h9fxNeOx2sXfuZmB3gtHGo8DtBKPR2cANwBnhsX9F8M7NAfqBg4GdU2Su9v6NAx8Ky/nvwIOAwv3/AfxDeN9eBzwdf0Zl9RxB+N6Hdf1ZWPZ+sfv8nfC+DcS2zQrPuQa4GJgb3p/Xh9sXh/fikFDG94f3avt2txMtb5faLUCv/YUv2jPhi2/A9cBguE/hj+mlseNfC9wbfp76QaSUfQfwjvDzB6isCM4jMEVE+3YMf1wLwu/G9Ab+EmBF+Pk7wLnAXgkyVDpvmkzhsW+Jff8fwPUp17YSuDn2vQ94CPjT8Id8f9nxpwHfjp17Ydn+XwDHA4cC14VyvgU4ErgzPKZauXcDb4jt2yO8h7NijdFesf23Au9Oub7zCZTTKDBJoOQOjO2/Hvgfse/7xer6W+CSsnszAhwRe+dOju2/HPha7PtfA8Ph578AfhWvu4Z3u/z9+21s35zwfryYQOlvBXaI7f9++TOK7TsivCejwBNhPe8O90X3+SWx46Nts8JnMgnMTSj3a8DnyrZtIFQUvfTn9rT2sNTM/k3S6wl+AC8ieMnnEfxg1oTWFAiUQ2L0g6Q/Bz5J8OJD0JhnGmIT9ORuj76Y2TOSHifoMd4Xbn44dvyWsHwIepyfA26VtBn4kpl9K3Zs2nlJPBD7vDGUq+qxZjapIIJqT4If/Z4KTWAh/QSNfRo/I1Ss4efNwOsJeu0/C4/Zp0q5+wA/kjQZ2z9B0POOqOVefNHMTpc0H/gJQWN/Z7hvT4L7E7GRoKHbvXxfeG8eIHiWEY/EPo8lfI/k+i6wN/ADSYPAhcBnzGy8XNgM79/UtZvZlvCdjo7ZbGbPll3P3uV1xHjQzPaqsP+BlO17A0+Y2eaEffsA75f017Ft21H5HexK3EfQRszsZwQ9wS+Gmx4j+FEeYGaD4d8uFjjJpiFpHwKzw/8EdjOzQeDXBIoDgsaxEg8S/BCi8nYgMIeMZJD7YTP7kJntSWBK+BelhIxmIP7jnx/KVfXY0JewV3j8AwSjpsHY305m9rZI5ISyIkXwuvDzzwgUwevZpgiqlfsA8Nay/bPNrOo9rISZ3U9gdvqqtvmHpj0vtvWqHynfF/pk9ibDs0yoe9zMzjSzVwF/Arwd+PPy4zK8f5V4CJgbvnPx62mEtPf9AWDXUKkl7fu7suc3x8wualCWjsMVQfv5CvAmSYvMbJLgx/VlSf8NQNKQpKMTztuB4OXfFB73QeDVsf2PAHtJ2i6l3ouAD0o6SNL2wN8Dt5jZfdUElvROSVHvbHMox2SFUyqxXNJcSXsTNH7ljuc4B0s6PnQCfpyg934zgcnlaUmfljQgqV/SqyX9cXjeI8ACTXdE/4qgx/0a4FYzW0/QmB4C/Dw8plq5Xwf+LmwUkTRP0jvqvA/TMLOfEjTwp4abLgI+ETpZdyR4Xheb2VYCs9Yxkt4gqQR8Krw3v6q1XgWO9IUKYvCfIjA/JT3bau9fpWvbCKwGzpS0naTDgWNrlTVjXQ8BPyborMyVVJL0unD3N4APSzpEATtIOkbSTnnIUmRcEbQZM9tEYHP/bLjp0wSOvpslPQX8G0GDVX7eb4AvETjdHgEWAjfFDrkBWA88LOmxhPP/jcC2fDlBD+2lwLsziv3HwC2SniFwbH/MzH6f8dxyriBwyN5B4NSrFDJ5BYFTcjPwPuD4sAc7QdBzPYjAtv4Y8E0giqq5NPz/uKTbAUKzxO3AejN7Idz/H8BGM3s0PKZauV8Nr/86SU8TKKVD6rsNiawC/iZU1N8iMNv8PJTlOQLbPma2AXgvgcP/MYJG9djYddXCi4HLCJTA3QSjo++WH5Th/avGewju1RPAGQS/gbx4H4FCu4fAOfxxADNbTeDM/ieCd+q3BL6NniPy4DtOy5FkwMvN7LftlsVxehkfETiO4/Q4rggcx3F6HDcNOY7j9Dg+InAcx+lxOmJC2Yte9CJbsGBBu8VwHMfpKNasWfOYmc2rdlxHKIIFCxawevXqdovhOI7TUUjaWP0oNw05juP0PK4IHMdxehxXBI7jOD2OKwLHcZwexxWB4zhOj5Nr1JCk+wgWYJkAtprZEgVL+V1MkMP8PuBdKbnCnQIwvHaEVddu4MHRMfYcHGD50fuxdPFQ9RMLTLOvqZX3qJl1tfrZNqu+qJyR0TH6JSbMpv7PKfUxtnUSM+iXOOmQvTlr6cKG6y8/98j953HjPZua+hzi1zPU4t9arjOLQ0WwxMwei237AsFCEedIWkGwctCnK5WzZMkS8/DR1jO8doTTfngXY+MTU9sGSv2cffzCjlUGzb6mVt6jZtbV6mfbrPqSyqnGew+dz5J9dq27/ix1NvM5NFpmHElrzGxJtePaYRp6B3BB+PkCYGkbZHAysOraDTNe0LHxCVZdu6FNEjVOs6+plfeomXW1+tk2q76kcqpx0S0PNFR/ljqb+RwaLbMe8lYERpCrfY2kaIGN3cPFIiBYym73pBMlnSpptaTVmzZtyllMJ4kHR8dq2t4JNPuaWnmPmllXq59ts+qrR74Js4bqz1pnM59DI2XWQ96K4HAz+yPgrcBHYisDAWCBXSrRNmVm55rZEjNbMm9e1RnSTg7sOThQ0/ZOoNnX1Mp71My6Wv1sm1VfPfL1Sw3Vn7XOZj6HRsqsh1wVQbR2a7ji048IlgV8RNIeAOH/R/OUwamf5Ufvx0Cpf9q2gVI/y4+esWBax9Dsa2rlPWpmXa1+ts2qL6mcapx0yN4N1Z+lzmY+h0bLrIfcoobChan7zOzp8PObgf9DsLTf+4Fzwv9X5CWD0xiRk6qbooaafU2tvEfNrKvVz7ZZ9cXLqTVqqN76k2SPooYiGeL2/FquKe16uiZqSNJLCEYBECic75vZ30najWCx7fnARoLw0ScqleVRQ47jJNHO8OZOiKrLGjWU24ggXMx8UcL2x4E35FWv4zi9QXlDPDI6xmk/vAuorVdeqfxKSqZSJFJRFEFWfGax4zgdSZ4hsJGSGRkdw9imZIbXjkwd001Rda4IHMfpSPJsiLMomW6KqnNF4DhOR5JnQ5xFyXRTVJ0rAsdxOpI8G+IsSmbp4iHOPn4hQ4MDCBgaHCiUo7gWOmKpSsdxnHLyDIFdfvR+iRFB5Upm6eKhjmz4y3FF4DhOx5JXQ9yNc2gq4YrAcZzCUYT0593S28+CKwLHcQpF3vMDnJm4s9hxnELRjenPi44rAsdxCsPw2hFGumiiVqfgisBxnEIQmYTS6MSJWp2C+wgcxykElVbrKg/dLIIzuZtwReA4TiGoZPqJT9RyZ3LzcdOQ4ziFIM30MzQ4kDnrp1MfrggcxykEWVNGtDLr5/DaEQ475wb2XXENh51zw7Tso92Em4YcxykEWWfz7jk4kBhZ1Gxnci+ZoFwROI5TGLLM5s2aB6hRumnhmWq4InAcp6NoVR6gblp4phquCBzHyZU8Qj1bkQeoVSaoIuDOYsdxciPLko/1lpu3E7ebFp6phisCx3FyI83OvvLK9XWXmZdyKaebFp6phpuGHMfJjTR7+ujYOMNrR+pqVFvpxO2VVNQ+InAcJzcq2dM/fvEddZl1esmJ2ypcETiOkxvV7On1mHXyXLS+nF6ZUOaKwHGc3Fi6eIi5c0oVj6k1PUSrnLit8kUUAVcEjuPkyhnHHjCj4S5nZHQsc8+7VU7cXspp5M5ix3FyJT4BLG3RGcHUviypHFrhxO0lX4SPCBzHyZ2li4e4acVRfOXEg2aMDgRY2fFF6Hm30hfRblwROI7TMpLMOuVKIKLdPe8kX0SpXzz7/Naucx67achxeoSirOpVbtY57JwbCpnKoTyn0eCcEs88t5XRsXGgu7KR+ojAcXqAIkfAFDmVQ2TSuvecY5iz3SzGJ6ePX4pgwmoGrggcpwcocgRMp6Ry6GbnsZuGHKcHKHoj1gmpHLo5G6mPCBynB+ilCJi8KLIJq1FcEThOD9DNjVir6BQTVj3kbhqS1A+sBkbM7O2S9gV+AOwGrAHeZ2Yv5C2H4/QyrVrVq9vpBBNWPbTCR/Ax4G5g5/D754Evm9kPJH0dOAX4WgvkcJyeptMbsaKEv3YjuZqGJO0FHAN8M/wu4CjgsvCQC4ClecrgOE7nU+Tw124gbx/BV4C/ASbD77sBo2a2Nfz+ByBRpUs6VdJqSas3bdqUs5iO4xSZIoe/dgO5KQJJbwceNbM19ZxvZuea2RIzWzJv3rwmS+c4TidR9PDXTidPH8FhwHGS3gbMJvARfBUYlDQrHBXsBfjYznGcinRzDH8RyG1EYGanmdleZrYAeDdwg5mdDNwILAsPez9wRV4yOI7THTQz/LVXVh2rhXbMI/g08ElJvyXwGZzXBhkcx+kgmhXD707nZGSWlgS2OCxZssRWr17dbjEcx+lw0jKdDg0OcNOKo9ogUb5IWmNmS6od57mGHMdJpBvj9t3pnIynmHAcZwbdakLxnEvJuCJwHGcG3Ri3P7x2hC0vbJ2x3XMuuWnIcXqOLCafbjOhRCOccuU2OFBi5XEHdLzJq1FcEThOD3H68F187+b7p9YJTltucZeB0tSSjHF2GSi1QsymkzTCAdhh+1k9rwTATUOO0zMMrx2ZpgQikkw+UnIZaduLTreNcJqNKwLH6RFWXbthhhKIKG8QR7fMHA1U2l503ElcGVcEjtMjVOr9ljeIaQ2kAYv/z3UdFz3kC/NUxhWB49RIp6YoSGvcBTMaxOVH70epP9kOtHnLOB+/+I6OUgjdvLpYM3BnsePUQHn0SZqztYgsP3q/GZEzAk4+dH6y7FWSDmzeMj517VD81c86fWGePHFF4Dg1UCm+vuiNTC3LVa66dgPjk9XTz4yNT3DmVet5bnyyI5WjE+CmIcepgU6PPlm6eIjlR+/HnoMDPDg6xqprNySad2q5ns1bxrtu8lmv4SMCx6mBTs+Ln9W0lXadtdCocuzGXEdFxUcEjlMDRY8+qebIzpo6Iuk6a6UR5dituY6KiisCx6mBIkefZGk803rpI6Nj0xRH/DrroVHl2I25joqMm4Ycp0aKGn2SxZE9OKfE5pRJYeVmoqWLh1i98QkuvPn+qnUPlPrYdYftm2bG6XRfTKfhisBxuoQsjWe1dajKFcdFtzyQqe6tk9ZUG36n+2I6jaqmIUmvkHS9pF+H3w+UdHr+ojmOUwtZ0ig8mZBIrpy44pjIuILh+IQ11WxTdF9Mt5HFR/AN4DRgHMDM7iRYjN5xnAKRpfHM0qOOH9NfQ5a5ZpptiuyL6UaymIbmmNmtmv5CzFzdwXGctpJlwljS7OI45YrjpEP2zuQjgOabbYrqi+lGsiiCxyS9lHDCuaRlwEO5SuU4Tl1UazyjfWdetX7KaSyCH/dQguI4a+lCfnT7CM++kKw4IpLMNj4PoHPIogg+ApwL7C9pBLgXeG+uUjmOUxdZGt/VG5+Ylk7a2NaQJzXUW6oogW2lTJdj+WXrGJ8Ito+MjrH8snWAp50oIlUVgZn9HnijpB2APjN7On+xHMeplSyzhrMsTrPq2g2MjI7RLzFhNvW/EmPjkyy/dFtDf+ZV66eUQMT4hHHmVetdERSQLFFDfy9p0MyeNbOnJc2VdFYrhHMcJztZJmFVWpwmUhxR2GbU+GeOHJrcFjmUNlchbbvTXrJEDb3VzEajL2a2GXhbfiI5Tm9T73oHWeYRVIrs6ZdSncgQ+BLqlcEpNll8BP2Stjez5wEkDQDb5yuW4/Qmta53EPcJ9KWYcOLRPGkTtUT1nn+WcUFU1+BAidGEOQuDA6UMpTitJsuI4HvA9ZJOkXQK8FPggnzFcpzepJYcO+W5hdIa8mef3zo1qkiaaxAtTtNoI13q01Tk0MrjDqDUpxn7Vx53QEN1OPmQxVn8eUl3Am8IN33OzK7NVyzH6U1qybGTpDSSGB0bnzGqSIosuubO9KjwgVI/s0t9qTb+wYESK487YKr8WhbBcdpPplxDZvZj4Mc5y+I4PU8tOXZqscfHcwilzTUYreDIHRufYPtZfZT6NS0aaKDUnzjj1+cQdBapikDSL83scElPM908KMDMbOfcpXOcHuPI/efNCO9My7FT6+Ix1RRHtfJGx8Yp9Ym5c0qMbhmf0cBHjf/I6NjUJDXwpSs7gVQfgZkdHv7fycx2jv3t5ErAcZrP8NoRLl8zMqPXdcLByT34WheP2aWKDyBLeeOTxpztZnHvOcdw04qjpimBeOhppXkKTvGo6CyW1C/pnlYJ4zi9TJLN34Ab79mUeHyUmG3unGxO3mdf2FoxFLU80Vsa5SOL4bUjfOqSdVX9FR5aWlwqKgIzmwA2SJrfInkcp2fJunpYnKWLh1j72TdnUgblqaKT5issXTzETSuO4t5zjkldnSzur4hGAlkmnflaAsUli7N4LrBe0q3As9FGMzsuN6kcpwepZKNPSxcROWSzzf3dpmyyzFdIylRa7q/IGrnkawkUmyyK4G9zl8JxnKopouORP+UNeVaiXnm1+QqRgtlloMTsUl+icxgqm3sqZTV1ikVFRSBpKfAy4K5a5w5Img38nGAW8izgMjM7Q9K+wA+A3YA1wPvM7IV6hHecbiIee19pZBAdU6sSiPfKK5mh4gpmdGycgVI/Xz7xoMSGPG0U0y/xpXct8sa/Q0j1EUj6F+ATBA325yTVOjJ4HjjKzBYBBwFvkXQo8Hngy2b2MmAzcEpdkjtOF7J08RDLj94vdWUwEZh1agkbBdhhu23x/sNrR+hLKT8p31CliJ+0VdFcCXQWlUYErwMWmdmEpDnAL4DPZS3YzAx4JvxaCv8MOAp4T7j9AmAl8LXaxHac9pHnZKlqzlcjGA1kSQ0dJ1pT4PThuxLTUEPQgKeNMtJGED6DuDuopAheCKOGMLMtUg2Ll4ZI6icw/7wM+Gfgd8ComUVLXf4BSHxjJJ0KnAowf74HLTnFoNakcLVy5lXrM4VhZlcBAQasvHI9T46NJ57bL3H28QtTzVKVIn58ScnOp5Ii2D/MMQTBiPSl4fdoZvGB1QoPFclBkgaBHwH7ZxXMzM4lWBmNJUuW1PreO04uVHKy1tMYxkcXg3NKmfL1R41ymm0+baSQlA00YsKMT1x8B4NzSpT6xPjk9DQSHvHT3VRSBK9sViVmNirpRuC1wKCkWeGoYC8gW7J1xykAtSSFq0b56CKLEij1Bxk+V298InFR+UNfMpdf/e6JmkcMEIwaNm8Zp9QvBgdKPDmWHCnkdB+pisDMNjZSsKR5wHioBAaANxE4im8ElhFEDr0fuKKRehynldSSFC6NeE6eWhmfMD51ybrUXv99j49x8qHzE/MVVcoeWl7H089tTY0UcrqPLOsR1MsewI2hOek24KdmdjXwaeCTkn5LEJF0Xo4yOE5TSYuSyWo6Kc/JUw+VnMQPjo5x1tKFnHzo/KnIo36JEw4e4oxjD8icm2jCjNN+eFfm1dGcziZTGup6MLM7gcUJ238PvCaveh0nT+qNkmlkFFALew4OMLx2hItvfWDamsMX3/oAS/bZdcohHMm+5YWtqaOERnwfTmchqyEErV0sWbLEVq9e3W4xnALRSfnua5kFXOoXO2w3q6Jjt9K5q5YtYuWV61OXibzjjDfXJJuAe885pmZZnGIgaY2ZLal2XNURgaS7mJlV9klgNXCWmT1en4iOUx95h3A2m6yzgMtTMSxYcU1tFYW/0jQlkrQ9qivN7+CJ4nqDLKahHwMTwPfD7+8G5gAPA+cDx+YiWRPopF6jk51mh3DmTbWIorRVvoZqXHhmfNLqyvkf1VstwZzTvWRxFr/RzE4zs7vCv88ArzezzwML8hWvfsoX9o56je786nyaGcLZCir1qocGBxKVAFBXIzwyOsacUvrPOu39L1+LoJJcTveRZUTQL+k1ZnYrgKQ/BqLQg63pp7WXTus1OtlpRghnoySNNiHZiZyWzjnLWr8DpT7Gxiczy9UvsX2pny0p51R6/32GcO+SRRH8JfAtSTsS+I6eAv5S0g7A2XkK1wid1mt0spMlT36eJPkoll+6DsTUwu5JfotqZsqkckv9Spzpm+ZzmDCruAi9v/9OElUVgZndBiyUtEv4/cnY7kvyEqxRitBrdPKh3YnOkkab8YY6Ij4CzdLbTix3wthhu34mxyeZMJuaE3DjPZtS/QcSpAUDStuc0IMDJVYed4CPApzqPgJJ20t6D/AR4GOSPivps/mL1hiNTvxxiku7gwBq6VVnOTZaMjKtYX/2hYlpcwIuXzPCkfvPS50clqCTEveNjo2z/NJ17jdzMjmLrwDeQeAPeDb2V2jc+dWdFCEIoJZRZbVj65lpPDY+wdXrHmJ2BacwBL3/qc8px9QbaeR0F1l8BHuZ2VtylyQH3PnVfRQhCCDJR1Hq0zQfAWQbgdaz0hhUziQ6hcF94WSwfSvMSXC/gZNlRPArSQtzl8RxMlCEIICk0eaqdy5i1bJFNY9AK8k9NDjA4ECpbjnjo5FKIxP3mzlZRgSHAx+QdC/B8pOZ1yNwnGZTlCCAtNFmraOStOsZGhzgphVH1b1IfZSuOmL50fux/LJ100YsEIxk3G/mZFEEb81dCsfJSLtDRyPKHdZH7j+PG+/ZVLMDu9r1JEVIVUoUBzB3Tokzjp0eDRR9PvOq9VPnetSQE5GqCCTtbGZPAU+3UB7HqUi7QkfLVxJ75rmtUyGjI6Nj0xaJSct9NLx2JLEhPvv4hdO2bz+rssX2mAP34PI1I9OUhwhSDZXnK4pTyWfW7kgsp72kZh+VdLWZvT00CRnTAw/MzF7SCgHBs486+VOpIazXPNMvMWk2NWK46NYHmEiI7ZxT6mN8wqbNRYga9rllSgeCEUN8LkF0bHz/2ccHbr0sjXvS9aXNfHY6i6zZRz0NtdPTlPfSI+INYaUY/3YR+RDSZBscKPH81slMjXtaGVEdTueSVRFkmVB2WJhOAknvlfQPkuY3Q0jHaSdRTzjJ3j42PsHKK9dz+nBjq4nlRSRTWtTR6Nh4aphtOUWIxHLaS5bw0a8BWyQtAj4F/A74bq5SOU4LqBbDPzo2nrhAfBGIlqGsNVoqqXFPK8PDSnuHLFFDW83MJL0D+CczO0/SKXkL5jh50aplI/Nkwox9V1zD4JxSYlK6tIXqo6UsyyOeyp3Pno6lt8iiCJ6WdBrwXuB1kvqA+me5OE4CrYpaqdfxW0QMZjT2USQSJC80c+T+82ZkOL3w5vsZKPUxd06J0S3jHjXUg2RRBCcC7wFOMbOHQ//AqnzFcnqJVi49WW9Kh07h+a3BOgRpYbZp1x+seSC+fOJBrgB6EI8aqoLHV+dP1qiVtGdRyzPad8U1Mxbg7jYqRftUu36PFOoumrl4/dNsC1PejsAs9IyZ7dKYiMWn0xZJ71SyRK2kPYvVG5+YZt+u9ozSUjp0E5Wifapdv0cK9SZVo4bMbCcz29nMdgYGgBMIIom6nkqZLp3mkSVqJe1ZXHTLAzU9oyP3n9egtMWnUrRP0jodWc91upcsPoIpLLAjDUs6A1iRj0jFweOrW0OW/EFpvdiJFNNm9IyG146w8sr12dI2dwHVon2Scg5lPdfpXrKYho6Pfe0DlgDP5SZRgShKpstuJ0v+oH4ptdFPIgqTXH7pusRlJLuRfilTWogo55D7v5yIqs5iSd+Ofd0K3Ad8w8wezVGuabTLWew5WIrDggoLq5Qv5h49o06fK1APgrod6U730TRnsZl9sDkidR7tXiS9CBSo2fxuAAAcaklEQVSlIRmqkLc/CoscGR2jX2JsfCLR9NELxJfvrNWR7hSHVv/usowI9gL+ETgs3PQL4GNm9ofcpCrDk861hyKNiKrJUoSJYpH5qlYzVjlfOfEgPnXJutQySv0Co6rJK00ODxEtNs383TUt6RzwbeBKYM/w76pwm9PlFClqKml5yPgPowgTxaJGtxElAMG1Vipj1bJFrHrntmUxq8lTjgc7FJt2/O6yRA3NM7N4w3++pI/nJZBTHIoWNVVpYZVuadzmzgmyt1QyhUX3IPqfNiEvbUTgwQ7Fph2/uywjgsfD9NP94d97gcdzk8gpDHlnpRxeO8Jh59zAviuu4bBzbmB47UjdZWTpg8+dU6KvUhe6zZT6xRnHBnmCkuL9S/3i2ee3zrhfSccOlPo56ZC9E7d7iGixaUc22CyK4C+AdwEPAw8By4CedSD3EmkNTDMaksgOOjI6Ns3BWYsyiJdRjYFSP8+PT1DUSNJ+iVXLFk3r7cdNYXPnlMCC1Njl9yvNbHbW0oUVzWlOMcnzd5eG5xpyKpJX9EIzVsWqtHLY3DklzODJsW3ZND9+8R0NyZwXWRyBvopYb9Gs313D4aOS/hHSR9xm9tGapXI6jkp2+UaoZAfN+iNIK0PA2s++Gdj2g/pEAZSABHvuMjAV5jphNhX+CkFjn3bNRfPXOPmS1+8ujUrO4ngX/EzgjFoKlrQ38B1gdwKFcq6ZfVXSrsDFwAKCyWnvMrPNtZTtdD6Dc0qJcf6Dc0qZE/2lzfzuk1iw4hr6RKFMQWYk9t6zJDf0We5OnqT6CMzsgugP2Bz/Hm6rxlbgU2b2KuBQ4COSXkWQo+h6M3s5cD09kLPImc7w2hGeeW5r4r5a1tpNS6AWRcoUSQlAYMYpZ3jtCJ+6ZF3Va26H3djpHbImnav5J2VmDxE4lzGzpyXdDQwB7wCOCA+7APh34NO1lu90Lquu3ZA6GSrNZZVkAimf+d3X4ESuvFmw23RFEI0EssT7+yx3J09qyj5aL5IWAIuBW4DdQyUBQSTS7innnAqcCjB//vz8hXRaRj127bgJJMmHABTWGRxx0++e4PThuzhr6UKg+iS4crNPq+3GTu+QahqS9LSkpyQ9BRwYfY62Z61A0o7A5cDHzWzaeWFa68TukJmda2ZLzGzJvHndn0O+l6jVrh03gSSFnS6/bB2fLLgSiLjolgemPlcKe3Wzj9NKKvkIphakMbNZsc87hYvUVEVSiUAJfM/MfhhufkTSHuH+PYCWZTF1ikG1xVHiZEklMT5hTDZdynyIzEDDa0dS00NkTSftOM0iN9OQJAHnAXeb2T/Edl0JvB84J/x/RV4yOMUk3qhXmwxWHmXT6eGSfao8/0HAl961yJWA01Ly9BEcBrwPuEtSNG7/3wQK4BJJpwAbCWYtOz1G3N79qr/9MVvGZ/bpo7w7cTp+zWGrbBIqrqvb6WaypJioCzP7pZnJzA40s4PCv381s8fN7A1m9nIze6OZPZGXDE5n8PfHHxikVo4Rz7sTp4hrDvdXSGBUvieLCavWVBuO0ygtiRpyWktRFpPJSi2hkTfes6nV4lVkTqkvcTTTCNEcgiI/M6e7cEXQZWSZpdoKGWpVRFlDI4vmI6imBOo19RTtOmuh0zoiTo6mIac9tHsxmWZkFa1Udp8KnEe6iXRq6og8n7+TH64Iuox2JyfLSxFVm4VbROpVWUlzCJqxdkMraHdHxKkPVwRdRjsWtYiTlyKqdynKUp+moo8aWZSmnlNPPnR+5vkSEUlrBnRSL7vdHRGnPlwRdBntTk6WlyKqpyERcOJr9uaMYw9gaHCgoSR0BgwOzAxnTWNocGBqYZj+jOYsEURFldvTO6mX3e6OiFMf7izuMtqdnGz50ftNc1ZDcxRRPfMHDLh63UNcfNsDjE+0zqQUv97ovpffkyQMuPDm+7l63UPTFtTppF52Xs/fyRdfocxpGlG0SNLCK40qovJoqKIxNDhQUfEOrx2pKyneQKmf2aW+xLUbonqLFpXjUUPFoeEVyhwnIssPu7yhnjCb6gk2oxGoJS1FPYigRz43ZcGcSiQtF5mWIbVWxsYn2H5WHwOl/kQl2I7w4Gp4ltTOw30ETkWyOipbYcdeuniIm1YcxX3nHNOU8iLb/dDgACcfOp+hwQFGqyiBLP6XtHs2UKrv5/bk2PjUIvRJFNVf4HQOrgicimRt4Jtpx84SKlmL4zaJwYESvzv7bXzlxIN49vmtXHjz/VMNd6Vzzj5+4bS6Zyc07mn3bHaNEUQRew4OTCnBNLdzEf0FTufgisCpSNYGvlnRIllGIKcP38XoWG3mmzilPrHyuAOm6spSVnQOwPNbt80m3rxlfIZ8afes2mgjifIRh0flOHngisCpSFoD0ydNa/yaFbZabQQyvHaE7918f01lxhkaHGDVO4M0z1nmJijDOeUjpEqNdZp5p7zOSNZoTkE0ShoZHZsxKvCoHKdR3FnsVCQpHBACZ3DcSdmssNU0R3C0fdW1G+rO3yOmr29QzZyS5ASuJh8E92z5petmrMv8YDjKiRzTaVhZ3eWO+HgZ/dI0RdQOJ61HCXU+rgicikQ/6E9dsm5GeofyLJnNiBbpT1mAPnLsNmIL36XMr1BpbsJAqZ8j95/HYefcMK2BqybfFAnGfIv9r6YM4teZNAqJyohkaVf0UBGSHDqN46YhJ5G4w3bVtRtSc/w020mZVk+0vRFb+OjY+DTnc9qSmXPnlDjh4CEuXzMyzVfx8YvvqChfVO6qazdUncAW9frTTEXx60y7x+U1tCN6qJNmPTvp+IjAmUFSLy+tB9sMJ2XctJDW444azDRTVVaSeqxJ8f5JI6BqROVmVY7RaKT83iY5iLPOnWh19FAnzXp20nFF4MwgzRSRRKMrhiVNRCsnKWXDmVetnzHxq5q5JSJu0io3ZzWS5TQqt9Z0GNVs/knKr5JibqXNPu1aPYqps3DTUM50SvrgOLU0Yo2uGFYtcicpGyfAU2NbZxwbNahZSOux1pvlNGJkdCzV5FSJNJv/8NoRli4e4oSDh6b8EP0Sf/LSXROjtI7cf15LM5W2O8mh0xxcEeRIJ6UPjhheO1JTyuVGTQCVzi9PUTG8doSDzryuoq0+sr1DggM3RlKPdXjtSNPSV1SaCZxGms1/eO0Il68ZmbrmCTNuv/9JTjh4iKHBgakQ17OPX8iN92xqqc1+6eKhqWuNy+GO4s7Ck87lSBT3XU5SWGJRSJM5jUavpVp9UflZk86Vy5N0XmRWGYr5BFZeub6hSWppMlSqPwsiiHZKki2SvzzRX1o59zYpNYfTOXjSuQLQiY60aj30etMLp9mt02Luy+XJYrIp9Ytnn9/KviuumWEbjxrLeCM8MjrG8kvXMQlMVFisoJaGOy4zJPs0Zpf62DppmVJjD1ZIgheNMCv5VyLcZu9Uwk1DOdKJ6QDSZIuG/PWYACqZyJYuHmLH2en9kUieaspTYWs9OjaeWMdNK45iaHBgRoM+PmkVlUBQeNVLTJQ5znOxRe7HxiczKYGBUj+VBuyRYzlLOW6zdyrhiiBHOtGRVknmqEG995xjuGnFUZntwNVizSvl4InuVTXlKZgxqhgbn+DMq9ZPfa9nJDY0OFCxMU6SY2R0bFpgQD0O6EjRPlnBXJUlsslt9k4WXBHkSCc60vKQuZqJLK2RHxwoTdVbLRInrVO/ecv4VINcz0jsyP3nZV5qEqabnaIRSa0KKEqFsXTxUKrMc+eUqjqjI19Fkd83pxi4s9jJnWpO8ySH6kCpn7OPXwhsm/A1OKeEWZCfv6+CY7SWerKce+T+87iwSqK7ND9C1FjX64Cvdm/Sric6xpVAb5PVWewjAid30nrzW17YOmXDTxqFANN8C5u3jPP81km+fOJBTNbQgYl65FE9tfDg6BhnLV3Iew+dPy2O/7CX7jpN3jRpHqxxXkG56bDSCC2+L5ILOmPk6RQLHxHkTCdmZsxD5uG1I4khmpV6rpVCSyuFSpZTHlJaT4hs2j2Ir9Ncqe4saxYPDpRYedwBhX8/nM7BRwQFoFMnlOUh89LFQ+yw/czooMihmzT7upJtPUkJlPpFqW+6PT/JOZ/UQy/1iVJ/si8g7R7E71US5akxqvka4gveOE4rcUWQI52YmTFPmdMa9s1bxhMVTxbnbr+0bfGYZYtY9c5FVR3dSeaWVe9cxKpli2paF7hSNFBS3dVGMEV/N5zuxSeU5Ug3TShrhsxZk7FFDeLyo/dj+WXrKsbcT5rNmDGbxbSStnbC0sVD7LvimkSbf/k9SLsn5QvgRAxluP4ivxtO9+IjghzppgllzZC5Fqfpg6NjgTlpu8p9lXrWRK6WBDDrPaj1XmW5/sE5pY5LUuh0Pq4IcqTbJpQ1SpJJZrBs1bCIqDGtNKGqVrmy+j+y3oMFuyU3+Gnby6N8yj0GpX7xzHNbO8qn5HQHbhrKkWat41sP9Ub+tFrmty/ag8vXjKTmMEozJ/VLNYVIDq8dybTcJmS/Bzf/fnNiXWnbo7Lj2VTjdTz7/NYZUVVJ8jVKJ0ayOfni4aNdSKVJSJV+8Hk3EGlynXDwEDfesymx3nqvpVq9cerNzLlgxTWp++6ro7w030QzM4c24346nUPbs49K+hbwduBRM3t1uG1X4GJgAXAf8C4zS+8+OXVRKfIn7cfeikXI0+S68Z5NqamsmzFCqZbrp17/R6W5DNFEuVpoxWpf9bwbTveTp4/gfOAtZdtWANeb2cuB68PvTpOpJ/Ina9hoIyuu1RuRVG+yuyzlC+r2f5x0yN6p++oJA22FT6kTI9mc/MlNEZjZz4Enyja/A7gg/HwBsDSv+nuZeiJ/sjQQjU42a1cUVaXyjfpHPGctTU9XUX4/kxRo+TYg9ySFnRjJ5uRPq6OGdjezh8LPDwO7px0o6VRJqyWt3rSpsXVxe416epZZGohGJ5u1K4pq+dH7pS4pUOtyklnPj9+3JAW6/NJ1LL9s3QylCjQ0+qlGJ0ayOfnTtvBRC7zUqZ5qMzvXzJaY2ZJ58+a1ULLOp55U0lkaiEbNCu1Ky7108RAnHzp/hjJoRgOY5b4lKdDxhBXKWjGzuBNTozv50+rw0Uck7WFmD0naA3i0xfV3BM2I3kmbOVvpeKjslG2GM7NWuZrFWUsXsmSfXVOvL89w21rs762w1bfrGTjFpdWK4Erg/cA54f8rWlx/4WlF9E4a1RqI5Ufvlxh6mLdZoZ5GOu2ctAyitdzzpLLTop4ge2qN6FjHaTV5ho9eBBwBvEjSH4AzCBTAJZJOATYC78qr/k6lyOF9zZxslrVxr0cxZjknXj/MtFGm3fN65ElSoKU+gZhmHmqGUvXJYk495KYIzOyklF1vyKvObqDo4X3lveoo8qXW3nrWxrQexVjtnKwrlSXd83rkSVOgSdsaabTbOZp0OhtPMVEwWjGpqFnU2/DU0pjWoxirnZN1Mfmke97IXIi0bKfNosijSafYeNK5gtFJ4X31hpPW0pjWE/de7Zyso6uke17kOPyijyad4uKKoGAUNbwvaUJUvQ1PLY1pPYqx2jlZGu25c0qJ97yoinp47Qh9KSugFUFJOcXGTUMFpGjhfWkmoME5JTZvmZkmOqnhiTsxdxkoUepXJkdpPQ7qauckOW/jDJT6OePYA+oqu9J1x49tplM3ej5JeY+KoKSc4uPZRzuUVkaHpC32PjhQ4vmtk1UzWSY5Z0t9YsfZsxjdMs6egwMcuf+81AykeVCumCSmZGlW3ZWyrSal3q535Jf2fPolvvSuRYXqVDitpe3ZR538aHV0SJqp58mxcb584kFVFVLazNo5281i7Wff3JZol1aMutJ8KBfd8kCmdRGykvZ8Js1cCTiZcEXQgbQ6OqRSJFOWBrWeKJ5uiHZJu+601NX1OnU7KdLMKSbuLO5AWh0d0qiDtN4onjyup5E02rWSdt39TXbqFtWB7XQOrgg6kFaHMDYayVRvFE+zr6fRNNq1knbdJx2yd1Mb7qJGmjmdg5uGOpB25PxpxKZeTxRPHtfTahNUpeuulACv3rq84XfqxaOGOpRuyynTiuvJsiZwt91Xp7fxqKEup9t6gK24nmpOVc/V4/Qq7iNweoZqvopGV2BznE7FRwRO4Wjm+gNxqvkqPFeP06u4InAKRV7rD0RUMkF5PL7Tq7hpyCkU9ZhnmmXS8Xh8p1fxEYFTKPJYfyArzVyBzXE6CVcETqGoxzzTTJNOt0VjOU4W3DTkFIo81h9wHKcyPiJwCkUe6w84jlMZn1nsOI7TpWSdWeymIcdxnB7HFYHjOE6P44rAcRynx3FF4DiO0+O4InAcx+lxOiJqSNImYGO75ajCi4DH2i1EC/Dr7C565Tqhd641fp37mNm8aid0hCLoBCStzhKm1en4dXYXvXKd0DvXWs91umnIcRynx3FF4DiO0+O4Imge57ZbgBbh19ld9Mp1Qu9ca83X6T4Cx3GcHsdHBI7jOD2OKwLHcZwexxVBE5DUL2mtpKvbLUueSLpP0l2S7pDUtelgJQ1KukzSPZLulvTadsvUbCTtFz7H6O8pSR9vt1x5IOkTktZL+rWkiyTNbrdMeSDpY+E1rq/1Wfp6BM3hY8DdwM7tFqQFHGlm3T4p56vAT8xsmaTtgDntFqjZmNkG4CAIOjLACPCjtgqVA5KGgI8CrzKzMUmXAO8Gzm+rYE1G0quBDwGvAV4AfiLpajP7bZbzfUTQIJL2Ao4BvtluWZzGkbQL8DrgPAAze8HMRtsrVe68AfidmRV99n69zAIGJM0iUOoPtlmePHglcIuZbTGzrcDPgOOznuyKoHG+AvwNMNluQVqAAddJWiPp1HYLkxP7ApuAb4fmvm9K2qHdQuXMu4GL2i1EHpjZCPBF4H7gIeBJM7uuvVLlwq+BP5W0m6Q5wNuAvbOe7IqgASS9HXjUzNa0W5YWcbiZ/RHwVuAjkl7XboFyYBbwR8DXzGwx8Cywor0i5Udo+joOuLTdsuSBpLnAOwgU/J7ADpLe216pmo+Z3Q18HrgO+AlwBzCR9XxXBI1xGHCcpPuAHwBHSbqwvSLlR9i7wsweJbAnv6a9EuXCH4A/mNkt4ffLCBRDt/JW4HYze6TdguTEG4F7zWyTmY0DPwT+pM0y5YKZnWdmB5vZ64DNwH9mPdcVQQOY2WlmtpeZLSAYXt9gZl3X2wCQtIOknaLPwJsJhqNdhZk9DDwgab9w0xuA37RRpLw5iS41C4XcDxwqaY4kETzPu9ssUy5I+m/h//kE/oHvZz3Xo4acrOwO/Cj4LTEL+L6Z/aS9IuXGXwPfC80mvwc+2GZ5ciFU6G8C/qrdsuSFmd0i6TLgdmArsJbuTTVxuaTdgHHgI7UEOXiKCcdxnB7HTUOO4zg9jisCx3GcHscVgeM4To/jisBxHKfHcUXgOI7T47gi6AIkLZVkkvZvtyztRNIzLarnIkl3SvpEK+orEpKOaDTLrqQFkn5dXp6k4yR17SzuIuPzCLqDk4Bfhv/PaLQwSbPCxFU9Q9ZrlvRi4I/N7GXNKK9ohJOuZGYtz51lZlcCV7a6XsdHBB2PpB2Bw4FTCGY3R9t/IOmY2PfzJS0L105YJem2sFf7V+H+IyT9QtKVhDNpJQ2HCebWx5PMSTpF0n9KulXSNyT9U7h9nqTLw7Jvk3RYgrwfkPRDST+R9F+SvhDb90zs8zJJ58dk/5qkmyX9PpT1W+FaAeeXlf/lUN7rJc0Lt700rG9NeI37x8r9uqRbgC+UlTNb0rcVrL+wVtKR4a7rgCEFOfz/tOycaeVJ2jW8h3eGsh8YHpe2faWkC0IZN0o6XtIXQhl+IqkUHneOpN+E538x4R6vlPRdSf8R3uMPxfYtjz37M8NtCyRtkPQdgtnie5eV9xYFazPcTiyjZSjXoAIel/Tn4fbvSHpT2ruWRvhuRO/S+ZL+r6Rfhc98Wbi9T9K/hPL8VNK/RvucBjAz/+vgP+Bk4Lzw86+Ag8PPfwZcEH7eDngAGABOBU4Pt28PrCZIyHUEQYK1fWNl7xr+HyBoIHYjSNx1H7ArUAJ+AfxTeNz3CRLTAcwH7k6Q9wMEs3V3AWYDG4G9w33PxI5bBpwffj6fIJeTCBKIPQUsJOjIrAEOCo8z4OTw82djcl0PvDz8fAhBKpCo3KuB/gQ5PwV8K/y8P0GqgtnAAuDXKc9iWnnAPwJnhJ+PAu6osn0lwciuBCwCtgBvDff9CFgaPoMNbJsMOpggx0pgXfjcXhQ++z0J0oKcG97HvlDW14XXNAkcmlDW7PD8l4fnXQJcHe77OkEK9lcDtwHfCLf/F7AD6e/a1D0keO+i8j4Qe2bnEyTC6wNeBfw29l78a7j9xQQ5dZa1+3fY6X9uGup8TiJYSAWCxvIkgsbxx8BXJW0PvAX4uQULc7wZODDWi9qF4Ef+AnCrmd0bK/ujkv4s/Lx3eNyLgZ+Z2RMAki4FXhEe80bgVQrSUADsLGlHMyu33V9vZk+G5/8G2IegsanEVWZmku4CHjGzu8Lz1xM0LHcQNGYXh8dfCPxQwYjpT4BLY3JtHyv3UjNLytJ4OEGDjZndI2ljeJ1PVZEzXt7hwAlhGTcoSBG8c4XtAD82s/HwOvsJMkkC3BVe59XAc8B5Cmzrafb6K8xsDBiTdCNBgsDDCZTB2vCYHQme6f3ARjO7OaGc/QmStv0XgIKkitHo8BcEimQj8DXgVAULwWw2s2crvGtZk6ENW2Ci+o2k3cNthxPc40ng4fDanAZxRdDBSNqVoEe5UJIRNBwmabmZPSfp34GjgRMJlAQEvbq/NrNry8o6gmBEEP/+RuC1ZrYlLKvaEn99BL3K56oc93zs8wTb3sN4vpPyuqJzJsvOnyT9PbZQplEzOyjlmGdTttdLo+U9D2Bmk5LGLewGE16nmW2V9BqC5GnLgP9J8A6UU547xgie/dlm9v/iOyQtqFPunwMfIRj9fYZgFLqMQEFA+ru2IGP58ees1KOchnEfQWezDPiume1jZgvMbG/gXiCyXV9MkDDtT9nWs7wW+O8xe/MrlLzwyi4EPbstoU390HD7bcDrJc1VsOLTCbFzriNI2EZYdlrjm8Yjkl4pqY+gUamVPoJ7AvAe4Jdm9hRwr6R3hjJJ0qIMZf2CwOyGpFcQNHYbapQnXsYRwGOhPGnbqxKOcHYxs38FPkFgQkriHQr8HLsRmF9uI3j2fxGWgaQhhRkrK3APsEDSS8PvJ0U7zOwBAtPTy83s9wRmrf9FoCAg+7tWCzcBJ4S+gt3Da3MaxEcEnc1JBItRxLk83P5zgob5uwRmghfC/d8kMDHcrsBWsonA9lzOT4APS7qboAG8GYI1CST9PXAr8ARBQ/FkeM5HgX+WdCfBu/Vz4MM1XM8KAlPHJgJ78o41nAtBr/Y1kk4HHiUYCUHQ6H4t3F4iGB2tq1LWv4Tn3EWQtfIDZvZ8zLyUhZXAt8L7sQV4f5XtWdgJuELBAuwCPply3J3AjQQN9efM7EHgQUmvBP4jvI5ngPdSYQGTcGR5KnCNpC0ESmyn2CG3EIxECfedTaAQIPu7VguXsy01+AMEWUWfrHiGUxXPPurUTGT3D0cEPyJwqnbdwuediqSVBI73GRFF3UDs/duNoENymAXrSDh14iMCpx5WSnojgR3/OmC4zfI4vcXVkgYJouE+50qgcXxE4DiO0+O4s9hxHKfHcUXgOI7T47gicBzH6XFcETiO4/Q4rggcx3F6nP8PE+7o+52RjEYAAAAASUVORK5CYII=\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plt.scatter(data[\"RM\"], labels)\n", "plt.xlabel(\"Average number of rooms per dwelling\")\n", "plt.ylabel(\"Housing Price\")\n", "plt.title(\"Relationship between Rooms and Price\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "predicted_prices = lm.predict(features)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEWCAYAAABrDZDcAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJztnXucXHV999/f3QxkF5ANECgshKDwhIpIIlFi01ZJK1G5uAXlUrDYaqm9IYjR4ItHYh8o0VRR+/RGRYuFQgLBCKJGXhK0xodLQhIwEl5V7itKkKxCdiGb3e/zx5yzOTt7zpkzs3NmZmc+79drXzvnzLl85+zs7/v7fa/m7gghhGhfOhotgBBCiMYiRSCEEG2OFIEQQrQ5UgRCCNHmSBEIIUSbI0UghBBtjhSBGMPMvm1mFzZajnbEzO41sw8Fr883s+/W4Z6zzczNbFqNrqfvzxRFiqCFMbMnzWzIzF42s1+a2X+Y2b5Jx7v7u9z9hnrKOFUws2VmNhw8ywEz+5GZvTWPe7n7Te5+SkaZbsxDhuD6+v60CVIErc/p7r4v8CZgPnBF6QFWZEp+F+os+8rgWc4EfgjcbmYWI1NNZthNQkt/f0QR/fHaBHfvB74NvAHGTBFXm9l6YBB4bdQ8ERzz52b2qJm9ZGY/MbM3BfsPM7PVZrbdzJ4ws4sj57zFzDaY2W+CWeTn4+QJrntaZHtacL3wHguCWfeAmW0xs7dHjo2T/QNm9ngg6xNmdn5w7LhZc6k5JOm8Ms9yGLgB+C3gwOAa683sWjP7FbAsuPafBZ9zh5mtNbMjI3K8w8y2mdmvzez/AhZ57wNm9sPI9nFmdreZvRg800+a2TuBTwLnBDP2LcGx+5vZ9Wb2nJn1m9lVZtYZvNdpZv9gZi+Y2ePAqeU+a+QzN9X3R9QYd9dPi/4ATwJ/GLw+AtgK/J9g+17gaeA4YBpQCPZ9KHj/fUA/8GaKg9TRwJEUJw8bgU8BewGvBR4HFgfn/T/g/cHrfYEFCbJ9Crgpsn0q8Gjwuhf4FfDu4H7vCLZnJsi+P/AbYE7w/qHAccHrZcCNkfvMBjw4b5+k82LkHbsOsDewAng62P4AsBv42+C6XcB7gJ8Cvx3suwL4UXD8QcBLwHuD535pcP6HItf7YfB6P+A54DJgerB9UtxnC/Z9Hfi34LMdDDwA/EXw3oeBbcF34QBgXfgsptr3Rz+1/dGKoPVZY2YDFE0Z3wf+PvLef7j7Vnff7cVZbpQPAZ919we9yE/d/SmK/9gz3f3v3H2Xuz8O/DtwbnDeMHC0mR3k7i+7+30Jcv0XcIaZdQfbfwzcHLy+APiWu3/L3Ufd/W5gA0XFMEF2ioPoKPAGM+ty9+fcfWvG51PJeWcHz/IZ4ETgjyLv/dzd/zF4lkMUB91r3P3RQMa/B+YGq4J3A1vd/bbguX8B+EXCPU8DfuHun3P3V9z9JXe/P+5AMzskuPYl7r7T3Z8HrmXP3+Zs4Avu/oy7vwhcU/bpNO/3R9QQKYLWp8/de9z9SHf/q2CQCnkm5bwjgJ/F7D8SOCww2QwEg8QngUOC9z8I/C9gm5k9GDX/RHH3nwKPAqcHyuAMisohvMf7Su7xuxRn7BNkd/edwDkUB9/nzOwuMzs25bNVe96q4Fke7O6L3H1jnDyRz/DFiPwvUpwZ9wKHlcjvMeeHJP0d4jiS4sz8uch9/43iyoDS+wJPZbhmU35/RG1pJaeWqJy00rPPAK9L2P+Eux8Te0H3/wHOs6Lz8EzgNjM7MBh0S7kZOI/ihOQngXII7/Gf7v7nWWV397XAWjPrAq6iOMv8PWAn0B059Lcynlcppc/yGeBqd7+p9EAzO4biQBluW3Q75jrnJrwXd89XgYOCVUgpz5XcZ1bCdbPS6O+PqBFaEYgkvgx8zMxOtCJHB2aNB4CXzOwTZtYVOCDfYGZvBjCzC8xspruPAgPBtUYT7nELcArwl+xZDQDcSHGlsDi4/nQze7uZHR53ETM7xMzeY2b7UBwIX47cczPw+2Y2y8z2By7PeN5k+VfgcjM7LrjX/mb2vuC9u4DjzOxMKzqtL6ZEQUX4JnComV1iZnub2X5mdlLw3i+B2cGgibs/B3wX+JyZvcbMOszsdWb2tuD4VcDFZna4mc0Altbos8ZRj++PqBFSBCIWd78VuJriAP0SsAY4wN1HKNqt5wJPAC9Q/KffPzj1ncBWM3sZ+CJwbok5IXqP5yg6B38HWBnZ/wxFZ+snge0UZ5FLSP6+dgAfBX5O0QTzNorKhcC/sBJ4mKKT8ptZzpss7v514DPALWb2G+DHwLuC916g6ExdTtEJfgywPuE6L1F0lp9O0Y/wP8DJwdu3Br9/ZWYPBa//hKIT9ifADuA29pjU/h1YC2wBHgJur8FHjaUe3x9RO6xonhRCCNGuaEUghBBtjhSBEEK0OVIEQgjR5kgRCCFEmzMl8ggOOuggnz17dqPFEEKIKcXGjRtfcPeZ5Y6bEopg9uzZbNiwodFiCCHElMLMsmSPyzQkhBDtjhSBEEK0OVIEQgjR5kgRCCFEmyNFIIQQbc6UiBoSQoh2Ys2mflasfYyfDwxxWE8XSxbPoW9eb273kyIQQogmYs2mfi6//RGGhkcA6B8Y4vLbHwHITRnINCSEEE3EirWPjSmBkKHhEVasfSy3e0oRCCFEE/Hzgfj2C0n7a4EUgRBCNBGH9XRVtL8WSBEIIUQTsWTxHAqdNm5fodNYsnhObveUIhBCiCZjZMRTt2uNFIEQQjQRy+7YymjJvtFgf15IEQghRBMxMDRc0f5aIEUghBBtjhLKhBCiTmTJGJ7RXWDH4MTZ/4zuQm5yaUUghBB1IMwY7h8YwtmTMbxmU/+4405946Gx5yftrwVSBEIIUQeyZgyv27Y99vyk/bVAikAIIepA1oxhZRYLIUSLkjVjWJnFQgjRoixZPIeuQueE/YO7do/zEyxZPIdCR0lmcYcyi4UQYsrTN6+Xa848np6u8dE/OwaHJziNd4+OzyQu3a41UgRCCFEn+ub1ss/eE6P2o07jT97+MKXDvgf78yL3PAIz6wQ2AP3ufpqZHQXcAhwIbATe7+678pZDCCEmQ626hpVzBg8OlxaYIHV/LajHiuAjwKOR7c8A17r70cAO4IN1kEEIIaomaw5AFhrhDC5HrorAzA4HTgW+HGwbsAi4LTjkBqAvTxmEEGKy1LJrWJzTuKvQOeYMtriTUvbXgrxNQ18APg7sF2wfCAy4++5g+1kgv47MQghRA2oZ2x+ak5LMTF2FjlgzUFchv3l7borAzE4Dnnf3jWb29irOvwi4CGDWrFk1lk4I0U5M1r5/WE8X/TGDfrXmnL55vYn3H0rwBSTtrwV5moYWAmeY2ZMUncOLgC8CPWYWKqDDgVgjm7tf5+7z3X3+zJkzcxRTCNHK1MK+X86cU0taKqHM3S9398PdfTZwLnCPu58PrAPeGxx2IfCNvGQQQohq7ftrNvWzcPk9HLX0LlasfYyzTuylt6cLA3p7urjmzOOrihoqRz2VTkgjylB/ArjFzK4CNgHXN0AGIUSbkNW+HzUf7d9VYOeu3QwHLSL7B4ZYvbE/t8E/SjkfQh7URRG4+73AvcHrx4G31OO+QgiRxb4fmo/ClUNcN7BwFZG3IoB0H0IeKLNYCDHliJptFi6/J9Xen8XUEmc+iiPPCqCNRB3KRFNTq2xOkT/1+luVzt5D5y8Qe78sppasA3wjk77yRIpANC2V/sOLxlHPv1Wa8zfpXqWmlnBFESqG/bsKZZvD5+2wjVLvCZBMQ6JpqWU2p8iXev6tkmbv/QNDmUxFceGkO3ftji39PKO7ULMooazmrFqWs8iKVgSiaWlEpyZRHfX8WyU5f4FxA2dI6cw6TmkNjzgzugt07zUtl1l4JSumalY8k0WKQDQttc7mFPlRz7/VksVzxg2qcQwNj7Dsjq28unt0wuCbdN7A4DCbPnVKzeWFygb3JCWXtL8WyDQkmpZGJNaI6qjn3yps8BImdyUxMDQcO/h2WvxZeU4wKlkxJcmXtL8WaEUgmpZGJNa0ArV0NGa9Vr3/VlHn78Ll91Q0Wx5xp6vQOU5J5D3BqGTFNOLx3ciS9tcCKQLR1NQ7sWYqETdIAzWL3qkmTDPpHnlGwcSZiroKnUwvdLBjcGIkUG/EV1CvCUaSjHHKpzdBafTmuGIxz1HL1Ir58+f7hg0bGi2GEE1D6SANxYFl72kdsWGQvT1drF+6qKJ7pM20eysYPJNkrWW5hixKsV73nawyvGLNI9x439MT9l+wYBZX9R1fkXxmttHd55c7TisCIaYgSc7HJEdoNY7GtIifSlYa9YiCSVuNRAffk4+dyYq1j3Hpys01MZvVasUUZd227RXtrwVSBEJMQaoJy1yzqT9xILpizSPcfP8zjLjTacZ5Jx2RGqYJ2QfzRoYBRwffWie95aXgGvG8FDUkxBQkKcKlIyWwJCm5KzRFhM7IEXduvO9pZh/YNSESqJQsg1Oz9OitddJbXgN2S/UjEEKUp5LiaVGSwjVHU1x+/QNDY/cI7zt76V2x9miA+x7fMRammUSWwalcaGm1z6BSaj1w5zVgL1k8JzbLOc+oJikCIRrEZEoJlMbS9/Z0cdaJvWUbnPcPDLHkti0suXVLWb/BiDt983pZv3QRXzhnbtV5AnGyhg7bepZTqPXAnWvuROkfMs/O9chHIETDKGeqKBdhUup8XLj8HrLEAIbNVrIQ+hWS8gTC+2bJM4jbX89yCpWEcGYhr9yJFWsfm/A3Gh5xlZgQohVJK55WjVMzD2di9L5xFTwn63ytp2M0j4E7jzyXRjiLpQiEaBBJUTmdZqmz5LgIn6v6ji8b5VMNabPzpNn8p+/cmnmwrXc9qamQoNiIGlvyEQjRIJJszEmlBPoHhsacu6URPleseST2eoVOm+h4jNmXRqUz1B2Dw5lt/pXa2evlWG4k7dK8XogpRaXlESZbn2fF2scqntnffP8zY1mncXb8pH1Z7nNYT1fsZ8q6AklbVVRirmmXRkWNqLGlEhNCpFBpeYRalFOIu0YWnlx+akXHZ7lvocPYa1oHO3eNl6Wr0MlZJ/ay8sFnMjmfDXhikvIllbyopnxGu6ASE0LUgCQ7+GWrtgDVNxWJzrD37ypgVqyHH87+rjnz+LH3s0zVwhLFkynuVjoT3b+rwM5duycogfAz3fXwc2QSjtrYt9upUZFaVQrRRCQNMiPuE2zfazb1J5pKotcpjZ0fGBpmx+DwhO5a65cu4tpz5mYKIT/vpCNiY/IvXbmZK9Y8Uvb8kDBv4Inlp7LP3tNSZ/s7BocZjslgK5W3VvbtZslQzptGtKqUIhAihbRBZmh4hEtWbmbh8nu4Ys0j49ojpl0nbtVQet0Vax9jzaZ+Llu1peykO6xKGXddB2667+mqBpFqZ9oOscljk6VdGhU1ole3TENCpJClLWL/wBA33fd04oBdOlhlGWD7B4a4ZOXm1GMMuPacuWODbNJ1HapKRkpzBte65HUW2qVRkfIIhGgywkHmslVbUjtEpc3aX9ldXDlctmpLpqqeWXHG+yjSrpt1ECn1XRQ6bYJ5qKerwLIzjgPi6/3nOUOfCnkAk6UReQRSBEKUIRx4qonkAQj1Rxjzf8zB+8QOsJXSacZRS+8amxkvWTyHS1dujlVKWQaR0qihgaFhCh3GjO7COEd2nIO8lWfo9abWpTCyoPBRITISzpZrMZvvAEYnL9IYYYjqhqdenGCmyhq+qvDM5qFWUUNZw0elCISokGrj/MvR29PFzld3x9rdszCju0D3XtPoHxii04wR99SWkqWDTZKCi8sBqHd4o6gO5RGItmPNpn6W3bF1bCCd0V3gytOPq/kAFXVapg2elU6x+geGKHRWX294x+DwWLP2Efcxc0LWLN0kmUvNSu2S4dtOaEUgWoI1m/pZcuuW2Lh22NNsHWpr067l6sBsjz+hViSZdZLMQKXKIM6sJBPS1EErAtFWrFj7WKISgKAhy61bwPbU46/FTLZ0dVDNSgCKpRzS5C8l630qDUUMcwDSFGU7Zfi2C7kpAjObDvwA2Du4z23ufqWZHQXcAhwIbATe7+678pJDtAdZBqG4gTatXERWwpDGpJlyOXp7uhjctXvMrFNK3Cz9rBN7E1tMRknLxk1aEZRbJTUivLFS5MOojDwzi18FFrn7CcBc4J1mtgD4DHCtux8N7AA+mKMMok2YzCAUVy6iGqqZEfd0FQASlQDA+QtmTcjUvaovvZcwpIccLlk8J7Z0RZh8lkazZ/g2okTDVCc3ReBFXg42C8GPA4uA24L9NwB9eckgWp+wPv1kQzprkcJfqTIqdBg7d+1Olb2nq8BVfceP1f9Zv3TR2Mw2aTCHYo5BWsho37zeRNNSOYWW1oO4GWhEiYapTq4+AjPrpGj+ORr4J+BnwIC77w4OeRaI/faY2UXARQCzZs3KU0wxRblizSOppR0qZbI27rhEoCRbfqcZ+06flroS6Cp0jmXwxtE3rzexDMVo0Hg+jd5JmHiaOcNXPozKybXonLuPuPtc4HDgLcCxFZx7nbvPd/f5M2fOzE1GMbUIVwBhp66kQbYaJmvjjpspn79gVqwZ5XNnn8BAihLIOstOMg/1dBfKdvJqdhNPtbRLldJaUnZFYGYfAb4KvAR8GZgHLHX372a9ibsPmNk64K1Aj5lNC1YFhwMy3IlUohm9WaJl0moCpVE6AFbicIzK2Gk2JuP8Iw9g/pEHxF4nKQ+hkjDMuFVIodN4+ZU9zuek6KhWLeLWiBINU52yeQRmtsXdTzCzxcBfAP8b+E93f1OZ82YCw4ES6AK+S9FRfCGw2t1vMbN/BR52939Ou5byCNqXvLJ4S+npKrD5ylNS72sUHbdhS8gsMubdzSy8TnQwT8pObqc4f0UNFallHkG4zn43RQWw1SzT2vtQ4IbAT9ABrHL3b5rZT4BbzOwqYBNwfYZriTalXO3+WnHaCYeWvW9Y23/+kQeMG1TSZAx7Fmx46sUJCqTcjLyS3sfR/UctvStWlmpt5FNxUG1mH0YzkkURbDSz7wJHAZeb2X5kqJfl7g9TNCOV7n+cor9AiLLUy8G3btv2TPeNq+2fRcYw5j9OGdS6UXst4/xVTqI9yOIs/iCwFHizuw8CewF/mqtUQgTUy8FXOnCm3beSY6PcfP8zmeWZTAhkLZ3ACsVsD7IoAgdeD1wcbO8DTM9NIiEiLFk8Z1KF2LJSGmmUNmjGHVs68MZRiRN7MiGQtYzzT8pxqEUpbtE8ZDEN/TNFU9Ai4O8oRg+tBt6co1yiTchifx6ZZAOXLJQO0mkx+nHHQno1UqgsrHWy5p1a2cjDctZx+0XrkGVFcJK7/zXwCoC776BoHhJiUmQpBbDsjq01beCSRKfZhFj7pBj9sDNYND6/b14v65cu4snlp3LBgvgEyPNOOiKzPM0S45+0iqk2RFc0J1kUwXAQ+eMwFhZaj/9N0eJksT9X26SlUuLqDSWZfEbcU2vYXNV3PBcsmDU2a+4044KYsNM0mqWMQ5IyLFfnSEwtspiGvgR8HTjYzK4G3gtckatUoi1otlIAoRIKB9vS8M6OGDNJ6TkhV/UdX9HAH0czhEAqOas9KKsI3P0mM9sI/AHFnII+d380d8lEy9OM5YxLlVB0MK51fP5UoFWzj5udeuduZCkxsQDY6u7/FGy/xsxOcvf7c5NKtATlvsxZZpszuguphdlqTZoSakbFVQ+aYWXSTjQidyOLj+BfgJcj2y8H+4RIJIsjOIsd/MrTk6tv1ppSJRQWuAsdwycfO7MpHLiitWlE7kamEhMeKUjk7qNmphaXU4w8l5px1077Mldy3755vVy6anPNe/mW0mnGWSfumfnGzcpWb+znrBN7Wbdte9nnWO55T8WyDaI+NMJ3lmVAf9zMLmbPKuCvgMdzk0jUnDyXmknXTqq9E/0yp8m14akXufn+Z+oWpjjizuqN/WN1hJIU2bpt22MLt0UH9p7uAi+/snusNWbp81bZBpFGI0yQWUxDHwZ+h2K56GeBkwgaxoipQZ5LzaRrJyUcRb/MSed+8vaHufG+p+seqx59JpXMykrNYDsGhyf0Rw6vvWZTP5et2hL7uS9btUXtFEVsNn2h03I1QWaJGnoeODc3CUTu5LnUTLrGiDtdhc5UR3DSuYPDjUtTCWUqNyuLrgDiwkrjCGf+aUlaWhkIYGLTjZznRIkrAjP7ePD7H83sS6U/+YolakmeHZuSrhE6ftMcwc0YbRPKlJbZW7oCyLpy6TQrW1JbBd3EirWPTVhRDo96rt+LNNNQmCuwgWLf4dIfMUXIs1xBUvZt/8AQl6zczOCu3Vx7ztxxTddDTj525oTm642sYBN9JmkRTdX0SOgqdGZWGK2clyDK01TOYne/Mygtcby7fyw3CUTu5JkUVK7g2o7B4djGLGs29bN6Y/+4FW+WNpR5Eg70pRE9154zt+L+A4VOY5+9pvHroeGx533Zqi2ZlEEzrpRE/WiEszjVR+DuI2a2MLe7i7qRZ1JQeO2Fy+9JrL5Z2tkrqQNYI8ka0ZP0j9ppxqh7oqJNqmYaRXkJYsniOSy5bQvDkaq7DXcWA5vN7A7gVmBnuNPdb89NKjElSZsph529oHy55kbQ01UA0iOswoE9KSO6XFG43ioVSFaUm9BC1NlZnEURTAd+RbEfQYgDUgRiHEkz5ZD+gaEJM51moNBhLDujmMGcxT5bramtWgWSBeUmtA5pzuK8/pZZFMESd38hl7uLlmLJ4jksuXXLhC9xlGZTAh0G57zliLJmn1L7bDWmtjx9NbXK5BaNp6mcxWZ2OvAViv0IRoGz3f1HuUkipjzhgHP57Q8z1MBcgEoYdVj5wDNj/ou8yy7n5atptpLeonqazVl8NfB77r7NzE4CPgu8LTdJRNMRtTnv31XADAYGh1NnsuFAd9ynvsPOXZWFWDaK4VFn2R1bxw3SoR8jjP0P/RvNOrtu18qorUgjekCk5RHsdvdtAEHJ6f1yk0I0HaVJUwNDw+wYHB6rJHrpys3MLmnXGGWqKIGQaCe0cGUQjf1P6kbWLDRLa0sxeRrRnS5tRXCwmX00advdP5+bVKLhlEuaCi39reqUnGo2dzWQaS3q3QMiTRH8O+NXAaXbooWpxLYcHSBDc1IzUegw9p0+LbXBzYzuwrjtpM/fbGGvUdRARlRLWmbxp+spiGgcpSWU3SsPW/75wNCEEMZmoLdkZrxmU39ssk5pA5wkm7sF19CAK1oJNZhpQuqZGFQ6eFfbFvKwnq6qavDkSW9P14TeAVlNKEsWz+HSlZtj83qa1TwkRLVIETQZ9U4Myjp49wRRQzsGhyfUBAqdkllKKNSLQsf4lPxKlWvfvN7Ez6OQTNFqSBE0GfV2UmYZ1AzYfOUpY9txg+qGp16suWzV0l3o4O/PfGNq28ksyjWpJIRCMkWrkZZQ9tGk90BRQ3lR78SgcmUhwmOiRJ2SoVKotxPVgP27CuPCPkNm7LP3uAG+WuXaiHhuIRpB2oogjBCaA7wZuCPYPh14IE+h2pksiUFrNvXz6Tu3jtnze7oKnHbCoRU1VQ+TpUbcy5Z/3rHz1VgHaSOdw+cvmMVN9z0d+16oNMspqXLKVSGZol0oGzVkZj8A3uTuLwXby4C7yl3YzI4AvgYcQnGcuc7dv2hmBwArgdnAkxRLV+yY1KdoIcrNQuOiXgaGhrkxMigmmT5KB+4wWapchNDg8ChLbt0ytl1pi8ZaM6O7wFV9x7N647OxpSz27ypkUlJZTDxZQjJV9VNMdbL4CA4BdkW2dwX7yrEbuMzdHzKz/YCNZnY38AHge+6+3MyWAkuBT1QmdutSbha6Yu1jmQq3xZk+JhPVE5ZheHX36ARFUm9OfeOhrNnUn1jPyCzbZz352JkV37vciqpVE+xEa5NFEXwNeMDMvh5s9wE3lDvJ3Z8Dngtev2RmjwK9wHuAtweH3QDcixTBONJmoZX4CkqPnayfIc4e3wjWbdvOum3bE98fGBxmIEMYbNo14si6omrmDGQh4iirCNz9ajP7NvB7wa4/dfdNldzEzGYD84D7gUMCJQHwCxJWF2Z2EXARwKxZsyq5XcsRNT1UYo4p9Su0Clmd2+WOq1QxVrKiUoipmEpkDR/tBn7j7l81s5lmdpS7P5HlRDPbF1gNXOLuvzHb057c3d3MYkc1d78OuA5g/vz5zVXEvo4kzULL0VXo5ORjZ6a2j2xFDMb8KbXwEUSpZHBXiKmYSpRVBGZ2JTCfYvTQV4ECcCNQtpexmRUoKoGbIq0tf2lmh7r7c2Z2KPB8tcK3A0mz0A4r1tKH+Kihk4+dyeqN/ZlmsI1uGh9H1oimKEYxmihqkklqGB9VGFnJEmoLCjEVU48sK4I/omjWeQjA3X8eOH9TseLU/3rg0ZKcgzuAC4Hlwe9vVCp0K5A10iRpFuoOTy4/NfG6NyaEVsZeK7vYdeNn17wbKH6epMG8lGvPmTvuGe5plDN+ZRCnMLIQF9EVvaYzsbaREFOBLIpgV9SEY2b7ZLz2QuD9wCNmFubqf5KiAlhlZh8EngLOrlDmKU+WTNdwQE8a/uJMD81Y9K1awryFvnm9XJqhdEVvT1dioxyoTS5AXNOaEXcN/mLKk0URrDKzfwN6zOzPgT8DvlzuJHf/IcWJUhx/kF3E1qNcpmuWAT0u9DHvom+dZpx30hGs27Y9d7/Dp+/cWraPcEg5U0wtyzOr1LNoRdI6lAHg7v8A3EbR1j8H+JS7fylvwRrJmk39LFx+D0eldOCaDOXKSGQZ0Fc+8Mw4udZs6s9tcO4qdHLBglnsN30aN973dF2cz9EqqHHdt8IZRj26NwnR6pRVBGb2GXe/292XuPvH3P1uM/tMPYRrBKUtGvNoUZgUURLuzxKdMjzqYw1gQpnzYmh4hBvve7pheQTR1n1QXJnIHi9E7SirCIB3xOx7V60FaRbSzDa1olx/2ayhh5WsIKYaPV3jO4al9RG+Ys0jua7ghGh1EhWBmf2lmT0CHGtmD0d+ngDym342mHpU/yzXnDpOUcTRYcaaTf1TPnmp9EtY6DCWnXHchOOSlPRNgbkqrxWcEK1OmrP4v4BvA9dQrAchNtkvAAAUEklEQVQU8pK7N0/x+RqTpfpnpSSFiiaZNEqjU5IYcefy2x9JLMecha5CJ2ed2DuWg9DTXeDlV3YzPFqfoNJ99urk6j86fnKhtCXbKvEgRGWkVR/9NfBrM/si8GKk+uhrzOwkd7+/XkLWk1rXoK+2KUppzf9o2ekoQ8MjTC900FXorMo8FOdorWePgZ27Rtjw1IsTWkrGkTWhC1TiQYhKyOIj+Bfg5cj2y8G+lqSc2SYka2RRLXwOffN62fSpUxJjcXcMDjO90JH4fhJpsffrly4ac87mzY33Pc3sDPb9rCYzUIkHISohSx6Bue9J63T3UTNr6RaX5WLFK5nlJ81gq5ltp82Iq2k6X64Mc737EJdbLZUmh+3fVeClV3czUmLGKu1XLIRIJ8uK4HEzu9jMCsHPR4DH8xasmalklt9p8fP0uP3lVhmVzIizEM7EX3f5t7hizR7/fyhHlozeWlNutRSuVp5Yfir77D1tghIA2Hf6NPkHhKiALDP7DwNfAq6g6Jf7HkF56HalksiipBo5pfuzrDL65vWy4akXK6ojBMUZ8m53ksr1jLiPXXP+kQfkVqai0GmZmupEn2NaTaakv0OWXgRCiD1kySx+3t3PdfeD3f0Qd/9jd2/riqHlEsKiJNnZS/dnXWVU2kzFgHPeckSmynI33vc0l6zcnIsS6O3pYsV7T2Dh6w4oe2z4HMsl91XydxBCJJOWR/Dx4Pc/mtmXSn/qJ2LzUS4hrJpjs64yKo2GceCuh59LrvpUBwzGooIeevrXqcdGn0055VjJ30EIkUyaaejR4PeGeggylaikomXWY7PmL1QSQhlSjSM5K12FjsTewSHhZ0jKgO40Y9Q9s+kn3F/LyqJCtDNpeQR3Br/L9iduN7L2EgjJUrFyyeI5LLltyzgbeqFzYvTLksVz+OiqzdQp3wvYk3S2euOzY4N+h8EfnzSrrE8hOkNPGthH3XkiprdCFuWoaqBCTJ5ERWBmd5JiWXb3M3KRqMmppJdAqaIoq0BKn3bC0w9n0KUUOgyMTA7ZrPQG3c7WbdvOK8OjiYXews/V013AHX49NDzhM1aatV3r5L5aUulkQIhmxjwhlMTM3ha8PBP4LYrtKQHOA37p7pfmL16R+fPn+4YNzWGhSuoB3NvTxfqli2J7CeyZUSe3jkxqyRhet9z9Q3q6CphN3hzUVejkmjOPByZ2+ArfSxr40hRhra7VSKr5HEI0AjPb6O7zyx2XZhr6fnChz5Vc6E4za45RuQFU00tgaHiEm+9/JrXdYtI7lTqLB4aGKXRW7xk2GDfgLlx+T6rDtnSQBsqumGptVqs35RoLCTHVyJJHsI+ZvdbdHwcws6OArO0qW45y5o2kgTpLz92k+2W5f5RqTUOlqw9I/jzhAF864E8vdKQOks04sFdKPSrUClFPsmQWXwrca2b3mtn3gXXAJfmK1RzEZfpW20sgKcO4HKX28HJlIcpxwYJZzOguTNifVJZh/66Jx0Jx5RA34CeZpPoHhlqmX4DyF0SrkSWh7DvAMcBHgIuBOe6+Nm/BGk1SMhNQcS+BrkIn5510RMXlIboLHaxY+9i4AbTShLIovT1dXNV3PFeeftxE81GCnkrSX9WsOVqlX4DyF0SrkaVVZTewBPgbd98CzDKz03KXrMEk2YE/fefWVBt337xezjqxd2wF0GnGWSf2clXf8ePaLWZhcHh0nCJacuuWqktDR2f8K9Y+NsF8NDzisTV+8ijXUOuOb/Uma4VaIaYKWXwEXwU2Am8NtvuBW4Fv5iVUM5Bk790xODxm/kgKHV29sX/MJzDizuqN/cw/ck9pBQN6ugu8MjxSNhkrSrXNYnq6Ciw747jMiVpRknwSM7oLvDI8mhgFFYa4ZnWCTzVawdchREgWH8Hr3P2zwDCAuw/S0IIF9SGrvbd0dpu0klh2x9ZxpqaiMrEJvXlrSaHT+MI5c9l85SnjBq1KbNxJZpArTz9uLLw0jjBJLGkFJHu6EM1DFkWwy8y6CMzCZvY64NVcpWoCKin5HJ3dJlbEHBqOVRBm1LS0dEh3oYMV7z0hdta6ZPGcYvJZhCRncZoZpG9eb9mBXvZ0IZqfLKahK4HvAEeY2U3AQuADeQqVB9WUhYDxMe87X90d2xs4OruttBbQwOAw154zt+atIb3coq307WC70v7K5bJ/VQ9IiOYnMbMYwMwMOBwYBBZQHC7uc/cX6iNekclmFtcqEzTLdZKOmV7oiA2tjMbux52blHGchbi8AEjOTu4OCshF75flOTVj9q8QogaZxQDu7mb2LXc/HrirZtLVmVplgmaZ3SYdAxNLNQAM7trNmk3942bdy+7YOrby6Oku8PpD92P9z16cIM8xB+/Dk78aTEwgKzVTlWtKPxjjuM7ynOQ4FWJqk8U09JCZvdndH8xdmpyoZSZolkEv7ZjoIA9Fp3Fp5NGru0fHvf+jGCUAMLhrlBXvPYHLVm2JzVyOmqziVhtZmWoRPlqhCFEZWRTBScAFZvYksJPAWuHub8xTsFqSZLfv6S6wcPk9dRsw+ub1smLtYxP8DNFZd9zqJS0EM5S3XJXOpF4AWWjmCJ/SQf/kY2eOK+4XF+IrhBhPFkWwOHcpcibOoVnoNF5+ZXdqTkAelFudVDL7DgfoLCartOv29nQxuGt3rA/DmFjmolmIKwl+031PT1CcKggnRDpp/QimU2xcfzTwCHC9u++ul2C1JGsEUD0GjHJF65LeL3Ual874y5mskq6bVj7bgPMXzGraAbTS1ZMQIp60PIIbgPkUlcC7gM/VRaKc6JvXy/qli3hi+amsX7qIX8eEgUL+A0a5uPqk989fMGtSJQ3K3TcuX+Dac+ZyVV9y0lijqWb1JISYSJpp6PVBtBBmdj3wQCUXNrOvAKcBz7v7G4J9BwArgdnAk8DZ7r6jcrEnT6XdskKyOiLLxeMnXSOvuPusEU9Z7tMszthqV09CiPGkdSh7yN3flLRd9sJmvw+8DHwtogg+C7zo7svNbCkww90/Ue5aeXQoq7ZbVpZzWrmDVTN9trRucOu2bW+4ohKi0dQij+AEM/tNeD2gK9gOo4Zek3Zhd/+Bmc0u2f0e4O3B6xuAe4GyiiAP0mbISTPerPkIrdzBqpk+m7KWhagNaa0qa18ABw5x9+eC178ADkk60MwuAi4CmDVrVg6ixJtC0prTZ81HqEcHq3LmmbzMN83WnUvJbEJMnixF53LBizapxOoJ7n6du8939/kzZ06uK1clpM14s1btnEwHq7iuaHHHxDXNCY8t9/5kUHcuIVqPeiuCX5rZoQDB7+frfP+ypM144yJvjOJAGx20q624mXUAT1NWae8vu2Nr6v2zoGqiQrQe9VYEdwAXBq8vBL5R5/uXJW3GGw2xhPHRKdFBu9oOVuUG+JBqk9IGhoYnvSpQdy4hWo8smcVVYWY3U3QMH2Rmz1IsZ70cWGVmHwSeAs7O6/7V2siXLJ7Dktu2jCvkVui0cfH2ffN6Yyt4Rp2m1dius9rfq01KA2ri1JVdXojWIrcVgbuf5+6HunvB3Q939+vd/Vfu/gfufoy7/6G7x1dTmySTtpGXei5iPBl5OE2TViP7l3Qxy5KUloQybIUQpTTMWZwnWU0scSy7Y+uE3sDDoxMbu+fhNI3rHAawMyhVHVLOPNM3r5cZ3fEtMOXUFUKU0pKKoNrZ+ppN/bEdyOLOPfnYmROafE3Wado3r5d9p0+01g2PTFREpSUzSk01V55+nJy6QohMtKQiqHa2nrZiKK3tv3pj/ziLkQFnnTh52/lATAVQqNykI6euECIruTmLG0m5PrpJpA225Wr7O7Bu2/bU62dxYFdbAykOOXWFEFloyRVBtbPhpMF2RnchU23/NEWS1YGtOH0hRL1pSUVQLUmD8JWnHzduX6WmpzWb+rls1ZZMDmyZdIQQ9aYlTUNp9YLKNWGH8kXMKjE9hbLE9RSG+FVENSadZikNLYSYerSkIphMhcyszenD+/QPDNFpNm52X64SaZSehDDPSqhW8QkhBLSoaagWyV7lir/1zesdMyWFs/04u3+5eyYsFCpiMnkTQgjRkopgssletSr+luWeSS0zK6HZSkMLIaYWLakIJht5U6vib0myRKlFpq9KQwshJkNLKoLJRt5UUvwtjuj+UJa4kg+1CgtVyKkQYjK0pLMYJpdMVS6pK4zQ6R8YytQoPZQlr8getWwUQkyGxOb1zUQezevTSGvQDkx4L1QGvRqAhRBNRC2a17ctaTPshcvviS0v0dvTxfqlixogrRBCTA4pggSSTEtTJUJHCWZCiKy0pLM4T6ZChE6ezeuFEK2HFEGFTIUIHSWYCSEqQYqgQkpDU2d0F9h7WgeXrtwcm4HcCKaK+UoI0RxIEVRAWHbi0pWbATh/wSxeGR5lYGi4qUwwU8F8JYRoHqQIMhJnd7/pvqeb0gQzFcxXQojmQVFDGUnqShZHo00wSjATQlSCFEFGKhncm8EEozaVQoisyDSUkaTB3Uq2ZYIRQkw1pAgykmR3P3/BLLWVFEJMaWQayojs7ntQ1rIQrYUUQQXI7q62mEK0IjINiYpQ1rIQrYdWBC1Cvcw1yloWovXQiqAFqGeROWUtC9F6SBG0APU01yhrWYjWoyGKwMzeaWaPmdlPzWxpI2RoJepprplsP2ghRPNRdx+BmXUC/wS8A3gWeNDM7nD3n9RbllahXI/lWqPoKSFai0asCN4C/NTdH3f3XcAtwHsaIEfLIHONEGIyNEIR9ALPRLafDfaNw8wuMrMNZrZh+/btdRNuKiJzjRBiMjRt+Ki7XwdcBzB//vykQp8iQOYaIUS1NGJF0A8cEdk+PNgnhBCiATRCETwIHGNmR5nZXsC5wB0NkEMIIQQNMA25+24z+xtgLdAJfMXdt9ZbDiGEEEUa4iNw928B32rEvYUQQoxHmcVCCNHmSBEIIUSbI0UghBBtjhSBEEK0OVIEQgjR5kgRCCFEmyNFIIQQbY4UgRBCtDlSBEII0eY0bfXRRlGvJvBCCNEsSBFECJvAh/1/wybwgJSBEKJlkWkoQj2bwAshRLMgRRChnk3ghRCiWZAiiJDU7D2vJvBCCNEMSBFEUBN4IUQ7ImdxhNAhrKghIUQ7IUVQgprACyHaDZmGhBCizZEiEEKINkeKQAgh2hwpAiGEaHOkCIQQos0xd2+0DGUxs+3AU42WY5IcBLzQaCGaCD2PPehZjEfPYw+TfRZHuvvMcgdNCUXQCpjZBnef32g5mgU9jz3oWYxHz2MP9XoWMg0JIUSbI0UghBBtjhRB/biu0QI0GXoee9CzGI+exx7q8izkIxBCiDZHKwIhhGhzpAiEEKLNkSLIATP7ipk9b2Y/juw7wMzuNrP/CX7PaKSM9cLMjjCzdWb2EzPbamYfCfa36/OYbmYPmNmW4Hl8Oth/lJndb2Y/NbOVZrZXo2WtF2bWaWabzOybwXY7P4snzewRM9tsZhuCfbn/r0gR5MN/AO8s2bcU+J67HwN8L9huB3YDl7n764EFwF+b2etp3+fxKrDI3U8A5gLvNLMFwGeAa939aGAH8MEGylhvPgI8Gtlu52cBcLK7z43kD+T+vyJFkAPu/gPgxZLd7wFuCF7fAPTVVagG4e7PuftDweuXKP7D99K+z8Pd/eVgsxD8OLAIuC3Y3zbPw8wOB04FvhxsG236LFLI/X9FiqB+HOLuzwWvfwEc0khhGoGZzQbmAffTxs8jMIVsBp4H7gZ+Bgy4++7gkGcpKst24AvAx4HRYPtA2vdZQHFS8F0z22hmFwX7cv9fUYeyBuDubmZtFbdrZvsCq4FL3P03xYlfkXZ7Hu4+Asw1sx7g68CxDRapIZjZacDz7r7RzN7eaHmahN91934zOxi428y2Rd/M639FK4L68UszOxQg+P18g+WpG2ZWoKgEbnL324Pdbfs8Qtx9AFgHvBXoMbNwYnY40N8wwerHQuAMM3sSuIWiSeiLtOezAMDd+4Pfz1OcJLyFOvyvSBHUjzuAC4PXFwLfaKAsdSOw+V4PPOrun4+81a7PY2awEsDMuoB3UPSbrAPeGxzWFs/D3S9398PdfTZwLnCPu59PGz4LADPbx8z2C18DpwA/pg7/K8oszgEzuxl4O8USsr8ErgTWAKuAWRRLap/t7qUO5ZbDzH4X+G/gEfbYgT9J0U/Qjs/jjRQdfp0UJ2Kr3P3vzOy1FGfFBwCbgAvc/dXGSVpfAtPQx9z9tHZ9FsHn/nqwOQ34L3e/2swOJOf/FSkCIYRoc2QaEkKINkeKQAgh2hwpAiGEaHOkCIQQos2RIhBCiDZHikAIwMxGgoqPPzazW82sO+G4b4V5AEK0CgofFQIws5fdfd/g9U3AxmgCXJAYZ+4+mnQNIaYqWhEIMZH/Bo42s9lm9piZfY1ihucRQb34gwDM7E/M7OGgt8B/BvtmmtlqM3sw+FkY7H9bsOLYHNTe369hn06IElR0TogIQY2bdwHfCXYdA1zo7vcF74fHHQdcAfyOu79gZgcEx3+RYi39H5rZLGAt8NvAx4C/dvf1QQG+V+r1mYQohxSBEEW6gtLQUFwRXA8cBjwVKoESFgG3uvsLAJGU/z8EXh+prvqaYOBfD3w+MDvd7u7P5vQ5hKgYKQIhigy5+9zojmAw31nhdTqABe5eOuNfbmZ3Ae8G1pvZYnffNvF0IeqPfARCVMc9wPuCgmBETEPfBf42PMjM5ga/X+fuj7j7Z4AHadMeBKI5kSIQogrcfStwNfB9M9sChBFGFwPzAyfyT4APB/svCUJTHwaGgW/XXWghElD4qBBCtDlaEQghRJsjRSCEEG2OFIEQQrQ5UgRCCNHmSBEIIUSbI0UghBBtjhSBEEK0Of8ftB+tAZaR+TgAAAAASUVORK5CYII=\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plt.scatter(labels, predicted_prices)\n", "plt.xlabel(\"Prices\")\n", "plt.ylabel(\"Predicted Prices\")\n", "plt.title(\"Prices versus Predicted Prices\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "21.831934375295628" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "training_error = \\\n", " (labels - predicted_prices).apply(lambda x: x ** 2).mean()\n", "\n", "training_error" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Citation: http://bigdata-madesimple.com/how-to-run-linear-regression-in-python-scikit-learn/" ] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:modin-dev]", "language": "python", "name": "conda-env-modin-dev-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: examples/quickstart.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "![LOGO](https://github.com/modin-project/modin/blob/main/examples/tutorial/jupyter/img/MODIN_ver2_hrz.png?raw=True)\n", "\n", "

Scale your pandas workflows by changing one line of code

\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Getting Started" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To install the most recent stable release for Modin run the following code on your command line:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install \"modin[all]\" " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For further instructions on how to install Modin with conda or for specific platforms or engines, see our detailed [installation guide](https://modin.readthedocs.io/en/latest/getting_started/installation.html).\n", "\n", "Modin acts as a drop-in replacement for pandas so you can simply change a single line of import to speed up your pandas workflows. To use Modin, you simply have to replace the import of pandas with the import of Modin, as follows." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import modin.pandas as pd\n", "import pandas" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2022-01-07 07:29:30,173\tINFO services.py:1250 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n" ] } ], "source": [ "#############################################\n", "### For the purpose of timing comparisons ###\n", "#############################################\n", "import time\n", "import ray\n", "# Look at the Ray documentation with respect to the Ray configuration suited to you most.\n", "ray.init()\n", "from IPython.display import Markdown, display\n", "def printmd(string):\n", " display(Markdown(string))" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Dataset: NYC taxi trip data\n", "\n", "Link to raw dataset: https://modin-datasets.intel.com/testing/yellow_tripdata_2015-01.csv (**Size: ~200MB**)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('taxi.csv', )" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# This may take a few minutes to download\n", "import urllib.request\n", "dataset_url = \"https://modin-datasets.intel.com/testing/yellow_tripdata_2015-01.csv\"\n", "urllib.request.urlretrieve(dataset_url, \"taxi.csv\") " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Faster Data Loading with Modin's ``read_csv``" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "DtypeWarning: Columns (6) have mixed types.Specify dtype option on import or set low_memory=False.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Time to read with pandas: 2.744 seconds\n" ] } ], "source": [ "start = time.time()\n", "\n", "pandas_df = pandas.read_csv(\"taxi.csv\", parse_dates=[\"tpep_pickup_datetime\", \"tpep_dropoff_datetime\"], quoting=3)\n", "\n", "end = time.time()\n", "pandas_duration = end - start\n", "print(\"Time to read with pandas: {} seconds\".format(round(pandas_duration, 3)))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Time to read with Modin: 1.35 seconds\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "UserWarning: `read_*` implementation has mismatches with pandas:\n", "Data types of partitions are different! Please refer to the troubleshooting section of the Modin documentation to fix this issue.\n" ] }, { "data": { "text/markdown": [ "## Modin is 2.03x faster than pandas at `read_csv`!" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "start = time.time()\n", "\n", "modin_df = pd.read_csv(\"taxi.csv\", parse_dates=[\"tpep_pickup_datetime\", \"tpep_dropoff_datetime\"], quoting=3)\n", "\n", "end = time.time()\n", "modin_duration = end - start\n", "print(\"Time to read with Modin: {} seconds\".format(round(modin_duration, 3)))\n", "\n", "printmd(\"## Modin is {}x faster than pandas at `read_csv`!\".format(round(pandas_duration / modin_duration, 2)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You can quickly check that the result from pandas and Modin is exactly the same." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
VendorIDtpep_pickup_datetimetpep_dropoff_datetimepassenger_counttrip_distanceRatecodeIDstore_and_fwd_flagPULocationIDDOLocationIDpayment_typefare_amountextramta_taxtip_amounttolls_amountimprovement_surchargetotal_amountcongestion_surcharge
01.02021-01-01 00:30:102021-01-01 00:36:121.02.101.0N142432.08.003.000.50.000.00.311.802.5
11.02021-01-01 00:51:202021-01-01 00:52:191.00.201.0N2381512.03.000.500.50.000.00.34.300.0
21.02021-01-01 00:43:302021-01-01 01:11:061.014.701.0N1321651.042.000.500.58.650.00.351.950.0
31.02021-01-01 00:15:482021-01-01 00:31:010.010.601.0N1381321.029.000.500.56.050.00.336.350.0
42.02021-01-01 00:31:492021-01-01 00:48:211.04.941.0N68331.016.500.500.54.060.00.324.362.5
.........................................................
1369760NaN2021-01-25 08:32:042021-01-25 08:49:32NaN8.80NaNNaN13582NaN21.842.750.50.000.00.325.390.0
1369761NaN2021-01-25 08:34:002021-01-25 09:04:00NaN5.86NaNNaN42161NaN26.672.750.50.000.00.330.220.0
1369762NaN2021-01-25 08:37:002021-01-25 08:53:00NaN4.45NaNNaN14106NaN25.292.750.50.000.00.328.840.0
1369763NaN2021-01-25 08:28:002021-01-25 08:50:00NaN10.04NaNNaN175216NaN28.242.750.50.000.00.331.790.0
1369764NaN2021-01-25 08:38:002021-01-25 08:50:00NaN4.93NaNNaN248168NaN20.762.750.50.000.00.324.310.0
\n", "

1369765 rows × 18 columns

\n", "
" ], "text/plain": [ " VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count \\\n", "0 1.0 2021-01-01 00:30:10 2021-01-01 00:36:12 1.0 \n", "1 1.0 2021-01-01 00:51:20 2021-01-01 00:52:19 1.0 \n", "2 1.0 2021-01-01 00:43:30 2021-01-01 01:11:06 1.0 \n", "3 1.0 2021-01-01 00:15:48 2021-01-01 00:31:01 0.0 \n", "4 2.0 2021-01-01 00:31:49 2021-01-01 00:48:21 1.0 \n", "... ... ... ... ... \n", "1369760 NaN 2021-01-25 08:32:04 2021-01-25 08:49:32 NaN \n", "1369761 NaN 2021-01-25 08:34:00 2021-01-25 09:04:00 NaN \n", "1369762 NaN 2021-01-25 08:37:00 2021-01-25 08:53:00 NaN \n", "1369763 NaN 2021-01-25 08:28:00 2021-01-25 08:50:00 NaN \n", "1369764 NaN 2021-01-25 08:38:00 2021-01-25 08:50:00 NaN \n", "\n", " trip_distance RatecodeID store_and_fwd_flag PULocationID \\\n", "0 2.10 1.0 N 142 \n", "1 0.20 1.0 N 238 \n", "2 14.70 1.0 N 132 \n", "3 10.60 1.0 N 138 \n", "4 4.94 1.0 N 68 \n", "... ... ... ... ... \n", "1369760 8.80 NaN NaN 135 \n", "1369761 5.86 NaN NaN 42 \n", "1369762 4.45 NaN NaN 14 \n", "1369763 10.04 NaN NaN 175 \n", "1369764 4.93 NaN NaN 248 \n", "\n", " DOLocationID payment_type fare_amount extra mta_tax tip_amount \\\n", "0 43 2.0 8.00 3.00 0.5 0.00 \n", "1 151 2.0 3.00 0.50 0.5 0.00 \n", "2 165 1.0 42.00 0.50 0.5 8.65 \n", "3 132 1.0 29.00 0.50 0.5 6.05 \n", "4 33 1.0 16.50 0.50 0.5 4.06 \n", "... ... ... ... ... ... ... \n", "1369760 82 NaN 21.84 2.75 0.5 0.00 \n", "1369761 161 NaN 26.67 2.75 0.5 0.00 \n", "1369762 106 NaN 25.29 2.75 0.5 0.00 \n", "1369763 216 NaN 28.24 2.75 0.5 0.00 \n", "1369764 168 NaN 20.76 2.75 0.5 0.00 \n", "\n", " tolls_amount improvement_surcharge total_amount \\\n", "0 0.0 0.3 11.80 \n", "1 0.0 0.3 4.30 \n", "2 0.0 0.3 51.95 \n", "3 0.0 0.3 36.35 \n", "4 0.0 0.3 24.36 \n", "... ... ... ... \n", "1369760 0.0 0.3 25.39 \n", "1369761 0.0 0.3 30.22 \n", "1369762 0.0 0.3 28.84 \n", "1369763 0.0 0.3 31.79 \n", "1369764 0.0 0.3 24.31 \n", "\n", " congestion_surcharge \n", "0 2.5 \n", "1 0.0 \n", "2 0.0 \n", "3 0.0 \n", "4 2.5 \n", "... ... \n", "1369760 0.0 \n", "1369761 0.0 \n", "1369762 0.0 \n", "1369763 0.0 \n", "1369764 0.0 \n", "\n", "[1369765 rows x 18 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pandas_df" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
VendorIDtpep_pickup_datetimetpep_dropoff_datetimepassenger_counttrip_distanceRatecodeIDstore_and_fwd_flagPULocationIDDOLocationIDpayment_typefare_amountextramta_taxtip_amounttolls_amountimprovement_surchargetotal_amountcongestion_surcharge
01.02021-01-01 00:30:102021-01-01 00:36:121.02.101.0N142432.08.003.000.50.000.00.311.802.5
11.02021-01-01 00:51:202021-01-01 00:52:191.00.201.0N2381512.03.000.500.50.000.00.34.300.0
21.02021-01-01 00:43:302021-01-01 01:11:061.014.701.0N1321651.042.000.500.58.650.00.351.950.0
31.02021-01-01 00:15:482021-01-01 00:31:010.010.601.0N1381321.029.000.500.56.050.00.336.350.0
42.02021-01-01 00:31:492021-01-01 00:48:211.04.941.0N68331.016.500.500.54.060.00.324.362.5
.........................................................
1369760NaN2021-01-25 08:32:042021-01-25 08:49:32NaN8.80NaNNaN13582NaN21.842.750.50.000.00.325.390.0
1369761NaN2021-01-25 08:34:002021-01-25 09:04:00NaN5.86NaNNaN42161NaN26.672.750.50.000.00.330.220.0
1369762NaN2021-01-25 08:37:002021-01-25 08:53:00NaN4.45NaNNaN14106NaN25.292.750.50.000.00.328.840.0
1369763NaN2021-01-25 08:28:002021-01-25 08:50:00NaN10.04NaNNaN175216NaN28.242.750.50.000.00.331.790.0
1369764NaN2021-01-25 08:38:002021-01-25 08:50:00NaN4.93NaNNaN248168NaN20.762.750.50.000.00.324.310.0
\n", "

1369765 rows x 18 columns

\n", "
" ], "text/plain": [ " VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count \\\n", "0 1.0 2021-01-01 00:30:10 2021-01-01 00:36:12 1.0 \n", "1 1.0 2021-01-01 00:51:20 2021-01-01 00:52:19 1.0 \n", "2 1.0 2021-01-01 00:43:30 2021-01-01 01:11:06 1.0 \n", "3 1.0 2021-01-01 00:15:48 2021-01-01 00:31:01 0.0 \n", "4 2.0 2021-01-01 00:31:49 2021-01-01 00:48:21 1.0 \n", "... ... ... ... ... \n", "1369760 NaN 2021-01-25 08:32:04 2021-01-25 08:49:32 NaN \n", "1369761 NaN 2021-01-25 08:34:00 2021-01-25 09:04:00 NaN \n", "1369762 NaN 2021-01-25 08:37:00 2021-01-25 08:53:00 NaN \n", "1369763 NaN 2021-01-25 08:28:00 2021-01-25 08:50:00 NaN \n", "1369764 NaN 2021-01-25 08:38:00 2021-01-25 08:50:00 NaN \n", "\n", " trip_distance RatecodeID store_and_fwd_flag PULocationID \\\n", "0 2.10 1.0 N 142 \n", "1 0.20 1.0 N 238 \n", "2 14.70 1.0 N 132 \n", "3 10.60 1.0 N 138 \n", "4 4.94 1.0 N 68 \n", "... ... ... ... ... \n", "1369760 8.80 NaN NaN 135 \n", "1369761 5.86 NaN NaN 42 \n", "1369762 4.45 NaN NaN 14 \n", "1369763 10.04 NaN NaN 175 \n", "1369764 4.93 NaN NaN 248 \n", "\n", " DOLocationID payment_type fare_amount extra mta_tax tip_amount \\\n", "0 43 2.0 8.00 3.00 0.5 0.00 \n", "1 151 2.0 3.00 0.50 0.5 0.00 \n", "2 165 1.0 42.00 0.50 0.5 8.65 \n", "3 132 1.0 29.00 0.50 0.5 6.05 \n", "4 33 1.0 16.50 0.50 0.5 4.06 \n", "... ... ... ... ... ... ... \n", "1369760 82 NaN 21.84 2.75 0.5 0.00 \n", "1369761 161 NaN 26.67 2.75 0.5 0.00 \n", "1369762 106 NaN 25.29 2.75 0.5 0.00 \n", "1369763 216 NaN 28.24 2.75 0.5 0.00 \n", "1369764 168 NaN 20.76 2.75 0.5 0.00 \n", "\n", " tolls_amount improvement_surcharge total_amount \\\n", "0 0.0 0.3 11.80 \n", "1 0.0 0.3 4.30 \n", "2 0.0 0.3 51.95 \n", "3 0.0 0.3 36.35 \n", "4 0.0 0.3 24.36 \n", "... ... ... ... \n", "1369760 0.0 0.3 25.39 \n", "1369761 0.0 0.3 30.22 \n", "1369762 0.0 0.3 28.84 \n", "1369763 0.0 0.3 31.79 \n", "1369764 0.0 0.3 24.31 \n", "\n", " congestion_surcharge \n", "0 2.5 \n", "1 0.0 \n", "2 0.0 \n", "3 0.0 \n", "4 2.5 \n", "... ... \n", "1369760 0.0 \n", "1369761 0.0 \n", "1369762 0.0 \n", "1369763 0.0 \n", "1369764 0.0 \n", "\n", "[1369765 rows x 18 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "modin_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Faster Append with Modin's ``concat``" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Our previous ``read_csv`` example operated on a relatively small dataframe. In the following example, we duplicate the same taxi dataset 100 times and then concatenate them together.\n", "\n", "Please note that this quickstart notebook is assumed to be run on a machine that has enough memory in order to be able to perform the operations both with pandas and Modin in a single pipeline (which at least doubles the amount of required memory). If your machine doesn't have enough resources to execute every cell of the notebook and you see an OOM issue, you most likely need to reduce ``N_copies`` in the cell below." ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Time to concat with pandas: 34.144 seconds\n" ] } ], "source": [ "N_copies= 100\n", "start = time.time()\n", "\n", "big_pandas_df = pandas.concat([pandas_df for _ in range(N_copies)])\n", "\n", "end = time.time()\n", "pandas_duration = end - start\n", "print(\"Time to concat with pandas: {} seconds\".format(round(pandas_duration, 3)))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Time to concat with Modin: 0.564 seconds\n" ] }, { "data": { "text/markdown": [ "### Modin is 60.57x faster than pandas at `concat`!" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "start = time.time()\n", "\n", "big_modin_df = pd.concat([modin_df for _ in range(N_copies)])\n", "\n", "end = time.time()\n", "modin_duration = end - start\n", "print(\"Time to concat with Modin: {} seconds\".format(round(modin_duration, 3)))\n", "\n", "printmd(\"### Modin is {}x faster than pandas at `concat`!\".format(round(pandas_duration / modin_duration, 2)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The result dataset is around 19GB in size." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[2m\u001b[36m(apply_list_of_funcs pid=73415)\u001b[0m \n", "\u001b[2m\u001b[36m(apply_list_of_funcs pid=73416)\u001b[0m \n", "\n", "Int64Index: 136976500 entries, 0 to 1369764\n", "Data columns (total 18 columns):\n", " # Column Non-Null Count Dtype \n", "--- --------------------- ------------------ ----- \n", " 0 VendorID 127141300 non-null float64\n", " 1 tpep_pickup_datetime 136976500 non-null datetime64[ns]\n", " 2 tpep_dropoff_datetime 136976500 non-null datetime64[ns]\n", " 3 passenger_count 127141300 non-null float64\n", " 4 trip_distance 136976500 non-null float64\n", " 5 RatecodeID 127141300 non-null float64\n", " 6 store_and_fwd_flag 127141300 non-null object\n", " 7 PULocationID 136976500 non-null int64\n", " 8 DOLocationID 136976500 non-null int64\n", " 9 payment_type 127141300 non-null float64\n", " 10 fare_amount 136976500 non-null float64\n", " 11 extra 136976500 non-null float64\n", " 12 mta_tax 136976500 non-null float64\n", " 13 tip_amount 136976500 non-null float64\n", " 14 tolls_amount 136976500 non-null float64\n", " 15 improvement_surcharge 136976500 non-null float64\n", " 16 total_amount 136976500 non-null float64\n", " 17 congestion_surcharge 136976500 non-null float64\n", "dtypes: float64(13), datetime64[ns](2), int64(2), object(1)\n", "memory usage: 19.4 GB\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "UserWarning: Distributing object. This may take some time.\n" ] } ], "source": [ "big_modin_df.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Faster ``apply`` over a single column\n", "\n", "The performance benefits of Modin becomes aparent when we operate on large gigabyte-scale datasets. For example, let's say that we want to round up the number across a single column via the ``apply`` operation. " ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Time to apply with pandas: 43.969 seconds\n" ] } ], "source": [ "start = time.time()\n", "rounded_trip_distance_pandas = big_pandas_df[\"trip_distance\"].apply(round)\n", "\n", "end = time.time()\n", "pandas_duration = end - start\n", "print(\"Time to apply with pandas: {} seconds\".format(round(pandas_duration, 3)))" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Time to apply with Modin: 1.225 seconds\n" ] }, { "data": { "text/markdown": [ "### Modin is 35.88x faster than pandas at `apply` on one column!" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "start = time.time()\n", "\n", "rounded_trip_distance_modin = big_modin_df[\"trip_distance\"].apply(round)\n", "\n", "end = time.time()\n", "modin_duration = end - start\n", "print(\"Time to apply with Modin: {} seconds\".format(round(modin_duration, 3)))\n", "\n", "printmd(\"### Modin is {}x faster than pandas at `apply` on one column!\".format(round(pandas_duration / modin_duration, 2)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Summary\n", "\n", "Hopefully, this tutorial demonstrated how Modin delivers significant speedup on pandas operations without the need for any extra effort. Throughout example, we moved from working with 100MBs of data to 20GBs of data all without having to change anything or manually optimize our code to achieve the level of scalable performance that Modin provides.\n", "\n", "Note that in this quickstart example, we've only shown ``read_csv``, ``concat``, ``apply``, but these are not the only pandas operations that Modin optimizes for. In fact, Modin covers [more than 90% of the pandas API](https://github.com/modin-project/modin/blob/main/README.md#pandas-api-coverage), yielding considerable speedups for many common operations." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.9" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: examples/spreadsheet/requirements.txt ================================================ ray==1.1.0 git+https://github.com/modin-project/modin git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5 ================================================ FILE: examples/spreadsheet/tutorial.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "![LOGO](../tutorial/tutorial_notebooks/img/MODIN_ver2_hrz.png)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## modin.spreadsheet\n", "`modin.spreadsheet` is a Jupyter notebook widget that allows users to interact with Modin DataFrames in a spreadsheet-like fashion while taking advantage of the underlying capabilities of Modin. The widget makes it quick and easy to explore, sort, filter, edit data and export reproducible code. \n", "\n", "This tutorial will showcase how to use `modin.spreadsheet`. Before starting, please install the required packages using `pip install -r requirements.txt` in the current directory. Then just run the cells; no editing required!" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Please install the required packages using `pip install -r requirements.txt` in the current directory\n", "# For all ways to install Modin see official documentation at:\n", "# https://modin.readthedocs.io/en/latest/installation.html\n", "import modin.pandas as pd\n", "import modin.spreadsheet as mss" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create a Modin DataFrame\n", "The following cells creates a DataFrame using a NYC taxi dataset." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "columns_names = [\n", " \"trip_id\", \"vendor_id\", \"pickup_datetime\", \"dropoff_datetime\", \"store_and_fwd_flag\",\n", " \"rate_code_id\", \"pickup_longitude\", \"pickup_latitude\", \"dropoff_longitude\", \"dropoff_latitude\",\n", " \"passenger_count\", \"trip_distance\", \"fare_amount\", \"extra\", \"mta_tax\", \"tip_amount\",\n", " \"tolls_amount\", \"ehail_fee\", \"improvement_surcharge\", \"total_amount\", \"payment_type\",\n", " \"trip_type\", \"pickup\", \"dropoff\", \"cab_type\", \"precipitation\", \"snow_depth\", \"snowfall\",\n", " \"max_temperature\", \"min_temperature\", \"average_wind_speed\", \"pickup_nyct2010_gid\",\n", " \"pickup_ctlabel\", \"pickup_borocode\", \"pickup_boroname\", \"pickup_ct2010\",\n", " \"pickup_boroct2010\", \"pickup_cdeligibil\", \"pickup_ntacode\", \"pickup_ntaname\", \"pickup_puma\",\n", " \"dropoff_nyct2010_gid\", \"dropoff_ctlabel\", \"dropoff_borocode\", \"dropoff_boroname\",\n", " \"dropoff_ct2010\", \"dropoff_boroct2010\", \"dropoff_cdeligibil\", \"dropoff_ntacode\",\n", " \"dropoff_ntaname\", \"dropoff_puma\",\n", " ]\n", "parse_dates=[\"pickup_datetime\", \"dropoff_datetime\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('s3://modin-datasets/trips_data.csv', names=columns_names,\n", " header=None, parse_dates=parse_dates)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Generate a spreadsheet widget with the DataFrame\n", "`mss.from_dataframe` takes in a DataFrame, optional configuration options, and returns a `SpreadsheetWidget`, which contains all the logic for displaying the spreadsheet view of the DataFrame. The object returned will not be rendered unless displayed." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "spreadsheet = mss.from_dataframe(df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Displaying the Spreadsheet\n", "The widget is displayed when the widget is returned by an input cell or passed to the `display` function e.g. `display(spreadsheet)`. When displayed, the SpreadsheetWidget will generate a transformation history cell that contains a record of the transformations applied to the DataFrame unless the cell already exists or the feature is disabled.\n", "\n", "### Basic Usage\n", "`from_dataframe` creates a copy of the input DataFrame, so changes do not alter the original DataFrame.\n", "\n", "**Filter** - Each column can be filtered according to its datatype using the filter button to the right of the column header. Any number of columns can be filtered simultaneously.\\\n", "**Sort** - Each column can be sorted by clicking on the column header. Assumptions on the order of the data should only be made according to the latest sort i.e. the 2nd last sort may not be in order even if grouped by the duplicates in the last sorted column.\\\n", "**Cell Edit** - Double click on a cell to edit its value.\\\n", "**Add Row**(toolbar) - Click on the `Add Row` button in the toolbar to duplicate the last row in the DataFrame.\\\n", "**Remove Row**(toolbar) - Select row(s) on the spreadsheet and click the `Remove Row` button in the toolbar to remove them.\\\n", "**Reset Filters**(toolbar) - Click on the `Reset Filters` button in the toolbar to remove all filters on the data.\\\n", "**Reset Sort**(toolbar) - Click on the `Reset Sort` button in the toolbar to remove any sorting on the data.\n", "\n", "### Transformation History and Reproducible Code\n", "The widget records the history of transformations, such as filtering, that occur on the spreadsheet. These transformations are updated in the `spreadsheet transformation history` cell as they happen and can be easily copied for reproducibility. The history can be cleared using the `Clear History` button in the toolbar.\n", "\n", "**Try making some changes to the spreadsheet!**" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "spreadsheet" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Exporting Changes\n", "`to_dataframe` takes in a `SpreadsheetWidget` and returns a copy of the DataFrame reflecting the current state of the UI on the widget. Specifically, any filters, edits, or sorts will be applied on the returned Dataframe.\n", "\n", "**Export a DataFrame after making some changes on the spreadsheet UI**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "changed_df = mss.to_dataframe(spreadsheet)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "changed_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## SpreadsheetWidget API\n", "The API on `SpreadsheetWidget` allows users to replicate some of the functionality on the GUI, but also provides other functionality such as applying the transformation history on another DataFrame or getting the DataFrame that matches the spreadsheet state like `to_dataframe`." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Duplicates the `Reset Filters` button\n", "spreadsheet.reset_filters()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Duplicates the `Reset Sort` button\n", "spreadsheet.reset_sort()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "qgrid6f69f373-ae0e-423e-8e26-429f52e1669d": true }, "outputs": [], "source": [ "# Duplicates the `Clear History` button\n", "spreadsheet.clear_history()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Gets the modified DataFrame that matches the changes to the spreadsheet\n", "# This is the same functionality as `mss.to_dataframe`\n", "spreadsheet.get_changed_df()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Retrieving and Applying Transformation History \n", "The transformation history can be retrieved as a list of code snippets using the `get_history` API. The `apply_history` API will apply the transformations on the input DataFrame and return the resultant DataFrame." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": false }, "outputs": [], "source": [ "spreadsheet.get_history()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "another_df = df.copy()\n", "spreadsheet.apply_history(another_df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Additional Example\n", "Here is another example of how to use `from_dataframe` with configuration options." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "mss.from_dataframe(df, show_toolbar=False, grid_options={'forceFitColumns': False, 'editable': False, 'highlightSelectedCell': True})" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 } ================================================ FILE: examples/tutorial/README.md ================================================ # Modin tutorial Jupyter Notebooks Tutorial for how to use different features of Modin. ================================================ FILE: examples/tutorial/jupyter/README.md ================================================ # Jupyter notebook examples to run with Modin Currently we provide tutorial notebooks for the following execution backends: - [PandasOnRay](https://modin.readthedocs.io/en/latest/development/using_pandas_on_ray.html) - [PandasOnDask](https://modin.readthedocs.io/en/latest/development/using_pandas_on_dask.html) - [PandasOnMPI through unidist](https://modin.readthedocs.io/en/latest/development/using_pandas_on_mpi.html) ## Creating a development environment To get required dependencies for `PandasOnRay`, `PandasOnDask` and `PandasOnUnidist` Jupyter Notebooks you should create a development environment with `pip` using `requirements.txt` file located in the respective directory: ```bash pip install -r execution/pandas_on_ray/requirements.txt ``` to install dependencies needed to run notebooks with Modin on `PandasOnRay` execution or ```bash pip install -r execution/pandas_on_dask/requirements.txt ``` to install dependencies needed to run notebooks with Modin on `PandasOnDask` execution or ```bash pip install -r execution/pandas_on_unidist/requirements.txt ``` to install dependencies needed to run notebooks with Modin on `PandasOnUnidist` execution. **Note:** Sometimes pip is installing every version of a package. If you encounter that issue, please install every package listed in `requirements.txt` file individually with `pip install `. ## Run Jupyter Notebooks A Jupyter Notebook server can be run from the current directory as follows: ```bash jupyter notebook ``` Navigate to a concrete notebook (for example, to the `execution/pandas_on_ray/local/exercise_1.ipynb`). **Note:** Since there are some specifics regarding the run of jupyter notebooks with the `Unidist` engine, refer to [PandasOnUnidist](https://github.com/modin-project/modin/blob/main/examples/tutorial/jupyter/execution/pandas_on_unidist/README.md) document to get more information on the matter. ================================================ FILE: examples/tutorial/jupyter/execution/pandas_on_dask/Dockerfile ================================================ FROM continuumio/miniconda3 RUN conda install -c conda-forge psutil setproctitle RUN pip install -r requirements-dev.txt ================================================ FILE: examples/tutorial/jupyter/execution/pandas_on_dask/cluster/exercise_5.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "![LOGO](../../../img/MODIN_ver2_hrz.png)\n", "\n", "

Scale your pandas workflows by changing one line of code

" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Exercise 5: Setting up cluster environment\n", "\n", "**GOAL**: Learn how to set up a Dask cluster for Modin, connect Modin to a Dask cluster and run pandas queries on a cluster.\n", "\n", "**NOTE**: This exercise has extra requirements. Read instructions carefully before attempting. \n", "\n", "**This exercise instructs users on how to start a 500+ core Dask cluster, and it is not shut down until the end of exercise.**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Often in practice we have a need to exceed the capabilities of a single machine. Modin works and performs well \n", "in both local mode and in a cluster environment. The key advantage of Modin is that your python code does not \n", "change between local development and cluster execution. Users are not required to think about how many workers \n", "exist or how to distribute and partition their data; Modin handles all of this seamlessly and transparently.\n", "\n", "![Cluster](../../../img/modin_cluster.png)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Extra requirements for AWS authentication\n", "\n", "First of all, install the necessary dependencies in your environment:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install dask_cloudprovider[aws]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The next step is to setup your AWS credentials, namely, set ``AWS_ACCESS_KEY_ID``, ``AWS_SECRET_ACCESS_KEY``\n", "and ``AWS_SESSION_TOKEN`` (Optional) (refer to [AWS CLI environment variables](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-envvars.html) to get more insight on this):" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "os.environ[\"AWS_ACCESS_KEY_ID\"] = \"\"\n", "os.environ[\"AWS_SECRET_ACCESS_KEY\"] = \"\"\n", "os.environ[\"AWS_SESSION_TOKEN\"] = \"\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Starting and connecting to the cluster\n", "\n", "This example starts 1 scheduler node (m5.24xlarge) and 6 worker nodes (m5.24xlarge), 576 total CPUs. Keep in mind the scheduler node manages cluster operation but doesn't perform any execution.\n", "\n", "You can check the [Amazon EC2 pricing](https://aws.amazon.com/ec2/pricing/on-demand/) page.\n", "\n", "Dask cluster can be deployed in different ways (refer to [Dask documentaion](https://docs.dask.org/en/latest/deploying.html) to get more information about it), but in this tutorial we will use the ``EC2Cluster`` from [dask_cloudprovider](https://cloudprovider.dask.org/en/latest/) to create and initialize a Dask cluster on Amazon Web Service (AWS).\n", "\n", "**Note**: EC2Cluster uses a docker container to run the scheduler and each of the workers. Probably you need to use another docker image depending on your python version and requirements. You can find more docker-images on [daskdev](https://hub.docker.com/u/daskdev) page.\n", "\n", "In the next cell you can see how the EC2Cluster is being created. Set your ``key_name`` and modify AWS settings as required before running it." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from dask_cloudprovider.aws import EC2Cluster\n", "\n", "n_workers = 6\n", "cluster = EC2Cluster(\n", " # AWS parameters\n", " key_name = \"\", # set your keyname\n", " region = \"us-west-2\",\n", " availability_zone = [\"us-west-2a\"],\n", " ami = \"ami-0387d929287ab193e\",\n", " instance_type = \"m5.24xlarge\",\n", " vpc = \"vpc-002bd14c63f227832\",\n", " subnet_id = \"subnet-09860dafd79720938\",\n", " filesystem_size = 200, # in GB\n", "\n", " # DASK parameters\n", " n_workers = n_workers,\n", " docker_image = \"daskdev/dask:latest\",\n", " debug = True,\n", " security=False,\n", ")\n", "\n", "scheduler_adress = cluster.scheduler_address\n", "print(f\"Scheduler IP address of Dask cluster: {scheduler_adress}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "After creating the cluster you need to connect to it. To do this you should put the ``EC2Cluster`` instance or the scheduler IP address in ``distributed.Client``.\n", "\n", "When you connect to the cluster, the workers may not be initialized yet, so you need to wait for them using ``client.wait_for_workers``.\n", "\n", "Then you can call ``client.ncores()`` and check which workers are available and how many threads are used for each of them." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from distributed import Client\n", "\n", "client = Client(cluster)\n", "# Or use an IP address connection if the cluster instance is unavailable:\n", "# client = Client(f\"{scheduler_adress}:8687\")\n", "\n", "client.wait_for_workers(n_workers)\n", "client.ncores()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "After successful initialization of the cluster, you need to configure it.\n", "\n", "You can use plugins to install any requirements into workers:\n", "* [InstallPlugin](https://distributed.dask.org/en/stable/plugins.html#distributed.diagnostics.plugin.InstallPlugin)\n", "* [PipInstall](https://distributed.dask.org/en/stable/plugins.html#distributed.diagnostics.plugin.PipInstall)\n", "* [CondaInstall](https://distributed.dask.org/en/stable/plugins.html#distributed.diagnostics.plugin.CondaInstall).\n", "\n", "You have to install Modin package on each worker using ``PipInstall`` plugin." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from dask.distributed import PipInstall\n", "\n", "client.register_plugin(PipInstall(packages=[\"modin\"]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "If you need an additional workers configuration, you can create your own [WorkerPlugin](https://distributed.dask.org/en/stable/plugins.html#worker-plugins) or function that will be executed on each worker upon calling ``client.run()``.\n", "\n", "**NOTE**: Dask cluster does not check if this plugin or function has been called before. Therefore, you need to take this into account when using them.\n", "\n", "In this tutorial a CSV file will be read, so you need to download it to each of the workers and local machine with the same global path." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from dask.distributed import Worker\n", "import os\n", "import urllib\n", "\n", "def dataset_upload(file_url, file_path):\n", " try:\n", " dir_name = os.path.dirname(file_path)\n", " if not os.path.exists(dir_name):\n", " os.makedirs(dir_name)\n", " if os.path.exists(file_path):\n", " return \"File has already existed.\"\n", " else:\n", " urllib.request.urlretrieve(file_url, file_path)\n", " return \"OK\"\n", " except Exception as ex:\n", " return str(ex)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Set the directory where it should be downloaded (the local directory will be used by default):" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "directory_path = \"./\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then you need to run `dataset_upload` function on all workers. As the result, you will get a dictionary, where the result of the function execution will be for each workers:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "file_path = os.path.join(os.path.abspath(directory_path), \"taxi.csv\")\n", "client.run(dataset_upload, \"https://modin-datasets.intel.com/testing/yellow_tripdata_2015-01.csv\", file_path)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You have to also execute this function on the local machine:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dataset_upload(\"https://modin-datasets.intel.com/testing/yellow_tripdata_2015-01.csv\", file_path)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Congratulations! The cluster is now fully configured and we can start running Pandas queries." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Executing in a cluster environment\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Same as local mode Modin on cluster uses Ray as an execution engine by default so no additional action is required to start to use it. Alternatively, if you need to use another engine, it should be specified either by setting the Modin config or by setting Modin environment variable before the first operation with Modin as it is shown below. Also, note that the full list of Modin configs and corresponding environment variables can be found in the [Modin Configuration Settings](https://modin.readthedocs.io/en/stable/flow/modin/config.html#modin-configs-list) section of the Modin documentation." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Modin engine can be specified either by config\n", "import modin.config as cfg\n", "cfg.Engine.put(\"dask\")\n", "\n", "# or by setting the environment variable\n", "# import os\n", "# os.environ[\"MODIN_ENGINE\"] = \"dask\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now you can use Modin on the Dask cluster.\n", "\n", "Let's read the downloaded CSV file and execute such pandas operations as count, groupby and map:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import modin.pandas as pd\n", "import time\n", "\n", "t0 = time.perf_counter()\n", "\n", "df = pd.read_csv(file_path, quoting=3)\n", "df_count = df.count()\n", "df_groupby_count = df.groupby(\"passenger_count\").count()\n", "df_map = df.map(str)\n", "\n", "t1 = time.perf_counter()\n", "print(f\"Full script time is {(t1 - t0):.3f}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Shutting down the cluster\n", "\n", "Now that we have finished computation, we can shut down the cluster:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cluster.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### This ends the cluster exercise" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: examples/tutorial/jupyter/execution/pandas_on_dask/local/exercise_1.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "![LOGO](../../../img/MODIN_ver2_hrz.png)\n", "\n", "

Scale your pandas workflows by changing one line of code

\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Exercise 1: How to use Modin\n", "\n", "**GOAL**: Learn how to import Modin to accelerate and scale pandas workflows." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Modin is a drop-in replacement for pandas that distributes the computation \n", "across all of the cores in your machine or in a cluster.\n", "In practical terms, this means that you can continue using the same pandas scripts\n", "as before and expect the behavior and results to be the same. The only thing that needs\n", "to change is the import statement. Normally, you would change:\n", "\n", "```python\n", "import pandas as pd\n", "```\n", "\n", "to:\n", "\n", "```python\n", "import modin.pandas as pd\n", "```\n", "\n", "Changing this line of code will allow you to use all of the cores in your machine to do computation on your data. One of the major performance bottlenecks of pandas is that it only uses a single core for any given computation. Modin exposes an API that is identical to pandas, allowing you to continue interacting with your data as you would with pandas. There are no additional commands required to use Modin locally. Partitioning, scheduling, data transfer, and other related concerns are all handled by Modin under the hood." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "

\n", "

pandas on a multicore laptop\n", " \n", " Modin on a multicore laptop\n", " \n", "\n", "
\n", "\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Concept for exercise: setting Modin engine\n", "\n", "Modin uses Ray as an execution engine by default so no additional action is required to start to use it. Alternatively, if you need to use another engine, it should be specified either by setting the Modin config or by setting Modin environment variable before the first operation with Modin as it is shown below. Also, note that the full list of Modin configs and corresponding environment variables can be found in the [Modin Configuration Settings](https://modin.readthedocs.io/en/stable/flow/modin/config.html#modin-configs-list) section of the Modin documentation." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Modin engine can be specified either by config\n", "import modin.config as cfg\n", "cfg.Engine.put(\"dask\")\n", "\n", "# or by setting the environment variable\n", "# import os\n", "# os.environ[\"MODIN_ENGINE\"] = \"dask\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Concept for exercise: Dataframe constructor\n", "\n", "Often when playing around in pandas, it is useful to create a DataFrame with the constructor. That is where we will start.\n", "\n", "```python\n", "import numpy as np\n", "import pandas as pd\n", "\n", "frame_data = np.random.randint(0, 100, size=(2**10, 2**5))\n", "df = pd.DataFrame(frame_data)\n", "```\n", "\n", "When creating a dataframe from a non-distributed object, it will take extra time to partition the data. When this is happening, you will see this message:\n", "\n", "```\n", "UserWarning: Distributing object. This may take some time.\n", "```\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Note: Do not change this code!\n", "import numpy as np\n", "import pandas\n", "import sys\n", "import modin" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pandas.__version__" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modin.__version__" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Implement your answer here. You are also free to play with the size\n", "# and shape of the DataFrame, but beware of exceeding your memory!\n", "\n", "import pandas as pd\n", "\n", "frame_data = np.random.randint(0, 100, size=(2**10, 2**5))\n", "df = pd.DataFrame(frame_data)\n", "\n", "# ***** Do not change the code below! It verifies that \n", "# ***** the exercise has been done correctly. *****\n", "\n", "try:\n", " assert df is not None\n", " assert frame_data is not None\n", " assert isinstance(frame_data, np.ndarray)\n", "except:\n", " raise AssertionError(\"Don't change too much of the original code!\")\n", "assert \"modin.pandas\" in sys.modules, \"Not quite correct. Remember the single line of code change (See above)\"\n", "\n", "import modin.pandas\n", "assert pd == modin.pandas, \"Remember the single line of code change (See above)\"\n", "assert hasattr(df, \"_query_compiler\"), \"Make sure that `df` is a modin.pandas DataFrame.\"\n", "\n", "print(\"Success! You only need to change one line of code!\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now that we have created a toy example for playing around with the DataFrame, let's print it out in different ways." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Concept for Exercise: Data Interaction and Printing\n", "\n", "When interacting with data, it is very imporant to look at different parts of the data (e.g. `df.head()`). Here we will show that you can print the modin.pandas DataFrame in the same ways you would pandas." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Print the first 10 lines.\n", "df.head(10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Print the DataFrame.\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Free cell for custom interaction (Play around here!)\n", "df.add_prefix(\"col\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Please move on to [Exercise 2](./exercise_2.ipynb) when you are ready**" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: examples/tutorial/jupyter/execution/pandas_on_dask/local/exercise_2.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "![LOGO](../../../img/MODIN_ver2_hrz.png)\n", "\n", "

Scale your pandas workflows by changing one line of code

\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Exercise 2: Speed improvements\n", "\n", "**GOAL**: Learn about common functionality that Modin speeds up by using all of your machine's cores." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Concept for Exercise: `read_csv` speedups\n", "\n", "The most commonly used data ingestion method used in pandas is CSV files (link to pandas survey). This concept is designed to give an idea of the kinds of speedups possible, even on a non-distributed filesystem. Modin also supports other file formats for parallel and distributed reads, which can be found in the documentation. We will import both Modin and pandas so that the speedups are evident.\n", "\n", "**Note: Rerunning the `read_csv` cells many times may result in degraded performance, depending on the memory of the machine**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import modin.pandas as pd\n", "import pandas\n", "import time\n", "from IPython.display import Markdown, display\n", "\n", "def printmd(string):\n", " display(Markdown(string))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Dataset: 2015 NYC taxi trip data\n", "\n", "\n", "We will be using a version of this data already in S3, originally posted in this blog post: https://matthewrocklin.com/blog/work/2017/01/12/dask-dataframes\n", "\n", "**Size: ~1.8GB**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "path = \"s3://dask-data/nyc-taxi/2015/yellow_tripdata_2015-01.csv\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Modin execution engine setting:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import modin.config as cfg\n", "cfg.Engine.put(\"dask\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## `pandas.read_csv`" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "\n", "pandas_df = pandas.read_csv(path, parse_dates=[\"tpep_pickup_datetime\", \"tpep_dropoff_datetime\"], quoting=3)\n", "\n", "end = time.time()\n", "pandas_duration = end - start\n", "print(\"Time to read with pandas: {} seconds\".format(round(pandas_duration, 3)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Expect pandas to take >3 minutes on EC2, longer locally\n", "\n", "This is a good time to chat with your neighbor\n", "Dicussion topics\n", "- Do you work with a large amount of data daily?\n", "- How big is your data?\n", "- What’s the common use case of your data?\n", "- Do you use any big data analytics tools?\n", "- Do you use any interactive analytics tool?\n", "- What’s are some drawbacks of your current interative analytic tools today?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## `modin.pandas.read_csv`" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "\n", "modin_df = pd.read_csv(path, parse_dates=[\"tpep_pickup_datetime\", \"tpep_dropoff_datetime\"], quoting=3)\n", "\n", "end = time.time()\n", "modin_duration = end - start\n", "print(\"Time to read with Modin: {} seconds\".format(round(modin_duration, 3)))\n", "\n", "printmd(\"### Modin is {}x faster than pandas at `read_csv`!\".format(round(pandas_duration / modin_duration, 2)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Are they equal?" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pandas_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modin_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Concept for exercise: Reduces\n", "\n", "In pandas, a reduce would be something along the lines of a `sum` or `count`. It computes some summary statistics about the rows or columns. We will be using `count`." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "\n", "pandas_count = pandas_df.count()\n", "\n", "end = time.time()\n", "pandas_duration = end - start\n", "\n", "print(\"Time to count with pandas: {} seconds\".format(round(pandas_duration, 3)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "\n", "modin_count = modin_df.count()\n", "\n", "end = time.time()\n", "modin_duration = end - start\n", "print(\"Time to count with Modin: {} seconds\".format(round(modin_duration, 3)))\n", "\n", "printmd(\"### Modin is {}x faster than pandas at `count`!\".format(round(pandas_duration / modin_duration, 2)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Are they equal?" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pandas_count" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modin_count" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Concept for exercise: Map operations\n", "\n", "In pandas, map operations are operations that do a single pass over the data and do not change its shape. Operations like `isnull` and `applymap` are included in this. We will be using `isnull`." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "\n", "pandas_isnull = pandas_df.isnull()\n", "\n", "end = time.time()\n", "pandas_duration = end - start\n", "\n", "print(\"Time to isnull with pandas: {} seconds\".format(round(pandas_duration, 3)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "\n", "modin_isnull = modin_df.isnull()\n", "\n", "end = time.time()\n", "modin_duration = end - start\n", "print(\"Time to isnull with Modin: {} seconds\".format(round(modin_duration, 3)))\n", "\n", "printmd(\"### Modin is {}x faster than pandas at `isnull`!\".format(round(pandas_duration / modin_duration, 2)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Are they equal?" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pandas_isnull" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modin_isnull" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Concept for exercise: Apply over a single column\n", "\n", "Sometimes we want to compute some summary statistics on a single column from our dataset." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "rounded_trip_distance_pandas = pandas_df[\"trip_distance\"].apply(round)\n", "\n", "end = time.time()\n", "pandas_duration = end - start\n", "print(\"Time to groupby with pandas: {} seconds\".format(round(pandas_duration, 3)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "\n", "rounded_trip_distance_modin = modin_df[\"trip_distance\"].apply(round)\n", "\n", "end = time.time()\n", "modin_duration = end - start\n", "print(\"Time to add a column with Modin: {} seconds\".format(round(modin_duration, 3)))\n", "\n", "printmd(\"### Modin is {}x faster than pandas at `apply` on one column!\".format(round(pandas_duration / modin_duration, 2)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Are they equal?" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "rounded_trip_distance_pandas" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "rounded_trip_distance_modin" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Concept for exercise: Add a column\n", "\n", "It is common to need to add a new column to an existing dataframe, here we show that this is significantly faster in Modin due to metadata management and an efficient zero copy implementation." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "pandas_df[\"rounded_trip_distance\"] = rounded_trip_distance_pandas\n", "\n", "end = time.time()\n", "pandas_duration = end - start\n", "print(\"Time to groupby with pandas: {} seconds\".format(round(pandas_duration, 3)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "\n", "modin_df[\"rounded_trip_distance\"] = rounded_trip_distance_modin\n", "\n", "end = time.time()\n", "modin_duration = end - start\n", "print(\"Time to add a column with Modin: {} seconds\".format(round(modin_duration, 3)))\n", "\n", "printmd(\"### Modin is {}x faster than pandas add a column!\".format(round(pandas_duration / modin_duration, 2)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Are they equal?" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pandas_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modin_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Please move on to [Exercise 3](./exercise_3.ipynb) when you are ready**" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: examples/tutorial/jupyter/execution/pandas_on_dask/local/exercise_3.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "![LOGO](../../../img/MODIN_ver2_hrz.png)\n", "\n", "

Scale your pandas workflows by changing one line of code

\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Exercise 3: Not Implemented\n", "\n", "**GOAL**: Learn what happens when a function is not yet supported in Modin as well as how to extend Modin's functionality using the DataFrame Algebra." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "When functionality has not yet been implemented, we default to pandas\n", "\n", "![](../../../img/convert_to_pandas.png)\n", "\n", "We convert a Modin dataframe to pandas to do the operation, then convert it back once it is finished. These operations will have a high overhead due to the communication involved and will take longer than pandas.\n", "\n", "When this is happening, a warning will be given to the user to inform them that this operation will take longer than usual. For example, `DataFrame.mask` is not yet implemented. In this case, when a user tries to use it, they will see this warning:\n", "\n", "```\n", "UserWarning: `DataFrame.mask` defaulting to pandas implementation.\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Concept for exercise: Default to pandas\n", "\n", "In this section of the exercise we will see first-hand how the runtime is affected by operations that are not implemented." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import modin.pandas as pd\n", "import pandas\n", "import numpy as np\n", "import time\n", "import modin.config as cfg\n", "cfg.Engine.put(\"dask\")\n", "\n", "frame_data = np.random.randint(0, 100, size=(2**18, 2**8))\n", "df = pd.DataFrame(frame_data).add_prefix(\"col\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pandas_df = pandas.DataFrame(frame_data).add_prefix(\"col\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modin_start = time.time()\n", "\n", "print(df.mask(df < 50))\n", "\n", "modin_end = time.time()\n", "print(\"Modin mask took {} seconds.\".format(round(modin_end - modin_start, 4)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pandas_start = time.time()\n", "\n", "print(pandas_df.mask(pandas_df < 50))\n", "\n", "pandas_end = time.time()\n", "print(\"pandas mask took {} seconds.\".format(round(pandas_end - pandas_start, 4)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Concept for exercise: Register custom functions\n", "\n", "Modin's user-facing API is pandas, but it is possible that we do not yet support your favorite or most-needed functionalities. Your user-defined function may also be able to be executed more efficiently if you pre-define the type of function it is (e.g. map, reduce, etc.) using the DataFrame Algebra. To solve either case, it is possible to register a custom function to be applied to your data." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Registering a custom function for all query compilers\n", "\n", "To register a custom function for a query compiler, we first need to import it:\n", "\n", "```python\n", "from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler\n", "```\n", "\n", "The `PandasQueryCompiler` is responsible for defining and compiling the queries that can be operated on by Modin, and is specific to the pandas storage format. Any queries defined here must also both be compatible with and result in a `pandas.DataFrame`. Many functionalities are very simply implemented, as you can see in the current code: [Link](https://github.com/modin-project/modin/blob/7a8158873e77cb5f1a5a3b89be4ddac89f576269/modin/core/storage_formats/pandas/query_compiler.py#L216).\n", "\n", "If we want to register a new function, we need to understand what kind of function it is. In our example, we will try to implement a `kurtosis` on the unary negation of the values in the dataframe, which is a map (unargy negation of each cell) followed by a reduce. So we next want to import the function type so we can use it in our definition:\n", "\n", "```python\n", "from modin.core.dataframe.algebra import TreeReduce\n", "```\n", "\n", "Then we can just use the `TreeReduce.register` `classmethod` and assign it to the `PandasQueryCompiler`:\n", "\n", "```python\n", "PandasQueryCompiler.neg_kurtosis = TreeReduce.register(lambda cell_value, **kwargs: ~cell_value, pandas.DataFrame.kurtosis)\n", "```\n", "\n", "We include `**kwargs` to the `lambda` function since the query compiler will pass all keyword arguments to both the map and reduce functions.\n", "\n", "Finally, we want a handle to it from the `DataFrame`, so we need to create a way to do that:\n", "\n", "```python\n", "def neg_kurtosis_func(self, **kwargs):\n", " # The constructor allows you to pass in a query compiler as a keyword argument\n", " return self.__constructor__(query_compiler=self._query_compiler.neg_kurtosis(**kwargs))\n", "\n", "pd.DataFrame.neg_kurtosis_custom = neg_kurtosis_func\n", "```\n", "\n", "And then you can use it like you usually would:\n", "\n", "```python\n", "df.neg_kurtosis_custom()\n", "```" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler\n", "from modin.core.dataframe.algebra import TreeReduce" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "PandasQueryCompiler.neg_kurtosis_custom = TreeReduce.register(lambda cell_value, **kwargs: ~cell_value,\n", " pandas.DataFrame.kurtosis)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from pandas._libs import lib\n", "# The function signature came from the pandas documentation:\n", "# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.kurtosis.html\n", "def neg_kurtosis_func(self, axis=lib.no_default, skipna=True, level=None, numeric_only=None, **kwargs):\n", " # We need to specify the axis for the query compiler\n", " if axis in [None, lib.no_default]:\n", " axis = 0\n", " # The constructor allows you to pass in a query compiler as a keyword argument\n", " # Reduce dimension is used for reduces\n", " # We also pass all keyword arguments here to ensure correctness\n", " return self._reduce_dimension(\n", " self._query_compiler.neg_kurtosis_custom(\n", " axis=axis, skipna=skipna, level=level, numeric_only=numeric_only, **kwargs\n", " )\n", " )\n", "\n", "pd.DataFrame.neg_kurtosis_custom = neg_kurtosis_func" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Speed improvements\n", "If we were to try and replicate this functionality using the pandas API, we would need to call `df.applymap` with our unary negation function, and subsequently `df.kurtosis` on the result of the first call. Let's see how this compares with our new, custom function!" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "\n", "print(pandas_df.applymap(lambda cell_value: ~cell_value).kurtosis())\n", "\n", "end = time.time()\n", "pandas_duration = end - start\n", "print(\"pandas unary negation kurtosis took {} seconds.\".format(pandas_duration))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "\n", "print(df.applymap(lambda x: ~x).kurtosis())\n", "\n", "end = time.time()\n", "modin_duration = end - start\n", "print(\"Modin unary negation kurtosis took {} seconds.\".format(modin_duration))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "custom_start = time.time()\n", "\n", "print(df.neg_kurtosis_custom())\n", "\n", "custom_end = time.time()\n", "modin_custom_duration = custom_end - custom_start\n", "print(\"Modin neg_kurtosis_custom took {} seconds.\".format(modin_custom_duration))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from IPython.display import Markdown, display\n", "\n", "display(Markdown(\"### As expected, Modin is {}x faster than pandas when chaining the functions; however we see that our custom function is even faster than that - beating pandas by {}x, and Modin (when chaining the functions) by {}x!\".format(round(pandas_duration / modin_duration, 2), round(pandas_duration / modin_custom_duration, 2), round(modin_duration / modin_custom_duration, 2))))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Congratulations! You have just implemented new DataFrame functionality!\n", "\n", "## Consider opening a pull request: https://github.com/modin-project/modin/pulls\n", "\n", "For a complete list of what is implemented, see the [Supported APIs](https://modin.readthedocs.io/en/latest/supported_apis/index.html) section." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Test your knowledge: Add a custom function for another tree reduce: finding `DataFrame.mad` after squaring all of the values\n", "\n", "See the pandas documentation for the correct signature: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.mad.html" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modin_mad_custom_start = time.time()\n", "\n", "# Implement your function here! Put the result of your custom squared `mad` in the variable `modin_mad_custom`\n", "# Hint: Look at the kurtosis walkthrough above\n", "\n", "modin_mad_custom = ...\n", "print(modin_mad_custom)\n", "\n", "modin_mad_custom_end = time.time()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Evaluation code, do not change!\n", "modin_mad_start = time.time()\n", "modin_mad = df.applymap(lambda x: x**2).mad()\n", "print(modin_mad)\n", "modin_mad_end = time.time()\n", "\n", "assert modin_mad_end - modin_mad_start > modin_mad_custom_end - modin_mad_custom_start, \\\n", " \"Your implementation was too slow, or you used the chaining functions approach. Try again\"\n", "assert modin_mad._to_pandas().equals(modin_mad_custom._to_pandas()), \"Your result did not match the result of chaining the functions, try again\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Now that you are able to create custom functions, you know enough to contribute to Modin!\n", "\n", "**Please move on to [Exercise 4](./exercise_4.ipynb) when you are ready**" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: examples/tutorial/jupyter/execution/pandas_on_dask/local/exercise_4.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "id": "99f41d2d", "metadata": {}, "source": [ "![LOGO](../../../img/MODIN_ver2_hrz.png)\n", "\n", "

Scale your pandas workflows by changing one line of code

\n", "\n" ] }, { "cell_type": "markdown", "id": "fdda1c9c", "metadata": {}, "source": [ "# Exercise 4: Experimental Features\n", "\n", "**GOAL**: Explore some of the experimental features being added to Modin." ] }, { "cell_type": "markdown", "id": "e7bf87a5", "metadata": {}, "source": [ "### Concept for exercise: Spreadsheet\n", "\n", "For those who have worked with Excel, the Spreadsheet API will definitely feel familiar! The Spreadsheet API is a Jupyter notebook widget that allows us to interact with Modin DataFrames in a spreadsheet-like fashion while taking advantage of the underlying capabilities of Modin. The widget makes it quick and easy to explore, sort, filter, and edit data as well as export the changes as reproducible code.\n", "\n", "Let's look back at a subset of the 2015 NYC Taxi Data from Exercise 2, and see how the Spreadsheet API can make it easy to play with the data!" ] }, { "cell_type": "code", "execution_count": null, "id": "5d5c4a3e", "metadata": {}, "outputs": [], "source": [ "!jupyter nbextension enable --py --sys-prefix modin_spreadsheet" ] }, { "cell_type": "code", "execution_count": null, "id": "dc8d5903", "metadata": {}, "outputs": [], "source": [ "import modin.pandas as pd\n", "import modin.experimental.spreadsheet as mss\n", "from modin.config import Engine\n", "Engine.put(\"dask\")\n", "\n", "s3_path = \"s3://dask-data/nyc-taxi/2015/yellow_tripdata_2015-01.csv\"\n", "modin_df = pd.read_csv(s3_path, parse_dates=[\"tpep_pickup_datetime\", \"tpep_dropoff_datetime\"], quoting=3, nrows=1000)" ] }, { "cell_type": "code", "execution_count": null, "id": "145e7bbe", "metadata": {}, "outputs": [], "source": [ "spreadsheet = mss.from_dataframe(modin_df)\n", "spreadsheet" ] }, { "cell_type": "markdown", "id": "3c18b7f2", "metadata": {}, "source": [ "### Thank you for participating!" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: examples/tutorial/jupyter/execution/pandas_on_dask/requirements.txt ================================================ fsspec>=2022.11.0 jupyterlab ipywidgets modin[dask] modin[spreadsheet] ================================================ FILE: examples/tutorial/jupyter/execution/pandas_on_dask/test/test_notebooks.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import os import sys import nbformat MODIN_DIR = os.path.abspath( os.path.join(os.path.dirname(__file__), *[".." for _ in range(6)]) ) sys.path.insert(0, MODIN_DIR) from examples.tutorial.jupyter.execution.test.utils import ( # noqa: E402 _execute_notebook, _replace_str, download_taxi_dataset, test_dataset_path, ) local_notebooks_dir = "examples/tutorial/jupyter/execution/pandas_on_dask/local" # in this notebook user should replace 'import pandas as pd' with # 'import modin.pandas as pd' to make notebook work def test_exercise_1(): modified_notebook_path = os.path.join(local_notebooks_dir, "exercise_1_test.ipynb") nb = nbformat.read( os.path.join(local_notebooks_dir, "exercise_1.ipynb"), as_version=nbformat.NO_CONVERT, ) _replace_str(nb, "import pandas as pd", "import modin.pandas as pd") nbformat.write(nb, modified_notebook_path) _execute_notebook(modified_notebook_path) # this notebook works "as is" but for testing purposes we can use smaller dataset def test_exercise_2(): modified_notebook_path = os.path.join(local_notebooks_dir, "exercise_2_test.ipynb") nb = nbformat.read( os.path.join(local_notebooks_dir, "exercise_2.ipynb"), as_version=nbformat.NO_CONVERT, ) new_cell = f'path = "{test_dataset_path}"\n' + download_taxi_dataset _replace_str( nb, 'path = "s3://dask-data/nyc-taxi/2015/yellow_tripdata_2015-01.csv"', new_cell, ) nbformat.write(nb, modified_notebook_path) _execute_notebook(modified_notebook_path) # in this notebook user should add custom mad implementation # to make notebook work def test_exercise_3(): modified_notebook_path = os.path.join(local_notebooks_dir, "exercise_3_test.ipynb") nb = nbformat.read( os.path.join(local_notebooks_dir, "exercise_3.ipynb"), as_version=nbformat.NO_CONVERT, ) user_mad_implementation = """PandasQueryCompiler.sq_mad_custom = TreeReduce.register(lambda cell_value, **kwargs: cell_value ** 2, pandas.DataFrame.mad) def sq_mad_func(self, axis=None, skipna=True, level=None, **kwargs): if axis is None: axis = 0 return self._reduce_dimension( self._query_compiler.sq_mad_custom( axis=axis, skipna=skipna, level=level, **kwargs ) ) pd.DataFrame.sq_mad_custom = sq_mad_func modin_mad_custom = df.sq_mad_custom() """ _replace_str(nb, "modin_mad_custom = ...", user_mad_implementation) nbformat.write(nb, modified_notebook_path) # need to update example, `.mad` doesn't exist # _execute_notebook(modified_notebook_path) # this notebook works "as is" but for testing purposes we can use smaller dataset def test_exercise_4(): modified_notebook_path = os.path.join(local_notebooks_dir, "exercise_4_test.ipynb") nb = nbformat.read( os.path.join(local_notebooks_dir, "exercise_4.ipynb"), as_version=nbformat.NO_CONVERT, ) s3_path_cell = f's3_path = "{test_dataset_path}"\n' + download_taxi_dataset _replace_str( nb, 's3_path = "s3://dask-data/nyc-taxi/2015/yellow_tripdata_2015-01.csv"', s3_path_cell, ) nbformat.write(nb, modified_notebook_path) _execute_notebook(modified_notebook_path) ================================================ FILE: examples/tutorial/jupyter/execution/pandas_on_ray/Dockerfile ================================================ FROM continuumio/miniconda3 RUN conda install -c conda-forge psutil setproctitle RUN pip install -r requirements-dev.txt ================================================ FILE: examples/tutorial/jupyter/execution/pandas_on_ray/cluster/README.md ================================================ ![LOGO](../../../img/MODIN_ver2_hrz.png)

Scale your pandas workflows on a Ray cluster

**NOTE**: Before starting the exercise, please read the full instructions in the [Modin documenation](https://modin.readthedocs.io/en/latest/getting_started/using_modin/using_modin_cluster.html). The basic steps to run the script on a remote Ray cluster are: Step 1. Install the necessary dependencies ```bash pip install boto3 ``` Step 2. Setup your AWS credentials. ```bash aws configure ``` Step 3. Modify configuration file and start up the Ray cluster. ```bash ray up modin-cluster.yaml ``` Step 4. Submit your script to the remote cluster. ```bash ray submit modin-cluster.yaml exercise_5.py ``` Step 5. Shut down the Ray remote cluster. ```bash ray down ================================================ FILE: examples/tutorial/jupyter/execution/pandas_on_ray/cluster/exercise_5.py ================================================ import time import ray import modin.pandas as pd ray.init(address="auto") cpu_count = ray.cluster_resources()["CPU"] assert cpu_count == 576, f"Expected 576 CPUs, but found {cpu_count}" file_path = "big_yellow.csv" t0 = time.perf_counter() df = pd.read_csv(file_path, quoting=3) df_count = df.count() df_groupby_count = df.groupby("passenger_count").count() df_map = df.map(str) t1 = time.perf_counter() print(f"Full script time is {(t1 - t0):.3f}") # noqa: T201 ================================================ FILE: examples/tutorial/jupyter/execution/pandas_on_ray/cluster/modin-cluster.yaml ================================================ # An unique identifier for the head node and workers of this cluster. cluster_name: modin_init # The maximum number of workers nodes to launch in addition to the head # node. max_workers: 5 # The autoscaler will scale up the cluster faster with higher upscaling speed. # E.g., if the task requires adding more nodes then autoscaler will gradually # scale up the cluster in chunks of upscaling_speed*currently_running_nodes. # This number should be > 0. upscaling_speed: 1.0 # This executes all commands on all nodes in the docker container, # and opens all the necessary ports to support the Ray cluster. # Empty string means disabled. docker: # image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup image: rayproject/ray:latest-cpu # use this one if you don't need ML dependencies, it's faster to pull container_name: "ray_container" # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image # if no cached version is present. pull_before_run: True run_options: # Extra options to pass into "docker run" - --ulimit nofile=65536:65536 # Example of running a GPU head with CPU workers # head_image: "rayproject/ray-ml:latest-gpu" # Allow Ray to automatically detect GPUs # worker_image: "rayproject/ray-ml:latest-cpu" # worker_run_options: [] # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 5 # Cloud-provider specific configuration. provider: type: aws region: us-west-2 # Availability zone(s), comma-separated, that nodes may be launched in. # Nodes will be launched in the first listed availability zone and will # be tried in the subsequent availability zones if launching fails. availability_zone: us-west-2a,us-west-2b # Whether to allow node reuse. If set to False, nodes will be terminated # instead of stopped. cache_stopped_nodes: False # If not present, the default is True. # How Ray will authenticate with newly launched nodes. auth: ssh_user: ubuntu # By default Ray creates a new private keypair, but you can also use your own. # If you do so, make sure to also set "KeyName" in the head and worker node # configurations below. # ssh_private_key: /path/to/your/key.pem # Tell the autoscaler the allowed node types and the resources they provide. # The key is the name of the node type, which is just for debugging purposes. # The node config specifies the launch config and physical instance type. available_node_types: ray.head.default: # The node type's CPU and GPU resources are auto-detected based on AWS instance type. # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler. # You can also set custom resources. # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set # resources: {"CPU": 1, "GPU": 1, "custom": 5} resources: {} # Provider-specific config for this node type, e.g. instance type. By default # Ray will auto-configure unspecified fields such as SubnetId and KeyName. # For more documentation on available fields, see: # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances node_config: InstanceType: m5.24xlarge # Default AMI for us-west-2. # Check https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/_private/aws/config.py # for default images for other zones. ImageId: ami-0387d929287ab193e # You can provision additional disk space with a conf as follows BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: VolumeSize: 500 VolumeType: gp3 # Additional options in the boto docs. ray.worker.default: # The minimum number of worker nodes of this type to launch. # This number should be >= 0. min_workers: 5 # The maximum number of worker nodes of this type to launch. # This takes precedence over min_workers. max_workers: 5 # The node type's CPU and GPU resources are auto-detected based on AWS instance type. # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler. # You can also set custom resources. # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set # resources: {"CPU": 1, "GPU": 1, "custom": 5} resources: {} # Provider-specific config for this node type, e.g. instance type. By default # Ray will auto-configure unspecified fields such as SubnetId and KeyName. # For more documentation on available fields, see: # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances node_config: InstanceType: m5.24xlarge # Default AMI for us-west-2. # Check https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/_private/aws/config.py # for default images for other zones. ImageId: ami-0387d929287ab193e # You can provision additional disk space with a conf as follows BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: VolumeSize: 500 VolumeType: gp3 # Run workers on spot by default. Comment this out to use on-demand. # NOTE: If relying on spot instances, it is best to specify multiple different instance # types to avoid interruption when one instance type is experiencing heightened demand. # Demand information can be found at https://aws.amazon.com/ec2/spot/instance-advisor/ # InstanceMarketOptions: # MarketType: spot # Additional options can be found in the boto docs, e.g. # SpotOptions: # MaxPrice: MAX_HOURLY_PRICE # Additional options in the boto docs. # Specify the node type of the head node (as configured above). head_node_type: ray.head.default # Files or directories to copy to the head and worker nodes. The format is a # dictionary from REMOTE_PATH: LOCAL_PATH, e.g. file_mounts: { # "/path1/on/remote/machine": "/path1/on/local/machine", # "/path2/on/remote/machine": "/path2/on/local/machine", } # Files or directories to copy from the head node to the worker nodes. The format is a # list of paths. The same path on the head node will be copied to the worker node. # This behavior is a subset of the file_mounts behavior. In the vast majority of cases # you should just use file_mounts. Only use this if you know what you're doing! cluster_synced_files: [] # Whether changes to directories in file_mounts or cluster_synced_files in the head node # should sync to the worker node continuously file_mounts_sync_continuously: False # Patterns for files to exclude when running rsync up or rsync down rsync_exclude: - "**/.git" - "**/.git/**" # Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for # in the source directory and recursively through all subdirectories. For example, if .gitignore is provided # as a value, the behavior will match git's behavior for finding and using .gitignore files. rsync_filter: - ".gitignore" # List of commands that will be run before `setup_commands`. If docker is # enabled, these commands will run outside the container and before docker # is setup. initialization_commands: [] # List of shell commands to run to set up nodes. setup_commands: # Note: if you're developing Ray, you probably want to create a Docker image that # has your Ray repo pre-cloned. Then, you can replace the pip installs # below with a git checkout (and possibly a recompile). # To run the nightly version of ray (as opposed to the latest), either use a rayproject docker image # that has the "nightly" (e.g. "rayproject/ray-ml:nightly-gpu") or uncomment the following line: # - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl" - conda create -n "modin" -c conda-forge modin "ray-default">=2.10.0,<3 -y - conda activate modin && pip install -U fsspec>=2022.11.0 boto3 - echo "conda activate modin" >> ~/.bashrc - wget https://modin-datasets.intel.com/testing/yellow_tripdata_2015-01.csv - printf "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee\n" > big_yellow.csv - tail -n +2 yellow_tripdata_2015-01.csv{,}{,}{,}{,}{,}{,} >> big_yellow.csv - echo 'export MODIN_RAY_CLUSTER=True' >> ~/.bashrc # Custom commands that will be run on the head node after common setup. head_setup_commands: - echo 'export MODIN_REDIS_ADDRESS="localhost:6379"' >> ~/.bashrc # Custom commands that will be run on worker nodes after common setup. worker_setup_commands: [] # Command to start ray on the head node. You don't need to change this. head_start_ray_commands: - ray stop - echo 'export MEMORY_STORE_SIZE=$(awk "/MemFree/ { printf \"%d \\n\", \$2*1024}" /proc/meminfo)' >> ~/.bashrc - echo 'export TMPDIR="$(dirname $(mktemp tmp.XXXXXXXXXX -ut))"' >> ~/.bashrc - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --object-store-memory=$MEMORY_STORE_SIZE --plasma-directory=$TMPDIR --dashboard-host=0.0.0.0 # Command to start ray on worker nodes. You don't need to change this. worker_start_ray_commands: - ray stop - echo 'export MEMORY_STORE_SIZE=$(awk "/MemFree/ { printf \"%d \\n\", \$2*1024}" /proc/meminfo)' >> ~/.bashrc - echo 'export TMPDIR="$(dirname $(mktemp tmp.XXXXXXXXXX -ut))"' >> ~/.bashrc - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --object-store-memory=$MEMORY_STORE_SIZE --plasma-directory=$TMPDIR ================================================ FILE: examples/tutorial/jupyter/execution/pandas_on_ray/local/exercise_1.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "![LOGO](../../../img/MODIN_ver2_hrz.png)\n", "\n", "

Scale your pandas workflows by changing one line of code

\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Exercise 1: How to use Modin\n", "\n", "**GOAL**: Learn how to import Modin to accelerate and scale pandas workflows." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Modin is a drop-in replacement for pandas that distributes the computation \n", "across all of the cores in your machine or in a cluster.\n", "In practical terms, this means that you can continue using the same pandas scripts\n", "as before and expect the behavior and results to be the same. The only thing that needs\n", "to change is the import statement. Normally, you would change:\n", "\n", "```python\n", "import pandas as pd\n", "```\n", "\n", "to:\n", "\n", "```python\n", "import modin.pandas as pd\n", "```\n", "\n", "Changing this line of code will allow you to use all of the cores in your machine to do computation on your data. One of the major performance bottlenecks of pandas is that it only uses a single core for any given computation. Modin exposes an API that is identical to pandas, allowing you to continue interacting with your data as you would with pandas. There are no additional commands required to use Modin locally. Partitioning, scheduling, data transfer, and other related concerns are all handled by Modin under the hood." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "

\n", "

pandas on a multicore laptop\n", " \n", " Modin on a multicore laptop\n", " \n", "\n", "
\n", "\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Concept for exercise: Dataframe constructor\n", "\n", "Often when playing around in pandas, it is useful to create a DataFrame with the constructor. That is where we will start.\n", "\n", "```python\n", "import numpy as np\n", "import pandas as pd\n", "\n", "frame_data = np.random.randint(0, 100, size=(2**10, 2**5))\n", "df = pd.DataFrame(frame_data)\n", "```\n", "\n", "When creating a dataframe from a non-distributed object, it will take extra time to partition the data. When this is happening, you will see this message:\n", "\n", "```\n", "UserWarning: Distributing object. This may take some time.\n", "```" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Note: Do not change this code!\n", "import numpy as np\n", "import pandas\n", "import sys\n", "import modin" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pandas.__version__" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modin.__version__" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Implement your answer here. You are also free to play with the size\n", "# and shape of the DataFrame, but beware of exceeding your memory!\n", "\n", "import pandas as pd\n", "\n", "frame_data = np.random.randint(0, 100, size=(2**10, 2**5))\n", "df = pd.DataFrame(frame_data)\n", "\n", "# ***** Do not change the code below! It verifies that \n", "# ***** the exercise has been done correctly. *****\n", "\n", "try:\n", " assert df is not None\n", " assert frame_data is not None\n", " assert isinstance(frame_data, np.ndarray)\n", "except:\n", " raise AssertionError(\"Don't change too much of the original code!\")\n", "assert \"modin.pandas\" in sys.modules, \"Not quite correct. Remember the single line of code change (See above)\"\n", "\n", "import modin.pandas\n", "assert pd == modin.pandas, \"Remember the single line of code change (See above)\"\n", "assert hasattr(df, \"_query_compiler\"), \"Make sure that `df` is a modin.pandas DataFrame.\"\n", "\n", "print(\"Success! You only need to change one line of code!\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now that we have created a toy example for playing around with the DataFrame, let's print it out in different ways." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Concept for Exercise: Data Interaction and Printing\n", "\n", "When interacting with data, it is very imporant to look at different parts of the data (e.g. `df.head()`). Here we will show that you can print the modin.pandas DataFrame in the same ways you would pandas." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Print the first 10 lines.\n", "df.head(10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Print the DataFrame.\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Free cell for custom interaction (Play around here!)\n", "df.add_prefix(\"col\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Please move on to [Exercise 2](./exercise_2.ipynb) when you are ready**" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: examples/tutorial/jupyter/execution/pandas_on_ray/local/exercise_2.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "![LOGO](../../../img/MODIN_ver2_hrz.png)\n", "\n", "

Scale your pandas workflows by changing one line of code

\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Exercise 2: Speed improvements\n", "\n", "**GOAL**: Learn about common functionality that Modin speeds up by using all of your machine's cores." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Concept for Exercise: `read_csv` speedups\n", "\n", "The most commonly used data ingestion method used in pandas is CSV files (link to pandas survey). This concept is designed to give an idea of the kinds of speedups possible, even on a non-distributed filesystem. Modin also supports other file formats for parallel and distributed reads, which can be found in the documentation.\n", "\n", "![](../../../img/read_csv_perf.png)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We will import both Modin and pandas so that the speedups are evident.\n", "\n", "**Note: Rerunning the `read_csv` cells many times may result in degraded performance, depending on the memory of the machine**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import modin.pandas as pd\n", "import pandas\n", "import time\n", "from IPython.display import Markdown, display\n", "\n", "def printmd(string):\n", " display(Markdown(string))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Dataset: 2015 NYC taxi trip data\n", "\n", "We will be using a version of this data already in S3, originally posted in this blog post: https://matthewrocklin.com/blog/work/2017/01/12/dask-dataframes\n", "\n", "**Size: ~1.8GB**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "path = \"s3://dask-data/nyc-taxi/2015/yellow_tripdata_2015-01.csv\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Optional:** Note that the dataset takes a while to download. To speed things up a bit, if you prefer to download this file once locally, you can run the following code in the notebook:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# [Optional] Download data locally. This may take a few minutes to download.\n", "# import urllib.request\n", "# url_path = \"https://dask-data.s3.amazonaws.com/nyc-taxi/2015/yellow_tripdata_2015-01.csv\"\n", "# urllib.request.urlretrieve(url_path, \"taxi.csv\")\n", "# path = \"taxi.csv\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## `pandas.read_csv`" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "\n", "pandas_df = pandas.read_csv(path, parse_dates=[\"tpep_pickup_datetime\", \"tpep_dropoff_datetime\"], quoting=3)\n", "\n", "end = time.time()\n", "pandas_duration = end - start\n", "print(\"Time to read with pandas: {} seconds\".format(round(pandas_duration, 3)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Expect pandas to take >3 minutes on EC2, longer locally\n", "\n", "This is a good time to chat with your neighbor\n", "Dicussion topics\n", "- Do you work with a large amount of data daily?\n", "- How big is your data?\n", "- What’s the common use case of your data?\n", "- Do you use any big data analytics tools?\n", "- Do you use any interactive analytics tool?\n", "- What’s are some drawbacks of your current interative analytic tools today?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## `modin.pandas.read_csv`" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "\n", "modin_df = pd.read_csv(path, parse_dates=[\"tpep_pickup_datetime\", \"tpep_dropoff_datetime\"], quoting=3)\n", "\n", "end = time.time()\n", "modin_duration = end - start\n", "print(\"Time to read with Modin: {} seconds\".format(round(modin_duration, 3)))\n", "\n", "printmd(\"### Modin is {}x faster than pandas at `read_csv`!\".format(round(pandas_duration / modin_duration, 2)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Are they equal?" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pandas_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modin_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Concept for exercise: Reduces\n", "\n", "In pandas, a reduce would be something along the lines of a `sum` or `count`. It computes some summary statistics about the rows or columns. We will be using `count`." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "\n", "pandas_count = pandas_df.count()\n", "\n", "end = time.time()\n", "pandas_duration = end - start\n", "\n", "print(\"Time to count with pandas: {} seconds\".format(round(pandas_duration, 3)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "\n", "modin_count = modin_df.count()\n", "\n", "end = time.time()\n", "modin_duration = end - start\n", "print(\"Time to count with Modin: {} seconds\".format(round(modin_duration, 3)))\n", "\n", "printmd(\"### Modin is {}x faster than pandas at `count`!\".format(round(pandas_duration / modin_duration, 2)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Are they equal?" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pandas_count" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modin_count" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Concept for exercise: Map operations\n", "\n", "In pandas, map operations are operations that do a single pass over the data and do not change its shape. Operations like `isnull` and `applymap` are included in this. We will be using `isnull`." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "\n", "pandas_isnull = pandas_df.isnull()\n", "\n", "end = time.time()\n", "pandas_duration = end - start\n", "\n", "print(\"Time to isnull with pandas: {} seconds\".format(round(pandas_duration, 3)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "\n", "modin_isnull = modin_df.isnull()\n", "\n", "end = time.time()\n", "modin_duration = end - start\n", "print(\"Time to isnull with Modin: {} seconds\".format(round(modin_duration, 3)))\n", "\n", "printmd(\"### Modin is {}x faster than pandas at `isnull`!\".format(round(pandas_duration / modin_duration, 2)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Are they equal?" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pandas_isnull" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modin_isnull" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Concept for exercise: Apply over a single column\n", "\n", "Sometimes we want to compute some summary statistics on a single column from our dataset." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "rounded_trip_distance_pandas = pandas_df[\"trip_distance\"].apply(round)\n", "\n", "end = time.time()\n", "pandas_duration = end - start\n", "print(\"Time to groupby with pandas: {} seconds\".format(round(pandas_duration, 3)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "\n", "rounded_trip_distance_modin = modin_df[\"trip_distance\"].apply(round)\n", "\n", "end = time.time()\n", "modin_duration = end - start\n", "print(\"Time to add a column with Modin: {} seconds\".format(round(modin_duration, 3)))\n", "\n", "printmd(\"### Modin is {}x faster than pandas at `apply` on one column!\".format(round(pandas_duration / modin_duration, 2)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Are they equal?" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "rounded_trip_distance_pandas" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "rounded_trip_distance_modin" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Concept for exercise: Add a column\n", "\n", "It is common to need to add a new column to an existing dataframe, here we show that this is significantly faster in Modin due to metadata management and an efficient zero copy implementation." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "pandas_df[\"rounded_trip_distance\"] = rounded_trip_distance_pandas\n", "\n", "end = time.time()\n", "pandas_duration = end - start\n", "print(\"Time to groupby with pandas: {} seconds\".format(round(pandas_duration, 3)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "\n", "modin_df[\"rounded_trip_distance\"] = rounded_trip_distance_modin\n", "\n", "end = time.time()\n", "modin_duration = end - start\n", "print(\"Time to add a column with Modin: {} seconds\".format(round(modin_duration, 3)))\n", "\n", "printmd(\"### Modin is {}x faster than pandas add a column!\".format(round(pandas_duration / modin_duration, 2)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Are they equal?" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pandas_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modin_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Please move on to [Exercise 3](./exercise_3.ipynb) when you are ready**" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: examples/tutorial/jupyter/execution/pandas_on_ray/local/exercise_3.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "![LOGO](../../../img/MODIN_ver2_hrz.png)\n", "\n", "

Scale your pandas workflows by changing one line of code

\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Exercise 3: Not Implemented\n", "\n", "**GOAL**: Learn what happens when a function is not yet supported in Modin as well as how to extend Modin's functionality using the DataFrame Algebra." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "When functionality has not yet been implemented, we default to pandas\n", "\n", "![](../../../img/convert_to_pandas.png)\n", "\n", "We convert a Modin dataframe to pandas to do the operation, then convert it back once it is finished. These operations will have a high overhead due to the communication involved and will take longer than pandas.\n", "\n", "When this is happening, a warning will be given to the user to inform them that this operation will take longer than usual. For example, `DataFrame.mask` is not yet implemented. In this case, when a user tries to use it, they will see this warning:\n", "\n", "```\n", "UserWarning: `DataFrame.mask` defaulting to pandas implementation.\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Concept for exercise: Default to pandas\n", "\n", "In this section of the exercise we will see first-hand how the runtime is affected by operations that are not implemented." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import modin.pandas as pd\n", "import pandas\n", "import numpy as np\n", "import time\n", "\n", "frame_data = np.random.randint(0, 100, size=(2**18, 2**8))\n", "df = pd.DataFrame(frame_data).add_prefix(\"col\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pandas_df = pandas.DataFrame(frame_data).add_prefix(\"col\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modin_start = time.time()\n", "\n", "print(df.mask(df < 50))\n", "\n", "modin_end = time.time()\n", "print(\"Modin mask took {} seconds.\".format(round(modin_end - modin_start, 4)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pandas_start = time.time()\n", "\n", "print(pandas_df.mask(pandas_df < 50))\n", "\n", "pandas_end = time.time()\n", "print(\"pandas mask took {} seconds.\".format(round(pandas_end - pandas_start, 4)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Concept for exercise: Register custom functions\n", "\n", "Modin's user-facing API is pandas, but it is possible that we do not yet support your favorite or most-needed functionalities. Your user-defined function may also be able to be executed more efficiently if you pre-define the type of function it is (e.g. map, reduce, etc.) using the DataFrame Algebra. To solve either case, it is possible to register a custom function to be applied to your data." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Registering a custom function for all query compilers\n", "\n", "To register a custom function for a query compiler, we first need to import it:\n", "\n", "```python\n", "from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler\n", "```\n", "\n", "The `PandasQueryCompiler` is responsible for defining and compiling the queries that can be operated on by Modin, and is specific to the pandas storage format. Any queries defined here must also both be compatible with and result in a `pandas.DataFrame`. Many functionalities are very simply implemented, as you can see in the current code: [Link](https://github.com/modin-project/modin/blob/7a8158873e77cb5f1a5a3b89be4ddac89f576269/modin/core/storage_formats/pandas/query_compiler.py#L216).\n", "\n", "If we want to register a new function, we need to understand what kind of function it is. In our example, we will try to implement a `kurtosis` on the unary negation of the values in the dataframe, which is a map (unargy negation of each cell) followed by a reduce. So we next want to import the function type so we can use it in our definition:\n", "\n", "```python\n", "from modin.core.dataframe.algebra import TreeReduce\n", "```\n", "\n", "Then we can just use the `TreeReduce.register` `classmethod` and assign it to the `PandasQueryCompiler`:\n", "\n", "```python\n", "PandasQueryCompiler.neg_kurtosis = TreeReduce.register(lambda cell_value, **kwargs: ~cell_value, pandas.DataFrame.kurtosis)\n", "```\n", "\n", "We include `**kwargs` to the `lambda` function since the query compiler will pass all keyword arguments to both the map and reduce functions.\n", "\n", "Finally, we want a handle to it from the `DataFrame`, so we need to create a way to do that:\n", "\n", "```python\n", "def neg_kurtosis_func(self, **kwargs):\n", " # The constructor allows you to pass in a query compiler as a keyword argument\n", " return self.__constructor__(query_compiler=self._query_compiler.neg_kurtosis(**kwargs))\n", "\n", "pd.DataFrame.neg_kurtosis_custom = neg_kurtosis_func\n", "```\n", "\n", "And then you can use it like you usually would:\n", "\n", "```python\n", "df.neg_kurtosis_custom()\n", "```" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler\n", "from modin.core.dataframe.algebra import TreeReduce" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "PandasQueryCompiler.neg_kurtosis_custom = TreeReduce.register(lambda cell_value, **kwargs: ~cell_value,\n", " pandas.DataFrame.kurtosis)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from pandas._libs import lib\n", "# The function signature came from the pandas documentation:\n", "# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.kurtosis.html\n", "def neg_kurtosis_func(self, axis=lib.no_default, skipna=True, level=None, numeric_only=None, **kwargs):\n", " # We need to specify the axis for the query compiler\n", " if axis in [None, lib.no_default]:\n", " axis = 0\n", " # The constructor allows you to pass in a query compiler as a keyword argument\n", " # Reduce dimension is used for reduces\n", " # We also pass all keyword arguments here to ensure correctness\n", " return self._reduce_dimension(\n", " self._query_compiler.neg_kurtosis_custom(\n", " axis=axis, skipna=skipna, level=level, numeric_only=numeric_only, **kwargs\n", " )\n", " )\n", "\n", "pd.DataFrame.neg_kurtosis_custom = neg_kurtosis_func" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Speed improvements\n", "If we were to try and replicate this functionality using the pandas API, we would need to call `df.applymap` with our unary negation function, and subsequently `df.kurtosis` on the result of the first call. Let's see how this compares with our new, custom function!" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "\n", "print(pandas_df.applymap(lambda cell_value: ~cell_value).kurtosis())\n", "\n", "end = time.time()\n", "pandas_duration = end - start\n", "print(\"pandas unary negation kurtosis took {} seconds.\".format(pandas_duration))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "\n", "print(df.applymap(lambda x: ~x).kurtosis())\n", "\n", "end = time.time()\n", "modin_duration = end - start\n", "print(\"Modin unary negation kurtosis took {} seconds.\".format(modin_duration))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "custom_start = time.time()\n", "\n", "print(df.neg_kurtosis_custom())\n", "\n", "custom_end = time.time()\n", "modin_custom_duration = custom_end - custom_start\n", "print(\"Modin neg_kurtosis_custom took {} seconds.\".format(modin_custom_duration))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from IPython.display import Markdown, display\n", "\n", "display(Markdown(\"### As expected, Modin is {}x faster than pandas when chaining the functions; however we see that our custom function is even faster than that - beating pandas by {}x, and Modin (when chaining the functions) by {}x!\".format(round(pandas_duration / modin_duration, 2), round(pandas_duration / modin_custom_duration, 2), round(modin_duration / modin_custom_duration, 2))))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Congratulations! You have just implemented new DataFrame functionality!\n", "\n", "## Consider opening a pull request: https://github.com/modin-project/modin/pulls\n", "\n", "For a complete list of what is implemented, see the [Supported APIs](https://modin.readthedocs.io/en/latest/supported_apis/index.html) section." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Test your knowledge: Add a custom function for another tree reduce: finding `DataFrame.mad` after squaring all of the values\n", "\n", "See the pandas documentation for the correct signature: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.mad.html" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modin_mad_custom_start = time.time()\n", "\n", "# Implement your function here! Put the result of your custom squared `mad` in the variable `modin_mad_custom`\n", "# Hint: Look at the kurtosis walkthrough above\n", "\n", "modin_mad_custom = ...\n", "print(modin_mad_custom)\n", "\n", "modin_mad_custom_end = time.time()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Evaluation code, do not change!\n", "modin_mad_start = time.time()\n", "modin_mad = df.applymap(lambda x: x**2).mad()\n", "print(modin_mad)\n", "modin_mad_end = time.time()\n", "\n", "assert modin_mad_end - modin_mad_start > modin_mad_custom_end - modin_mad_custom_start, \\\n", " \"Your implementation was too slow, or you used the chaining functions approach. Try again\"\n", "assert modin_mad._to_pandas().equals(modin_mad_custom._to_pandas()), \"Your result did not match the result of chaining the functions, try again\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Now that you are able to create custom functions, you know enough to contribute to Modin!\n", "\n", "**Please move on to [Exercise 4](./exercise_4.ipynb) when you are ready**" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: examples/tutorial/jupyter/execution/pandas_on_ray/local/exercise_4.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "id": "99f41d2d", "metadata": {}, "source": [ "![LOGO](../../../img/MODIN_ver2_hrz.png)\n", "\n", "

Scale your pandas workflows by changing one line of code

\n", "\n" ] }, { "cell_type": "markdown", "id": "fdda1c9c", "metadata": {}, "source": [ "# Exercise 4: Experimental Features\n", "\n", "**GOAL**: Explore some of the experimental features being added to Modin." ] }, { "cell_type": "markdown", "id": "9b487c51", "metadata": {}, "source": [ "### Concept for exercise: Progress Bar\n", "\n", "\n", "Sometimes when running long functions on DataFrames, it can be hard to tell how much progress has been made, as well as how much longer the function will run. A progress bar allows users to see the estimated progress and completion time of each line they run, in environments such as a shell or Jupyter notebook.\n", "\n", "To enable Modin's Progress Bar, add the following lines of code after importing `modin.pandas`:\n", "```python\n", "from modin.config import ProgressBar\n", "ProgressBar.enable()\n", "```\n", "\n", "In this exercise, we'll see how the progress bar can improve our experience running dataframe queries!" ] }, { "cell_type": "code", "execution_count": null, "id": "f95d4874", "metadata": { "scrolled": false }, "outputs": [], "source": [ "import modin.pandas as pd\n", "import numpy as np\n", "from modin.config import ProgressBar\n", "ProgressBar.enable()\n", "\n", "frame_data = np.random.randint(0, 100, size=(2**18, 2**8))\n", "df = pd.DataFrame(frame_data).add_prefix(\"col\")" ] }, { "cell_type": "markdown", "id": "6905bc6b", "metadata": {}, "source": [ "On longer functions, its nice to be able to see an estimation of how much longer things will take!" ] }, { "cell_type": "code", "execution_count": null, "id": "236ec8e2", "metadata": {}, "outputs": [], "source": [ "df = df.applymap(lambda x: ~x)\n", "df" ] }, { "cell_type": "markdown", "id": "e7bf87a5", "metadata": {}, "source": [ "### Concept for exercise: Spreadsheet\n", "\n", "For those who have worked with Excel, the Spreadsheet API will definitely feel familiar! The Spreadsheet API is a Jupyter notebook widget that allows us to interact with Modin DataFrames in a spreadsheet-like fashion while taking advantage of the underlying capabilities of Modin. The widget makes it quick and easy to explore, sort, filter, and edit data as well as export the changes as reproducible code.\n", "\n", "Let's look back at a subset of the 2015 NYC Taxi Data from Exercise 2, and see how the Spreadsheet API can make it easy to play with the data!" ] }, { "cell_type": "code", "execution_count": null, "id": "5d5c4a3e", "metadata": {}, "outputs": [], "source": [ "!jupyter nbextension enable --py --sys-prefix modin_spreadsheet\n", "ProgressBar.disable()" ] }, { "cell_type": "code", "execution_count": null, "id": "dc8d5903", "metadata": {}, "outputs": [], "source": [ "import modin.experimental.spreadsheet as mss\n", "\n", "s3_path = \"s3://dask-data/nyc-taxi/2015/yellow_tripdata_2015-01.csv\"\n", "modin_df = pd.read_csv(s3_path, parse_dates=[\"tpep_pickup_datetime\", \"tpep_dropoff_datetime\"], quoting=3, nrows=1000)" ] }, { "cell_type": "code", "execution_count": null, "id": "145e7bbe", "metadata": {}, "outputs": [], "source": [ "spreadsheet = mss.from_dataframe(modin_df)\n", "spreadsheet" ] }, { "cell_type": "markdown", "id": "3c18b7f2", "metadata": {}, "source": [ "### Thank you for participating!" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: examples/tutorial/jupyter/execution/pandas_on_ray/requirements.txt ================================================ fsspec>=2022.11.0 jupyterlab ipywidgets tqdm>=4.60.0 modin[ray] modin[spreadsheet] ================================================ FILE: examples/tutorial/jupyter/execution/pandas_on_ray/test/test_notebooks.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import os import sys import nbformat MODIN_DIR = os.path.abspath( os.path.join(os.path.dirname(__file__), *[".." for _ in range(6)]) ) sys.path.insert(0, MODIN_DIR) from examples.tutorial.jupyter.execution.test.utils import ( # noqa: E402 _execute_notebook, _find_code_cell_idx, _replace_str, download_taxi_dataset, test_dataset_path, ) local_notebooks_dir = "examples/tutorial/jupyter/execution/pandas_on_ray/local" # in this notebook user should replace 'import pandas as pd' with # 'import modin.pandas as pd' to make notebook work def test_exercise_1(): modified_notebook_path = os.path.join(local_notebooks_dir, "exercise_1_test.ipynb") nb = nbformat.read( os.path.join(local_notebooks_dir, "exercise_1.ipynb"), as_version=nbformat.NO_CONVERT, ) _replace_str(nb, "import pandas as pd", "import modin.pandas as pd") nbformat.write(nb, modified_notebook_path) _execute_notebook(modified_notebook_path) # this notebook works "as is" but for testing purposes we can use smaller dataset def test_exercise_2(): modified_notebook_path = os.path.join(local_notebooks_dir, "exercise_2_test.ipynb") nb = nbformat.read( os.path.join(local_notebooks_dir, "exercise_2.ipynb"), as_version=nbformat.NO_CONVERT, ) _replace_str( nb, 'path = "s3://dask-data/nyc-taxi/2015/yellow_tripdata_2015-01.csv"', '# path = "s3://dask-data/nyc-taxi/2015/yellow_tripdata_2015-01.csv"', ) new_optional_cell = f'path = "{test_dataset_path}"\n' + download_taxi_dataset optional_cell_idx = _find_code_cell_idx(nb, "[Optional] Download data locally.") nb["cells"][optional_cell_idx]["source"] = new_optional_cell nbformat.write(nb, modified_notebook_path) _execute_notebook(modified_notebook_path) # in this notebook user should add custom mad implementation # to make notebook work def test_exercise_3(): modified_notebook_path = os.path.join(local_notebooks_dir, "exercise_3_test.ipynb") nb = nbformat.read( os.path.join(local_notebooks_dir, "exercise_3.ipynb"), as_version=nbformat.NO_CONVERT, ) user_mad_implementation = """PandasQueryCompiler.sq_mad_custom = TreeReduce.register(lambda cell_value, **kwargs: cell_value ** 2, pandas.DataFrame.mad) def sq_mad_func(self, axis=None, skipna=True, level=None, **kwargs): if axis is None: axis = 0 return self._reduce_dimension( self._query_compiler.sq_mad_custom( axis=axis, skipna=skipna, level=level, **kwargs ) ) pd.DataFrame.sq_mad_custom = sq_mad_func modin_mad_custom = df.sq_mad_custom() """ _replace_str(nb, "modin_mad_custom = ...", user_mad_implementation) nbformat.write(nb, modified_notebook_path) # need to update example, `.mad` doesn't exist # _execute_notebook(modified_notebook_path) # this notebook works "as is" but for testing purposes we can use smaller dataset def test_exercise_4(): modified_notebook_path = os.path.join(local_notebooks_dir, "exercise_4_test.ipynb") nb = nbformat.read( os.path.join(local_notebooks_dir, "exercise_4.ipynb"), as_version=nbformat.NO_CONVERT, ) s3_path_cell = f's3_path = "{test_dataset_path}"\n' + download_taxi_dataset _replace_str( nb, 's3_path = "s3://dask-data/nyc-taxi/2015/yellow_tripdata_2015-01.csv"', s3_path_cell, ) nbformat.write(nb, modified_notebook_path) _execute_notebook(modified_notebook_path) ================================================ FILE: examples/tutorial/jupyter/execution/pandas_on_unidist/Dockerfile ================================================ FROM continuumio/miniconda3 RUN conda env create -f jupyter_unidist_env.yml RUN conda install -c conda-forge psutil setproctitle ================================================ FILE: examples/tutorial/jupyter/execution/pandas_on_unidist/README.md ================================================ # Jupyter notebook examples to run with PandasOnUnidist Currently, Modin supports `PandasOnUnidist` execution only with MPI backend of [unidist](https://github.com/modin-project/unidist). There are some specifics on how to run a jupyter notebook with MPI, namely, you should use `mpiexec` command. ```bash mpiexec -n 1 jupyter notebook ``` **Important** MPI is not reliable yet to work in interactive environment such as jupyter notebooks. Thus, some things may not work. For example, if you are experiencing the error `The kernel appears to have died. It will restart automatically.`, you may want to modify `kernel.json` file or create a new one in order to fix the problem. For simplicity, you can just run `setup_kernel.py` script located in this directory. This will install a new MPI enabled kernel, which you can then select using the dropdown menu in your browser. Otherwise, you can follow the steps below: 1. First, what you should do is locate `kernel.json` file with `jupyter kernelspec list` command. It should generally be like this. ```bash jupyter kernelspec list Available kernels: python3 $PREFIX/share/jupyter/kernels/python3 ``` `kernel.json` file should be located in `python3` folder. 2. Second, you should make a copy of the `python3` folder, say to `python3mpi` folder. ```bash cp -r $PREFIX/share/jupyter/kernels/python3 $PREFIX/share/jupyter/kernels/python3mpi ``` 3. Third, modify `kernel.json` file in `python3mpi` folder to add `mpiexec -n 1` command (like "mpiexec", "-n", "1") to the beginning of the launched command (`argv`). 4. Fourth, change `display_name` in `kernel.json` file to something like `Python 3 (ipykernel) with MPI`. That way you can specifically select the Python kernel with MPI-enabled using the dropdown menu in your browser. ## Run Jupyter Notebooks with PandasOnUnidist After the `setup_kernel.py` script is run or the steps above are done, you can run a jupyter notebook with `PandasOnUnidist` in a normal way. ```bash jupyter notebook ``` ================================================ FILE: examples/tutorial/jupyter/execution/pandas_on_unidist/jupyter_unidist_env.yml ================================================ name: jupyter_modin_on_unidist channels: - conda-forge dependencies: - pip - fsspec>=2022.11.0 - jupyterlab - ipywidgets - modin-mpi - pip: - modin[spreadsheet] ================================================ FILE: examples/tutorial/jupyter/execution/pandas_on_unidist/local/exercise_1.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "![LOGO](../../../img/MODIN_ver2_hrz.png)\n", "\n", "

Scale your pandas workflows by changing one line of code

\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Exercise 1: How to use Modin\n", "\n", "**GOAL**: Learn how to import Modin to accelerate and scale pandas workflows." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Modin is a drop-in replacement for pandas that distributes the computation \n", "across all of the cores in your machine or in a cluster.\n", "In practical terms, this means that you can continue using the same pandas scripts\n", "as before and expect the behavior and results to be the same. The only thing that needs\n", "to change is the import statement. Normally, you would change:\n", "\n", "```python\n", "import pandas as pd\n", "```\n", "\n", "to:\n", "\n", "```python\n", "import modin.pandas as pd\n", "```\n", "\n", "Changing this line of code will allow you to use all of the cores in your machine to do computation on your data. One of the major performance bottlenecks of pandas is that it only uses a single core for any given computation. Modin exposes an API that is identical to pandas, allowing you to continue interacting with your data as you would with pandas. There are no additional commands required to use Modin locally. Partitioning, scheduling, data transfer, and other related concerns are all handled by Modin under the hood." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "

\n", "

pandas on a multicore laptop\n", " \n", " Modin on a multicore laptop\n", " \n", "\n", "
\n", "\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Concept for exercise: setting Modin engine\n", "\n", "Modin uses Ray as an execution engine by default so no additional action is required to start to use it. Alternatively, if you need to use another engine, it should be specified either by setting the Modin config or by setting Modin environment variable before the first operation with Modin as it is shown below. Also, note that the full list of Modin configs and corresponding environment variables can be found in the [Modin Configuration Settings](https://modin.readthedocs.io/en/stable/flow/modin/config.html#modin-configs-list) section of the Modin documentation.\n", "\n", "One of the execution engines that Modin uses is Unidist. Currently, Modin only supports MPI through unidist, so it should be specified either by setting the Unidist config or by setting Unidist environment variable. The full list of Unidist configs and corresponding environment variables can be found in the [Unidist Configuration Settings](https://unidist.readthedocs.io/en/latest/flow/unidist/config.html#unidist-configuration-settings-list) section of the Unidist documentation." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "vscode": { "languageId": "python" } }, "outputs": [], "source": [ "# Modin engine and Unidist backend can be specified either by config\n", "import modin.config as modin_cfg\n", "import unidist.config as unidist_cfg\n", "modin_cfg.Engine.put(\"unidist\")\n", "unidist_cfg.Backend.put(\"mpi\")\n", "\n", "# or by setting the environment variable\n", "# import os\n", "# os.environ[\"MODIN_ENGINE\"] = \"unidist\"\n", "# os.environ[\"UNIDIST_BACKEND\"] = \"mpi\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Concept for exercise: Dataframe constructor\n", "\n", "Often when playing around in pandas, it is useful to create a DataFrame with the constructor. That is where we will start.\n", "\n", "```python\n", "import numpy as np\n", "import pandas as pd\n", "\n", "frame_data = np.random.randint(0, 100, size=(2**10, 2**5))\n", "df = pd.DataFrame(frame_data)\n", "```\n", "\n", "When creating a dataframe from a non-distributed object, it will take extra time to partition the data. When this is happening, you will see this message:\n", "\n", "```\n", "UserWarning: Distributing object. This may take some time.\n", "```\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "vscode": { "languageId": "python" } }, "outputs": [], "source": [ "# Note: Do not change this code!\n", "import numpy as np\n", "import pandas\n", "import sys\n", "import modin" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "vscode": { "languageId": "python" } }, "outputs": [], "source": [ "pandas.__version__" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "vscode": { "languageId": "python" } }, "outputs": [], "source": [ "modin.__version__" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "vscode": { "languageId": "python" } }, "outputs": [], "source": [ "# Implement your answer here. You are also free to play with the size\n", "# and shape of the DataFrame, but beware of exceeding your memory!\n", "\n", "# import pandas as pd\n", "import pandas as pd\n", "\n", "frame_data = np.random.randint(0, 100, size=(2**5, 2**5))\n", "df = pd.DataFrame(frame_data)\n", "\n", "# ***** Do not change the code below! It verifies that \n", "# ***** the exercise has been done correctly. *****\n", "\n", "try:\n", " assert df is not None\n", " assert frame_data is not None\n", " assert isinstance(frame_data, np.ndarray)\n", "except:\n", " raise AssertionError(\"Don't change too much of the original code!\")\n", "assert \"modin.pandas\" in sys.modules, \"Not quite correct. Remember the single line of code change (See above)\"\n", "\n", "import modin.pandas\n", "assert pd == modin.pandas, \"Remember the single line of code change (See above)\"\n", "assert hasattr(df, \"_query_compiler\"), \"Make sure that `df` is a modin.pandas DataFrame.\"\n", "\n", "print(\"Success! You only need to change one line of code!\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now that we have created a toy example for playing around with the DataFrame, let's print it out in different ways." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Concept for Exercise: Data Interaction and Printing\n", "\n", "When interacting with data, it is very imporant to look at different parts of the data (e.g. `df.head()`). Here we will show that you can print the modin.pandas DataFrame in the same ways you would pandas." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "vscode": { "languageId": "python" } }, "outputs": [], "source": [ "# Print the first 10 lines.\n", "df.head(10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "vscode": { "languageId": "python" } }, "outputs": [], "source": [ "# Print the DataFrame.\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "vscode": { "languageId": "python" } }, "outputs": [], "source": [ "# Free cell for custom interaction (Play around here!)\n", "df.add_prefix(\"col\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "vscode": { "languageId": "python" } }, "outputs": [], "source": [ "df.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Please move on to [Exercise 2](./exercise_2.ipynb) when you are ready**" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel) with MPI", "language": "python", "name": "python3mpi" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: examples/tutorial/jupyter/execution/pandas_on_unidist/local/exercise_2.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "![LOGO](../../../img/MODIN_ver2_hrz.png)\n", "\n", "

Scale your pandas workflows by changing one line of code

\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Exercise 2: Speed improvements\n", "\n", "**GOAL**: Learn about common functionality that Modin speeds up by using all of your machine's cores." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Concept for Exercise: `read_csv` speedups\n", "\n", "The most commonly used data ingestion method used in pandas is CSV files (link to pandas survey). This concept is designed to give an idea of the kinds of speedups possible, even on a non-distributed filesystem. Modin also supports other file formats for parallel and distributed reads, which can be found in the documentation. We will import both Modin and pandas so that the speedups are evident.\n", "\n", "**Note: Rerunning the `read_csv` cells many times may result in degraded performance, depending on the memory of the machine**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import modin.pandas as pd\n", "import pandas\n", "import time\n", "from IPython.display import Markdown, display\n", "\n", "def printmd(string):\n", " display(Markdown(string))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Dataset: 2015 NYC taxi trip data\n", "\n", "\n", "We will be using a version of this data already in S3, originally posted in this blog post: https://matthewrocklin.com/blog/work/2017/01/12/dask-dataframes\n", "\n", "**Size: ~1.8GB**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "path = \"s3://dask-data/nyc-taxi/2015/yellow_tripdata_2015-01.csv\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Modin execution engine setting:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import modin.config as modin_cfg\n", "import unidist.config as unidist_cfg\n", "modin_cfg.Engine.put(\"unidist\")\n", "unidist_cfg.Backend.put(\"mpi\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## `pandas.read_csv`" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "\n", "pandas_df = pandas.read_csv(path, parse_dates=[\"tpep_pickup_datetime\", \"tpep_dropoff_datetime\"], quoting=3)\n", "\n", "end = time.time()\n", "pandas_duration = end - start\n", "print(\"Time to read with pandas: {} seconds\".format(round(pandas_duration, 3)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Expect pandas to take >3 minutes on EC2, longer locally\n", "\n", "This is a good time to chat with your neighbor\n", "Dicussion topics\n", "- Do you work with a large amount of data daily?\n", "- How big is your data?\n", "- What’s the common use case of your data?\n", "- Do you use any big data analytics tools?\n", "- Do you use any interactive analytics tool?\n", "- What’s are some drawbacks of your current interative analytic tools today?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## `modin.pandas.read_csv`" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "\n", "modin_df = pd.read_csv(path, parse_dates=[\"tpep_pickup_datetime\", \"tpep_dropoff_datetime\"], quoting=3)\n", "\n", "end = time.time()\n", "modin_duration = end - start\n", "print(\"Time to read with Modin: {} seconds\".format(round(modin_duration, 3)))\n", "\n", "printmd(\"### Modin is {}x faster than pandas at `read_csv`!\".format(round(pandas_duration / modin_duration, 2)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Are they equal?" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pandas_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modin_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Concept for exercise: Reduces\n", "\n", "In pandas, a reduce would be something along the lines of a `sum` or `count`. It computes some summary statistics about the rows or columns. We will be using `count`." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "\n", "pandas_count = pandas_df.count()\n", "\n", "end = time.time()\n", "pandas_duration = end - start\n", "\n", "print(\"Time to count with pandas: {} seconds\".format(round(pandas_duration, 3)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "\n", "modin_count = modin_df.count()\n", "\n", "end = time.time()\n", "modin_duration = end - start\n", "print(\"Time to count with Modin: {} seconds\".format(round(modin_duration, 3)))\n", "\n", "printmd(\"### Modin is {}x faster than pandas at `count`!\".format(round(pandas_duration / modin_duration, 2)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Are they equal?" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pandas_count" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modin_count" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Concept for exercise: Map operations\n", "\n", "In pandas, map operations are operations that do a single pass over the data and do not change its shape. Operations like `isnull` and `applymap` are included in this. We will be using `isnull`." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "\n", "pandas_isnull = pandas_df.isnull()\n", "\n", "end = time.time()\n", "pandas_duration = end - start\n", "\n", "print(\"Time to isnull with pandas: {} seconds\".format(round(pandas_duration, 3)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "\n", "modin_isnull = modin_df.isnull()\n", "\n", "end = time.time()\n", "modin_duration = end - start\n", "print(\"Time to isnull with Modin: {} seconds\".format(round(modin_duration, 3)))\n", "\n", "printmd(\"### Modin is {}x faster than pandas at `isnull`!\".format(round(pandas_duration / modin_duration, 2)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Are they equal?" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pandas_isnull" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modin_isnull" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Concept for exercise: Apply over a single column\n", "\n", "Sometimes we want to compute some summary statistics on a single column from our dataset." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "rounded_trip_distance_pandas = pandas_df[\"trip_distance\"].apply(round)\n", "\n", "end = time.time()\n", "pandas_duration = end - start\n", "print(\"Time to groupby with pandas: {} seconds\".format(round(pandas_duration, 3)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "\n", "rounded_trip_distance_modin = modin_df[\"trip_distance\"].apply(round)\n", "\n", "end = time.time()\n", "modin_duration = end - start\n", "print(\"Time to add a column with Modin: {} seconds\".format(round(modin_duration, 3)))\n", "\n", "printmd(\"### Modin is {}x faster than pandas at `apply` on one column!\".format(round(pandas_duration / modin_duration, 2)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Are they equal?" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "rounded_trip_distance_pandas" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "rounded_trip_distance_modin" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Concept for exercise: Add a column\n", "\n", "It is common to need to add a new column to an existing dataframe, here we show that this is significantly faster in Modin due to metadata management and an efficient zero copy implementation." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "pandas_df[\"rounded_trip_distance\"] = rounded_trip_distance_pandas\n", "\n", "end = time.time()\n", "pandas_duration = end - start\n", "print(\"Time to groupby with pandas: {} seconds\".format(round(pandas_duration, 3)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "\n", "modin_df[\"rounded_trip_distance\"] = rounded_trip_distance_modin\n", "\n", "end = time.time()\n", "modin_duration = end - start\n", "print(\"Time to add a column with Modin: {} seconds\".format(round(modin_duration, 3)))\n", "\n", "printmd(\"### Modin is {}x faster than pandas add a column!\".format(round(pandas_duration / modin_duration, 2)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Are they equal?" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pandas_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modin_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Please move on to [Exercise 3](./exercise_3.ipynb) when you are ready**" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel) with MPI", "language": "python", "name": "python3mpi" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: examples/tutorial/jupyter/execution/pandas_on_unidist/local/exercise_3.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "![LOGO](../../../img/MODIN_ver2_hrz.png)\n", "\n", "

Scale your pandas workflows by changing one line of code

\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Exercise 3: Not Implemented\n", "\n", "**GOAL**: Learn what happens when a function is not yet supported in Modin as well as how to extend Modin's functionality using the DataFrame Algebra." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "When functionality has not yet been implemented, we default to pandas\n", "\n", "![](../../../img/convert_to_pandas.png)\n", "\n", "We convert a Modin dataframe to pandas to do the operation, then convert it back once it is finished. These operations will have a high overhead due to the communication involved and will take longer than pandas.\n", "\n", "When this is happening, a warning will be given to the user to inform them that this operation will take longer than usual. For example, `DataFrame.mask` is not yet implemented. In this case, when a user tries to use it, they will see this warning:\n", "\n", "```\n", "UserWarning: `DataFrame.mask` defaulting to pandas implementation.\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Concept for exercise: Default to pandas\n", "\n", "In this section of the exercise we will see first-hand how the runtime is affected by operations that are not implemented." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import modin.pandas as pd\n", "import pandas\n", "import numpy as np\n", "import time\n", "import modin.config as modin_cfg\n", "import unidist.config as unidist_cfg\n", "modin_cfg.Engine.put(\"unidist\")\n", "unidist_cfg.Backend.put(\"mpi\")\n", "\n", "frame_data = np.random.randint(0, 100, size=(2**18, 2**8))\n", "df = pd.DataFrame(frame_data).add_prefix(\"col\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pandas_df = pandas.DataFrame(frame_data).add_prefix(\"col\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modin_start = time.time()\n", "\n", "print(df.mask(df < 50))\n", "\n", "modin_end = time.time()\n", "print(\"Modin mask took {} seconds.\".format(round(modin_end - modin_start, 4)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pandas_start = time.time()\n", "\n", "print(pandas_df.mask(pandas_df < 50))\n", "\n", "pandas_end = time.time()\n", "print(\"pandas mask took {} seconds.\".format(round(pandas_end - pandas_start, 4)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Concept for exercise: Register custom functions\n", "\n", "Modin's user-facing API is pandas, but it is possible that we do not yet support your favorite or most-needed functionalities. Your user-defined function may also be able to be executed more efficiently if you pre-define the type of function it is (e.g. map, reduce, etc.) using the DataFrame Algebra. To solve either case, it is possible to register a custom function to be applied to your data." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Registering a custom function for all query compilers\n", "\n", "To register a custom function for a query compiler, we first need to import it:\n", "\n", "```python\n", "from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler\n", "```\n", "\n", "The `PandasQueryCompiler` is responsible for defining and compiling the queries that can be operated on by Modin, and is specific to the pandas storage format. Any queries defined here must also both be compatible with and result in a `pandas.DataFrame`. Many functionalities are very simply implemented, as you can see in the current code: [Link](https://github.com/modin-project/modin/blob/7a8158873e77cb5f1a5a3b89be4ddac89f576269/modin/core/storage_formats/pandas/query_compiler.py#L216).\n", "\n", "If we want to register a new function, we need to understand what kind of function it is. In our example, we will try to implement a `kurtosis` on the unary negation of the values in the dataframe, which is a map (unargy negation of each cell) followed by a reduce. So we next want to import the function type so we can use it in our definition:\n", "\n", "```python\n", "from modin.core.dataframe.algebra import TreeReduce\n", "```\n", "\n", "Then we can just use the `TreeReduce.register` `classmethod` and assign it to the `PandasQueryCompiler`:\n", "\n", "```python\n", "PandasQueryCompiler.neg_kurtosis = TreeReduce.register(lambda cell_value, **kwargs: ~cell_value, pandas.DataFrame.kurtosis)\n", "```\n", "\n", "We include `**kwargs` to the `lambda` function since the query compiler will pass all keyword arguments to both the map and reduce functions.\n", "\n", "Finally, we want a handle to it from the `DataFrame`, so we need to create a way to do that:\n", "\n", "```python\n", "def neg_kurtosis_func(self, **kwargs):\n", " # The constructor allows you to pass in a query compiler as a keyword argument\n", " return self.__constructor__(query_compiler=self._query_compiler.neg_kurtosis(**kwargs))\n", "\n", "pd.DataFrame.neg_kurtosis_custom = neg_kurtosis_func\n", "```\n", "\n", "And then you can use it like you usually would:\n", "\n", "```python\n", "df.neg_kurtosis_custom()\n", "```" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler\n", "from modin.core.dataframe.algebra import TreeReduce" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "PandasQueryCompiler.neg_kurtosis_custom = TreeReduce.register(lambda cell_value, **kwargs: ~cell_value,\n", " pandas.DataFrame.kurtosis)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from pandas._libs import lib\n", "# The function signature came from the pandas documentation:\n", "# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.kurtosis.html\n", "def neg_kurtosis_func(self, axis=lib.no_default, skipna=True, level=None, numeric_only=None, **kwargs):\n", " # We need to specify the axis for the query compiler\n", " if axis in [None, lib.no_default]:\n", " axis = 0\n", " # The constructor allows you to pass in a query compiler as a keyword argument\n", " # Reduce dimension is used for reduces\n", " # We also pass all keyword arguments here to ensure correctness\n", " return self._reduce_dimension(\n", " self._query_compiler.neg_kurtosis_custom(\n", " axis=axis, skipna=skipna, level=level, numeric_only=numeric_only, **kwargs\n", " )\n", " )\n", "\n", "pd.DataFrame.neg_kurtosis_custom = neg_kurtosis_func" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Speed improvements\n", "If we were to try and replicate this functionality using the pandas API, we would need to call `df.applymap` with our unary negation function, and subsequently `df.kurtosis` on the result of the first call. Let's see how this compares with our new, custom function!" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "\n", "print(pandas_df.applymap(lambda cell_value: ~cell_value).kurtosis())\n", "\n", "end = time.time()\n", "pandas_duration = end - start\n", "print(\"pandas unary negation kurtosis took {} seconds.\".format(pandas_duration))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "\n", "print(df.applymap(lambda x: ~x).kurtosis())\n", "\n", "end = time.time()\n", "modin_duration = end - start\n", "print(\"Modin unary negation kurtosis took {} seconds.\".format(modin_duration))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "custom_start = time.time()\n", "\n", "print(df.neg_kurtosis_custom())\n", "\n", "custom_end = time.time()\n", "modin_custom_duration = custom_end - custom_start\n", "print(\"Modin neg_kurtosis_custom took {} seconds.\".format(modin_custom_duration))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from IPython.display import Markdown, display\n", "\n", "display(Markdown(\"### As expected, Modin is {}x faster than pandas when chaining the functions; however we see that our custom function is even faster than that - beating pandas by {}x, and Modin (when chaining the functions) by {}x!\".format(round(pandas_duration / modin_duration, 2), round(pandas_duration / modin_custom_duration, 2), round(modin_duration / modin_custom_duration, 2))))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Congratulations! You have just implemented new DataFrame functionality!\n", "\n", "## Consider opening a pull request: https://github.com/modin-project/modin/pulls\n", "\n", "For a complete list of what is implemented, see the [Supported APIs](https://modin.readthedocs.io/en/latest/supported_apis/index.html) section." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Test your knowledge: Add a custom function for another tree reduce: finding `DataFrame.mad` after squaring all of the values\n", "\n", "See the pandas documentation for the correct signature: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.mad.html" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modin_mad_custom_start = time.time()\n", "\n", "# Implement your function here! Put the result of your custom squared `mad` in the variable `modin_mad_custom`\n", "# Hint: Look at the kurtosis walkthrough above\n", "\n", "modin_mad_custom = ...\n", "print(modin_mad_custom)\n", "\n", "modin_mad_custom_end = time.time()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Evaluation code, do not change!\n", "modin_mad_start = time.time()\n", "modin_mad = df.applymap(lambda x: x**2).mad()\n", "print(modin_mad)\n", "modin_mad_end = time.time()\n", "\n", "assert modin_mad_end - modin_mad_start > modin_mad_custom_end - modin_mad_custom_start, \\\n", " \"Your implementation was too slow, or you used the chaining functions approach. Try again\"\n", "assert modin_mad._to_pandas().equals(modin_mad_custom._to_pandas()), \"Your result did not match the result of chaining the functions, try again\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Now that you are able to create custom functions, you know enough to contribute to Modin!\n", "\n", "**Please move on to [Exercise 4](./exercise_4.ipynb) when you are ready**" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel) with MPI", "language": "python", "name": "python3mpi" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: examples/tutorial/jupyter/execution/pandas_on_unidist/local/exercise_4.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "id": "99f41d2d", "metadata": {}, "source": [ "![LOGO](../../../img/MODIN_ver2_hrz.png)\n", "\n", "

Scale your pandas workflows by changing one line of code

\n", "\n" ] }, { "cell_type": "markdown", "id": "fdda1c9c", "metadata": {}, "source": [ "# Exercise 4: Experimental Features\n", "\n", "**GOAL**: Explore some of the experimental features being added to Modin." ] }, { "cell_type": "markdown", "id": "e7bf87a5", "metadata": {}, "source": [ "### Concept for exercise: Spreadsheet\n", "\n", "For those who have worked with Excel, the Spreadsheet API will definitely feel familiar! The Spreadsheet API is a Jupyter notebook widget that allows us to interact with Modin DataFrames in a spreadsheet-like fashion while taking advantage of the underlying capabilities of Modin. The widget makes it quick and easy to explore, sort, filter, and edit data as well as export the changes as reproducible code.\n", "\n", "Let's look back at a subset of the 2015 NYC Taxi Data from Exercise 2, and see how the Spreadsheet API can make it easy to play with the data!" ] }, { "cell_type": "code", "execution_count": null, "id": "5d5c4a3e", "metadata": {}, "outputs": [], "source": [ "!jupyter nbextension enable --py --sys-prefix modin_spreadsheet" ] }, { "cell_type": "code", "execution_count": null, "id": "dc8d5903", "metadata": {}, "outputs": [], "source": [ "import modin.pandas as pd\n", "import modin.experimental.spreadsheet as mss\n", "import modin.config as modin_cfg\n", "import unidist.config as unidist_cfg\n", "modin_cfg.Engine.put(\"unidist\")\n", "unidist_cfg.Backend.put(\"mpi\")\n", "\n", "s3_path = \"s3://dask-data/nyc-taxi/2015/yellow_tripdata_2015-01.csv\"\n", "modin_df = pd.read_csv(s3_path, parse_dates=[\"tpep_pickup_datetime\", \"tpep_dropoff_datetime\"], quoting=3, nrows=1000)" ] }, { "cell_type": "code", "execution_count": null, "id": "145e7bbe", "metadata": {}, "outputs": [], "source": [ "spreadsheet = mss.from_dataframe(modin_df)\n", "spreadsheet" ] }, { "cell_type": "markdown", "id": "3c18b7f2", "metadata": {}, "source": [ "### Thank you for participating!" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel) with MPI", "language": "python", "name": "python3mpi" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: examples/tutorial/jupyter/execution/pandas_on_unidist/setup_kernel.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import sys from ipykernel import kernelspec default_make_ipkernel_cmd = kernelspec.make_ipkernel_cmd def custom_make_ipkernel_cmd(*args, **kwargs): """ Build modified Popen command list for launching an IPython kernel with MPI. Parameters ---------- *args : iterable Additional positional arguments to be passed in `default_make_ipkernel_cmd`. **kwargs : dict Additional keyword arguments to be passed in `default_make_ipkernel_cmd`. Returns ------- array A Popen command list. Notes ----- The parameters of the function should be kept in sync with the ones of the original function. """ mpi_arguments = ["mpiexec", "-n", "1"] arguments = default_make_ipkernel_cmd(*args, **kwargs) return mpi_arguments + arguments kernelspec.make_ipkernel_cmd = custom_make_ipkernel_cmd if __name__ == "__main__": kernel_name = "python3mpi" display_name = "Python 3 (ipykernel) with MPI" dest = kernelspec.install( kernel_name=kernel_name, display_name=display_name, prefix=sys.prefix ) print(f"Installed kernelspec {kernel_name} in {dest}") # noqa: T201 ================================================ FILE: examples/tutorial/jupyter/execution/pandas_on_unidist/test/test_notebooks.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import os import sys import nbformat MODIN_DIR = os.path.abspath( os.path.join(os.path.dirname(__file__), *[".." for _ in range(6)]) ) sys.path.insert(0, MODIN_DIR) from examples.tutorial.jupyter.execution.test.utils import ( # noqa: E402 _execute_notebook, _replace_str, download_taxi_dataset, set_kernel, test_dataset_path, ) # the kernel name "python3mpi" must match the one # that is set up in `examples/tutorial/jupyter/execution/pandas_on_unidist/setup_kernel.py` # for `Unidist` engine set_kernel(kernel_name="python3mpi") local_notebooks_dir = "examples/tutorial/jupyter/execution/pandas_on_unidist/local" # in this notebook user should replace 'import pandas as pd' with # 'import modin.pandas as pd' to make notebook work def test_exercise_1(): modified_notebook_path = os.path.join(local_notebooks_dir, "exercise_1_test.ipynb") nb = nbformat.read( os.path.join(local_notebooks_dir, "exercise_1.ipynb"), as_version=nbformat.NO_CONVERT, ) _replace_str(nb, "import pandas as pd", "import modin.pandas as pd") nbformat.write(nb, modified_notebook_path) _execute_notebook(modified_notebook_path) # this notebook works "as is" but for testing purposes we can use smaller dataset def test_exercise_2(): modified_notebook_path = os.path.join(local_notebooks_dir, "exercise_2_test.ipynb") nb = nbformat.read( os.path.join(local_notebooks_dir, "exercise_2.ipynb"), as_version=nbformat.NO_CONVERT, ) new_cell = f'path = "{test_dataset_path}"\n' + download_taxi_dataset _replace_str( nb, 'path = "s3://dask-data/nyc-taxi/2015/yellow_tripdata_2015-01.csv"', new_cell, ) nbformat.write(nb, modified_notebook_path) _execute_notebook(modified_notebook_path) # in this notebook user should add custom mad implementation # to make notebook work def test_exercise_3(): modified_notebook_path = os.path.join(local_notebooks_dir, "exercise_3_test.ipynb") nb = nbformat.read( os.path.join(local_notebooks_dir, "exercise_3.ipynb"), as_version=nbformat.NO_CONVERT, ) user_mad_implementation = """PandasQueryCompiler.sq_mad_custom = TreeReduce.register(lambda cell_value, **kwargs: cell_value ** 2, pandas.DataFrame.mad) def sq_mad_func(self, axis=None, skipna=True, level=None, **kwargs): if axis is None: axis = 0 return self._reduce_dimension( self._query_compiler.sq_mad_custom( axis=axis, skipna=skipna, level=level, **kwargs ) ) pd.DataFrame.sq_mad_custom = sq_mad_func modin_mad_custom = df.sq_mad_custom() """ _replace_str(nb, "modin_mad_custom = ...", user_mad_implementation) nbformat.write(nb, modified_notebook_path) # need to update example, `.mad` doesn't exist # _execute_notebook(modified_notebook_path) # this notebook works "as is" but for testing purposes we can use smaller dataset def test_exercise_4(): modified_notebook_path = os.path.join(local_notebooks_dir, "exercise_4_test.ipynb") nb = nbformat.read( os.path.join(local_notebooks_dir, "exercise_4.ipynb"), as_version=nbformat.NO_CONVERT, ) s3_path_cell = f's3_path = "{test_dataset_path}"\n' + download_taxi_dataset _replace_str( nb, 's3_path = "s3://dask-data/nyc-taxi/2015/yellow_tripdata_2015-01.csv"', s3_path_cell, ) nbformat.write(nb, modified_notebook_path) _execute_notebook(modified_notebook_path) ================================================ FILE: examples/tutorial/jupyter/execution/test/utils.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import nbformat from nbconvert.preprocessors import ExecutePreprocessor test_dataset_path = "taxi.csv" download_taxi_dataset = f"""import os import urllib.request if not os.path.exists("{test_dataset_path}"): url_path = "https://modin-datasets.intel.com/testing/yellow_tripdata_2015-01.csv" urllib.request.urlretrieve(url_path, "{test_dataset_path}") """ # Default kernel name for ``ExecutePreprocessor`` to be created _default_kernel_name = "python3" def set_kernel(kernel_name): """ Set custom kernel for ``ExecutePreprocessor`` to be created. Parameters ---------- kernel_name : str Kernel name. """ global _default_kernel_name _default_kernel_name = kernel_name def make_execute_preprocessor(): """ Make ``ExecutePreprocessor`` with the `_default_kernel_name`. Returns ------- nbconvert.preprocessors.ExecutePreprocessor Execute processor entity. Notes ----- Note that `_default_kernel_name` can be changed for the concrete executions (e.g., ``PandasOnUnidist`` with MPI backend). """ return ExecutePreprocessor(timeout=600, kernel_name=_default_kernel_name) def _execute_notebook(notebook): """ Execute a jupyter notebook. Parameters ---------- notebook : file-like or str File-like object or path to the notebook to execute. """ nb = nbformat.read(notebook, as_version=nbformat.NO_CONVERT) ep = make_execute_preprocessor() ep.preprocess(nb) def _find_code_cell_idx(nb, identifier): """ Find code cell index by provided ``identifier``. Parameters ---------- nb : dict Dictionary representation of the notebook to look for. identifier : str Unique string which target code cell should contain. Returns ------- int Code cell index by provided ``identifier``. Notes ----- Assertion will be raised if ``identifier`` is found in several code cells or isn't found at all. """ import_cell_idx = [ idx for idx, cell in enumerate(nb["cells"]) if cell["cell_type"] == "code" and identifier in cell["source"] ] assert len(import_cell_idx) == 1 return import_cell_idx[0] def _replace_str(nb, original_str, str_to_replace): """ Replace ``original_str`` with ``str_to_replace`` in the provided notebook. Parameters ---------- nb : dict Dictionary representation of the notebook which requires replacement. original_str : str Original string which should be replaced. str_to_replace : str String to replace original string. Notes ----- Assertion will be raised if ``original_str`` is found in several code cells or isn't found at all. """ import_cell_idx = _find_code_cell_idx(nb, original_str) nb["cells"][import_cell_idx]["source"] = nb["cells"][import_cell_idx][ "source" ].replace(original_str, str_to_replace) ================================================ FILE: modin/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import warnings from typing import Any, Optional, Tuple, Type, Union from . import _version def custom_formatwarning( message: Union[Warning, str], category: Type[Warning], filename: str, lineno: int, line: Optional[str] = None, ) -> str: # ignore everything except the message return "{}: {}\n".format(category.__name__, message) warnings.formatwarning = custom_formatwarning # Filter numpy version warnings because they are not relevant warnings.filterwarnings("ignore", message="numpy.dtype size changed") warnings.filterwarnings("ignore", message="Large object of size") def set_execution(engine: Any = None, storage_format: Any = None) -> Tuple[Any, Any]: """ Method to set the _pair_ of execution engine and storage format format simultaneously. This is needed because there might be cases where switching one by one would be impossible, as not all pairs of values are meaningful. The method returns pair of old values, so it is easy to return back. """ from .config import Backend, Engine, Execution, StorageFormat old_engine, old_storage_format = None, None # defer callbacks until both entities are set if engine is not None: old_engine = Engine._put_nocallback(engine) if storage_format is not None: old_storage_format = StorageFormat._put_nocallback(storage_format) # execute callbacks if something was changed if old_engine is not None: Engine._check_callbacks(old_engine) if old_storage_format is not None: StorageFormat._check_callbacks(old_storage_format) old_backend = Backend.get() Backend._put_nocallback( Backend.get_backend_for_execution( Execution(engine=Engine.get(), storage_format=StorageFormat.get()) ) ) Backend._check_callbacks(old_backend) return old_engine, old_storage_format __version__ = _version.get_versions()["version"] ================================================ FILE: modin/__main__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Command-line interface piece, called when user issues "python -m modin --foo".""" import argparse def main() -> None: parser = argparse.ArgumentParser( "python -m modin", description="Drop-in pandas replacement; refer to https://modin.readthedocs.io/ for details.", ) parser.add_argument( "--versions", action="store_true", default=False, help="Show versions of all known components", ) args = parser.parse_args() if args.versions: from modin.utils import show_versions show_versions() if __name__ == "__main__": main() ================================================ FILE: modin/_version.py ================================================ # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build # directories (produced by setup.py build) will contain a much shorter file # that just contains the computed version number. # This file is released into the public domain. # Generated by versioneer-0.29 # https://github.com/python-versioneer/python-versioneer """Git implementation of _version.py.""" import errno import functools import os import re import subprocess import sys from typing import Any, Callable, Dict, List, Optional, Tuple def get_keywords() -> Dict[str, str]: """Get the keywords needed to look up the version information.""" # these strings will be replaced by git during git-archive. # setup.py/versioneer.py will grep for the variable names, so they must # each be defined on a line of their own. _version.py will just call # get_keywords(). git_refnames = "$Format:%d$" git_full = "$Format:%H$" git_date = "$Format:%ci$" keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} return keywords class VersioneerConfig: """Container for Versioneer configuration parameters.""" VCS: str style: str tag_prefix: str parentdir_prefix: str versionfile_source: str verbose: bool def get_config() -> VersioneerConfig: """Create, populate and return the VersioneerConfig() object.""" # these strings are filled in when 'setup.py versioneer' creates # _version.py cfg = VersioneerConfig() cfg.VCS = "git" cfg.style = "pep440" cfg.tag_prefix = "" cfg.parentdir_prefix = "modin-" cfg.versionfile_source = "modin/_version.py" cfg.verbose = False return cfg class NotThisMethod(Exception): """Exception raised if a method is not valid for the current scenario.""" LONG_VERSION_PY: Dict[str, str] = {} HANDLERS: Dict[str, Dict[str, Callable]] = {} def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator """Create decorator to mark a method as the handler of a VCS.""" def decorate(f: Callable) -> Callable: """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f return f return decorate def run_command( commands: List[str], args: List[str], cwd: Optional[str] = None, verbose: bool = False, hide_stderr: bool = False, env: Optional[Dict[str, str]] = None, ) -> Tuple[Optional[str], Optional[int]]: """Call the given command(s).""" assert isinstance(commands, list) process = None popen_kwargs: Dict[str, Any] = {} if sys.platform == "win32": # This hides the console window if pythonw.exe is used startupinfo = subprocess.STARTUPINFO() startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW popen_kwargs["startupinfo"] = startupinfo for command in commands: try: dispcmd = str([command] + args) # remember shell=False, so use git.cmd on windows, not just git process = subprocess.Popen( [command] + args, cwd=cwd, env=env, stdout=subprocess.PIPE, stderr=(subprocess.PIPE if hide_stderr else None), **popen_kwargs, ) break except OSError as e: if e.errno == errno.ENOENT: continue if verbose: print("unable to run %s" % dispcmd) print(e) return None, None else: if verbose: print("unable to find command, tried %s" % (commands,)) return None, None stdout = process.communicate()[0].strip().decode() if process.returncode != 0: if verbose: print("unable to run %s (error)" % dispcmd) print("stdout was %s" % stdout) return None, process.returncode return stdout, process.returncode def versions_from_parentdir( parentdir_prefix: str, root: str, verbose: bool, ) -> Dict[str, Any]: """Try to determine the version from the parent directory name. Source tarballs conventionally unpack into a directory that includes both the project name and a version string. We will also support searching up two directory levels for an appropriately named parent directory """ rootdirs = [] for _ in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): return { "version": dirname[len(parentdir_prefix) :], "full-revisionid": None, "dirty": False, "error": None, "date": None, } rootdirs.append(root) root = os.path.dirname(root) # up a level if verbose: print( "Tried directories %s but none started with prefix %s" % (str(rootdirs), parentdir_prefix) ) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") @register_vcs_handler("git", "get_keywords") def git_get_keywords(versionfile_abs: str) -> Dict[str, str]: """Extract version information from the given file.""" # the code embedded in _version.py can just fetch the value of these # keywords. When used from setup.py, we don't want to import _version.py, # so we do it with a regexp instead. This function is not used from # _version.py. keywords: Dict[str, str] = {} try: with open(versionfile_abs, "r") as fobj: for line in fobj: if line.strip().startswith("git_refnames ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["refnames"] = mo.group(1) if line.strip().startswith("git_full ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["full"] = mo.group(1) if line.strip().startswith("git_date ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["date"] = mo.group(1) except OSError: pass return keywords @register_vcs_handler("git", "keywords") def git_versions_from_keywords( keywords: Dict[str, str], tag_prefix: str, verbose: bool, ) -> Dict[str, Any]: """Get version information from git keywords.""" if "refnames" not in keywords: raise NotThisMethod("Short version file found") date = keywords.get("date") if date is not None: # Use only the last line. Previous lines may contain GPG signature # information. date = date.splitlines()[-1] # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 # -like" string, which we must then edit to make compliant), because # it's been around since git-1.5.3, and it's too difficult to # discover which version we're using, or to work around using an # older one. date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) refnames = keywords["refnames"].strip() if refnames.startswith("$Format"): if verbose: print("keywords are unexpanded, not using") raise NotThisMethod("unexpanded keywords, not a git-archive tarball") refs = {r.strip() for r in refnames.strip("()").split(",")} # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)} if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d # expansion behaves like git log --decorate=short and strips out the # refs/heads/ and refs/tags/ prefixes that would let us distinguish # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". tags = {r for r in refs if re.search(r"\d", r)} if verbose: print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: print("likely tags: %s" % ",".join(sorted(tags))) for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): r = ref[len(tag_prefix) :] # Filter out refs that exactly match prefix or that don't start # with a number once the prefix is stripped (mostly a concern # when prefix is '') if not re.match(r"\d", r): continue if verbose: print("picking %s" % r) return { "version": r, "full-revisionid": keywords["full"].strip(), "dirty": False, "error": None, "date": date, } # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") return { "version": "0+unknown", "full-revisionid": keywords["full"].strip(), "dirty": False, "error": "no suitable tags", "date": None, } @register_vcs_handler("git", "pieces_from_vcs") def git_pieces_from_vcs( tag_prefix: str, root: str, verbose: bool, runner: Callable = run_command ) -> Dict[str, Any]: """Get version from 'git describe' in the root of the source tree. This only gets called if the git-archive 'subst' keywords were *not* expanded, and _version.py hasn't already been rewritten with a short version string, meaning we're inside a checked out source tree. """ GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] # GIT_DIR can interfere with correct operation of Versioneer. # It may be intended to be passed to the Versioneer-versioned project, # but that should not change where we get our version from. env = os.environ.copy() env.pop("GIT_DIR", None) runner = functools.partial(runner, env=env) _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=not verbose) if rc != 0: if verbose: print("Directory %s not under git control" % root) raise NotThisMethod("'git rev-parse --git-dir' returned error") # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) describe_out, rc = runner( GITS, [ "describe", "--tags", "--dirty", "--always", "--long", "--match", f"{tag_prefix}[[:digit:]]*", ], cwd=root, ) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") describe_out = describe_out.strip() full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) if full_out is None: raise NotThisMethod("'git rev-parse' failed") full_out = full_out.strip() pieces: Dict[str, Any] = {} pieces["long"] = full_out pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root) # --abbrev-ref was added in git-1.6.3 if rc != 0 or branch_name is None: raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") branch_name = branch_name.strip() if branch_name == "HEAD": # If we aren't exactly on a branch, pick a branch which represents # the current commit. If all else fails, we are on a branchless # commit. branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) # --contains was added in git-1.5.4 if rc != 0 or branches is None: raise NotThisMethod("'git branch --contains' returned error") branches = branches.split("\n") # Remove the first line if we're running detached if "(" in branches[0]: branches.pop(0) # Strip off the leading "* " from the list of branches. branches = [branch[2:] for branch in branches] if "master" in branches: branch_name = "master" elif not branches: branch_name = None else: # Pick the first branch that is returned. Good or bad. branch_name = branches[0] pieces["branch"] = branch_name # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] # TAG might have hyphens. git_describe = describe_out # look for -dirty suffix dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: git_describe = git_describe[: git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) if not mo: # unparsable. Maybe git-describe is misbehaving? pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out return pieces # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( full_tag, tag_prefix, ) return pieces pieces["closest-tag"] = full_tag[len(tag_prefix) :] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) # commit: short hex revision ID pieces["short"] = mo.group(3) else: # HEX: no tags pieces["closest-tag"] = None out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root) pieces["distance"] = len(out.split()) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() # Use only the last line. Previous lines may contain GPG signature # information. date = date.splitlines()[-1] pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) return pieces def plus_or_dot(pieces: Dict[str, Any]) -> str: """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." return "+" def render_pep440(pieces: Dict[str, Any]) -> str: """Build up version string, with post-release "local version identifier". Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty Exceptions: 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += plus_or_dot(pieces) rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_branch(pieces: Dict[str, Any]) -> str: """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . The ".dev0" means not master branch. Note that .dev0 sorts backwards (a feature branch will appear "older" than the master branch). Exceptions: 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: if pieces["branch"] != "master": rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0" if pieces["branch"] != "master": rendered += ".dev0" rendered += "+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def pep440_split_post(ver: str) -> Tuple[str, Optional[int]]: """Split pep440 version string at the post-release segment. Returns the release segments before the post-release and the post-release version number (or -1 if no post-release segment is present). """ vc = str.split(ver, ".post") return vc[0], int(vc[1] or 0) if len(vc) == 2 else None def render_pep440_pre(pieces: Dict[str, Any]) -> str: """TAG[.postN.devDISTANCE] -- No -dirty. Exceptions: 1: no tags. 0.post0.devDISTANCE """ if pieces["closest-tag"]: if pieces["distance"]: # update the post release segment tag_version, post_version = pep440_split_post(pieces["closest-tag"]) rendered = tag_version if post_version is not None: rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"]) else: rendered += ".post0.dev%d" % (pieces["distance"]) else: # no commits, use the tag as the version rendered = pieces["closest-tag"] else: # exception #1 rendered = "0.post0.dev%d" % pieces["distance"] return rendered def render_pep440_post(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]+gHEX] . The ".dev0" means dirty. Note that .dev0 sorts backwards (a dirty tree will appear "older" than the corresponding clean one), but you shouldn't be releasing software with -dirty anyways. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%s" % pieces["short"] else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += "+g%s" % pieces["short"] return rendered def render_pep440_post_branch(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . The ".dev0" means not master branch. Exceptions: 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["branch"] != "master": rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%s" % pieces["short"] if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["branch"] != "master": rendered += ".dev0" rendered += "+g%s" % pieces["short"] if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_old(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" return rendered def render_git_describe(pieces: Dict[str, Any]) -> str: """TAG[-DISTANCE-gHEX][-dirty]. Like 'git describe --tags --dirty --always'. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render_git_describe_long(pieces: Dict[str, Any]) -> str: """TAG-DISTANCE-gHEX[-dirty]. Like 'git describe --tags --dirty --always -long'. The distance/hash is unconditional. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]: """Render the given version pieces into the requested style.""" if pieces["error"]: return { "version": "unknown", "full-revisionid": pieces.get("long"), "dirty": None, "error": pieces["error"], "date": None, } if not style or style == "default": style = "pep440" # the default if style == "pep440": rendered = render_pep440(pieces) elif style == "pep440-branch": rendered = render_pep440_branch(pieces) elif style == "pep440-pre": rendered = render_pep440_pre(pieces) elif style == "pep440-post": rendered = render_pep440_post(pieces) elif style == "pep440-post-branch": rendered = render_pep440_post_branch(pieces) elif style == "pep440-old": rendered = render_pep440_old(pieces) elif style == "git-describe": rendered = render_git_describe(pieces) elif style == "git-describe-long": rendered = render_git_describe_long(pieces) else: raise ValueError("unknown style '%s'" % style) return { "version": rendered, "full-revisionid": pieces["long"], "dirty": pieces["dirty"], "error": None, "date": pieces.get("date"), } def get_versions() -> Dict[str, Any]: """Get version information or return default if unable to do so.""" # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have # __file__, we can work backwards from there to the root. Some # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which # case we can only use expanded keywords. cfg = get_config() verbose = cfg.verbose try: return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose) except NotThisMethod: pass try: root = os.path.realpath(__file__) # versionfile_source is the relative path from the top of the source # tree (where the .git directory might live) to this file. Invert # this to find the root from __file__. for _ in cfg.versionfile_source.split("/"): root = os.path.dirname(root) except NameError: return { "version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to find root of source tree", "date": None, } try: pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) return render(pieces, cfg.style) except NotThisMethod: pass try: if cfg.parentdir_prefix: return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) except NotThisMethod: pass return { "version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to compute version", "date": None, } ================================================ FILE: modin/config/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses config entities which can be used for Modin behavior tuning.""" from modin.config.envvars import ( AsvDataSizeConfig, AsvImplementation, AsyncReadMode, AutoSwitchBackend, Backend, BackendJoinConsiderAllBackends, BackendMergeCastInPlace, BenchmarkMode, CIAWSAccessKeyID, CIAWSSecretAccessKey, CpuCount, DaskThreadsPerWorker, DocModule, DynamicPartitioning, Engine, EnvironmentVariable, Execution, GithubCI, GpuCount, IsDebug, IsExperimental, IsRayCluster, LazyExecution, LogFileSize, LogMemoryInterval, LogMode, Memory, MetricsMode, MinColumnPartitionSize, MinPartitionSize, MinRowPartitionSize, ModinNumpy, NativePandasDeepCopy, NativePandasMaxRows, NativePandasTransferThreshold, NPartitions, PersistentPickle, ProgressBar, RangePartitioning, RayInitCustomResources, RayRedisAddress, RayRedisPassword, RayTaskCustomResources, ReadSqlEngine, ShowBackendSwitchProgress, StorageFormat, TestDatasetSize, TestReadFromPostgres, TestReadFromSqlServer, TrackFileLeaks, ) from modin.config.pubsub import Parameter, ValueSource, context __all__ = [ "EnvironmentVariable", "Parameter", "ValueSource", "context", # General settings "IsDebug", "Engine", "StorageFormat", "CpuCount", "GpuCount", "Memory", "Backend", "BackendJoinConsiderAllBackends", "BackendMergeCastInPlace", "Execution", "AutoSwitchBackend", "ShowBackendSwitchProgress", # Ray specific "IsRayCluster", "RayRedisAddress", "RayRedisPassword", "RayInitCustomResources", "RayTaskCustomResources", "LazyExecution", # Dask specific "DaskThreadsPerWorker", # Native Pandas Specific "NativePandasMaxRows", "NativePandasTransferThreshold", "NativePandasDeepCopy", # Partitioning "NPartitions", "MinPartitionSize", "MinRowPartitionSize", "MinColumnPartitionSize", # ASV specific "TestDatasetSize", "AsvImplementation", "AsvDataSizeConfig", # Specific features "ProgressBar", "BenchmarkMode", "PersistentPickle", "ModinNumpy", "RangePartitioning", "AsyncReadMode", "ReadSqlEngine", "IsExperimental", "DynamicPartitioning", # For tests "TrackFileLeaks", "TestReadFromSqlServer", "TestReadFromPostgres", "GithubCI", "CIAWSSecretAccessKey", "CIAWSAccessKeyID", # Logging "LogMode", "LogMemoryInterval", "LogFileSize", "MetricsMode", # Plugin settings "DocModule", ] ================================================ FILE: modin/config/__main__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Content of this file should be executed if module `modin.config` is called. If module is called (using `python -m modin.config`) configs help will be printed. Using `-export_path` option configs description can be exported to the external CSV file provided with this flag. """ import argparse from textwrap import dedent import pandas import modin.config as cfg def print_config_help() -> None: """Print configs help messages.""" for objname in sorted(cfg.__all__): obj = getattr(cfg, objname) if ( isinstance(obj, type) and issubclass(obj, cfg.Parameter) and not obj.is_abstract ): print(f"{obj.get_help()}\n\tCurrent value: {obj.get()}") # noqa: T201 def export_config_help(filename: str) -> None: """ Export all configs help messages to the CSV file. Parameters ---------- filename : str Name of the file to export configs data. """ configs_data = [] default_values = dict( RayRedisPassword="random string", CpuCount="multiprocessing.cpu_count()", NPartitions="equals to MODIN_CPUS env", ) for objname in sorted(cfg.__all__): obj = getattr(cfg, objname) if ( isinstance(obj, type) and issubclass(obj, cfg.Parameter) and not obj.is_abstract ): data = { "Config Name": obj.__name__, "Env. Variable Name": getattr( obj, "varname", "not backed by environment" ), "Default Value": default_values.get(obj.__name__, obj._get_default()), # `Notes` `-` underlining can't be correctly parsed inside csv table by sphinx "Description": dedent(obj.__doc__ or "").replace( "Notes\n-----", "Notes:\n" ), "Options": obj.choices, } configs_data.append(data) pandas.DataFrame( configs_data, columns=[ "Config Name", "Env. Variable Name", "Default Value", "Description", "Options", ], ).to_csv(filename, index=False) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--export-path", dest="export_path", type=str, required=False, default=None, help="File path to export configs data.", ) export_path = parser.parse_args().export_path if export_path: export_config_help(export_path) else: print_config_help() ================================================ FILE: modin/config/envvars.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses Modin configs originated from environment variables.""" import os import secrets import sys import warnings from collections import namedtuple from textwrap import dedent from typing import Any, NoReturn, Optional from packaging import version from pandas.util._decorators import doc # type: ignore[attr-defined] from modin import set_execution from modin.config.pubsub import ( _TYPE_PARAMS, _UNSET, DeprecationDescriptor, ExactStr, Parameter, ValueSource, ) class EnvironmentVariable(Parameter, type=str, abstract=True): """Base class for environment variables-based configuration.""" varname: Optional[str] = None @classmethod def _get_value_from_config(cls) -> Any: """ Read the value from environment variable. Returns ------- Any Config raw value if it's set, otherwise `_UNSET`. """ if cls.varname is None: raise TypeError("varname should not be None") if cls.varname not in os.environ: return _UNSET raw = os.environ[cls.varname] if not _TYPE_PARAMS[cls.type].verify(raw): # TODO: use and test a better error message, like "Invalid value # for {cls.varname}: {raw}" raise ValueError(f"Unsupported raw value: {raw}") return _TYPE_PARAMS[cls.type].decode(raw) @classmethod def get_help(cls) -> str: """ Generate user-presentable help for the config. Returns ------- str """ help = f"{cls.varname}: {dedent(cls.__doc__ or 'Unknown').strip()}\n\tProvide {_TYPE_PARAMS[cls.type].help}" if cls.choices: help += f" (valid examples are: {', '.join(str(c) for c in cls.choices)})" return help class EnvWithSibilings( EnvironmentVariable, # 'type' is a mandatory parameter for '__init_subclasses__', so we have to pass something here, # this doesn't force child classes to have 'str' type though, they actually can be any type type=str, ): """Ensure values synchronization between sibling parameters.""" _update_sibling = True @classmethod def _sibling(cls) -> type["EnvWithSibilings"]: """Return a sibling parameter.""" raise NotImplementedError() @classmethod def get(cls) -> Any: """ Get parameter's value and ensure that it's equal to the sibling's value. Returns ------- Any """ sibling = cls._sibling() if sibling._value is _UNSET and cls._value is _UNSET: super().get() with warnings.catch_warnings(): # filter warnings that can potentially come from the potentially deprecated sibling warnings.filterwarnings("ignore", category=FutureWarning) super(EnvWithSibilings, sibling).get() if ( cls._value_source == sibling._value_source == ValueSource.GOT_FROM_CFG_SOURCE ): raise ValueError( f"Configuration is ambiguous. You cannot set '{cls.varname}' and '{sibling.varname}' at the same time." ) # further we assume that there are only two valid sources for the variables: 'GOT_FROM_CFG' and 'DEFAULT', # as otherwise we wouldn't ended-up in this branch at all, because all other ways of setting a value # changes the '._value' attribute from '_UNSET' to something meaningful from modin.error_message import ErrorMessage if cls._value_source == ValueSource.GOT_FROM_CFG_SOURCE: ErrorMessage.catch_bugs_and_request_email( failure_condition=sibling._value_source != ValueSource.DEFAULT ) sibling._value = cls._value sibling._value_source = ValueSource.GOT_FROM_CFG_SOURCE elif sibling._value_source == ValueSource.GOT_FROM_CFG_SOURCE: ErrorMessage.catch_bugs_and_request_email( failure_condition=cls._value_source != ValueSource.DEFAULT ) cls._value = sibling._value cls._value_source = ValueSource.GOT_FROM_CFG_SOURCE else: ErrorMessage.catch_bugs_and_request_email( failure_condition=cls._value_source != ValueSource.DEFAULT or sibling._value_source != ValueSource.DEFAULT ) # propagating 'cls' default value to the sibling sibling._value = cls._value return super().get() @classmethod def put(cls, value: Any) -> None: """ Set a new value to this parameter as well as to its sibling. Parameters ---------- value : Any """ super().put(value) # avoid getting into an infinite recursion if cls._update_sibling: cls._update_sibling = False try: with warnings.catch_warnings(): # filter potential future warnings of the sibling warnings.filterwarnings("ignore", category=FutureWarning) cls._sibling().put(value) finally: cls._update_sibling = True class EnvironmentVariableDisallowingExecutionAndBackendBothSet( EnvironmentVariable, type=EnvironmentVariable.type, abstract=True, ): """Subclass to disallow getting this variable from the environment when both execution and backend are set in the environment.""" @classmethod @doc(EnvironmentVariable._get_value_from_config.__doc__) def _get_value_from_config(cls) -> str: if Backend.varname in os.environ and ( Engine.varname in os.environ or StorageFormat.varname in os.environ ): # Handling this case is tricky, in part because the combination of # Backend and Engine/StorageFormat may be invalid. For now just # disallow it. raise ValueError("Can't specify both execution and backend in environment") return super()._get_value_from_config() class IsDebug(EnvironmentVariable, type=bool): """Force Modin engine to be "Python" unless specified by $MODIN_ENGINE.""" varname = "MODIN_DEBUG" class Engine( EnvironmentVariableDisallowingExecutionAndBackendBothSet, type=str, ): """Distribution engine to run queries by.""" varname = "MODIN_ENGINE" choices = ("Ray", "Dask", "Python", "Unidist", "Native") NOINIT_ENGINES = { "Python", "Native", } # engines that don't require initialization, useful for unit tests has_custom_engine = False @classmethod def _get_default(cls) -> str: """ Get default value of the config. Returns ------- str """ from modin.utils import MIN_DASK_VERSION, MIN_RAY_VERSION, MIN_UNIDIST_VERSION # If there's a custom engine, we don't need to check for any engine # dependencies. Return the default "Python" engine. if IsDebug.get() or cls.has_custom_engine: return "Python" try: import ray except ImportError: pass else: if version.parse(ray.__version__) < MIN_RAY_VERSION: raise ImportError( 'Please `pip install "modin[ray]"` to install compatible Ray ' + "version " + f"(>={MIN_RAY_VERSION})." ) return "Ray" try: import dask import distributed except ImportError: pass else: if ( version.parse(dask.__version__) < MIN_DASK_VERSION or version.parse(distributed.__version__) < MIN_DASK_VERSION ): raise ImportError( f'Please `pip install "modin[dask]"` to install compatible Dask version (>={MIN_DASK_VERSION}).' ) return "Dask" try: import unidist except ImportError: pass else: if version.parse(unidist.__version__) < MIN_UNIDIST_VERSION: raise ImportError( 'Please `pip install "unidist[mpi]"` to install compatible unidist on MPI ' + "version " + f"(>={MIN_UNIDIST_VERSION})." ) return "Unidist" raise ImportError( "Please refer to installation documentation page to install an engine" ) @classmethod @doc(Parameter.add_option.__doc__) def add_option(cls, choice: Any) -> Any: choice = super().add_option(choice) cls.NOINIT_ENGINES.add(choice) cls.has_custom_engine = True return choice @classmethod def put(cls, value: str) -> None: """ Set the engine value. Parameters ---------- value : str Engine value to set. """ value = cls.normalize(value) # Backend.put() will set Engine. Backend.put( Backend.get_backend_for_execution( Execution(engine=value, storage_format=StorageFormat.get()) ) ) @classmethod def get(cls) -> str: """ Get the engine value. Returns ------- str Engine value. """ # We have to override get() because Engine may need to get its value # from the OS's environment variables for Backend or Engine. cls._warn_if_deprecated() # First, check if we've already set the engine value. if cls._value is not _UNSET: return cls._value engine_config_value = cls._get_value_from_config() backend_config_value = Backend._get_value_from_config() # If Engine is in the OS's configuration, use the configured Engine value. # Otherwise, use the Backend config value if that exists. If it doesn't, # fall back to the default Engine value. cls._value = ( engine_config_value if engine_config_value is not _UNSET else ( Backend.get_execution_for_backend(backend_config_value).engine if backend_config_value is not _UNSET else cls._get_default() ) ) return cls._value class StorageFormat(EnvironmentVariableDisallowingExecutionAndBackendBothSet, type=str): """Engine to run on a single node of distribution.""" @classmethod def put(cls, value: str) -> None: """ Set the storage format value. Parameters ---------- value : str Storage format value to set. """ value = cls.normalize(value) # Backend.put() will set StorageFormat. Backend.put( Backend.get_backend_for_execution( Execution(engine=Engine.get(), storage_format=value) ) ) @classmethod def get(cls) -> str: """ Get the storage format value. Returns ------- str Storage format value. """ # We have to override get() because StorageFormat may need to get its # value from the OS's environment variables for Backend or StorageFormat. cls._warn_if_deprecated() # First, check if we've already set the engine value. if cls._value is not _UNSET: return cls._value storage_format_config_value = cls._get_value_from_config() backend_config_value = Backend._get_value_from_config() # If StorageFormat is in the OS's configuration, use the configured # StorageFormat value. Otherwise, use the Backend config value if that # exists. If it doesn't, fall back to the default StorageFormat value. cls._value = ( storage_format_config_value if storage_format_config_value is not _UNSET else ( Backend.get_execution_for_backend(backend_config_value).storage_format if backend_config_value is not _UNSET else cls._get_default() ) ) return cls._value varname = "MODIN_STORAGE_FORMAT" default = "Pandas" choices = ("Pandas", "Native") Execution = namedtuple("Execution", ["storage_format", "engine"]) class Backend(EnvironmentVariableDisallowingExecutionAndBackendBothSet, type=str): """ An alias for execution, i.e. the combination of StorageFormat and Engine. Setting backend may change StorageFormat and/or Engine to the corresponding respective values, and setting Engine or StorageFormat may change Backend. Modin's built-in backends include: - "Ray" <-> (StorageFormat="Pandas", Engine="Ray") - "Dask" <-> (StorageFormat="Pandas", Engine="Dask") - "Python_Test" <-> (StorageFormat="Pandas", Engine="Python") - This execution mode is meant for testing only. - "Unidist" <-> (StorageFormat="Pandas", Engine="Unidist") - "Pandas" <-> (StorageFormat="Native", Engine="Native") """ _BACKEND_TO_EXECUTION: dict[str, Execution] = {} _EXECUTION_TO_BACKEND: dict[Execution, str] = {} varname: str = "MODIN_BACKEND" choices: tuple[str, ...] = ("Ray", "Dask", "Python_Test", "Unidist", "Pandas") @classmethod def put(cls, value: str) -> None: """ Set the backend value. Parameters ---------- value : str Backend value to set. """ execution = cls.get_execution_for_backend(value) set_execution(execution.engine, execution.storage_format) @classmethod def _get_default(cls) -> str: """ Get the default backend value. Returns ------- str Default backend value. """ return cls._EXECUTION_TO_BACKEND[ Execution(StorageFormat._get_default(), Engine._get_default()) ] @classmethod def register_backend(cls: type["Backend"], name: str, execution: Execution) -> None: """ Register a new backend. Parameters ---------- name : str Backend name. execution : Execution Execution that corresponds to the backend. """ name = cls.normalize(name) super().add_option(name) if name in cls._BACKEND_TO_EXECUTION: raise ValueError( f"Backend '{name}' is already registered with the execution {cls._BACKEND_TO_EXECUTION[name]}." ) if execution in cls._EXECUTION_TO_BACKEND: raise ValueError( f"{execution} is already registered with the backend {cls._EXECUTION_TO_BACKEND[execution]}." ) cls._BACKEND_TO_EXECUTION[name] = execution cls._EXECUTION_TO_BACKEND[execution] = name @classmethod def add_option(cls, choice: str) -> NoReturn: """ Raise an exception for trying to add an option to Backend directly. Parameters ---------- choice : str Choice to add. Unused. Raises ------ ValueError Always. """ raise ValueError( "Cannot add an option to Backend directly. Use Backend.register_backend instead." ) @classmethod def set_active_backends(cls, new_choices: tuple) -> None: """ Set the active backends available for manual and automatic switching. Other backends may have been registered, and those backends remain registered, but the set of engines that can be used is dynamically modified. Parameters ---------- new_choices : tuple Choices to add. Raises ------ ValueError Raises a ValueError when the set of new_choices are not already registered """ registered_backends = cls._BACKEND_TO_EXECUTION for i in new_choices: if i not in registered_backends: raise ValueError( f"Active backend choices {new_choices} are not all registered." ) cls.choices = new_choices @classmethod def activate(cls, backend: str) -> None: """ Activate a backend that was previously registered. This is a no-op if the backend is already active. Parameters ---------- backend : str Backend to activate. Raises ------ ValueError Raises a ValueError if backend was not previously registered. """ if backend not in cls._BACKEND_TO_EXECUTION: raise ValueError(f"Unknown backend '{backend}' is not registered.") cls.choices = (*cls.choices, backend) @classmethod def get_active_backends(cls) -> tuple[str, ...]: """ Get the active backends available for manual and automatic switching. Returns ------- tuple[str, ...] returns the active set of backends for switching """ return cls.choices @classmethod def get_backend_for_execution(cls, execution: Execution) -> str: """ Get the backend for the execution. Parameters ---------- execution : Execution Execution to get the backend for. Returns ------- str Backend for the execution. """ if execution not in cls._EXECUTION_TO_BACKEND: raise ValueError( f"{execution} has no known backend. Please register a " + "backend for it with Backend.register_backend()" ) return cls._EXECUTION_TO_BACKEND[execution] @classmethod def get_execution_for_backend(cls, backend: str) -> Execution: """ Get the execution for the given backend. Parameters ---------- backend : str Backend to get the execution for. Returns ------- execution : Execution The execution for the given backend """ if not isinstance(backend, str): raise TypeError( "Backend value should be a string, but instead it is " + f"{repr(backend)} of type {type(backend)}." ) normalized_value = cls.normalize(backend) if normalized_value not in cls.choices: if normalized_value in cls._BACKEND_TO_EXECUTION: raise ValueError( f"Backend '{backend}' is not currently active. Activate it first with Backend.activate('{backend})'." ) backend_choice_string = ", ".join(f"'{choice}'" for choice in cls.choices) raise ValueError( f"Unknown backend '{backend}'. Available backends are: " + backend_choice_string ) if normalized_value not in cls._BACKEND_TO_EXECUTION: raise ValueError( f"Backend '{backend}' has no known execution. Please " + "register an execution for it with Backend.register_backend()." ) return cls._BACKEND_TO_EXECUTION[normalized_value] @classmethod def get(cls) -> str: """ Get the backend. Returns ------- str Backend. """ # We have to override get() because Backend may need to get its value # from the OS's environment variables for Backend or Engine. cls._warn_if_deprecated() # First, check if we've already set the Backend value. if cls._value is not _UNSET: return cls._value backend_config_value = Backend._get_value_from_config() # If Backend is in the OS's configuration, use the configured Backend # value. Otherwise, we need to figure out the Backend value based on # the Engine and StorageFormat values. cls._value = ( backend_config_value if backend_config_value is not _UNSET else cls.get_backend_for_execution( Execution(storage_format=StorageFormat.get(), engine=Engine.get()) ) ) return cls._value Backend.register_backend("Ray", Execution("Pandas", "Ray")) Backend.register_backend("Dask", Execution("Pandas", "Dask")) Backend.register_backend("Python_Test", Execution("Pandas", "Python")) Backend.register_backend("Unidist", Execution("Pandas", "Unidist")) Backend.register_backend("Pandas", Execution("Native", "Native")) class AutoSwitchBackend(EnvironmentVariable, type=bool): """ Whether automatic backend switching is allowed. When this flag is set, a Modin backend can attempt to automatically choose an appropriate backend for different operations based on features of the input data. When disabled, backends should avoid implicit backend switching outside of explicit operations like `to_pandas` and `to_ray`. """ varname = "MODIN_AUTO_SWITCH_BACKENDS" default = False @classmethod def enable(cls) -> None: """Enable automatic backend switching.""" cls.put(True) @classmethod def disable(cls) -> None: """Disable automatic backend switching.""" cls.put(False) class ShowBackendSwitchProgress(EnvironmentVariable, type=bool): """ Whether to show progress when switching between backends. When enabled, progress messages are displayed during backend switches to inform users about data transfer operations. When disabled, backend switches occur silently. """ varname = "MODIN_BACKEND_SWITCH_PROGRESS" default = True @classmethod def enable(cls) -> None: """Enable backend switch progress display.""" cls.put(True) @classmethod def disable(cls) -> None: """Disable backend switch progress display.""" cls.put(False) class IsExperimental(EnvironmentVariable, type=bool): """Whether to Turn on experimental features.""" varname = "MODIN_EXPERIMENTAL" class IsRayCluster(EnvironmentVariable, type=bool): """Whether Modin is running on pre-initialized Ray cluster.""" varname = "MODIN_RAY_CLUSTER" class RayRedisAddress(EnvironmentVariable, type=ExactStr): """Redis address to connect to when running in Ray cluster.""" varname = "MODIN_REDIS_ADDRESS" class RayRedisPassword(EnvironmentVariable, type=ExactStr): """What password to use for connecting to Redis.""" varname = "MODIN_REDIS_PASSWORD" default = secrets.token_hex(32) class RayInitCustomResources(EnvironmentVariable, type=dict): """ Ray node's custom resources to initialize with. Visit Ray documentation for more details: https://docs.ray.io/en/latest/ray-core/scheduling/resources.html#custom-resources Notes ----- Relying on Modin to initialize Ray, you should set this config for the proper initialization with custom resources. """ varname = "MODIN_RAY_INIT_CUSTOM_RESOURCES" default = None class RayTaskCustomResources(EnvironmentVariable, type=dict): """ Ray node's custom resources to request them in tasks or actors. Visit Ray documentation for more details: https://docs.ray.io/en/latest/ray-core/scheduling/resources.html#custom-resources Notes ----- You can use this config to limit the parallelism for the entire workflow by setting the config at the very beginning. >>> import modin.config as cfg >>> cfg.RayTaskCustomResources.put({"special_hardware": 0.001}) This way each single remote task or actor will require 0.001 of "special_hardware" to run. You can also use this config to limit the parallelism for a certain operation by setting the config with context. >>> with context(RayTaskCustomResources={"special_hardware": 0.001}): ... df. This way each single remote task or actor will require 0.001 of "special_hardware" to run within the context only. """ varname = "MODIN_RAY_TASK_CUSTOM_RESOURCES" default = None class CpuCount(EnvironmentVariable, type=int): """How many CPU cores to use during initialization of the Modin engine.""" varname = "MODIN_CPUS" @classmethod def _put(cls, value: int) -> None: """ Put specific value if CpuCount wasn't set by a user yet. Parameters ---------- value : int Config value to set. Notes ----- This method is used to set CpuCount from cluster resources internally and should not be called by a user. """ if cls.get_value_source() == ValueSource.DEFAULT: cls.put(value) @classmethod def _get_default(cls) -> int: """ Get default value of the config. Returns ------- int """ import multiprocessing return multiprocessing.cpu_count() @classmethod def get(cls) -> int: """ Get ``CpuCount`` with extra checks. Returns ------- int """ cpu_count = super().get() if cpu_count <= 0: raise ValueError(f"`CpuCount` should be > 0; current value: {cpu_count}") return cpu_count class GpuCount(EnvironmentVariable, type=int): """How may GPU devices to utilize across the whole distribution.""" varname = "MODIN_GPUS" class Memory(EnvironmentVariable, type=int): """ How much memory (in bytes) give to an execution engine. Notes ----- * In Ray case: the amount of memory to start the Plasma object store with. * In Dask case: the amount of memory that is given to each worker depending on CPUs used. """ varname = "MODIN_MEMORY" class NPartitions(EnvironmentVariable, type=int): """How many partitions to use for a Modin DataFrame (along each axis).""" varname = "MODIN_NPARTITIONS" @classmethod def _put(cls, value: int) -> None: """ Put specific value if NPartitions wasn't set by a user yet. Parameters ---------- value : int Config value to set. Notes ----- This method is used to set NPartitions from cluster resources internally and should not be called by a user. """ if cls.get_value_source() == ValueSource.DEFAULT: cls.put(value) @classmethod def _get_default(cls) -> int: """ Get default value of the config. Returns ------- int """ return CpuCount.get() @classmethod def get(cls) -> int: """ Get ``NPartitions`` with extra checks. Returns ------- int """ nparts = super().get() if nparts <= 0: raise ValueError(f"`NPartitions` should be > 0; current value: {nparts}") return nparts class TestDatasetSize(EnvironmentVariable, type=str): """Dataset size for running some tests.""" varname = "MODIN_TEST_DATASET_SIZE" choices = ("Small", "Normal", "Big") class TrackFileLeaks(EnvironmentVariable, type=bool): """Whether to track for open file handles leakage during testing.""" varname = "MODIN_TEST_TRACK_FILE_LEAKS" # Turn off tracking on Windows by default because # psutil's open_files() can be extremely slow on Windows (up to adding a few hours). # see https://github.com/giampaolo/psutil/pull/597 default = sys.platform != "win32" class AsvImplementation(EnvironmentVariable, type=ExactStr): """Allows to select a library that we will use for testing performance.""" varname = "MODIN_ASV_USE_IMPL" choices = ("modin", "pandas") default = "modin" class AsvDataSizeConfig(EnvironmentVariable, type=ExactStr): """Allows to override default size of data (shapes).""" varname = "MODIN_ASV_DATASIZE_CONFIG" default = None class ProgressBar(EnvironmentVariable, type=bool): """Whether or not to show the progress bar.""" varname = "MODIN_PROGRESS_BAR" default = False @classmethod def enable(cls) -> None: """Enable ``ProgressBar`` feature.""" cls.put(True) @classmethod def disable(cls) -> None: """Disable ``ProgressBar`` feature.""" cls.put(False) @classmethod def put(cls, value: bool) -> None: """ Set ``ProgressBar`` value only if synchronous benchmarking is disabled. Parameters ---------- value : bool Config value to set. """ if value and BenchmarkMode.get(): raise ValueError("ProgressBar isn't compatible with BenchmarkMode") super().put(value) class BenchmarkMode(EnvironmentVariable, type=bool): """Whether or not to perform computations synchronously.""" varname = "MODIN_BENCHMARK_MODE" default = False @classmethod def put(cls, value: bool) -> None: """ Set ``BenchmarkMode`` value only if progress bar feature is disabled. Parameters ---------- value : bool Config value to set. """ if value and ProgressBar.get(): raise ValueError("BenchmarkMode isn't compatible with ProgressBar") super().put(value) class LogMode(EnvironmentVariable, type=ExactStr): """Set ``LogMode`` value if users want to opt-in.""" varname = "MODIN_LOG_MODE" choices = ("enable", "disable") default = "disable" @classmethod def enable(cls) -> None: """Enable all logging levels.""" cls.put("enable") @classmethod def disable(cls) -> None: """Disable logging feature.""" cls.put("disable") class LogMemoryInterval(EnvironmentVariable, type=int): """Interval (in seconds) to profile memory utilization for logging.""" varname = "MODIN_LOG_MEMORY_INTERVAL" default = 5 @classmethod def put(cls, value: int) -> None: """ Set ``LogMemoryInterval`` with extra checks. Parameters ---------- value : int Config value to set. """ if value <= 0: raise ValueError(f"Log memory Interval should be > 0, passed value {value}") super().put(value) @classmethod def get(cls) -> int: """ Get ``LogMemoryInterval`` with extra checks. Returns ------- int """ log_memory_interval = super().get() if log_memory_interval <= 0: raise ValueError( f"`LogMemoryInterval` should be > 0; current value: {log_memory_interval}" ) return log_memory_interval class LogFileSize(EnvironmentVariable, type=int): """Max size of logs (in MBs) to store per Modin job.""" varname = "MODIN_LOG_FILE_SIZE" default = 10 @classmethod def put(cls, value: int) -> None: """ Set ``LogFileSize`` with extra checks. Parameters ---------- value : int Config value to set. """ if value <= 0: raise ValueError(f"Log file size should be > 0 MB, passed value {value}") super().put(value) @classmethod def get(cls) -> int: """ Get ``LogFileSize`` with extra checks. Returns ------- int """ log_file_size = super().get() if log_file_size <= 0: raise ValueError( f"`LogFileSize` should be > 0; current value: {log_file_size}" ) return log_file_size class MetricsMode(EnvironmentVariable, type=ExactStr): """ Set ``MetricsMode`` value to disable/enable metrics collection. Metric handlers are registered through `add_metric_handler` and can be used to record graphite-style timings or values. It is the responsibility of the handler to define how those emitted metrics are handled. """ varname = "MODIN_METRICS_MODE" choices = ("enable", "disable") default = "enable" @classmethod def enable(cls) -> None: """Enable all metric collection.""" cls.put("enable") @classmethod def disable(cls) -> None: """Disable all metric collection.""" cls.put("disable") class PersistentPickle(EnvironmentVariable, type=bool): """Whether serialization should be persistent.""" varname = "MODIN_PERSISTENT_PICKLE" # When set to off, it allows faster serialization which is only # valid in current run (i.e. useless for saving to disk). # When set to on, Modin objects could be saved to disk and loaded # but serialization/deserialization could take more time. default = False class MinPartitionSize(EnvironmentVariable, type=int): """ Minimum number of rows/columns in a single pandas partition split. Once a partition for a pandas dataframe has more than this many elements, Modin adds another partition. """ varname = "MODIN_MIN_PARTITION_SIZE" default = 32 @classmethod def put(cls, value: int) -> None: """ Set ``MinPartitionSize`` with extra checks. Parameters ---------- value : int Config value to set. """ if value <= 0: raise ValueError(f"Min partition size should be > 0, passed value {value}") super().put(value) @classmethod def get(cls) -> int: """ Get ``MinPartitionSize`` with extra checks. Returns ------- int """ from modin.error_message import ErrorMessage ErrorMessage.single_warning( "`MinPartitionSize` is deprecated and will be removed in a future version. " + "This config has no longer effect, " + "use `MinRowPartitionSize` and `MinColumnPartitionSize` instead.", FutureWarning, ) min_partition_size = super().get() if min_partition_size <= 0: raise ValueError( f"`MinPartitionSize` should be > 0; current value: {min_partition_size}" ) return min_partition_size class MinRowPartitionSize(EnvironmentVariable, type=int): """ Minimum number of rows in a single pandas partition split. Once a partition for a pandas dataframe has more than this many elements, Modin adds another partition. """ varname = "MODIN_MIN_ROW_PARTITION_SIZE" default = 32 @classmethod def put(cls, value: int) -> None: """ Set ``MinRowPartitionSize`` with extra checks. Parameters ---------- value : int Config value to set. """ if value <= 0: raise ValueError( f"Min row partition size should be > 0, passed value {value}" ) super().put(value) @classmethod def get(cls) -> int: """ Get ``MinRowPartitionSize`` with extra checks. Returns ------- int """ min_row_partition_size = super().get() if min_row_partition_size <= 0: raise ValueError( f"`MinRowPartitionSize` should be > 0; current value: {min_row_partition_size}" ) return min_row_partition_size class MinColumnPartitionSize(EnvironmentVariable, type=int): """ Minimum number of columns in a single pandas partition split. Once a partition for a pandas dataframe has more than this many elements, Modin adds another partition. """ varname = "MODIN_MIN_COLUMN_PARTITION_SIZE" default = 32 @classmethod def put(cls, value: int) -> None: """ Set ``MinColumnPartitionSize`` with extra checks. Parameters ---------- value : int Config value to set. """ if value <= 0: raise ValueError( f"Min column partition size should be > 0, passed value {value}" ) super().put(value) @classmethod def get(cls) -> int: """ Get ``MinColumnPartitionSize`` with extra checks. Returns ------- int """ min_column_partition_size = super().get() if min_column_partition_size <= 0: raise ValueError( f"`MinColumnPartitionSize` should be > 0; current value: {min_column_partition_size}" ) return min_column_partition_size class TestReadFromSqlServer(EnvironmentVariable, type=bool): """Set to true to test reading from SQL server.""" varname = "MODIN_TEST_READ_FROM_SQL_SERVER" default = False class TestReadFromPostgres(EnvironmentVariable, type=bool): """Set to true to test reading from Postgres.""" varname = "MODIN_TEST_READ_FROM_POSTGRES" default = False class GithubCI(EnvironmentVariable, type=bool): """Set to true when running Modin in GitHub CI.""" varname = "MODIN_GITHUB_CI" default = False class ModinNumpy(EnvironmentVariable, type=bool): """Set to true to use Modin's implementation of NumPy API.""" varname = "MODIN_NUMPY" default = False class RangePartitioning(EnvironmentVariable, type=bool): """ Set to true to use Modin's range-partitioning implementation where possible. Please refer to documentation for cases where enabling this options would be beneficial: https://modin.readthedocs.io/en/stable/flow/modin/experimental/range_partitioning_groupby.html """ varname = "MODIN_RANGE_PARTITIONING" default = False class CIAWSSecretAccessKey(EnvironmentVariable, type=str): """Set to AWS_SECRET_ACCESS_KEY when running mock S3 tests for Modin in GitHub CI.""" varname = "AWS_SECRET_ACCESS_KEY" default = "foobar_secret" class CIAWSAccessKeyID(EnvironmentVariable, type=str): """Set to AWS_ACCESS_KEY_ID when running mock S3 tests for Modin in GitHub CI.""" varname = "AWS_ACCESS_KEY_ID" default = "foobar_key" class AsyncReadMode(EnvironmentVariable, type=bool): """ It does not wait for the end of reading information from the source. It basically means, that the reading function only launches tasks for the dataframe to be read/created, but not ensures that the construction is finalized by the time the reading function returns a dataframe. This option was brought to improve performance of reading/construction of Modin DataFrames, however it may also: 1. Increase the peak memory consumption. Since the garbage collection of the temporary objects created during the reading is now also lazy and will only be performed when the reading/construction is actually finished. 2. Can break situations when the source is manually deleted after the reading function returns a result, for example, when reading inside of a context-block that deletes the file on ``__exit__()``. """ varname = "MODIN_ASYNC_READ_MODE" default = False class ReadSqlEngine(EnvironmentVariable, type=str): """Engine to run `read_sql`.""" varname = "MODIN_READ_SQL_ENGINE" default = "Pandas" choices = ("Pandas", "Connectorx") class LazyExecution(EnvironmentVariable, type=str): """ Lazy execution mode. Supported values: `Auto` - the execution mode is chosen by the engine for each operation (default value). `On` - the lazy execution is performed wherever it's possible. `Off` - the lazy execution is disabled. """ varname = "MODIN_LAZY_EXECUTION" choices = ("Auto", "On", "Off") default = "Auto" class DocModule(EnvironmentVariable, type=ExactStr): """ The module to use that will be used for docstrings. The value set here must be a valid, importable module. It should have a `DataFrame`, `Series`, and/or several APIs directly (e.g. `read_csv`). """ varname = "MODIN_DOC_MODULE" default = "pandas" class DaskThreadsPerWorker(EnvironmentVariable, type=int): """Number of threads per Dask worker.""" varname = "MODIN_DASK_THREADS_PER_WORKER" default = 1 class NativePandasMaxRows(EnvironmentVariable, type=int): """Maximum number of rows which can be processed using local, native, pandas.""" varname = "MODIN_NATIVE_MAX_ROWS" default = 10_000_000 class NativePandasTransferThreshold(EnvironmentVariable, type=int): """ Targeted max number of dataframe rows which should be transferred between engines. This is often the same value as MODIN_NATIVE_MAX_ROWS but it can be independently set to change how transfer costs are considered. """ varname = "MODIN_NATIVE_MAX_XFER_ROWS" default = 10_000_000 class NativePandasDeepCopy(EnvironmentVariable, type=bool): """ Whether to perform deep copies when transferring data with the native pandas backend. Copies occur when constructing a Modin frame from a native pandas object with `pd.DataFrame(pandas.DataFrame([]))`, or when creating a native pandas frame from a Modin one via `df.modin.to_pandas()`. Leaving this flag disabled produces significant performance improvements by reducing the number of copy operations performed. However, it may create unexpected results if the user mutates the Modin frame or native pandas frame in-place. >>> import pandas # doctest: +SKIP >>> import modin.pandas as pd # doctest: +SKIP >>> from modin.config import Backend # doctest: + SKIP >>> Backend.put("Pandas") # doctest: +SKIP >>> pandas.set_option("mode.copy_on_write", False) # doctest: +SKIP >>> native_df = pandas.DataFrame([0]) # doctest: +SKIP >>> modin_df = pd.DataFrame(native_df) # doctest: +SKIP >>> native_df.loc[0, 0] = -1 # doctest: +SKIP >>> modin_df # doctest: +SKIP 0 0 -1 """ varname = "MODIN_NATIVE_DEEP_COPY" default = False @classmethod def enable(cls) -> None: """Enable deep copy on frames with the native pandas backend.""" cls.put(True) @classmethod def disable(cls) -> None: """Disable deep copy on frames with the native pandas backend.""" cls.put(False) class BackendMergeCastInPlace(EnvironmentVariable, type=bool): """ Whether to cast a DataFrame in-place when performing a merge when using hybrid mode. This flag modifies the behavior of a cast performed on operations involving more than one type of query compiler. If enabled the actual cast will be performed in-place and the input DataFrame will have a new backend. If disabled the original DataFrame will remain on the same underlying engine. """ varname = "MODIN_BACKEND_MERGE_CAST_IN_PLACE" default = True @classmethod def enable(cls) -> None: """Enable casting in place when performing a merge operation betwen two different compilers.""" cls.put(True) @classmethod def disable(cls) -> None: """Disable casting in place when performing a merge operation betwen two different compilers.""" cls.put(False) class BackendJoinConsiderAllBackends(EnvironmentVariable, type=bool): """ Whether to consider all active backends when performing a pre-operation switch for join operations. Only used when AutoSwitchBackend is active. By default, only backends already present in the arguments of a join operation are considered when switching backends. Enabling this flag will allow join operations that are registered as pre-op switches to consider backends other than those directly present in the arguments. """ varname = "MODIN_BACKEND_JOIN_CONSIDER_ALL_BACKENDS" default = True @classmethod def enable(cls) -> None: """Enable casting in place when performing a merge operation betwen two different compilers.""" cls.put(True) @classmethod def disable(cls) -> None: """Disable casting in place when performing a merge operation betwen two different compilers.""" cls.put(False) class DynamicPartitioning(EnvironmentVariable, type=bool): """ Set to true to use Modin's dynamic-partitioning implementation where possible. Please refer to documentation for cases where enabling this options would be beneficial: https://modin.readthedocs.io/en/stable/usage_guide/optimization_notes/index.html#dynamic-partitioning-in-modin """ varname = "MODIN_DYNAMIC_PARTITIONING" default = False def _check_vars() -> None: """ Check validity of environment variables. Look out for any environment variables that start with "MODIN_" prefix that are unknown - they might be a typo, so warn a user. """ valid_names = { obj.varname for obj in globals().values() if isinstance(obj, type) and issubclass(obj, EnvironmentVariable) and not obj.is_abstract } found_names = {name for name in os.environ if name.startswith("MODIN_")} unknown = found_names - valid_names deprecated: dict[str, DeprecationDescriptor] = { obj.varname: obj._deprecation_descriptor for obj in globals().values() if isinstance(obj, type) and issubclass(obj, EnvironmentVariable) and not obj.is_abstract and obj.varname is not None and obj._deprecation_descriptor is not None } found_deprecated = found_names & deprecated.keys() if unknown: warnings.warn( f"Found unknown environment variable{'s' if len(unknown) > 1 else ''}," + f" please check {'their' if len(unknown) > 1 else 'its'} spelling: " + ", ".join(sorted(unknown)) ) for depr_var in found_deprecated: warnings.warn( deprecated[depr_var].deprecation_message(use_envvar_names=True), FutureWarning, ) _check_vars() ================================================ FILE: modin/config/pubsub.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses ``Parameter`` class - base class for all configs.""" import contextlib import warnings from collections import defaultdict from enum import IntEnum from typing import ( TYPE_CHECKING, Any, Callable, DefaultDict, Iterator, NamedTuple, Optional, Tuple, cast, ) if TYPE_CHECKING: from modin.config.envvars import EnvironmentVariable class DeprecationDescriptor: """ Describe deprecated parameter. Parameters ---------- parameter : type[Parameter] Deprecated parameter. new_parameter : type[Parameter], optional If there's a replacement parameter for the deprecated one, specify it here. when_removed : str, optional If known, the exact release when the deprecated parameter is planned to be removed. """ _parameter: type["Parameter"] _new_parameter: Optional[type["Parameter"]] _when_removed: str def __init__( self, parameter: type["Parameter"], new_parameter: Optional[type["Parameter"]] = None, when_removed: Optional[str] = None, ): self._parameter = parameter self._new_parameter = new_parameter self._when_removed = "a future" if when_removed is None else when_removed def deprecation_message(self, use_envvar_names: bool = False) -> str: """ Generate a message to be used in a warning raised when using the deprecated parameter. Parameters ---------- use_envvar_names : bool, default: False Whether to use environment variable names in the warning. If ``True``, both ``self._parameter`` and ``self._new_parameter`` have to be a type of ``EnvironmentVariable``. Returns ------- str """ name = ( cast("EnvironmentVariable", self._parameter).varname if use_envvar_names else self._parameter.__name__ ) msg = f"'{name}' is deprecated and will be removed in {self._when_removed} version." if self._new_parameter is not None: new_name = ( cast("EnvironmentVariable", self._new_parameter).varname if use_envvar_names else self._new_parameter.__name__ ) msg += f" Use '{new_name}' instead." return msg class TypeDescriptor(NamedTuple): """ Class for config data manipulating of exact type. Parameters ---------- decode : callable Callable to decode config value from the raw data. normalize : callable Callable to bring different config value variations to the single form. verify : callable Callable to check that config value satisfies given config type requirements. help : str Class description string. """ decode: Callable[[str], object] normalize: Callable[[object], object] verify: Callable[[object], bool] help: str class ExactStr(str): """Class to be used in type params where no transformations are needed.""" _TYPE_PARAMS = { str: TypeDescriptor( decode=lambda value: value.strip().title(), normalize=lambda value: str(value).strip().title(), verify=lambda value: True, help="a case-insensitive string", ), ExactStr: TypeDescriptor( decode=lambda value: value, normalize=lambda value: value, verify=lambda value: True, help="a string", ), bool: TypeDescriptor( decode=lambda value: value.strip().lower() in {"true", "yes", "1"}, normalize=bool, verify=lambda value: isinstance(value, bool) or ( isinstance(value, str) and value.strip().lower() in {"true", "yes", "1", "false", "no", "0"} ), help="a boolean flag (any of 'true', 'yes' or '1' in case insensitive manner is considered positive)", ), int: TypeDescriptor( decode=lambda value: int(value.strip()), normalize=int, # type: ignore verify=lambda value: isinstance(value, int) or (isinstance(value, str) and value.strip().isdigit()), help="an integer value", ), dict: TypeDescriptor( decode=lambda value: { key: int(val) if val.isdigit() else val for key_value in value.split(",") for key, val in [[v.strip() for v in key_value.split("=", maxsplit=1)]] }, normalize=lambda value: ( value if isinstance(value, dict) else { key: int(val) if val.isdigit() else val for key_value in str(value).split(",") for key, val in [[v.strip() for v in key_value.split("=", maxsplit=1)]] } ), verify=lambda value: isinstance(value, dict) or ( isinstance(value, str) and all( key_value.find("=") not in (-1, len(key_value) - 1) for key_value in value.split(",") ) ), help="a sequence of KEY=VALUE values separated by comma (Example: 'KEY1=VALUE1,KEY2=VALUE2,KEY3=VALUE3')", ), } # special marker to distinguish unset value from None value # as someone may want to use None as a real value for a parameter _UNSET = object() class ValueSource(IntEnum): # noqa: PR01 """Class that describes the method of getting the value for a parameter.""" # got from default, i.e. neither user nor configuration source had the value DEFAULT = 0 # set by user SET_BY_USER = 1 # got from parameter configuration source, like environment variable GOT_FROM_CFG_SOURCE = 2 class Parameter(object): """ Base class describing interface for configuration entities. Attributes ---------- choices : Optional[Sequence[str]] Array with possible options of ``Parameter`` values. type : str String that denotes ``Parameter`` type. default : Optional[Any] ``Parameter`` default value. is_abstract : bool, default: True Whether or not ``Parameter`` is abstract. _value_source : Optional[ValueSource] Source of the ``Parameter`` value, should be set by ``ValueSource``. _deprecation_descriptor : Optional[DeprecationDescriptor] Indicate whether this parameter is deprecated. """ choices: Optional[Tuple[str, ...]] = None type = str default: Optional[Any] = None is_abstract = True _value_source: Optional[ValueSource] = None _value: Any = _UNSET _subs: list = [] _once: DefaultDict[Any, list] = defaultdict(list) _deprecation_descriptor: Optional[DeprecationDescriptor] = None @classmethod def _warn_if_deprecated(cls) -> None: """Warn that the variable is deprecated if it has a deprecation descriptor.""" if cls._deprecation_descriptor is not None: warnings.warn( cls._deprecation_descriptor.deprecation_message(), FutureWarning ) @classmethod def _get_value_from_config(cls) -> Any: """ Read the value from config storage. Returns ------- Any Config raw value if it's set, otherwise `_UNSET`. Notes ----- Config storage can be config file or environment variable or whatever. Method should be implemented in the child class. """ raise NotImplementedError() @classmethod def get_help(cls) -> str: """ Generate user-presentable help for the option. Returns ------- str Notes ----- Method should be implemented in the child class. """ raise NotImplementedError() def __init_subclass__(cls, type: Any, abstract: bool = False, **kw: dict): """ Initialize subclass. Parameters ---------- type : Any Type of the config. abstract : bool, default: False Whether config is abstract. **kw : dict Optional arguments for config initialization. """ assert type in _TYPE_PARAMS, f"Unsupported variable type: {type}" cls.type = type cls.is_abstract = abstract cls._value = _UNSET cls._subs = [] cls._once = defaultdict(list) super().__init_subclass__(**kw) @classmethod def subscribe(cls, callback: Callable) -> None: """ Add `callback` to the `_subs` list and then execute it. Parameters ---------- callback : callable Callable to execute. """ cls._subs.append(callback) callback(cls) @classmethod def _get_default(cls) -> Any: """ Get default value of the config. Returns ------- Any """ return cls.default @classmethod def get_value_source(cls) -> ValueSource: """ Get value source of the config. Returns ------- ValueSource """ if cls._value_source is None: # dummy call to .get() to initialize the value cls.get() assert ( cls._value_source is not None ), "_value_source must be initialized by now in get()" return cls._value_source @classmethod def get(cls) -> Any: """ Get config value. Returns ------- Any Decoded and verified config value. """ cls._warn_if_deprecated() if cls._value is _UNSET: # get the value from env config_value = cls._get_value_from_config() if config_value is _UNSET: cls._value = cls._get_default() cls._value_source = ValueSource.DEFAULT else: cls._value = config_value cls._value_source = ValueSource.GOT_FROM_CFG_SOURCE return cls._value @classmethod def put(cls, value: Any) -> None: """ Set config value. Parameters ---------- value : Any Config value to set. """ cls._warn_if_deprecated() cls._check_callbacks(cls._put_nocallback(value)) cls._value_source = ValueSource.SET_BY_USER @classmethod def normalize(cls, value: Any) -> Any: """ Normalize config value. Parameters ---------- value : Any Config value to normalize. Returns ------- Any Normalized config value. """ return _TYPE_PARAMS[cls.type].normalize(value) @classmethod def once(cls, onvalue: Any, callback: Callable) -> None: """ Execute `callback` if config value matches `onvalue` value. Otherwise accumulate callbacks associated with the given `onvalue` in the `_once` container. Parameters ---------- onvalue : Any Config value to set. callback : callable Callable that should be executed if config value matches `onvalue`. """ onvalue = cls.normalize(onvalue) if onvalue == cls.get(): callback(cls) else: cls._once[onvalue].append(callback) @classmethod def _put_nocallback(cls, value: Any) -> Any: """ Set config value without executing callbacks. Parameters ---------- value : Any Config value to set. Returns ------- Any Replaced (old) config value. """ if not _TYPE_PARAMS[cls.type].verify(value): raise ValueError(f"Unsupported value: {value}") value = cls.normalize(value) oldvalue, cls._value = cls.get(), value return oldvalue @classmethod def _check_callbacks(cls, oldvalue: Any) -> None: """ Execute all needed callbacks if config value was changed. Parameters ---------- oldvalue : Any Previous (old) config value. """ if oldvalue == cls.get(): return for callback in cls._subs: callback(cls) for callback in cls._once.pop(cls.get(), ()): callback(cls) @classmethod def add_option(cls, choice: Any) -> Any: """ Add a new choice for the parameter. Parameters ---------- choice : Any New choice to add to the available choices. Returns ------- Any Added choice normalized according to the parameter type. """ if cls.choices is not None: if not _TYPE_PARAMS[cls.type].verify(choice): raise ValueError(f"Unsupported choice value: {choice}") choice = cls.normalize(choice) if choice not in cls.choices: cls.choices += (choice,) return choice raise TypeError("Cannot add a choice to a parameter where choices is None") @contextlib.contextmanager def context(**config: dict[str, Any]) -> Iterator[None]: """ Set a value(s) for the specified config(s) from ``modin.config`` in the scope of the context. Parameters ---------- **config : dict[str, Any] Keyword describing a name of a config variable from ``modin.config`` as a key and a new value as a value. Examples -------- >>> RangePartitioning.get() False >>> with context(RangePartitioning=True): ... print(RangePartitioning.get()) # True True False >>> RangePartitioning.get() False >>> with context(RangePartitioning=True, AsyncReadMode=True): ... print(RangePartitioning.get()) # True ... print(AsyncReadMode.get()) # True True True >>> RangePartitioning.get() False >>> AsyncReadMode.get() False """ import modin.config as cfg old_values = {} for name, val in config.items(): var = getattr(cfg, name) old_values[var] = var.get() var.put(val) try: yield finally: for var, val in old_values.items(): var.put(val) __all__ = ["Parameter", "context"] ================================================ FILE: modin/conftest.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. # We turn off mypy type checks in this file because it's not imported anywhere # type: ignore import copy import logging import os import platform import shutil import subprocess import sys import time from collections import defaultdict from contextlib import contextmanager from typing import Iterable, Optional import boto3 import numpy as np import pandas import pytest import requests import s3fs from pandas.util._decorators import doc from modin.config import Backend, Execution assert ( "modin.utils" not in sys.modules ), "Do not import modin.utils before patching, or tests could fail" # every import under this assert has to be postfixed with 'noqa: E402' # as flake8 complains about that... but we _have_ to make sure we # monkey-patch at the right spot, otherwise testing doc URLs might # not catch all of them import modin.utils # noqa: E402 _generated_doc_urls = set() def _saving_make_api_url(token, _make_api_url=modin.utils._make_api_url): url = _make_api_url(token) _generated_doc_urls.add(url) return url modin.utils._make_api_url = _saving_make_api_url import uuid # noqa: E402 import modin # noqa: E402 import modin.config # noqa: E402 import modin.pandas as pd # noqa: E402 import modin.tests.config # noqa: E402 from modin.config import ( # noqa: E402 AsyncReadMode, BenchmarkMode, GithubCI, IsExperimental, MinRowPartitionSize, NPartitions, ) from modin.core.execution.dispatching.factories import factories # noqa: E402 from modin.core.execution.python.implementations.pandas_on_python.io import ( # noqa: E402 PandasOnPythonIO, ) from modin.core.storage_formats import ( # noqa: E402 BaseQueryCompiler, PandasQueryCompiler, ) from modin.core.storage_formats.pandas.query_compiler_caster import ( # noqa: E402 _CLASS_AND_BACKEND_TO_POST_OP_SWITCH_METHODS, _CLASS_AND_BACKEND_TO_PRE_OP_SWITCH_METHODS, _GENERAL_EXTENSIONS, ) from modin.tests.pandas.utils import ( # noqa: E402 NROWS, _make_csv_file, get_unique_filename, make_default_file, ) def pytest_addoption(parser): parser.addoption( "--execution", action="store", default=None, help="specifies execution to run tests on", ) def set_experimental_env(mode): IsExperimental.put(mode == "experimental") @pytest.fixture(scope="session", autouse=True) def enforce_config(): """ A fixture that ensures that all checks for MODIN_* variables are done using modin.config to prevent leakage """ orig_env = os.environ modin_start = os.path.dirname(modin.__file__) modin_exclude = [ os.path.dirname(modin.config.__file__), os.path.dirname(modin.tests.config.__file__), ] class PatchedEnv: @staticmethod def __check_var(name): if name.upper().startswith("MODIN_"): frame = sys._getframe() try: # get the path to module where caller of caller is defined; # caller of this function is inside PatchedEnv, and we're # interested in whomever called a method on PatchedEnv caller_file = frame.f_back.f_back.f_code.co_filename finally: del frame pkg_name = os.path.dirname(caller_file) if pkg_name.startswith(modin_start): assert any( pkg_name.startswith(excl) for excl in modin_exclude ), "Do not access MODIN_ environment variable bypassing modin.config" def __getitem__(self, name): self.__check_var(name) return orig_env[name] def __setitem__(self, name, value): self.__check_var(name) orig_env[name] = value def __delitem__(self, name): self.__check_var(name) del orig_env[name] def pop(self, name, default=object()): self.__check_var(name) return orig_env.pop(name, default) def get(self, name, default=None): self.__check_var(name) return orig_env.get(name, default) def __contains__(self, name): self.__check_var(name) return name in orig_env def __getattr__(self, name): return getattr(orig_env, name) def __iter__(self): return iter(orig_env) os.environ = PatchedEnv() yield os.environ = orig_env BASE_EXECUTION_NAME = "BaseOnPython" class TestQC(BaseQueryCompiler): def __init__(self, modin_frame): self._modin_frame = modin_frame storage_format = property( lambda self: "Base", doc=BaseQueryCompiler.storage_format.__doc__ ) engine = property(lambda self: "Python", doc=BaseQueryCompiler.engine.__doc__) def finalize(self): self._modin_frame.finalize() def execute(self): self.finalize() self._modin_frame.wait_computations() @classmethod def from_pandas(cls, df, data_cls): return cls(data_cls.from_pandas(df)) @classmethod def from_arrow(cls, at, data_cls): return cls(data_cls.from_arrow(at)) def free(self): pass def to_interchange_dataframe( self, nan_as_null: bool = False, allow_copy: bool = True ): raise NotImplementedError( "The selected execution does not implement the DataFrame exchange protocol." ) @classmethod def from_interchange_dataframe(cls, df, data_cls): raise NotImplementedError( "The selected execution does not implement the DataFrame exchange protocol." ) to_pandas = PandasQueryCompiler.to_pandas default_to_pandas = PandasQueryCompiler.default_to_pandas class BaseOnPythonIO(PandasOnPythonIO): query_compiler_cls = TestQC class BaseOnPythonFactory(factories.BaseFactory): @classmethod def prepare(cls): cls.io_cls = BaseOnPythonIO def set_base_execution(name=BASE_EXECUTION_NAME): setattr(factories, f"{name}Factory", BaseOnPythonFactory) Backend.register_backend( "BaseOnPython", Execution( engine="Python", storage_format="Base", ), ) modin.set_execution(engine="python", storage_format=name.split("On")[0]) @pytest.fixture(scope="function") def get_unique_base_execution(): """Setup unique execution for a single function and yield its QueryCompiler that's suitable for inplace modifications.""" # It's better to use decimal IDs rather than hex ones due to factory names formatting execution_id = int(uuid.uuid4().hex, 16) format_name = f"Base{execution_id}" engine_name = "Python" execution_name = f"{format_name}On{engine_name}" # Dynamically building all the required classes to form a new execution base_qc = type( format_name, (TestQC,), {"get_backend": (lambda self: execution_name)} ) base_io = type( f"{execution_name}IO", (BaseOnPythonIO,), {"query_compiler_cls": base_qc} ) base_factory = type( f"{execution_name}Factory", (BaseOnPythonFactory,), {"prepare": classmethod(lambda cls: setattr(cls, "io_cls", base_io))}, ) # Setting up the new execution setattr(factories, f"{execution_name}Factory", base_factory) Backend.register_backend( execution_name, Execution(engine=engine_name, storage_format=format_name) ) old_engine, old_format = modin.set_execution( engine=engine_name, storage_format=format_name ) yield base_qc # Teardown the new execution modin.set_execution(engine=old_engine, storage_format=old_format) try: delattr(factories, f"{execution_name}Factory") except AttributeError: pass def pytest_configure(config): execution = config.option.execution if execution is None: return if execution == BASE_EXECUTION_NAME: set_base_execution(BASE_EXECUTION_NAME) config.addinivalue_line( "filterwarnings", "default:.*defaulting to pandas.*:UserWarning" ) else: partition, engine = execution.split("On") modin.set_execution(engine=engine, storage_format=partition) def pytest_runtest_call(item): custom_markers = ["xfail", "skip"] # dynamicly adding custom markers to tests for custom_marker in custom_markers: for marker in item.iter_markers(name=f"{custom_marker}_executions"): executions = marker.args[0] if not isinstance(executions, list): executions = [executions] current_execution = modin.utils.get_current_execution() reason = marker.kwargs.pop("reason", "") item.add_marker( getattr(pytest.mark, custom_marker)( condition=current_execution in executions, reason=f"Execution {current_execution} does not pass this test. {reason}", **marker.kwargs, ) ) _doc_pytest_fixture = """ Pytest fixture factory that makes temp {file_type} files for testing. Yields: Function that generates {file_type} files """ @pytest.fixture(scope="class") def TestReadCSVFixture(tmp_path_factory): tmp_path = tmp_path_factory.mktemp("TestReadCSVFixture") creator = _make_csv_file(data_dir=tmp_path) # each xdist worker spawned in separate process with separate namespace and dataset pytest.csvs_names = {} # test_read_csv_col_handling, test_read_csv_parsing pytest.csvs_names["test_read_csv_regular"] = creator() # test_read_csv_parsing pytest.csvs_names["test_read_csv_yes_no"] = creator( additional_col_values=["Yes", "true", "No", "false"], ) # test_read_csv_col_handling pytest.csvs_names["test_read_csv_blank_lines"] = creator( add_blank_lines=True, ) # test_read_csv_nans_handling pytest.csvs_names["test_read_csv_nans"] = creator( add_blank_lines=True, additional_col_values=["", "N/A", "NA", "NULL", "custom_nan", "73"], ) # test_read_csv_error_handling pytest.csvs_names["test_read_csv_bad_lines"] = creator( add_bad_lines=True, ) yield @pytest.fixture @doc(_doc_pytest_fixture, file_type="csv") def make_csv_file(tmp_path): yield _make_csv_file(data_dir=tmp_path) def create_fixture(file_type): @doc(_doc_pytest_fixture, file_type=file_type) def fixture(tmp_path): yield make_default_file(file_type=file_type, data_dir=tmp_path) return fixture for file_type in ("json", "html", "excel", "feather", "stata", "hdf", "pickle", "fwf"): fixture = create_fixture(file_type) fixture.__name__ = f"make_{file_type}_file" globals()[fixture.__name__] = pytest.fixture(fixture) @pytest.fixture def make_parquet_file(): """Pytest fixture factory that makes a parquet file/dir for testing. Yields: Function that generates a parquet file/dir """ filenames = [] def _make_parquet_file( filename, nrows=NROWS, ncols=2, force=True, range_index_start=0, range_index_step=1, range_index_name=None, partitioned_columns=[], row_group_size: Optional[int] = None, ): """Helper function to generate parquet files/directories. Args: filename: The name of test file, that should be created. nrows: Number of rows for the dataframe. ncols: Number of cols for the dataframe. force: Create a new file/directory even if one already exists. partitioned_columns: Create a partitioned directory using pandas. row_group_size: Maximum size of each row group. """ if force or not os.path.exists(filename): df = pandas.DataFrame( {f"col{x + 1}": np.arange(nrows) for x in range(ncols)} ) index = pandas.RangeIndex( start=range_index_start, stop=range_index_start + (nrows * range_index_step), step=range_index_step, name=range_index_name, ) if ( range_index_start == 0 and range_index_step == 1 and range_index_name is None ): assert df.index.equals(index) else: df.index = index if len(partitioned_columns) > 0: df.to_parquet( filename, partition_cols=partitioned_columns, row_group_size=row_group_size, ) else: df.to_parquet(filename, row_group_size=row_group_size) filenames.append(filename) # Return function that generates parquet files yield _make_parquet_file # Delete parquet file that was created for path in filenames: if os.path.exists(path): if os.path.isdir(path): shutil.rmtree(path) else: os.remove(path) @pytest.fixture def make_sql_connection(): """Sets up sql connections and takes them down after the caller is done. Yields: Factory that generates sql connection objects """ def _sql_connection(filename, table=""): # Remove file if exists if os.path.exists(filename): os.remove(filename) # Create connection and, if needed, table conn = "sqlite:///{}".format(filename) if table: df = pandas.DataFrame( { "col1": [0, 1, 2, 3, 4, 5, 6], "col2": [7, 8, 9, 10, 11, 12, 13], "col3": [14, 15, 16, 17, 18, 19, 20], "col4": [21, 22, 23, 24, 25, 26, 27], "col5": [0, 0, 0, 0, 0, 0, 0], } ) df.to_sql(table, conn) return conn yield _sql_connection @pytest.fixture(scope="class") def TestReadGlobCSVFixture(tmp_path_factory): tmp_path = tmp_path_factory.mktemp("TestReadGlobCSVFixture") base_name = get_unique_filename(extension="") pytest.glob_path = str(tmp_path / "{}_*.csv".format(base_name)) pytest.files = [str(tmp_path / "{}_{}.csv".format(base_name, i)) for i in range(11)] for fname in pytest.files: # Glob does not guarantee ordering so we have to remove the randomness in the generated csvs. _make_csv_file(data_dir=tmp_path)(fname, row_size=11, remove_randomness=True) yield @pytest.fixture def get_generated_doc_urls(): return lambda: _generated_doc_urls @pytest.fixture def set_num_partitions(request): old_num_partitions = NPartitions.get() NPartitions.put(request.param) yield NPartitions.put(old_num_partitions) @pytest.fixture() def set_benchmark_mode(request): old_benchmark_mode = BenchmarkMode.get() BenchmarkMode.put(request.param) yield BenchmarkMode.put(old_benchmark_mode) @pytest.fixture def set_async_read_mode(request): old_async_read_mode = AsyncReadMode.get() AsyncReadMode.put(request.param) yield AsyncReadMode.put(old_async_read_mode) @pytest.fixture def set_min_row_partition_size(request): old_min_row_partition_size = MinRowPartitionSize.get() MinRowPartitionSize.put(request.param) yield MinRowPartitionSize.put(old_min_row_partition_size) ray_client_server = None @pytest.fixture def s3_storage_options(worker_id): # # copied from pandas conftest.py: # https://github.com/pandas-dev/pandas/blob/32f789fbc5d5a72d9d1ac14935635289eeac9009/pandas/tests/io/conftest.py#L45 # worker_id is a pytest fixture if GithubCI.get(): url = "http://localhost:5000/" else: # If we hit this else-case, this test is being run locally. In that case, we want # each worker to point to a different port for its mock S3 service. The easiest way # to do that is to use the `worker_id`, which is unique, to determine what port to point # to. We arbitrarily assign `5` as a worker id to the master worker, since we need a number # for each worker, and we never run tests with more than `pytest -n 4`. worker_id = "5" if worker_id == "master" else worker_id.lstrip("gw") url = f"http://127.0.0.1:555{worker_id}/" return {"client_kwargs": {"endpoint_url": url}} @pytest.fixture(scope="session") def monkeysession(): with pytest.MonkeyPatch.context() as mp: yield mp @pytest.fixture(scope="session") def s3_base(worker_id, monkeysession): """ Fixture for mocking S3 interaction. Sets up moto server in separate process locally. Yields ------ str URL for motoserver/moto CI service. """ # copied from pandas conftest.py # still need access keys for https://github.com/getmoto/moto/issues/1924 monkeysession.setenv("AWS_ACCESS_KEY_ID", "foobar_key") monkeysession.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret") monkeysession.setenv("AWS_REGION", "us-west-2") if GithubCI.get(): if sys.platform in ("darwin", "win32", "cygwin") or ( platform.machine() in ("arm64", "aarch64") or platform.machine().startswith("armv") ): # pandas comments say: # DO NOT RUN on Windows/macOS/ARM, only Ubuntu # - subprocess in CI can cause timeouts # - GitHub Actions do not support # container services for the above OSs pytest.skip( "S3 tests do not have a corresponding service in Windows, macOS " + "or ARM platforms" ) else: # assume CI has started moto in docker container: # https://docs.getmoto.org/en/latest/docs/server_mode.html#run-using-docker # It would be nice to start moto on another thread as in the # instructions here: # https://docs.getmoto.org/en/latest/docs/server_mode.html#start-within-python # but that gives 403 forbidden error when we try to create the bucket yield "http://localhost:5000" else: # Launching moto in server mode, i.e., as a separate process # with an S3 endpoint on localhost # If we hit this else-case, this test is being run locally. In that case, we want # each worker to point to a different port for its mock S3 service. The easiest way # to do that is to use the `worker_id`, which is unique, to determine what port to point # to. endpoint_port = ( 5500 if worker_id == "master" else (5550 + int(worker_id.lstrip("gw"))) ) endpoint_uri = f"http://127.0.0.1:{endpoint_port}/" # pipe to null to avoid logging in terminal # TODO any way to throw the error from here? e.g. i had an annoying problem # where I didn't have flask-cors and moto just failed .if there's an error # in the popen command and we throw an error within the body of the context # manager, the test just hangs forever. with subprocess.Popen( ["moto_server", "s3", "-p", str(endpoint_port)], stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, ) as proc: for _ in range(50): try: # OK to go once server is accepting connections if requests.get(endpoint_uri).ok: break except Exception: # try again while we still have retries time.sleep(0.1) else: proc.terminate() _, errs = proc.communicate() raise RuntimeError( "Could not connect to moto server after 50 tries. " + f"See stderr for extra info: {errs}" ) yield endpoint_uri proc.terminate() @pytest.fixture def s3_resource(s3_base): """ Set up S3 bucket with contents. The primary bucket name is "modin-test". When running locally, this function should be safe even if there are multiple pytest workers running in parallel because each worker gets its own endpoint. When running in CI, we use a single endpoint for all workers, so we can't have multiple pytest workers running in parallel. """ bucket = "modin-test" conn = boto3.resource("s3", endpoint_url=s3_base) cli = boto3.client("s3", endpoint_url=s3_base) # https://github.com/getmoto/moto/issues/3292 # without location, I get # botocore.exceptions.ClientError: An error occurred # (IllegalLocationConstraintException) when calling the CreateBucket operation: # The unspecified location constraint is incompatible for the region specific # endpoint this request was sent to. # even if I delete os.environ['AWS_REGION'] but somehow pandas can get away with # this. try: cli.create_bucket( Bucket=bucket, CreateBucketConfiguration={"LocationConstraint": "us-west-2"} ) except Exception as e: # OK if bucket already exists, but want to raise other exceptions. # The exception raised by `create_bucket` is made using a factory, # so we need to check using this method of reading the response rather # than just checking the type of the exception. response = getattr(e, "response", {}) error_code = response.get("Error", {}).get("Code", "") if error_code not in ("BucketAlreadyOwnedByYou", "BucketAlreadyExists"): raise for _ in range(20): # We want to wait until bucket creation is finished. if cli.list_buckets()["Buckets"]: break time.sleep(0.1) if not cli.list_buckets()["Buckets"]: raise RuntimeError("Could not create bucket") s3fs.S3FileSystem.clear_instance_cache() s3 = s3fs.S3FileSystem(client_kwargs={"endpoint_url": s3_base}) test_s3_files = [ ("modin-bugs/multiple_csv/", "modin/tests/pandas/data/multiple_csv/"), ( "modin-bugs/test_data_dir.parquet/", "modin/tests/pandas/data/test_data_dir.parquet/", ), ("modin-bugs/test_data.parquet", "modin/tests/pandas/data/test_data.parquet"), ("modin-bugs/test_data.json", "modin/tests/pandas/data/test_data.json"), ("modin-bugs/test_data.fwf", "modin/tests/pandas/data/test_data.fwf"), ("modin-bugs/test_data.feather", "modin/tests/pandas/data/test_data.feather"), ("modin-bugs/issue5159.parquet/", "modin/tests/pandas/data/issue5159.parquet/"), ] for s3_key, file_name in test_s3_files: s3.put(file_name, f"{bucket}/{s3_key}", recursive=s3_key.endswith("/")) yield conn s3.rm(bucket, recursive=True) for _ in range(20): # We want to wait until the deletion finishes. if not cli.list_buckets()["Buckets"]: break time.sleep(0.1) @pytest.fixture def modify_config(request): values = request.param old_values = {} for key, value in values.items(): old_values[key] = key.get() key.put(value) yield # waiting for the test to be completed # restoring old parameters for key, value in old_values.items(): try: key.put(value) except ValueError as e: # sometimes bool env variables have 'None' as a default value, which # causes a ValueError when we try to set this value back, as technically, # only bool values are allowed (and 'None' is not a bool), in this case # we try to set 'False' instead if key.type == bool and value is None: key.put(False) else: raise e @contextmanager def copy_and_restore( dicts: Iterable[defaultdict], ) -> None: """ Make deep copies of defaultdicts and restore them upon exiting this context. Ideally this function would be a fixture, but we want to pass it parameters and use it in other fixtures, and it does not seem to be possible to pass parameters from one fixture to another. Parameters ---------- dicts : Iterable[defaultdict] The dicts to copy and restore. """ try: # Use a tuples of tuples instead of a dict mapping each original dict # to its copy, because the original dict is not hashable. original_dict_to_copy = tuple( (original_dict, copy.deepcopy(original_dict)) for original_dict in dicts ) yield finally: for original_dict, dict_copy in original_dict_to_copy: original_dict.clear() original_dict.update(dict_copy) @pytest.fixture(autouse=True) def clean_up_extensions(): with copy_and_restore( ( pd.dataframe.DataFrame._extensions, pd.Series._extensions, pd.base.BasePandasDataset._extensions, _GENERAL_EXTENSIONS, pd.groupby.DataFrameGroupBy._extensions, pd.groupby.SeriesGroupBy._extensions, ) ): yield from modin.pandas.api.extensions.extensions import _attrs_to_delete_on_test for k, v in _attrs_to_delete_on_test.items(): for obj in v: delattr(k, obj) _attrs_to_delete_on_test.clear() @pytest.fixture(autouse=True) def clean_up_auto_backend_switching(): with copy_and_restore( ( _CLASS_AND_BACKEND_TO_POST_OP_SWITCH_METHODS, _CLASS_AND_BACKEND_TO_PRE_OP_SWITCH_METHODS, ) ): yield @pytest.fixture(autouse=True) def assert_no_root_logging(caplog): try: import xgboost except ImportError: xgboost_path = None else: xgboost_path = os.path.dirname(xgboost.__file__) root_logger = logging.getLogger() # Capture logs at any level, i.e. at level >= logging.NOTSET. with caplog.at_level(logging.NOTSET): yield # Note that because this code is in a fixture, we have to use # caplog.get_records(when="call") instead of caplog.records: # https://github.com/pytest-dev/pytest/issues/4033 assert not any( record.name == root_logger.name # Allow xgboost to log to root. # TODO(https://github.com/modin-project/modin/issues/5194): Check # whether we can remove this exception once we use a newer version of # xgboost. and not (xgboost_path is not None and record.pathname.startswith(xgboost_path)) for record in caplog.get_records(when="call") ) ================================================ FILE: modin/core/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Modin's core functionality.""" ================================================ FILE: modin/core/computation/__init__.py ================================================ ================================================ FILE: modin/core/computation/align.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Core eval alignment algorithms. Forked from pandas.core.computation.align """ from __future__ import annotations import warnings from collections.abc import Sequence from functools import ( partial, wraps, ) from typing import ( Callable, ) import numpy as np import pandas import pandas.core.common as com from pandas._typing import F from pandas.core.base import PandasObject from pandas.errors import PerformanceWarning from modin.core.computation.common import result_type_many from modin.pandas import DataFrame, Series from modin.pandas.base import BasePandasDataset def _align_core_single_unary_op( term, ) -> tuple[partial | type[BasePandasDataset], dict[str, pandas.Index] | None]: typ: partial | type[BasePandasDataset] axes: dict[str, pandas.Index] | None = None if isinstance(term.value, np.ndarray): typ = partial(np.asanyarray, dtype=term.value.dtype) else: typ = type(term.value) if hasattr(term.value, "axes"): axes = _zip_axes_from_type(typ, term.value.axes) return typ, axes def _zip_axes_from_type( typ: type[BasePandasDataset], new_axes: Sequence[pandas.Index] ) -> dict[str, pandas.Index]: return {name: new_axes[i] for i, name in enumerate(typ._AXIS_ORDERS)} def _any_pandas_objects(terms) -> bool: """ Check a sequence of terms for instances of PandasObject. """ return any(isinstance(term.value, PandasObject) for term in terms) def _filter_special_cases(f) -> Callable[[F], F]: @wraps(f) def wrapper(terms): # single unary operand if len(terms) == 1: return _align_core_single_unary_op(terms[0]) term_values = (term.value for term in terms) # we don't have any pandas objects if not _any_pandas_objects(terms): return result_type_many(*term_values), None return f(terms) return wrapper @_filter_special_cases def _align_core(terms): term_index = [i for i, term in enumerate(terms) if hasattr(term.value, "axes")] term_dims = [terms[i].value.ndim for i in term_index] ndims = pandas.Series(dict(zip(term_index, term_dims))) # initial axes are the axes of the largest-axis'd term biggest = terms[ndims.idxmax()].value typ = biggest._constructor axes = biggest.axes naxes = len(axes) gt_than_one_axis = naxes > 1 for value in (terms[i].value for i in term_index): is_series = isinstance(value, Series) is_series_and_gt_one_axis = is_series and gt_than_one_axis for axis, items in enumerate(value.axes): if is_series_and_gt_one_axis: ax, itm = naxes - 1, value.index else: ax, itm = axis, items if not axes[ax].is_(itm): axes[ax] = axes[ax].union(itm) for i, ndim in ndims.items(): for axis, items in zip(range(ndim), axes): ti = terms[i].value if hasattr(ti, "reindex"): transpose = isinstance(ti, Series) and naxes > 1 reindexer = axes[naxes - 1] if transpose else items term_axis_size = len(ti.axes[axis]) reindexer_size = len(reindexer) ordm = np.log10(max(1, abs(reindexer_size - term_axis_size))) if ordm >= 1 and reindexer_size >= 10000: w = ( f"Alignment difference on axis {axis} is larger " + f"than an order of magnitude on term {repr(terms[i].name)}, " + f"by more than {ordm:.4g}; performance may suffer." ) warnings.warn(w, category=PerformanceWarning) obj = ti.reindex(reindexer, axis=axis, copy=False) terms[i].update(obj) terms[i].update(terms[i].value.values) return typ, _zip_axes_from_type(typ, axes) def align_terms(terms): """ Align a set of terms. """ try: # flatten the parse tree (a nested list, really) terms = list(com.flatten(terms)) except TypeError: # can't iterate so it must just be a constant or single variable if isinstance(terms.value, (Series, DataFrame)): typ = type(terms.value) return typ, _zip_axes_from_type(typ, terms.value.axes) return np.result_type(terms.type), None # if all resolved variables are numeric scalars if all(term.is_scalar for term in terms): return result_type_many(*(term.value for term in terms)).type, None # perform the main alignment typ, axes = _align_core(terms) return typ, axes def reconstruct_object(typ, obj, axes, dtype): """ Reconstruct an object given its type, raw value, and possibly empty (None) axes. Parameters ---------- typ : object A type obj : object The value to use in the type constructor axes : dict The axes to use to construct the resulting pandas object Returns ------- ret : typ An object of type ``typ`` with the value `obj` and possible axes `axes`. """ try: typ = typ.type except AttributeError: pass res_t = np.result_type(obj.dtype, dtype) if not isinstance(typ, partial) and issubclass(typ, PandasObject): return typ(obj, dtype=res_t, **axes) # special case for pathological things like ~True/~False if hasattr(res_t, "type") and typ == np.bool_ and res_t != np.bool_: ret_value = res_t.type(obj) else: ret_value = typ(obj).astype(res_t) # The condition is to distinguish 0-dim array (returned in case of # scalar) and 1 element array # e.g. np.array(0) and np.array([0]) if ( len(obj.shape) == 1 and len(obj) == 1 and not isinstance(ret_value, np.ndarray) ): ret_value = np.array([ret_value]).astype(res_t) return ret_value ================================================ FILE: modin/core/computation/check.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Forked from pandas.core.computation.check """ from __future__ import annotations from pandas.compat._optional import import_optional_dependency ne = import_optional_dependency("numexpr", errors="warn") NUMEXPR_INSTALLED = ne is not None __all__ = ["NUMEXPR_INSTALLED"] ================================================ FILE: modin/core/computation/common.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Forked from pandas.core.computation.common """ from __future__ import annotations from functools import reduce import numpy as np from pandas._config import get_option from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import is_extension_array_dtype def ensure_decoded(s) -> str: """ If we have bytes, decode them to unicode. """ if isinstance(s, (np.bytes_, bytes)): s = s.decode(get_option("display.encoding")) return s def result_type_many(*arrays_and_dtypes): """ Wrapper around numpy.result_type which overcomes the NPY_MAXARGS (32) argument limit. """ try: return np.result_type(*arrays_and_dtypes) except ValueError: # we have > NPY_MAXARGS terms in our expression return reduce(np.result_type, arrays_and_dtypes) except TypeError: arr_and_dtypes = list(arrays_and_dtypes) ea_dtypes, non_ea_dtypes = [], [] for arr_or_dtype in arr_and_dtypes: if is_extension_array_dtype(arr_or_dtype): ea_dtypes.append(arr_or_dtype) else: non_ea_dtypes.append(arr_or_dtype) if non_ea_dtypes: try: np_dtype = np.result_type(*non_ea_dtypes) except ValueError: np_dtype = reduce(np.result_type, arrays_and_dtypes) return find_common_type(ea_dtypes + [np_dtype]) return find_common_type(ea_dtypes) ================================================ FILE: modin/core/computation/engines.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Engine classes for :func:`~pandas.eval`. Forked from pandas.core.computation.engines """ from __future__ import annotations import abc from typing import TYPE_CHECKING from pandas.errors import NumExprClobberingError from pandas.io.formats import printing from modin.core.computation.align import ( align_terms, reconstruct_object, ) from modin.core.computation.ops import ( MATHOPS, REDUCTIONS, ) if TYPE_CHECKING: from modin.core.computation.expr import Expr _ne_builtins = frozenset(MATHOPS + REDUCTIONS) def _check_ne_builtin_clash(expr: Expr) -> None: """ Attempt to prevent foot-shooting in a helpful way. Parameters ---------- expr : Expr Terms can contain """ names = expr.names overlap = names & _ne_builtins if overlap: s = ", ".join([repr(x) for x in overlap]) raise NumExprClobberingError( f'Variables in expression "{expr}" overlap with builtins: ({s})' ) class AbstractEngine(metaclass=abc.ABCMeta): """Object serving as a base class for all engines.""" has_neg_frac = False def __init__(self, expr) -> None: self.expr = expr self.aligned_axes = None self.result_type = None def convert(self) -> str: """ Convert an expression for evaluation. Defaults to return the expression as a string. """ return printing.pprint_thing(self.expr) def evaluate(self) -> object: """ Run the engine on the expression. This method performs alignment which is necessary no matter what engine is being used, thus its implementation is in the base class. Returns ------- object The result of the passed expression. """ if not self._is_aligned: self.result_type, self.aligned_axes = align_terms(self.expr.terms) # make sure no names in resolvers and locals/globals clash res = self._evaluate() return reconstruct_object( self.result_type, res, self.aligned_axes, self.expr.terms.return_type ) @property def _is_aligned(self) -> bool: return self.aligned_axes is not None and self.result_type is not None @abc.abstractmethod def _evaluate(self): """ Return an evaluated expression. Parameters ---------- env : Scope The local and global environment in which to evaluate an expression. Notes ----- Must be implemented by subclasses. """ class NumExprEngine(AbstractEngine): """NumExpr engine class""" has_neg_frac = True def _evaluate(self): import numexpr as ne # convert the expression to a valid numexpr expression s = self.convert() env = self.expr.env scope = env.full_scope _check_ne_builtin_clash(self.expr) return ne.evaluate(s, local_dict=scope) class PythonEngine(AbstractEngine): """ Evaluate an expression in Python space. Mostly for testing purposes. """ has_neg_frac = False def evaluate(self): return self.expr() def _evaluate(self) -> None: pass ENGINES: dict[str, type[AbstractEngine]] = { "numexpr": NumExprEngine, "python": PythonEngine, } ================================================ FILE: modin/core/computation/eval.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Top level ``eval`` module. Forked from pandas.core.computation.eval """ from __future__ import annotations import tokenize import warnings from pandas.core.dtypes.common import is_extension_array_dtype from pandas.io.formats.printing import pprint_thing from pandas.util._validators import validate_bool_kwarg from modin.core.computation.check import NUMEXPR_INSTALLED from modin.core.computation.engines import ENGINES from modin.core.computation.expr import ( PARSERS, Expr, ) from modin.core.computation.ops import BinOp from modin.core.computation.parsing import tokenize_string from modin.core.computation.scope import ensure_scope from modin.pandas.base import BasePandasDataset def _check_engine(engine: str | None) -> str: """ Make sure a valid engine is passed. Parameters ---------- engine : str String to validate. Raises ------ KeyError * If an invalid engine is passed. ImportError * If numexpr was requested but doesn't exist. Returns ------- str Engine name. """ if engine is None: engine = "numexpr" if NUMEXPR_INSTALLED else "python" if engine not in ENGINES: valid_engines = list(ENGINES.keys()) raise KeyError( f"Invalid engine '{engine}' passed, valid engines are {valid_engines}" ) # TODO: validate this in a more general way (thinking of future engines # that won't necessarily be import-able) # Could potentially be done on engine instantiation if engine == "numexpr" and not NUMEXPR_INSTALLED: raise ImportError( "'numexpr' is not installed or an unsupported version. Cannot use " + "engine='numexpr' for query/eval if 'numexpr' is not installed" ) return engine def _check_parser(parser: str): """ Make sure a valid parser is passed. Parameters ---------- parser : str Raises ------ KeyError * If an invalid parser is passed """ if parser not in PARSERS: raise KeyError( f"Invalid parser '{parser}' passed, valid parsers are {PARSERS.keys()}" ) def _check_resolvers(resolvers): if resolvers is not None: for resolver in resolvers: if not hasattr(resolver, "__getitem__"): name = type(resolver).__name__ raise TypeError( f"Resolver of type '{name}' does not " + "implement the __getitem__ method" ) def _check_expression(expr): """ Make sure an expression is not an empty string Parameters ---------- expr : object An object that can be converted to a string Raises ------ ValueError * If expr is an empty string """ if not expr: raise ValueError("expr cannot be an empty string") def _convert_expression(expr) -> str: """ Convert an object to an expression. This function converts an object to an expression (a unicode string) and checks to make sure it isn't empty after conversion. This is used to convert operators to their string representation for recursive calls to :func:`~pandas.eval`. Parameters ---------- expr : object The object to be converted to a string. Returns ------- str The string representation of an object. Raises ------ ValueError * If the expression is empty. """ s = pprint_thing(expr) _check_expression(s) return s def _check_for_locals(expr: str, stack_level: int, parser: str): at_top_of_stack = stack_level == 0 not_pandas_parser = parser != "pandas" if not_pandas_parser: msg = "The '@' prefix is only supported by the pandas parser" elif at_top_of_stack: msg = ( "The '@' prefix is not allowed in top-level eval calls.\n" + "please refer to your variables by name without the '@' prefix." ) if at_top_of_stack or not_pandas_parser: for toknum, tokval in tokenize_string(expr): if toknum == tokenize.OP and tokval == "@": raise SyntaxError(msg) def eval( expr: str | BinOp, # we leave BinOp out of the docstr bc it isn't for users parser: str = "pandas", engine: str | None = None, local_dict=None, global_dict=None, resolvers=(), level: int = 0, target=None, inplace: bool = False, ): """ Evaluate a Python expression as a string using various backends. The following arithmetic operations are supported: ``+``, ``-``, ``*``, ``/``, ``**``, ``%``, ``//`` (python engine only) along with the following boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not). Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`, :keyword:`or`, and :keyword:`not` with the same semantics as the corresponding bitwise operators. :class:`~pandas.Series` and :class:`~pandas.DataFrame` objects are supported and behave as they would with plain ol' Python evaluation. Parameters ---------- expr : str The expression to evaluate. This string cannot contain any Python `statements `__, only Python `expressions `__. parser : {'pandas', 'python'}, default 'pandas' The parser to use to construct the syntax tree from the expression. The default of ``'pandas'`` parses code slightly different than standard Python. Alternatively, you can parse an expression using the ``'python'`` parser to retain strict Python semantics. See the :ref:`enhancing performance ` documentation for more details. engine : {'python', 'numexpr'}, default 'numexpr' The engine used to evaluate the expression. Supported engines are - None : tries to use ``numexpr``, falls back to ``python`` - ``'numexpr'`` : This default engine evaluates pandas objects using numexpr for large speed ups in complex expressions with large frames. - ``'python'`` : Performs operations as if you had ``eval``'d in top level python. This engine is generally not that useful. More backends may be available in the future. local_dict : dict or None, optional A dictionary of local variables, taken from locals() by default. global_dict : dict or None, optional A dictionary of global variables, taken from globals() by default. resolvers : list of dict-like or None, optional A list of objects implementing the ``__getitem__`` special method that you can use to inject an additional collection of namespaces to use for variable lookup. For example, this is used in the :meth:`~DataFrame.query` method to inject the ``DataFrame.index`` and ``DataFrame.columns`` variables that refer to their respective :class:`~pandas.DataFrame` instance attributes. level : int, optional The number of prior stack frames to traverse and add to the current scope. Most users will **not** need to change this parameter. target : object, optional, default None This is the target object for assignment. It is used when there is variable assignment in the expression. If so, then `target` must support item assignment with string keys, and if a copy is being returned, it must also support `.copy()`. inplace : bool, default False If `target` is provided, and the expression mutates `target`, whether to modify `target` inplace. Otherwise, return a copy of `target` with the mutation. Returns ------- ndarray, numeric scalar, DataFrame, Series, or None The completion value of evaluating the given code or None if ``inplace=True``. Raises ------ ValueError There are many instances where such an error can be raised: - `target=None`, but the expression is multiline. - The expression is multiline, but not all them have item assignment. An example of such an arrangement is this: a = b + 1 a + 2 Here, there are expressions on different lines, making it multiline, but the last line has no variable assigned to the output of `a + 2`. - `inplace=True`, but the expression is missing item assignment. - Item assignment is provided, but the `target` does not support string item assignment. - Item assignment is provided and `inplace=False`, but the `target` does not support the `.copy()` method See Also -------- DataFrame.query : Evaluates a boolean expression to query the columns of a frame. DataFrame.eval : Evaluate a string describing operations on DataFrame columns. Notes ----- The ``dtype`` of any objects involved in an arithmetic ``%`` operation are recursively cast to ``float64``. See the :ref:`enhancing performance ` documentation for more details. Examples -------- >>> df = pd.DataFrame({"animal": ["dog", "pig"], "age": [10, 20]}) >>> df animal age 0 dog 10 1 pig 20 We can add a new column using ``pd.eval``: >>> pd.eval("double_age = df.age * 2", target=df) animal age double_age 0 dog 10 20 1 pig 20 40 """ inplace = validate_bool_kwarg(inplace, "inplace") exprs: list[str | BinOp] if isinstance(expr, str): _check_expression(expr) exprs = [e.strip() for e in expr.splitlines() if e.strip() != ""] else: # ops.BinOp; for internal compat, not intended to be passed by users exprs = [expr] multi_line = len(exprs) > 1 if multi_line and target is None: raise ValueError( "multi-line expressions are only valid in the " + "context of data, use DataFrame.eval" ) engine = _check_engine(engine) _check_parser(parser) _check_resolvers(resolvers) ret = None first_expr = True target_modified = False for expr in exprs: expr = _convert_expression(expr) _check_for_locals(expr, level, parser) # get our (possibly passed-in) scope env = ensure_scope( level + 1, global_dict=global_dict, local_dict=local_dict, resolvers=resolvers, target=target, ) parsed_expr = Expr(expr, engine=engine, parser=parser, env=env) if engine == "numexpr" and ( is_extension_array_dtype(parsed_expr.terms.return_type) or getattr(parsed_expr.terms, "operand_types", None) is not None and any( is_extension_array_dtype(elem) for elem in parsed_expr.terms.operand_types ) ): warnings.warn( "Engine has switched to 'python' because numexpr does not support " + "extension array dtypes. Please set your engine to python manually.", RuntimeWarning, ) engine = "python" # construct the engine and evaluate the parsed expression eng = ENGINES[engine] eng_inst = eng(parsed_expr) ret = eng_inst.evaluate() if parsed_expr.assigner is None: if multi_line: raise ValueError( "Multi-line expressions are only valid " + "if all expressions contain an assignment" ) if inplace: raise ValueError("Cannot operate inplace if there is no assignment") # assign if needed assigner = parsed_expr.assigner if env.target is not None and assigner is not None: target_modified = True # if returning a copy, copy only on the first assignment if not inplace and first_expr: try: target = env.target if isinstance(target, BasePandasDataset): target = target.copy(deep=True) else: target = target.copy() except AttributeError as err: raise ValueError("Cannot return a copy of the target") from err else: target = env.target # TypeError is most commonly raised (e.g. int, list), but you # get IndexError if you try to do this assignment on np.ndarray. # we will ignore numpy warnings here; e.g. if trying # to use a non-numeric indexer try: if inplace and isinstance(target, BasePandasDataset): target.loc[:, assigner] = ret else: target[assigner] = ret # pyright: ignore[reportGeneralTypeIssues] except (TypeError, IndexError) as err: raise ValueError("Cannot assign expression output to target") from err if not resolvers: resolvers = ({assigner: ret},) else: # existing resolver needs updated to handle # case of mutating existing column in copy for resolver in resolvers: if assigner in resolver: resolver[assigner] = ret break else: resolvers += ({assigner: ret},) ret = None first_expr = False # We want to exclude `inplace=None` as being False. return (target if target_modified else ret) if inplace is False else None ================================================ FILE: modin/core/computation/expr.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ :func:`~pandas.eval` parsers. Forked from pandas.core.computation.expr """ from __future__ import annotations import ast import tokenize from functools import ( partial, reduce, ) from keyword import iskeyword from typing import ( Callable, ClassVar, TypeVar, ) import numpy as np import pandas.core.common as com from pandas.errors import UndefinedVariableError from pandas.io.formats import printing from modin.core.computation.ops import ( ARITH_OPS_SYMS, BOOL_OPS_SYMS, CMP_OPS_SYMS, LOCAL_TAG, UNARY_OPS_SYMS, BinOp, Constant, FuncNode, Op, Term, UnaryOp, is_term, ) from modin.core.computation.parsing import ( clean_backtick_quoted_toks, tokenize_string, ) from modin.core.computation.scope import Scope def _rewrite_assign(tok: tuple[int, str]) -> tuple[int, str]: """ Rewrite the assignment operator for PyTables expressions that use ``=`` as a substitute for ``==``. Parameters ---------- tok : tuple of int, str ints correspond to the all caps constants in the tokenize module Returns ------- tuple of int, str Either the input or token or the replacement values """ toknum, tokval = tok return toknum, "==" if tokval == "=" else tokval def _replace_booleans(tok: tuple[int, str]) -> tuple[int, str]: """ Replace ``&`` with ``and`` and ``|`` with ``or`` so that bitwise precedence is changed to boolean precedence. Parameters ---------- tok : tuple of int, str ints correspond to the all caps constants in the tokenize module Returns ------- tuple of int, str Either the input or token or the replacement values """ toknum, tokval = tok if toknum == tokenize.OP: if tokval == "&": return tokenize.NAME, "and" elif tokval == "|": return tokenize.NAME, "or" return toknum, tokval return toknum, tokval def _replace_locals(tok: tuple[int, str]) -> tuple[int, str]: """ Replace local variables with a syntactically valid name. Parameters ---------- tok : tuple of int, str ints correspond to the all caps constants in the tokenize module Returns ------- tuple of int, str Either the input or token or the replacement values Notes ----- This is somewhat of a hack in that we rewrite a string such as ``'@a'`` as ``'__pd_eval_local_a'`` by telling the tokenizer that ``__pd_eval_local_`` is a ``tokenize.OP`` and to replace the ``'@'`` symbol with it. """ toknum, tokval = tok if toknum == tokenize.OP and tokval == "@": return tokenize.OP, LOCAL_TAG return toknum, tokval def _compose2(f, g): """ Compose 2 callables. """ return lambda *args, **kwargs: f(g(*args, **kwargs)) def _compose(*funcs): """ Compose 2 or more callables. """ assert len(funcs) > 1, "At least 2 callables must be passed to compose" return reduce(_compose2, funcs) def _preparse( source: str, f=_compose( _replace_locals, _replace_booleans, _rewrite_assign, clean_backtick_quoted_toks ), ) -> str: """ Compose a collection of tokenization functions. Parameters ---------- source : str A Python source code string f : callable This takes a tuple of (toknum, tokval) as its argument and returns a tuple with the same structure but possibly different elements. Defaults to the composition of ``_rewrite_assign``, ``_replace_booleans``, and ``_replace_locals``. Returns ------- str Valid Python source code Notes ----- The `f` parameter can be any callable that takes *and* returns input of the form ``(toknum, tokval)``, where ``toknum`` is one of the constants from the ``tokenize`` module and ``tokval`` is a string. """ assert callable(f), "f must be callable" return tokenize.untokenize(f(x) for x in tokenize_string(source)) def _is_type(t): """ Factory for a type checking function of type ``t`` or tuple of types. """ return lambda x: isinstance(x.value, t) _is_list = _is_type(list) _is_str = _is_type(str) # partition all AST nodes _all_nodes = frozenset( node for node in (getattr(ast, name) for name in dir(ast)) if isinstance(node, type) and issubclass(node, ast.AST) ) def _filter_nodes(superclass, all_nodes=_all_nodes): """ Filter out AST nodes that are subclasses of ``superclass``. """ node_names = (node.__name__ for node in all_nodes if issubclass(node, superclass)) return frozenset(node_names) _all_node_names = frozenset(x.__name__ for x in _all_nodes) _mod_nodes = _filter_nodes(ast.mod) _stmt_nodes = _filter_nodes(ast.stmt) _expr_context_nodes = _filter_nodes(ast.expr_context) _boolop_nodes = _filter_nodes(ast.boolop) _handler_nodes = _filter_nodes(ast.excepthandler) _arguments_nodes = _filter_nodes(ast.arguments) _keyword_nodes = _filter_nodes(ast.keyword) _alias_nodes = _filter_nodes(ast.alias) # nodes that we don't support directly but are needed for parsing _hacked_nodes = frozenset(["Assign", "Module", "Expr"]) _unsupported_expr_nodes = frozenset( [ "Yield", "GeneratorExp", "IfExp", "DictComp", "SetComp", "Repr", "Lambda", "Set", "AST", "Is", "IsNot", ] ) # these nodes are low priority or won't ever be supported (e.g., AST) _unsupported_nodes = ( _stmt_nodes | _mod_nodes | _handler_nodes | _arguments_nodes | _keyword_nodes | _alias_nodes | _expr_context_nodes | _unsupported_expr_nodes ) - _hacked_nodes # we're adding a different assignment in some cases to be equality comparison # and we don't want `stmt` and friends in their so get only the class whose # names are capitalized _base_supported_nodes = (_all_node_names - _unsupported_nodes) | _hacked_nodes intersection = _unsupported_nodes & _base_supported_nodes _msg = f"cannot both support and not support {intersection}" assert not intersection, _msg def _node_not_implemented(node_name: str) -> Callable[..., None]: """ Return a function that raises a NotImplementedError with a passed node name. """ def f(self, *args, **kwargs): raise NotImplementedError(f"'{node_name}' nodes are not implemented") return f _T = TypeVar("_T") def disallow(nodes: set[str]) -> Callable[[type[_T]], type[_T]]: """ Decorator to disallow certain nodes from parsing. Raises a NotImplementedError instead. Returns ------- callable """ def disallowed(cls: type[_T]) -> type[_T]: # error: "Type[_T]" has no attribute "unsupported_nodes" cls.unsupported_nodes = () # type: ignore[attr-defined] for node in nodes: new_method = _node_not_implemented(node) name = f"visit_{node}" # error: "Type[_T]" has no attribute "unsupported_nodes" cls.unsupported_nodes += (name,) # type: ignore[attr-defined] setattr(cls, name, new_method) return cls return disallowed def _op_maker(op_class, op_symbol): """ Return a function to create an op class with its symbol already passed. Returns ------- callable """ def f(self, node, *args, **kwargs): """ Return a partial function with an Op subclass with an operator already passed. Returns ------- callable """ return partial(op_class, op_symbol, *args, **kwargs) return f _op_classes = {"binary": BinOp, "unary": UnaryOp} def add_ops(op_classes): """ Decorator to add default implementation of ops. """ def f(cls): for op_attr_name, op_class in op_classes.items(): ops = getattr(cls, f"{op_attr_name}_ops") ops_map = getattr(cls, f"{op_attr_name}_op_nodes_map") for op in ops: op_node = ops_map[op] if op_node is not None: made_op = _op_maker(op_class, op) setattr(cls, f"visit_{op_node}", made_op) return cls return f @disallow(_unsupported_nodes) @add_ops(_op_classes) class BaseExprVisitor(ast.NodeVisitor): """ Custom ast walker. Parsers of other engines should subclass this class if necessary. Parameters ---------- env : Scope engine : str parser : str preparser : callable """ const_type: ClassVar[type[Term]] = Constant term_type: ClassVar[type[Term]] = Term binary_ops = CMP_OPS_SYMS + BOOL_OPS_SYMS + ARITH_OPS_SYMS binary_op_nodes = ( "Gt", "Lt", "GtE", "LtE", "Eq", "NotEq", "In", "NotIn", "BitAnd", "BitOr", "And", "Or", "Add", "Sub", "Mult", "Div", "Pow", "FloorDiv", "Mod", ) binary_op_nodes_map = dict(zip(binary_ops, binary_op_nodes)) unary_ops = UNARY_OPS_SYMS unary_op_nodes = "UAdd", "USub", "Invert", "Not" unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes)) rewrite_map = { ast.Eq: ast.In, ast.NotEq: ast.NotIn, ast.In: ast.In, ast.NotIn: ast.NotIn, } unsupported_nodes: tuple[str, ...] def __init__(self, env, engine, parser, preparser=_preparse) -> None: self.env = env self.engine = engine self.parser = parser self.preparser = preparser self.assigner = None def visit(self, node, **kwargs): if isinstance(node, str): clean = self.preparser(node) try: node = ast.fix_missing_locations(ast.parse(clean)) except SyntaxError as e: if any(iskeyword(x) for x in clean.split()): e.msg = "Python keyword not valid identifier in numexpr query" raise e method = f"visit_{type(node).__name__}" visitor = getattr(self, method) return visitor(node, **kwargs) def visit_Module(self, node, **kwargs): if len(node.body) != 1: raise SyntaxError("only a single expression is allowed") expr = node.body[0] return self.visit(expr, **kwargs) def visit_Expr(self, node, **kwargs): return self.visit(node.value, **kwargs) def _rewrite_membership_op(self, node, left, right): # the kind of the operator (is actually an instance) op_instance = node.op op_type = type(op_instance) # must be two terms and the comparison operator must be ==/!=/in/not in if is_term(left) and is_term(right) and op_type in self.rewrite_map: left_list, right_list = map(_is_list, (left, right)) left_str, right_str = map(_is_str, (left, right)) # if there are any strings or lists in the expression if left_list or right_list or left_str or right_str: op_instance = self.rewrite_map[op_type]() # pop the string variable out of locals and replace it with a list # of one string, kind of a hack if right_str: name = self.env.add_tmp([right.value]) right = self.term_type(name, self.env) if left_str: name = self.env.add_tmp([left.value]) left = self.term_type(name, self.env) op = self.visit(op_instance) return op, op_instance, left, right def _maybe_transform_eq_ne(self, node, left=None, right=None): if left is None: left = self.visit(node.left, side="left") if right is None: right = self.visit(node.right, side="right") op, op_class, left, right = self._rewrite_membership_op(node, left, right) return op, op_class, left, right def _maybe_downcast_constants(self, left, right): f32 = np.dtype(np.float32) if ( left.is_scalar and hasattr(left, "value") and not right.is_scalar and right.return_type == f32 ): # right is a float32 array, left is a scalar name = self.env.add_tmp(np.float32(left.value)) left = self.term_type(name, self.env) if ( right.is_scalar and hasattr(right, "value") and not left.is_scalar and left.return_type == f32 ): # left is a float32 array, right is a scalar name = self.env.add_tmp(np.float32(right.value)) right = self.term_type(name, self.env) return left, right def _maybe_eval(self, binop, eval_in_python): # eval `in` and `not in` (for now) in "partial" python space # things that can be evaluated in "eval" space will be turned into # temporary variables. for example, # [1,2] in a + 2 * b # in that case a + 2 * b will be evaluated using numexpr, and the "in" # call will be evaluated using isin (in python space) return binop.evaluate( self.env, self.engine, self.parser, self.term_type, eval_in_python ) def _maybe_evaluate_binop( self, op, op_class, lhs, rhs, eval_in_python=("in", "not in"), maybe_eval_in_python=("==", "!=", "<", ">", "<=", ">="), ): res = op(lhs, rhs) if res.has_invalid_return_type: raise TypeError( f"unsupported operand type(s) for {res.op}: " + f"'{lhs.type}' and '{rhs.type}'" ) if self.engine != "pytables" and ( res.op in CMP_OPS_SYMS and getattr(lhs, "is_datetime", False) or getattr(rhs, "is_datetime", False) ): # all date ops must be done in python bc numexpr doesn't work # well with NaT return self._maybe_eval(res, self.binary_ops) if res.op in eval_in_python: # "in"/"not in" ops are always evaluated in python return self._maybe_eval(res, eval_in_python) elif self.engine != "pytables": if ( getattr(lhs, "return_type", None) == object or getattr(rhs, "return_type", None) == object ): # evaluate "==" and "!=" in python if either of our operands # has an object return type return self._maybe_eval(res, eval_in_python + maybe_eval_in_python) return res def visit_BinOp(self, node, **kwargs): op, op_class, left, right = self._maybe_transform_eq_ne(node) left, right = self._maybe_downcast_constants(left, right) return self._maybe_evaluate_binop(op, op_class, left, right) def visit_UnaryOp(self, node, **kwargs): op = self.visit(node.op) operand = self.visit(node.operand) return op(operand) def visit_Name(self, node, **kwargs) -> Term: return self.term_type(node.id, self.env, **kwargs) # TODO(py314): deprecated since Python 3.8. Remove after Python 3.14 is min def visit_NameConstant(self, node, **kwargs) -> Term: return self.const_type(node.value, self.env) # TODO(py314): deprecated since Python 3.8. Remove after Python 3.14 is min def visit_Num(self, node, **kwargs) -> Term: return self.const_type(node.value, self.env) def visit_Constant(self, node, **kwargs) -> Term: return self.const_type(node.value, self.env) # TODO(py314): deprecated since Python 3.8. Remove after Python 3.14 is min def visit_Str(self, node, **kwargs) -> Term: name = self.env.add_tmp(node.s) return self.term_type(name, self.env) def visit_List(self, node, **kwargs) -> Term: name = self.env.add_tmp([self.visit(e)(self.env) for e in node.elts]) return self.term_type(name, self.env) visit_Tuple = visit_List def visit_Index(self, node, **kwargs): """df.index[4]""" return self.visit(node.value) def visit_Subscript(self, node, **kwargs) -> Term: from modin.core.computation.eval import eval as pd_eval value = self.visit(node.value) slobj = self.visit(node.slice) result = pd_eval( slobj, local_dict=self.env, engine=self.engine, parser=self.parser ) try: # a Term instance v = value.value[result] except AttributeError: # an Op instance lhs = pd_eval( value, local_dict=self.env, engine=self.engine, parser=self.parser ) v = lhs[result] name = self.env.add_tmp(v) return self.term_type(name, env=self.env) def visit_Slice(self, node, **kwargs) -> slice: """df.index[slice(4,6)]""" lower = node.lower if lower is not None: lower = self.visit(lower).value upper = node.upper if upper is not None: upper = self.visit(upper).value step = node.step if step is not None: step = self.visit(step).value return slice(lower, upper, step) def visit_Assign(self, node, **kwargs): """ support a single assignment node, like c = a + b set the assigner at the top level, must be a Name node which might or might not exist in the resolvers """ if len(node.targets) != 1: raise SyntaxError("can only assign a single expression") if not isinstance(node.targets[0], ast.Name): raise SyntaxError("left hand side of an assignment must be a single name") if self.env.target is None: raise ValueError("cannot assign without a target object") try: assigner = self.visit(node.targets[0], **kwargs) except UndefinedVariableError: assigner = node.targets[0].id self.assigner = getattr(assigner, "name", assigner) if self.assigner is None: raise SyntaxError( "left hand side of an assignment must be a single resolvable name" ) return self.visit(node.value, **kwargs) def visit_Attribute(self, node, **kwargs): attr = node.attr value = node.value ctx = node.ctx if isinstance(ctx, ast.Load): # resolve the value resolved = self.visit(value).value try: v = getattr(resolved, attr) name = self.env.add_tmp(v) return self.term_type(name, self.env) except AttributeError: # something like datetime.datetime where scope is overridden if isinstance(value, ast.Name) and value.id == attr: return resolved raise raise ValueError(f"Invalid Attribute context {type(ctx).__name__}") def visit_Call(self, node, side=None, **kwargs): if isinstance(node.func, ast.Attribute) and node.func.attr != "__call__": res = self.visit_Attribute(node.func) elif not isinstance(node.func, ast.Name): raise TypeError("Only named functions are supported") else: try: res = self.visit(node.func) except UndefinedVariableError: # Check if this is a supported function name try: res = FuncNode(node.func.id) except ValueError: # Raise original error raise if res is None: # error: "expr" has no attribute "id" raise ValueError( f"Invalid function call {node.func.id}" # type: ignore[attr-defined] ) if hasattr(res, "value"): res = res.value if isinstance(res, FuncNode): new_args = [self.visit(arg) for arg in node.args] if node.keywords: raise TypeError( f'Function "{res.name}" does not support keyword arguments' ) return res(*new_args) else: new_args = [self.visit(arg)(self.env) for arg in node.args] for key in node.keywords: if not isinstance(key, ast.keyword): # error: "expr" has no attribute "id" raise ValueError( "keyword error in function call " + f"'{node.func.id}'" # type: ignore[attr-defined] ) if key.arg: kwargs[key.arg] = self.visit(key.value)(self.env) name = self.env.add_tmp(res(*new_args, **kwargs)) return self.term_type(name=name, env=self.env) def translate_In(self, op): return op def visit_Compare(self, node, **kwargs): ops = node.ops comps = node.comparators # base case: we have something like a CMP b if len(comps) == 1: op = self.translate_In(ops[0]) binop = ast.BinOp(op=op, left=node.left, right=comps[0]) return self.visit(binop) # recursive case: we have a chained comparison, a CMP b CMP c, etc. left = node.left values = [] for op, comp in zip(ops, comps): new_node = self.visit( ast.Compare(comparators=[comp], left=left, ops=[self.translate_In(op)]) ) left = comp values.append(new_node) return self.visit(ast.BoolOp(op=ast.And(), values=values)) def _try_visit_binop(self, bop): if isinstance(bop, (Op, Term)): return bop return self.visit(bop) def visit_BoolOp(self, node, **kwargs): def visitor(x, y): lhs = self._try_visit_binop(x) rhs = self._try_visit_binop(y) op, op_class, lhs, rhs = self._maybe_transform_eq_ne(node, lhs, rhs) return self._maybe_evaluate_binop(op, node.op, lhs, rhs) operands = node.values return reduce(visitor, operands) _python_not_supported = frozenset(["Dict", "BoolOp", "In", "NotIn"]) @disallow( (_unsupported_nodes | _python_not_supported) - (_boolop_nodes | frozenset(["BoolOp", "Attribute", "In", "NotIn", "Tuple"])) ) class PandasExprVisitor(BaseExprVisitor): def __init__( self, env, engine, parser, preparser=partial( _preparse, f=_compose(_replace_locals, _replace_booleans, clean_backtick_quoted_toks), ), ) -> None: super().__init__(env, engine, parser, preparser) @disallow(_unsupported_nodes | _python_not_supported | frozenset(["Not"])) class PythonExprVisitor(BaseExprVisitor): def __init__( self, env, engine, parser, preparser=lambda source, f=None: source ) -> None: super().__init__(env, engine, parser, preparser=preparser) class Expr: """ Object encapsulating an expression. Parameters ---------- expr : str engine : str, optional, default 'numexpr' parser : str, optional, default 'pandas' env : Scope, optional, default None level : int, optional, default 2 """ env: Scope engine: str parser: str def __init__( self, expr, engine: str = "numexpr", parser: str = "pandas", env: Scope | None = None, level: int = 0, ) -> None: self.expr = expr self.env = env or Scope(level=level + 1) self.engine = engine self.parser = parser self._visitor = PARSERS[parser](self.env, self.engine, self.parser) self.terms = self.parse() @property def assigner(self): return getattr(self._visitor, "assigner", None) def __call__(self): return self.terms(self.env) def __repr__(self) -> str: return printing.pprint_thing(self.terms) def __len__(self) -> int: return len(self.expr) def parse(self): """ Parse an expression. """ return self._visitor.visit(self.expr) @property def names(self): """ Get the names in an expression. """ if is_term(self.terms): return frozenset([self.terms.name]) return frozenset(term.name for term in com.flatten(self.terms)) PARSERS = {"python": PythonExprVisitor, "pandas": PandasExprVisitor} ================================================ FILE: modin/core/computation/ops.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Operator classes for eval. Forked from pandas.core.computation.ops """ from __future__ import annotations import operator from datetime import datetime from functools import partial from typing import ( TYPE_CHECKING, Callable, Literal, ) import numpy as np import pandas import pandas.core.common as com from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( is_list_like, is_scalar, ) from pandas.io.formats.printing import ( pprint_thing, pprint_thing_encoded, ) from modin.core.computation.common import ( ensure_decoded, result_type_many, ) from modin.core.computation.scope import DEFAULT_GLOBALS if TYPE_CHECKING: from collections.abc import ( Iterable, Iterator, ) REDUCTIONS = ("sum", "prod", "min", "max") _unary_math_ops = ( "sin", "cos", "exp", "log", "expm1", "log1p", "sqrt", "sinh", "cosh", "tanh", "arcsin", "arccos", "arctan", "arccosh", "arcsinh", "arctanh", "abs", "log10", "floor", "ceil", ) _binary_math_ops = ("arctan2",) MATHOPS = _unary_math_ops + _binary_math_ops LOCAL_TAG = "__pd_eval_local_" class Term: def __new__(cls, name, env, side=None, encoding=None): klass = Constant if not isinstance(name, str) else cls # error: Argument 2 for "super" not an instance of argument 1 supr_new = super(Term, klass).__new__ # type: ignore[misc] return supr_new(klass) is_local: bool def __init__(self, name, env, side=None, encoding=None) -> None: # name is a str for Term, but may be something else for subclasses self._name = name self.env = env self.side = side tname = str(name) self.is_local = tname.startswith(LOCAL_TAG) or tname in DEFAULT_GLOBALS self._value = self._resolve_name() self.encoding = encoding @property def local_name(self) -> str: return self.name.replace(LOCAL_TAG, "") def __repr__(self) -> str: return pprint_thing(self.name) def __call__(self, *args, **kwargs): return self.value def evaluate(self, *args, **kwargs) -> Term: return self def _resolve_name(self): local_name = str(self.local_name) is_local = self.is_local if local_name in self.env.scope and isinstance( self.env.scope[local_name], type ): is_local = False res = self.env.resolve(local_name, is_local=is_local) self.update(res) if hasattr(res, "ndim") and res.ndim > 2: raise NotImplementedError( "N-dimensional objects, where N > 2, are not supported with eval" ) return res def update(self, value) -> None: """ search order for local (i.e., @variable) variables: scope, key_variable [('locals', 'local_name'), ('globals', 'local_name'), ('locals', 'key'), ('globals', 'key')] """ key = self.name # if it's a variable name (otherwise a constant) if isinstance(key, str): self.env.swapkey(self.local_name, key, new_value=value) self.value = value @property def is_scalar(self) -> bool: return is_scalar(self._value) @property def type(self): try: # potentially very slow for large, mixed dtype frames return find_common_type(self._value.dtypes.values) except AttributeError: try: # ndarray return self._value.dtype except AttributeError: # scalar return type(self._value) return_type = type @property def raw(self) -> str: return f"{type(self).__name__}(name={repr(self.name)}, type={self.type})" @property def is_datetime(self) -> bool: try: t = self.type.type except AttributeError: t = self.type return issubclass(t, (datetime, np.datetime64)) @property def value(self): return self._value @value.setter def value(self, new_value) -> None: self._value = new_value @property def name(self): return self._name @property def ndim(self) -> int: return self._value.ndim class Constant(Term): def _resolve_name(self): return self._name @property def name(self): return self.value def __repr__(self) -> str: # in python 2 str() of float # can truncate shorter than repr() return repr(self.name) _bool_op_map = {"not": "~", "and": "&", "or": "|"} class Op: """ Hold an operator of arbitrary arity. """ op: str def __init__(self, op: str, operands: Iterable[Term | Op], encoding=None) -> None: self.op = _bool_op_map.get(op, op) self.operands = operands self.encoding = encoding def __iter__(self) -> Iterator: return iter(self.operands) def __repr__(self) -> str: """ Print a generic n-ary operator and its operands using infix notation. """ # recurse over the operands parened = (f"({pprint_thing(opr)})" for opr in self.operands) return pprint_thing(f" {self.op} ".join(parened)) @property def return_type(self): # clobber types to bool if the op is a boolean operator if self.op in (CMP_OPS_SYMS + BOOL_OPS_SYMS): return np.bool_ return result_type_many(*(term.type for term in com.flatten(self))) @property def has_invalid_return_type(self) -> bool: types = self.operand_types obj_dtype_set = frozenset([np.dtype("object")]) return self.return_type == object and types - obj_dtype_set @property def operand_types(self): return frozenset(term.type for term in com.flatten(self)) @property def is_scalar(self) -> bool: return all(operand.is_scalar for operand in self.operands) @property def is_datetime(self) -> bool: try: t = self.return_type.type except AttributeError: t = self.return_type return issubclass(t, (datetime, np.datetime64)) def _in(x, y): """ Compute the vectorized membership of ``x in y`` if possible, otherwise use Python. """ try: return x.isin(y) except AttributeError: if is_list_like(x): try: return y.isin(x) except AttributeError: pass return x in y def _not_in(x, y): """ Compute the vectorized membership of ``x not in y`` if possible, otherwise use Python. """ try: return ~x.isin(y) except AttributeError: if is_list_like(x): try: return ~y.isin(x) except AttributeError: pass return x not in y CMP_OPS_SYMS = (">", "<", ">=", "<=", "==", "!=", "in", "not in") _cmp_ops_funcs = ( operator.gt, operator.lt, operator.ge, operator.le, operator.eq, operator.ne, _in, _not_in, ) _cmp_ops_dict = dict(zip(CMP_OPS_SYMS, _cmp_ops_funcs)) BOOL_OPS_SYMS = ("&", "|", "and", "or") _bool_ops_funcs = (operator.and_, operator.or_, operator.and_, operator.or_) _bool_ops_dict = dict(zip(BOOL_OPS_SYMS, _bool_ops_funcs)) ARITH_OPS_SYMS = ("+", "-", "*", "/", "**", "//", "%") _arith_ops_funcs = ( operator.add, operator.sub, operator.mul, operator.truediv, operator.pow, operator.floordiv, operator.mod, ) _arith_ops_dict = dict(zip(ARITH_OPS_SYMS, _arith_ops_funcs)) SPECIAL_CASE_ARITH_OPS_SYMS = ("**", "//", "%") _binary_ops_dict = {} for d in (_cmp_ops_dict, _bool_ops_dict, _arith_ops_dict): _binary_ops_dict.update(d) def is_term(obj) -> bool: return isinstance(obj, Term) class BinOp(Op): """ Hold a binary operator and its operands. Parameters ---------- op : str lhs : Term or Op rhs : Term or Op """ def __init__(self, op: str, lhs, rhs) -> None: super().__init__(op, (lhs, rhs)) self.lhs = lhs self.rhs = rhs self._disallow_scalar_only_bool_ops() self.convert_values() try: self.func = _binary_ops_dict[op] except KeyError as err: # has to be made a list for python3 keys = list(_binary_ops_dict.keys()) raise ValueError( f"Invalid binary operator {repr(op)}, valid operators are {keys}" ) from err def __call__(self, env): """ Recursively evaluate an expression in Python space. Parameters ---------- env : Scope Returns ------- object The result of an evaluated expression. """ # recurse over the left/right nodes left = self.lhs(env) right = self.rhs(env) return self.func(left, right) def evaluate(self, env, engine: str, parser, term_type, eval_in_python): """ Evaluate a binary operation *before* being passed to the engine. Parameters ---------- env : Scope engine : str parser : str term_type : type eval_in_python : list Returns ------- term_type The "pre-evaluated" expression as an instance of ``term_type`` """ if engine == "python": res = self(env) else: # recurse over the left/right nodes left = self.lhs.evaluate( env, engine=engine, parser=parser, term_type=term_type, eval_in_python=eval_in_python, ) right = self.rhs.evaluate( env, engine=engine, parser=parser, term_type=term_type, eval_in_python=eval_in_python, ) # base cases if self.op in eval_in_python: res = self.func(left.value, right.value) else: from modin.core.computation.eval import eval res = eval(self, local_dict=env, engine=engine, parser=parser) name = env.add_tmp(res) return term_type(name, env=env) def convert_values(self) -> None: """ Convert datetimes to a comparable value in an expression. """ def stringify(value): encoder: Callable if self.encoding is not None: encoder = partial(pprint_thing_encoded, encoding=self.encoding) else: encoder = pprint_thing return encoder(value) lhs, rhs = self.lhs, self.rhs if is_term(lhs) and lhs.is_datetime and is_term(rhs) and rhs.is_scalar: v = rhs.value if isinstance(v, (int, float)): v = stringify(v) v = pandas.Timestamp(ensure_decoded(v)) if v.tz is not None: v = v.tz_convert("UTC") self.rhs.update(v) if is_term(rhs) and rhs.is_datetime and is_term(lhs) and lhs.is_scalar: v = lhs.value if isinstance(v, (int, float)): v = stringify(v) v = pandas.Timestamp(ensure_decoded(v)) if v.tz is not None: v = v.tz_convert("UTC") self.lhs.update(v) def _disallow_scalar_only_bool_ops(self): rhs = self.rhs lhs = self.lhs # GH#24883 unwrap dtype if necessary to ensure we have a type object rhs_rt = rhs.return_type rhs_rt = getattr(rhs_rt, "type", rhs_rt) lhs_rt = lhs.return_type lhs_rt = getattr(lhs_rt, "type", lhs_rt) if ( (lhs.is_scalar or rhs.is_scalar) and self.op in _bool_ops_dict and ( not ( issubclass(rhs_rt, (bool, np.bool_)) and issubclass(lhs_rt, (bool, np.bool_)) ) ) ): raise NotImplementedError("cannot evaluate scalar only bool ops") def isnumeric(dtype) -> bool: return issubclass(np.dtype(dtype).type, np.number) UNARY_OPS_SYMS = ("+", "-", "~", "not") _unary_ops_funcs = (operator.pos, operator.neg, operator.invert, operator.invert) _unary_ops_dict = dict(zip(UNARY_OPS_SYMS, _unary_ops_funcs)) class UnaryOp(Op): """ Hold a unary operator and its operands. Parameters ---------- op : str The token used to represent the operator. operand : Term or Op The Term or Op operand to the operator. Raises ------ ValueError * If no function associated with the passed operator token is found. """ def __init__(self, op: Literal["+", "-", "~", "not"], operand) -> None: super().__init__(op, (operand,)) self.operand = operand try: self.func = _unary_ops_dict[op] except KeyError as err: raise ValueError( f"Invalid unary operator {repr(op)}, valid operators are {UNARY_OPS_SYMS}" ) from err def __call__(self, env) -> MathCall: operand = self.operand(env) # error: Cannot call function of unknown type return self.func(operand) # type: ignore[operator] def __repr__(self) -> str: return pprint_thing(f"{self.op}({self.operand})") @property def return_type(self) -> np.dtype: operand = self.operand if operand.return_type == np.dtype("bool"): return np.dtype("bool") if isinstance(operand, Op) and ( operand.op in _cmp_ops_dict or operand.op in _bool_ops_dict ): return np.dtype("bool") return np.dtype("int") class MathCall(Op): def __init__(self, func, args) -> None: super().__init__(func.name, args) self.func = func def __call__(self, env): # error: "Op" not callable operands = [op(env) for op in self.operands] # type: ignore[operator] return self.func.func(*operands) def __repr__(self) -> str: operands = map(str, self.operands) return pprint_thing(f"{self.op}({','.join(operands)})") class FuncNode: def __init__(self, name: str) -> None: if name not in MATHOPS: raise ValueError(f'"{name}" is not a supported function') self.name = name self.func = getattr(np, name) def __call__(self, *args) -> MathCall: return MathCall(self, args) ================================================ FILE: modin/core/computation/parsing.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ :func:`~pandas.eval` source string parsing functions. Forked from pandas.core.computation.parsing """ from __future__ import annotations import token import tokenize from io import StringIO from keyword import iskeyword from typing import TYPE_CHECKING if TYPE_CHECKING: from collections.abc import ( Hashable, Iterator, ) # A token value Python's tokenizer probably will never use. BACKTICK_QUOTED_STRING = 100 def create_valid_python_identifier(name: str) -> str: """ Create valid Python identifiers from any string. Check if name contains any special characters. If it contains any special characters, the special characters will be replaced by a special string and a prefix is added. Raises ------ SyntaxError If the returned name is not a Python valid identifier, raise an exception. This can happen if there is a hashtag in the name, as the tokenizer will than terminate and not find the backtick. But also for characters that fall out of the range of (U+0001..U+007F). """ if name.isidentifier() and not iskeyword(name): return name # Create a dict with the special characters and their replacement string. # EXACT_TOKEN_TYPES contains these special characters # token.tok_name contains a readable description of the replacement string. special_characters_replacements = { char: f"_{token.tok_name[tokval]}_" for char, tokval in (tokenize.EXACT_TOKEN_TYPES.items()) } special_characters_replacements.update( { " ": "_", "?": "_QUESTIONMARK_", "!": "_EXCLAMATIONMARK_", "$": "_DOLLARSIGN_", "€": "_EUROSIGN_", "°": "_DEGREESIGN_", # Including quotes works, but there are exceptions. "'": "_SINGLEQUOTE_", '"': "_DOUBLEQUOTE_", # Currently not possible. Terminates parser and won't find backtick. # "#": "_HASH_", } ) name = "".join([special_characters_replacements.get(char, char) for char in name]) name = f"BACKTICK_QUOTED_STRING_{name}" if not name.isidentifier(): raise SyntaxError(f"Could not convert '{name}' to a valid Python identifier.") return name def clean_backtick_quoted_toks(tok: tuple[int, str]) -> tuple[int, str]: """ Clean up a column name if surrounded by backticks. Backtick quoted string are indicated by a certain tokval value. If a string is a backtick quoted token it will processed by :func:`_create_valid_python_identifier` so that the parser can find this string when the query is executed. In this case the tok will get the NAME tokval. Parameters ---------- tok : tuple of int, str ints correspond to the all caps constants in the tokenize module Returns ------- tok : Tuple[int, str] Either the input or token or the replacement values """ toknum, tokval = tok if toknum == BACKTICK_QUOTED_STRING: return tokenize.NAME, create_valid_python_identifier(tokval) return toknum, tokval def clean_column_name(name: Hashable) -> Hashable: """ Function to emulate the cleaning of a backtick quoted name. The purpose for this function is to see what happens to the name of identifier if it goes to the process of being parsed a Python code inside a backtick quoted string and than being cleaned (removed of any special characters). Parameters ---------- name : hashable Name to be cleaned. Returns ------- name : hashable Returns the name after tokenizing and cleaning. Notes ----- For some cases, a name cannot be converted to a valid Python identifier. In that case :func:`tokenize_string` raises a SyntaxError. In that case, we just return the name unmodified. If this name was used in the query string (this makes the query call impossible) an error will be raised by :func:`tokenize_backtick_quoted_string` instead, which is not caught and propagates to the user level. """ try: tokenized = tokenize_string(f"`{name}`") tokval = next(tokenized)[1] return create_valid_python_identifier(tokval) except SyntaxError: return name def tokenize_backtick_quoted_string( token_generator: Iterator[tokenize.TokenInfo], source: str, string_start: int ) -> tuple[int, str]: """ Creates a token from a backtick quoted string. Moves the token_generator forwards till right after the next backtick. Parameters ---------- token_generator : Iterator[tokenize.TokenInfo] The generator that yields the tokens of the source string (Tuple[int, str]). The generator is at the first token after the backtick (`) source : str The Python source code string. string_start : int This is the start of backtick quoted string inside the source string. Returns ------- tok: Tuple[int, str] The token that represents the backtick quoted string. The integer is equal to BACKTICK_QUOTED_STRING (100). """ string_end = None for _, tokval, start, _, _ in token_generator: if tokval == "`": string_end = start[1] break assert string_end is not None return BACKTICK_QUOTED_STRING, source[string_start:string_end] def tokenize_string(source: str) -> Iterator[tuple[int, str]]: """ Tokenize a Python source code string. Parameters ---------- source : str The Python source code string. Returns ------- tok_generator : Iterator[Tuple[int, str]] An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]). """ line_reader = StringIO(source).readline token_generator = tokenize.generate_tokens(line_reader) # Loop over all tokens till a backtick (`) is found. # Then, take all tokens till the next backtick to form a backtick quoted string for toknum, tokval, start, _, _ in token_generator: if tokval == "`": try: yield tokenize_backtick_quoted_string( token_generator, source, string_start=start[1] + 1 ) except Exception as err: raise SyntaxError(f"Failed to parse backticks in '{source}'.") from err else: yield toknum, tokval ================================================ FILE: modin/core/computation/scope.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Module for scope operations. Forked from pandas.core.computation.scope """ from __future__ import annotations import datetime import inspect import itertools import pprint import struct import sys from collections import ChainMap from io import StringIO from typing import TypeVar import numpy as np import pandas from pandas.errors import UndefinedVariableError _KT = TypeVar("_KT") _VT = TypeVar("_VT") # https://docs.python.org/3/library/collections.html#chainmap-examples-and-recipes class DeepChainMap(ChainMap[_KT, _VT]): """ Variant of ChainMap that allows direct updates to inner scopes. Only works when all passed mapping are mutable. """ def __setitem__(self, key: _KT, value: _VT) -> None: for mapping in self.maps: if key in mapping: mapping[key] = value return self.maps[0][key] = value def __delitem__(self, key: _KT) -> None: """ Raises ------ KeyError If `key` doesn't exist. """ for mapping in self.maps: if key in mapping: del mapping[key] return raise KeyError(key) def ensure_scope( level: int, global_dict=None, local_dict=None, resolvers=(), target=None ) -> Scope: """Ensure that we are grabbing the correct scope.""" return Scope( level + 1, global_dict=global_dict, local_dict=local_dict, resolvers=resolvers, target=target, ) def _replacer(x) -> str: """ Replace a number with its hexadecimal representation. Used to tag temporary variables with their calling scope's id. """ # get the hex repr of the binary char and remove 0x and pad by pad_size # zeros try: hexin = ord(x) except TypeError: # bytes literals masquerade as ints when iterating in py3 hexin = x return hex(hexin) def _raw_hex_id(obj) -> str: """Return the padded hexadecimal id of ``obj``.""" # interpret as a pointer since that's what really what id returns packed = struct.pack("@P", id(obj)) return "".join([_replacer(x) for x in packed]) DEFAULT_GLOBALS = { "Timestamp": pandas.Timestamp, "datetime": datetime.datetime, "True": True, "False": False, "list": list, "tuple": tuple, "inf": np.inf, "Inf": np.inf, } def _get_pretty_string(obj) -> str: """ Return a prettier version of obj. Parameters ---------- obj : object Object to pretty print Returns ------- str Pretty print object repr """ sio = StringIO() pprint.pprint(obj, stream=sio) # noqa: T203 return sio.getvalue() class Scope: """ Object to hold scope, with a few bells to deal with some custom syntax and contexts added by pandas. Parameters ---------- level : int global_dict : dict or None, optional, default None local_dict : dict or Scope or None, optional, default None resolvers : list-like or None, optional, default None target : object Attributes ---------- level : int scope : DeepChainMap target : object temps : dict """ __slots__ = ["level", "scope", "target", "resolvers", "temps"] level: int scope: DeepChainMap resolvers: DeepChainMap temps: dict def __init__( self, level: int, global_dict=None, local_dict=None, resolvers=(), target=None ) -> None: self.level = level + 1 # shallow copy because we don't want to keep filling this up with what # was there before if there are multiple calls to Scope/_ensure_scope self.scope = DeepChainMap(DEFAULT_GLOBALS.copy()) self.target = target if isinstance(local_dict, Scope): self.scope.update(local_dict.scope) if local_dict.target is not None: self.target = local_dict.target self._update(local_dict.level) frame = sys._getframe(self.level) try: # shallow copy here because we don't want to replace what's in # scope when we align terms (alignment accesses the underlying # numpy array of pandas objects) scope_global = self.scope.new_child( (global_dict if global_dict is not None else frame.f_globals).copy() ) self.scope = DeepChainMap(scope_global) if not isinstance(local_dict, Scope): scope_local = self.scope.new_child( (local_dict if local_dict is not None else frame.f_locals).copy() ) self.scope = DeepChainMap(scope_local) finally: del frame # assumes that resolvers are going from outermost scope to inner if isinstance(local_dict, Scope): resolvers += tuple(local_dict.resolvers.maps) self.resolvers = DeepChainMap(*resolvers) self.temps = {} def __repr__(self) -> str: scope_keys = _get_pretty_string(list(self.scope.keys())) res_keys = _get_pretty_string(list(self.resolvers.keys())) return f"{type(self).__name__}(scope={scope_keys}, resolvers={res_keys})" @property def has_resolvers(self) -> bool: """ Return whether we have any extra scope. For example, DataFrames pass Their columns as resolvers during calls to ``DataFrame.eval()`` and ``DataFrame.query()``. Returns ------- hr : bool """ return bool(len(self.resolvers)) def resolve(self, key: str, is_local: bool): """ Resolve a variable name in a possibly local context. Parameters ---------- key : str A variable name is_local : bool Flag indicating whether the variable is local or not (prefixed with the '@' symbol) Returns ------- value : object The value of a particular variable """ try: # only look for locals in outer scope if is_local: return self.scope[key] # not a local variable so check in resolvers if we have them if self.has_resolvers: return self.resolvers[key] # if we're here that means that we have no locals and we also have # no resolvers assert not is_local and not self.has_resolvers return self.scope[key] except KeyError: try: # last ditch effort we look in temporaries # these are created when parsing indexing expressions # e.g., df[df > 0] return self.temps[key] except KeyError as err: raise UndefinedVariableError(key, is_local) from err def swapkey(self, old_key: str, new_key: str, new_value=None) -> None: """ Replace a variable name, with a potentially new value. Parameters ---------- old_key : str Current variable name to replace new_key : str New variable name to replace `old_key` with new_value : object Value to be replaced along with the possible renaming """ if self.has_resolvers: maps = self.resolvers.maps + self.scope.maps else: maps = self.scope.maps maps.append(self.temps) for mapping in maps: if old_key in mapping: mapping[new_key] = new_value return def _get_vars(self, stack, scopes: list[str]) -> None: """ Get specifically scoped variables from a list of stack frames. Parameters ---------- stack : list A list of stack frames as returned by ``inspect.stack()`` scopes : sequence of strings A sequence containing valid stack frame attribute names that evaluate to a dictionary. For example, ('locals', 'globals') """ variables = itertools.product(scopes, stack) for scope, (frame, _, _, _, _, _) in variables: try: d = getattr(frame, f"f_{scope}") self.scope = DeepChainMap(self.scope.new_child(d)) finally: # won't remove it, but DECREF it # in Py3 this probably isn't necessary since frame won't be # scope after the loop del frame def _update(self, level: int) -> None: """ Update the current scope by going back `level` levels. Parameters ---------- level : int """ sl = level + 1 # add sl frames to the scope starting with the # most distant and overwriting with more current # makes sure that we can capture variable scope stack = inspect.stack() try: self._get_vars(stack[:sl], scopes=["locals"]) finally: # explcitly delete the stack according to the advice here: # https://docs.python.org/3/library/inspect.html#inspect.Traceback del stack[:], stack def add_tmp(self, value) -> str: """ Add a temporary variable to the scope. Parameters ---------- value : object An arbitrary object to be assigned to a temporary variable. Returns ------- str The name of the temporary variable created. """ name = f"{type(value).__name__}_{self.ntemps}_{_raw_hex_id(self)}" # add to inner most scope assert name not in self.temps self.temps[name] = value assert name in self.temps # only increment if the variable gets put in the scope return name @property def ntemps(self) -> int: """The number of temporary variables in this scope""" return len(self.temps) @property def full_scope(self) -> DeepChainMap: """ Return the full scope for use with passing to engines transparently as a mapping. Returns ------- vars : DeepChainMap All variables in this scope. """ maps = [self.temps] + self.resolvers.maps + self.scope.maps return DeepChainMap(*maps) ================================================ FILE: modin/core/dataframe/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Base Modin Dataframe functionality.""" ================================================ FILE: modin/core/dataframe/algebra/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Modin Dataframe algebra (core operators).""" from .binary import Binary from .fold import Fold from .groupby import GroupByReduce from .map import Map from .operator import Operator from .reduce import Reduce from .tree_reduce import TreeReduce __all__ = [ "Operator", "Map", "TreeReduce", "Reduce", "Fold", "Binary", "GroupByReduce", ] ================================================ FILE: modin/core/dataframe/algebra/binary.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses builder class for Binary operator.""" from __future__ import annotations import warnings from typing import TYPE_CHECKING, Any, Callable, Optional, Union import numpy as np import pandas from pandas.api.types import is_bool_dtype, is_scalar from modin.error_message import ErrorMessage from .operator import Operator if TYPE_CHECKING: from pandas._typing import DtypeObj from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler def maybe_compute_dtypes_common_cast( first: PandasQueryCompiler, second: Union[PandasQueryCompiler, dict, list, tuple, np.ndarray, str, DtypeObj], trigger_computations: bool = False, axis: int = 0, func: Optional[ Callable[[pandas.DataFrame, pandas.DataFrame], pandas.DataFrame] ] = None, ) -> Optional[pandas.Series]: """ Precompute data types for binary operations by finding common type between operands. Parameters ---------- first : PandasQueryCompiler First operand for which the binary operation would be performed later. second : PandasQueryCompiler, dict, list, tuple, np.ndarray, str or DtypeObj Second operand for which the binary operation would be performed later. trigger_computations : bool, default: False Whether to trigger computation of the lazy metadata for `first` and `second`. If False is specified this method will return None if any of the operands doesn't have materialized dtypes. axis : int, default: 0 Axis to perform the binary operation along. func : callable(pandas.DataFrame, pandas.DataFrame) -> pandas.DataFrame, optional If specified, will use this function to perform the "try_sample" method (see ``Binary.register()`` docs for more details). Returns ------- pandas.Series The pandas series with precomputed dtypes or None if there's not enough metadata to compute it. Notes ----- The dtypes of the operands are supposed to be known. """ if not trigger_computations: if not first.frame_has_materialized_dtypes: return None if isinstance(second, type(first)) and not second.frame_has_materialized_dtypes: return None dtypes_first = first.dtypes.to_dict() if isinstance(second, type(first)): dtypes_second = second.dtypes.to_dict() columns_first = set(first.columns) columns_second = set(second.columns) common_columns = columns_first.intersection(columns_second) # Here we want to XOR the sets in order to find the columns that do not # belong to the intersection, these will be NaN columns in the result mismatch_columns = columns_first ^ columns_second elif isinstance(second, dict): dtypes_second = { key: pandas.api.types.pandas_dtype(type(value)) for key, value in second.items() } columns_first = set(first.columns) columns_second = set(second.keys()) common_columns = columns_first.intersection(columns_second) # Here we want to find the difference between the sets in order to find columns # that are missing in the dictionary, this will be NaN columns in the result mismatch_columns = columns_first.difference(columns_second) else: if isinstance(second, (list, tuple)): second_dtypes_list = ( [pandas.api.types.pandas_dtype(type(value)) for value in second] if axis == 1 # Here we've been given a column so it has only one dtype, # Infering the dtype using `np.array`, TODO: maybe there's more efficient way? else [np.array(second).dtype] * len(dtypes_first) ) elif is_scalar(second) or isinstance(second, np.ndarray): try: dtype = getattr(second, "dtype", None) or pandas.api.types.pandas_dtype( type(second) ) except TypeError: # For example, dtype '' not understood dtype = pandas.Series(second).dtype second_dtypes_list = [dtype] * len(dtypes_first) else: raise NotImplementedError( f"Can't compute common type for {type(first)} and {type(second)}." ) # We verify operands shapes at the front-end, invalid operands shouldn't be # propagated to the query compiler level ErrorMessage.catch_bugs_and_request_email( failure_condition=len(second_dtypes_list) != len(dtypes_first), extra_log="Shapes of the operands of a binary operation don't match", ) dtypes_second = { key: second_dtypes_list[idx] for idx, key in enumerate(dtypes_first.keys()) } common_columns = first.columns mismatch_columns = [] # If at least one column doesn't match, the result of the non matching column would be nan. nan_dtype = pandas.api.types.pandas_dtype(type(np.nan)) dtypes = None if func is not None: try: with warnings.catch_warnings(): warnings.filterwarnings("ignore") df1 = pandas.DataFrame([[1] * len(common_columns)]).astype( {i: dtypes_first[col] for i, col in enumerate(common_columns)} ) df2 = pandas.DataFrame([[1] * len(common_columns)]).astype( {i: dtypes_second[col] for i, col in enumerate(common_columns)} ) dtypes = func(df1, df2).dtypes.set_axis(common_columns) # it sometimes doesn't work correctly with strings, so falling back to # the "common_cast" method in this case except TypeError: pass if dtypes is None: dtypes = pandas.Series( [ pandas.core.dtypes.cast.find_common_type( [ dtypes_first[x], dtypes_second[x], ] ) for x in common_columns ], index=common_columns, ) dtypes: pandas.Series = pandas.concat( [ dtypes, pandas.Series( [nan_dtype] * (len(mismatch_columns)), index=mismatch_columns, ), ] ) return dtypes def maybe_build_dtypes_series( first: PandasQueryCompiler, second: Union[PandasQueryCompiler, Any], dtype: DtypeObj, trigger_computations: bool = False, ) -> Optional[pandas.Series]: """ Build a ``pandas.Series`` describing dtypes of the result of a binary operation. Parameters ---------- first : PandasQueryCompiler First operand for which the binary operation would be performed later. second : PandasQueryCompiler, list-like or scalar Second operand for which the binary operation would be performed later. dtype : DtypeObj Dtype of the result. trigger_computations : bool, default: False Whether to trigger computation of the lazy metadata for `first` and `second`. If False is specified this method will return None if any of the operands doesn't have materialized columns. Returns ------- pandas.Series or None The pandas series with precomputed dtypes or None if there's not enough metadata to compute it. Notes ----- Finds a union of columns and finds dtypes for all these columns. """ if not trigger_computations: if not first.frame_has_columns_cache: return None if isinstance(second, type(first)) and not second.frame_has_columns_cache: return None columns_first = set(first.columns) if isinstance(second, type(first)): columns_second = set(second.columns) columns_union = columns_first.union(columns_second) else: columns_union = columns_first dtypes = pandas.Series([dtype] * len(columns_union), index=columns_union) return dtypes def try_compute_new_dtypes( first: PandasQueryCompiler, second: Union[PandasQueryCompiler, Any], infer_dtypes: Optional[str] = None, result_dtype: Optional[Union[DtypeObj, str]] = None, axis: int = 0, func: Optional[ Callable[[pandas.DataFrame, pandas.DataFrame], pandas.DataFrame] ] = None, ) -> Optional[pandas.Series]: """ Precompute resulting dtypes of the binary operation if possible. The dtypes won't be precomputed if any of the operands doesn't have their dtypes materialized or if the second operand type is not supported. Supported types: PandasQueryCompiler, list, dict, tuple, np.ndarray. Parameters ---------- first : PandasQueryCompiler First operand of the binary operation. second : PandasQueryCompiler, list-like or scalar Second operand of the binary operation. infer_dtypes : {"common_cast", "try_sample", "bool", None}, default: None How dtypes should be infered (see ``Binary.register`` doc for more info). result_dtype : np.dtype, optional NumPy dtype of the result. If not specified it will be inferred from the `infer_dtypes` parameter. axis : int, default: 0 Axis to perform the binary operation along. func : callable(pandas.DataFrame, pandas.DataFrame) -> pandas.DataFrame, optional A callable to be used for the "try_sample" method. Returns ------- pandas.Series or None """ if infer_dtypes is None and result_dtype is None: return None try: if infer_dtypes == "bool" or is_bool_dtype(result_dtype): dtypes = maybe_build_dtypes_series( first, second, dtype=pandas.api.types.pandas_dtype(bool) ) elif infer_dtypes == "common_cast": dtypes = maybe_compute_dtypes_common_cast( first, second, axis=axis, func=None ) elif infer_dtypes == "try_sample": if func is None: raise ValueError( "'func' must be specified if dtypes infering method is 'try_sample'" ) dtypes = maybe_compute_dtypes_common_cast( first, second, axis=axis, func=func ) else: # For now we only know how to handle `result_dtype == bool` as that's # the only value that is being passed here right now, it's unclear # how we should behave in case of an arbitrary dtype, so let's wait # for at least one case to appear for this regard. dtypes = None except NotImplementedError: dtypes = None return dtypes class Binary(Operator): """Builder class for Binary operator.""" @classmethod def register( cls, func: Callable[..., pandas.DataFrame], join_type: str = "outer", sort: bool = None, labels: str = "replace", infer_dtypes: Optional[str] = None, ) -> Callable[..., PandasQueryCompiler]: """ Build template binary operator. Parameters ---------- func : callable(pandas.DataFrame, [pandas.DataFrame, list-like, scalar]) -> pandas.DataFrame Binary function to execute. Have to be able to accept at least two arguments. join_type : {'left', 'right', 'outer', 'inner', None}, default: 'outer' Type of join that will be used if indices of operands are not aligned. sort : bool, default: None Whether to sort index and columns or not. labels : {"keep", "replace", "drop"}, default: "replace" Whether keep labels from left Modin DataFrame, replace them with labels from joined DataFrame or drop altogether to make them be computed lazily later. infer_dtypes : {"common_cast", "try_sample", "bool", None}, default: None How dtypes should be inferred. * If "common_cast", casts to common dtype of operand columns. * If "try_sample", creates small pandas DataFrames with dtypes of operands and runs the `func` on them to determine output dtypes. If a ``TypeError`` is raised during this process, fallback to "common_cast" method. * If "bool", dtypes would be a boolean series with same size as that of operands. * If ``None``, do not infer new dtypes (they will be computed manually once accessed). Returns ------- callable Function that takes query compiler and executes binary operation. """ def caller( query_compiler: PandasQueryCompiler, other: Union[PandasQueryCompiler, Any], broadcast: bool = False, *args: tuple, dtypes: Optional[Union[DtypeObj, str]] = None, **kwargs: dict, ) -> PandasQueryCompiler: """ Apply binary `func` to passed operands. Parameters ---------- query_compiler : PandasQueryCompiler Left operand of `func`. other : PandasQueryCompiler, list-like object or scalar Right operand of `func`. broadcast : bool, default: False If `other` is a one-column query compiler, indicates whether it is a Series or not. Frames and Series have to be processed differently, however we can't distinguish them at the query compiler level, so this parameter is a hint that passed from a high level API. *args : tuple, Arguments that will be passed to `func`. dtypes : "copy", scalar dtype or None, default: None Dtypes of the result. "copy" to keep old dtypes and None to compute them on demand. **kwargs : dict, Arguments that will be passed to `func`. Returns ------- PandasQueryCompiler Result of binary function. """ axis: int = kwargs.get("axis", 0) if isinstance(other, type(query_compiler)) and broadcast: assert ( len(other.columns) == 1 ), "Invalid broadcast argument for `broadcast_apply`, too many columns: {}".format( len(other.columns) ) # Transpose on `axis=1` because we always represent an individual # column or row as a single-column Modin DataFrame if axis == 1: other = other.transpose() if dtypes != "copy": dtypes = try_compute_new_dtypes( query_compiler, other, infer_dtypes, dtypes, axis, func ) shape_hint = None if isinstance(other, type(query_compiler)): if broadcast: if ( query_compiler.frame_has_materialized_columns and other.frame_has_materialized_columns ): if ( len(query_compiler.columns) == 1 and len(other.columns) == 1 and query_compiler.columns.equals(other.columns) ): shape_hint = "column" return query_compiler.__constructor__( query_compiler._modin_frame.broadcast_apply( axis, lambda left, right: func( left, right.squeeze(), *args, **kwargs ), other._modin_frame, join_type=join_type, labels=labels, dtypes=dtypes, ), shape_hint=shape_hint, ) else: if ( query_compiler.frame_has_materialized_columns and other.frame_has_materialized_columns ): if ( len(query_compiler.columns) == 1 and len(other.columns) == 1 and query_compiler.columns.equals(other.columns) ): shape_hint = "column" return query_compiler.__constructor__( query_compiler._modin_frame.n_ary_op( lambda x, y: func(x, y, *args, **kwargs), [other._modin_frame], join_type=join_type, sort=sort, labels=labels, dtypes=dtypes, ), shape_hint=shape_hint, ) else: # TODO: it's possible to chunk the `other` and broadcast them to partitions # accordingly, in that way we will be able to use more efficient `._modin_frame.map()` if isinstance(other, (dict, list, np.ndarray, pandas.Series)): new_modin_frame = query_compiler._modin_frame.apply_full_axis( axis, lambda df: func(df, other, *args, **kwargs), new_index=query_compiler.index, new_columns=query_compiler.columns, dtypes=dtypes, ) else: if ( query_compiler.frame_has_materialized_columns and len(query_compiler._modin_frame.columns) == 1 and is_scalar(other) ): shape_hint = "column" new_modin_frame = query_compiler._modin_frame.map( func, func_args=(other, *args), func_kwargs=kwargs, dtypes=dtypes, lazy=True, ) return query_compiler.__constructor__( new_modin_frame, shape_hint=shape_hint ) return caller ================================================ FILE: modin/core/dataframe/algebra/default2pandas/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module default2pandas provides templates for a query compiler default-to-pandas methods.""" from .binary import BinaryDefault from .cat import CatDefault from .dataframe import DataFrameDefault from .datetime import DateTimeDefault from .default import DefaultMethod from .groupby import GroupByDefault, SeriesGroupByDefault from .list import ListDefault from .resample import ResampleDefault from .rolling import ExpandingDefault, RollingDefault from .series import SeriesDefault from .str import StrDefault from .struct import StructDefault __all__ = [ "DataFrameDefault", "DateTimeDefault", "SeriesDefault", "StrDefault", "BinaryDefault", "ResampleDefault", "RollingDefault", "ExpandingDefault", "DefaultMethod", "CatDefault", "GroupByDefault", "SeriesGroupByDefault", "ListDefault", "StructDefault", ] ================================================ FILE: modin/core/dataframe/algebra/default2pandas/binary.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses default binary functions builder class.""" import pandas from pandas.core.dtypes.common import is_list_like from .default import DefaultMethod class BinaryDefault(DefaultMethod): """Build default-to-pandas methods which executes binary functions.""" @classmethod def build_default_to_pandas(cls, fn, fn_name): """ Build function that do fallback to pandas for passed binary `fn`. Parameters ---------- fn : callable Binary function to apply to the casted to pandas frame and other operand. fn_name : str Function name which will be shown in default-to-pandas warning message. Returns ------- callable Function that takes query compiler, does fallback to pandas and applies binary `fn` to the casted to pandas frame. """ def bin_ops_wrapper(df, other, *args, **kwargs): """Apply specified binary function to the passed operands.""" squeeze_other = kwargs.pop("broadcast", False) or kwargs.pop( "squeeze_other", False ) squeeze_self = kwargs.pop("squeeze_self", False) if squeeze_other: other = other.squeeze(axis=1) if squeeze_self: df = df.squeeze(axis=1) result = fn(df, other, *args, **kwargs) if ( not isinstance(result, pandas.Series) and not isinstance(result, pandas.DataFrame) and is_list_like(result) ): result = pandas.DataFrame(result) return result return super().build_default_to_pandas(bin_ops_wrapper, fn_name) ================================================ FILE: modin/core/dataframe/algebra/default2pandas/cat.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses default applied-on-category functions builder class.""" from .series import SeriesDefault class CatDefault(SeriesDefault): """Builder for default-to-pandas methods which is executed under category accessor.""" @classmethod def frame_wrapper(cls, df): """ Get category accessor of the passed frame. Parameters ---------- df : pandas.DataFrame Returns ------- pandas.core.arrays.categorical.CategoricalAccessor """ return df.squeeze(axis=1).cat ================================================ FILE: modin/core/dataframe/algebra/default2pandas/dataframe.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses default DataFrame functions builder class.""" import pandas from modin.utils import _inherit_docstrings from .default import DefaultMethod @_inherit_docstrings(DefaultMethod) class DataFrameDefault(DefaultMethod): DEFAULT_OBJECT_TYPE = pandas.DataFrame ================================================ FILE: modin/core/dataframe/algebra/default2pandas/datetime.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses default applied-on-datetime functions builder class.""" from .series import SeriesDefault class DateTimeDefault(SeriesDefault): """Builder for default-to-pandas methods which is executed under datetime accessor.""" @classmethod def frame_wrapper(cls, df): """ Get datetime accessor of the passed frame. Parameters ---------- df : pandas.DataFrame Returns ------- pandas.core.indexes.accessors.DatetimeProperties """ return df.squeeze(axis=1).dt ================================================ FILE: modin/core/dataframe/algebra/default2pandas/default.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses default functions builder class.""" import pandas from pandas.core.dtypes.common import is_list_like from modin.core.dataframe.algebra.operator import Operator from modin.utils import MODIN_UNNAMED_SERIES_LABEL, try_cast_to_pandas class ObjTypeDeterminer: """ Class that routes work to the frame. Provides an instance which forwards all of the `__getattribute__` calls to an object under which `key` function is applied. """ def __getattr__(self, key): """ Build function that executes `key` function over passed frame. Parameters ---------- key : str Returns ------- callable Function that takes DataFrame and executes `key` function on it. """ def func(df, *args, **kwargs): """Access specified attribute of the passed object and call it if it's callable.""" prop = getattr(df, key) if callable(prop): return prop(*args, **kwargs) else: return prop return func class DefaultMethod(Operator): """ Builder for default-to-pandas methods. Attributes ---------- OBJECT_TYPE : str Object type name that will be shown in default-to-pandas warning message. DEFAULT_OBJECT_TYPE : object Default place to search for a function. """ OBJECT_TYPE = "DataFrame" DEFAULT_OBJECT_TYPE = ObjTypeDeterminer @classmethod def register(cls, func, obj_type=None, inplace=None, fn_name=None): """ Build function that do fallback to default pandas implementation for passed `func`. Parameters ---------- func : callable or str, Function to apply to the casted to pandas frame or its property accesed by ``cls.frame_wrapper``. obj_type : object, optional If `func` is a string with a function name then `obj_type` provides an object to search function in. inplace : bool, optional If True return an object to which `func` was applied, otherwise return the result of `func`. fn_name : str, optional Function name which will be shown in default-to-pandas warning message. If not specified, name will be deducted from `func`. Returns ------- callable Function that takes query compiler, does fallback to pandas and applies `func` to the casted to pandas frame or its property accesed by ``cls.frame_wrapper``. """ if isinstance(func, str): if obj_type is None: obj_type = cls.DEFAULT_OBJECT_TYPE fn = getattr(obj_type, func) else: fn = func if type(fn) is property: if fn_name is None and hasattr(fn, "fget"): # When `fn` is a property, `str(fn)` will be something like # "". We instead check its `fget` method to get # the name of the property. # Note that this method is still imperfect because we cannot get the class name # of the property. For example, we can only get "hour" from `Series.dt.hour`. fn_name = f"" fn = cls.build_property_wrapper(fn) else: fn_name = getattr(fn, "__name__", str(fn)) if fn_name is None else fn_name def applyier(df, *args, **kwargs): """ Apply target function to the casted to pandas frame. This function is directly applied to the casted to pandas frame, executes target function under it and processes result so it is possible to create a valid query compiler from it. """ # pandas default implementation doesn't know how to handle `dtypes` keyword argument kwargs.pop("dtypes", None) df = cls.frame_wrapper(df) result = fn(df, *args, **kwargs) if ( not isinstance(result, pandas.Series) and not isinstance(result, pandas.DataFrame) and func not in ("to_numpy", pandas.DataFrame.to_numpy) and func not in ("align", pandas.DataFrame.align) and func not in ("divmod", pandas.Series.divmod) and func not in ("rdivmod", pandas.Series.rdivmod) and func not in ("to_list", pandas.Series.to_list) and func not in ("corr", pandas.Series.corr) and func not in ("to_dict", pandas.Series.to_dict) and func not in ("mean", pandas.DataFrame.mean) and func not in ("median", pandas.DataFrame.median) and func not in ("skew", pandas.DataFrame.skew) and func not in ("kurt", pandas.DataFrame.kurt) ): # When applying a DatetimeProperties or TimedeltaProperties function, # if we don't specify the dtype for the DataFrame, the frame might # get the wrong dtype, e.g. for to_pydatetime in # https://github.com/modin-project/modin/issues/4436 astype_kwargs = {} dtype = getattr(result, "dtype", None) if dtype and isinstance( df, ( pandas.core.indexes.accessors.DatetimeProperties, pandas.core.indexes.accessors.TimedeltaProperties, ), ): astype_kwargs["dtype"] = dtype result = ( pandas.DataFrame(result, **astype_kwargs) if is_list_like(result) else pandas.DataFrame([result], **astype_kwargs) ) if isinstance(result, pandas.Series): if result.name is None: result.name = MODIN_UNNAMED_SERIES_LABEL result = result.to_frame() inplace_method = kwargs.get("inplace", False) if inplace is not None: inplace_method = inplace return result if not inplace_method else df return cls.build_wrapper(applyier, fn_name) @classmethod # FIXME: this method is almost a duplicate of `cls.build_default_to_pandas`. # Those two methods should be merged into a single one. def build_wrapper(cls, fn, fn_name): """ Build function that do fallback to pandas for passed `fn`. In comparison with ``cls.build_default_to_pandas`` this method also casts function arguments to pandas before doing fallback. Parameters ---------- fn : callable Function to apply to the defaulted frame. fn_name : str Function name which will be shown in default-to-pandas warning message. Returns ------- callable Method that does fallback to pandas and applies `fn` to the pandas frame. """ wrapper = cls.build_default_to_pandas(fn, fn_name) def args_cast(self, *args, **kwargs): """ Preprocess `default_to_pandas` function arguments and apply default function. Cast all Modin objects that function arguments contain to its pandas representation. """ args = try_cast_to_pandas(args) kwargs = try_cast_to_pandas(kwargs) return wrapper(self, *args, **kwargs) return args_cast @classmethod def build_property_wrapper(cls, prop): """ Build function that accesses specified property of the frame. Parameters ---------- prop : str Property name to access. Returns ------- callable Function that takes DataFrame and returns its value of `prop` property. """ def property_wrapper(df): """Get specified property of the passed object.""" return prop.fget(df) return property_wrapper @classmethod def build_default_to_pandas(cls, fn, fn_name): """ Build function that do fallback to pandas for passed `fn`. Parameters ---------- fn : callable Function to apply to the defaulted frame. fn_name : str Function name which will be shown in default-to-pandas warning message. Returns ------- callable Method that does fallback to pandas and applies `fn` to the pandas frame. """ fn.__name__ = f"" def wrapper(self, *args, **kwargs): """Do fallback to pandas for the specified function.""" return self.default_to_pandas(fn, *args, **kwargs) return wrapper @classmethod def frame_wrapper(cls, df): """ Extract frame property to apply function on. This method is executed under casted to pandas frame right before applying a function passed to `register`, which gives an ability to transform frame somehow or access its properties, by overriding this method in a child class. Parameters ---------- df : pandas.DataFrame Returns ------- pandas.DataFrame Notes ----- Being a base implementation, this particular method does nothing with passed frame. """ return df ================================================ FILE: modin/core/dataframe/algebra/default2pandas/groupby.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses default GroupBy functions builder class.""" import warnings from typing import Any import pandas from pandas.core.dtypes.common import is_list_like # Defines a set of string names of functions that are executed in a transform-way in groupby from pandas.core.groupby.base import transformation_kernels from modin.utils import MODIN_UNNAMED_SERIES_LABEL, hashable from .default import DefaultMethod # FIXME: there is no sence of keeping `GroupBy` and `GroupByDefault` logic in a different # classes. They should be combined. class GroupBy: """Builder for GroupBy aggregation functions.""" agg_aliases = [ "agg", "dict_agg", pandas.core.groupby.DataFrameGroupBy.agg, pandas.core.groupby.DataFrameGroupBy.aggregate, ] @staticmethod def is_transformation_kernel(agg_func: Any) -> bool: """ Check whether a passed aggregation function is a transformation. Transformation means that the result of the function will be broadcasted to the frame's original shape. Parameters ---------- agg_func : Any Returns ------- bool """ return hashable(agg_func) and agg_func in transformation_kernels.union( # these methods are also producing transpose-like result in a sense we understand it # (they're non-aggregative functions), however are missing in the pandas dictionary {"nth", "head", "tail"} ) @classmethod def _call_groupby(cls, df, *args, **kwargs): # noqa: PR01 """Call .groupby() on passed `df`.""" with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=FutureWarning) return df.groupby(*args, **kwargs) @classmethod def validate_by(cls, by): """ Build valid `by` parameter for `pandas.DataFrame.groupby`. Cast all DataFrames in `by` parameter to Series or list of Series in case of multi-column frame. Parameters ---------- by : DateFrame, Series, index label or list of such Object which indicates groups for GroupBy. Returns ------- Series, index label or list of such By parameter with all DataFrames casted to Series. """ def try_cast_series(df): """Cast one-column frame to Series.""" if isinstance(df, pandas.DataFrame): df = df.squeeze(axis=1) if not isinstance(df, pandas.Series): return df if df.name == MODIN_UNNAMED_SERIES_LABEL: df.name = None return df if isinstance(by, pandas.DataFrame): by = [try_cast_series(column) for _, column in by.items()] elif isinstance(by, pandas.Series): by = [try_cast_series(by)] elif isinstance(by, list): by = [try_cast_series(o) for o in by] return by @classmethod def inplace_applyier_builder(cls, key, func=None): """ Bind actual aggregation function to the GroupBy aggregation method. Parameters ---------- key : callable Function that takes GroupBy object and evaluates passed aggregation function. func : callable or str, optional Function that takes DataFrame and aggregate its data. Will be applied to each group at the grouped frame. Returns ------- callable, Function that executes aggregation under GroupBy object. """ inplace_args = [] if func is None else [func] def inplace_applyier(grp, *func_args, **func_kwargs): return key(grp, *inplace_args, *func_args, **func_kwargs) return inplace_applyier @classmethod def get_func(cls, key, **kwargs): """ Extract aggregation function from groupby arguments. Parameters ---------- key : callable or str Default aggregation function. If aggregation function is not specified via groupby arguments, then `key` function is used. **kwargs : dict GroupBy arguments that may contain aggregation function. Returns ------- callable Aggregation function. Notes ----- There are two ways of how groupby aggregation can be invoked: 1. Explicitly with query compiler method: `qc.groupby_sum()`. 2. By passing aggregation function as an argument: `qc.groupby_agg("sum")`. Both are going to produce the same result, however in the first case actual aggregation function can be extracted from the method name, while for the second only from the method arguments. """ if "agg_func" in kwargs: return cls.inplace_applyier_builder(key, kwargs["agg_func"]) elif "func_dict" in kwargs: return cls.inplace_applyier_builder(key, kwargs["func_dict"]) else: return cls.inplace_applyier_builder(key) @classmethod def build_aggregate_method(cls, key): """ Build function for `QueryCompiler.groupby_agg` that can be executed as default-to-pandas. Parameters ---------- key : callable or str Default aggregation function. If aggregation function is not specified via groupby arguments, then `key` function is used. Returns ------- callable Function that executes groupby aggregation. """ def fn( df, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, **kwargs, ): """Group DataFrame and apply aggregation function to each group.""" by = cls.validate_by(by) grp = cls._call_groupby(df, by, axis=axis, **groupby_kwargs) agg_func = cls.get_func(key, **kwargs) result = agg_func(grp, *agg_args, **agg_kwargs) return result return fn @classmethod def build_groupby_reduce_method(cls, agg_func): """ Build function for `QueryCompiler.groupby_*` that can be executed as default-to-pandas. Parameters ---------- agg_func : callable or str Default aggregation function. If aggregation function is not specified via groupby arguments, then `agg_func` function is used. Returns ------- callable Function that executes groupby aggregation. """ def fn( df, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, **kwargs ): """Group DataFrame and apply aggregation function to each group.""" if not isinstance(by, (pandas.Series, pandas.DataFrame)): by = cls.validate_by(by) grp = cls._call_groupby(df, by, axis=axis, **groupby_kwargs) grp_agg_func = cls.get_func(agg_func, **kwargs) return grp_agg_func( grp, *agg_args, **agg_kwargs, ) if isinstance(by, pandas.DataFrame): by = by.squeeze(axis=1) if ( drop and isinstance(by, pandas.Series) and by.name in df and df[by.name].equals(by) ): by = [by.name] if isinstance(by, pandas.DataFrame): df = pandas.concat([df] + [by[[o for o in by if o not in df]]], axis=1) by = list(by.columns) groupby_kwargs = groupby_kwargs.copy() as_index = groupby_kwargs.pop("as_index", True) groupby_kwargs["as_index"] = True grp = cls._call_groupby(df, by, axis=axis, **groupby_kwargs) func = cls.get_func(agg_func, **kwargs) result = func(grp, *agg_args, **agg_kwargs) method = kwargs.get("method") if isinstance(result, pandas.Series): result = result.to_frame( MODIN_UNNAMED_SERIES_LABEL if result.name is None else result.name ) if not as_index: if isinstance(by, pandas.Series): # 1. If `drop` is True then 'by' Series represents a column from the # source frame and so the 'by' is internal. # 2. If method is 'size' then any 'by' is considered to be internal. # This is a hacky legacy from the ``groupby_size`` implementation: # https://github.com/modin-project/modin/issues/3739 internal_by = (by.name,) if drop or method == "size" else tuple() else: internal_by = by cls.handle_as_index_for_dataframe( result, internal_by, by_cols_dtypes=( df.index.dtypes.values if isinstance(df.index, pandas.MultiIndex) else (df.index.dtype,) ), by_length=len(by), drop=drop, method=method, inplace=True, ) if result.index.name == MODIN_UNNAMED_SERIES_LABEL: result.index.name = None return result return fn @classmethod def is_aggregate(cls, key): # noqa: PR01 """Check whether `key` is an alias for pandas.GroupBy.aggregation method.""" return key in cls.agg_aliases @classmethod def build_groupby(cls, func): """ Build function that groups DataFrame and applies aggregation function to the every group. Parameters ---------- func : callable or str Default aggregation function. If aggregation function is not specified via groupby arguments, then `func` function is used. Returns ------- callable Function that takes pandas DataFrame and does GroupBy aggregation. """ if cls.is_aggregate(func): return cls.build_aggregate_method(func) return cls.build_groupby_reduce_method(func) @classmethod def handle_as_index_for_dataframe( cls, result, internal_by_cols, by_cols_dtypes=None, by_length=None, selection=None, partition_idx=0, drop=True, method=None, inplace=False, ): """ Handle `as_index=False` parameter for the passed GroupBy aggregation result. Parameters ---------- result : DataFrame Frame containing GroupBy aggregation result computed with `as_index=True` parameter (group names are located at the frame's index). internal_by_cols : list-like Internal 'by' columns. by_cols_dtypes : list-like, optional Data types of the internal 'by' columns. Required to do special casing in case of categorical 'by'. If not specified, assume that there is no categorical data in 'by'. by_length : int, optional Amount of keys to group on (including frame columns and external objects like list, Series, etc.) If not specified, consider `by_length` to be equal ``len(internal_by_cols)``. selection : label or list of labels, optional Set of columns that were explicitly selected for aggregation (for example via dict-aggregation). If not specified assuming that aggregation was applied to all of the available columns. partition_idx : int, default: 0 Positional index of the current partition. drop : bool, default: True Indicates whether or not any of the `by` data came from the same frame. method : str, optional Name of the groupby function. This is a hint to be able to do special casing. Note: this parameter is a legacy from the ``groupby_size`` implementation, it's a hacky one and probably will be removed in the future: https://github.com/modin-project/modin/issues/3739. inplace : bool, default: False Modify the DataFrame in place (do not create a new object). Returns ------- DataFrame GroupBy aggregation result with the considered `as_index=False` parameter. """ if not inplace: result = result.copy() reset_index, drop, lvls_to_drop, cols_to_drop = cls.handle_as_index( result_cols=result.columns, result_index_names=result.index.names, internal_by_cols=internal_by_cols, by_cols_dtypes=by_cols_dtypes, by_length=by_length, selection=selection, partition_idx=partition_idx, drop=drop, method=method, ) if len(lvls_to_drop) > 0: result.index = result.index.droplevel(lvls_to_drop) if len(cols_to_drop) > 0: result.drop(columns=cols_to_drop, inplace=True) if reset_index: result.reset_index(drop=drop, inplace=True) return result @staticmethod def handle_as_index( result_cols, result_index_names, internal_by_cols, by_cols_dtypes=None, by_length=None, selection=None, partition_idx=0, drop=True, method=None, ): """ Compute hints to process ``as_index=False`` parameter for the GroupBy result. This function resolves naming conflicts of the index levels to insert and the column labels for the GroupBy result. The logic of this function assumes that the initial GroupBy result was computed as ``as_index=True``. Parameters ---------- result_cols : pandas.Index Columns of the GroupBy result. result_index_names : list-like Index names of the GroupBy result. internal_by_cols : list-like Internal 'by' columns. by_cols_dtypes : list-like, optional Data types of the internal 'by' columns. Required to do special casing in case of categorical 'by'. If not specified, assume that there is no categorical data in 'by'. by_length : int, optional Amount of keys to group on (including frame columns and external objects like list, Series, etc.) If not specified, consider `by_length` to be equal ``len(internal_by_cols)``. selection : label or list of labels, optional Set of columns that were explicitly selected for aggregation (for example via dict-aggregation). If not specified assuming that aggregation was applied to all of the available columns. partition_idx : int, default: 0 Positional index of the current partition. drop : bool, default: True Indicates whether or not any of the `by` data came from the same frame. method : str, optional Name of the groupby function. This is a hint to be able to do special casing. Note: this parameter is a legacy from the ``groupby_size`` implementation, it's a hacky one and probably will be removed in the future: https://github.com/modin-project/modin/issues/3739. Returns ------- reset_index : bool Indicates whether to reset index to the default one (0, 1, 2 ... n) at this partition. drop_index : bool If `reset_index` is True, indicates whether to drop all index levels (True) or insert them into the resulting columns (False). lvls_to_drop : list of ints Contains numeric indices of the levels of the result index to drop as intersected. cols_to_drop : list of labels Contains labels of the columns to drop from the result as intersected. Examples -------- >>> groupby_result = compute_groupby_without_processing_as_index_parameter() >>> if not as_index: >>> reset_index, drop, lvls_to_drop, cols_to_drop = handle_as_index(**extract_required_params(groupby_result)) >>> if len(lvls_to_drop) > 0: >>> groupby_result.index = groupby_result.index.droplevel(lvls_to_drop) >>> if len(cols_to_drop) > 0: >>> groupby_result = groupby_result.drop(columns=cols_to_drop) >>> if reset_index: >>> groupby_result_with_processed_as_index_parameter = groupby_result.reset_index(drop=drop) >>> else: >>> groupby_result_with_processed_as_index_parameter = groupby_result """ if by_length is None: by_length = len(internal_by_cols) reset_index = method != "transform" and (by_length > 0 or selection is not None) # If the method is "size" then the result contains only one unique named column # and we don't have to worry about any naming conflicts, so inserting all of # the "by" into the result (just a fast-path) if method == "size": return reset_index, False, [], [] # Pandas logic of resolving naming conflicts is the following: # 1. If any categorical is in 'by' and 'by' is multi-column, then the categorical # index is prioritized: drop intersected columns and insert all of the 'by' index # levels to the frame as columns. # 2. Otherwise, aggregation result is prioritized: drop intersected index levels and # insert the filtered ones to the frame as columns. if by_cols_dtypes is not None: keep_index_levels = ( by_length > 1 and selection is None and any(isinstance(x, pandas.CategoricalDtype) for x in by_cols_dtypes) ) else: keep_index_levels = False # 1. We insert 'by'-columns to the result at the beginning of the frame and so only to the # first partition, if partition_idx != 0 we just drop the index. If there are no columns # that are required to drop (keep_index_levels is True) then we can exit here. # 2. We don't insert 'by'-columns to the result if 'by'-data came from a different # frame (drop is False), there's only one exception for this rule: if the `method` is "size", # so if (drop is False) and method is not "size" we just drop the index and so can exit here. if (not keep_index_levels and partition_idx != 0) or ( not drop and method != "size" ): return reset_index, True, [], [] if not isinstance(internal_by_cols, pandas.Index): if not is_list_like(internal_by_cols): internal_by_cols = [internal_by_cols] internal_by_cols = pandas.Index(internal_by_cols) internal_by_cols = ( internal_by_cols[ ~internal_by_cols.str.startswith(MODIN_UNNAMED_SERIES_LABEL, na=False) ] if hasattr(internal_by_cols, "str") else internal_by_cols ) if selection is not None and not isinstance(selection, pandas.Index): selection = pandas.Index(selection) lvls_to_drop = [] cols_to_drop = [] if not keep_index_levels: # We want to insert only these internal-by-cols that are not presented # in the result in order to not create naming conflicts if selection is None: cols_to_insert = frozenset(internal_by_cols) - frozenset(result_cols) else: cols_to_insert = frozenset( # We have to use explicit 'not in' check and not just difference # of sets because of specific '__contains__' operator in case of # scalar 'col' and MultiIndex 'selection'. col for col in internal_by_cols if col not in selection ) else: cols_to_insert = internal_by_cols # We want to drop such internal-by-cols that are presented # in the result in order to not create naming conflicts cols_to_drop = frozenset(internal_by_cols) & frozenset(result_cols) if partition_idx == 0: lvls_to_drop = [ i for i, name in enumerate(result_index_names) if name not in cols_to_insert ] else: lvls_to_drop = result_index_names drop = False if len(lvls_to_drop) == len(result_index_names): drop = True lvls_to_drop = [] return reset_index, drop, lvls_to_drop, cols_to_drop class SeriesGroupBy(GroupBy): """Builder for GroupBy aggregation functions for Series.""" @classmethod def _call_groupby(cls, df, *args, **kwargs): # noqa: PR01 """Call .groupby() on passed `df` squeezed to Series.""" # We can end up here by two means - either by "true" call # like Series().groupby() or by df.groupby()[item]. if len(df.columns) == 1: # Series().groupby() case return df.squeeze(axis=1).groupby(*args, **kwargs) # In second case surrounding logic will supplement grouping columns, # so we need to drop them after grouping is over; our originally # selected column is always the first, so use it with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=FutureWarning) return df.groupby(*args, **kwargs)[df.columns[0]] class GroupByDefault(DefaultMethod): """Builder for default-to-pandas GroupBy aggregation functions.""" _groupby_cls = GroupBy OBJECT_TYPE = "GroupBy" @classmethod def register(cls, func, **kwargs): """ Build default-to-pandas GroupBy aggregation function. Parameters ---------- func : callable or str Default aggregation function. If aggregation function is not specified via groupby arguments, then `func` function is used. **kwargs : kwargs Additional arguments that will be passed to function builder. Returns ------- callable Functiom that takes query compiler and defaults to pandas to do GroupBy aggregation. """ return super().register( cls._groupby_cls.build_groupby(func), fn_name=func.__name__, **kwargs ) # This specifies a `pandas.DataFrameGroupBy` method to pass the `agg_func` to, # it's based on `how` to apply it. Going by pandas documentation: # 1. `.aggregate(func)` applies func row/column wise. # 2. `.apply(func)` applies func to a DataFrames, holding a whole group (group-wise). # 3. `.transform(func)` is the same as `.apply()` but also broadcast the `func` # result to the group's original shape. # 4. 'direct' mode means that the passed `func` has to be applied directly # to the `pandas.DataFrameGroupBy` object. _aggregation_methods_dict = { "axis_wise": pandas.core.groupby.DataFrameGroupBy.aggregate, "group_wise": pandas.core.groupby.DataFrameGroupBy.apply, "transform": pandas.core.groupby.DataFrameGroupBy.transform, "direct": lambda grp, func, *args, **kwargs: func(grp, *args, **kwargs), } @classmethod def get_aggregation_method(cls, how): """ Return `pandas.DataFrameGroupBy` method that implements the passed `how` UDF applying strategy. Parameters ---------- how : {"axis_wise", "group_wise", "transform"} `how` parameter of the ``BaseQueryCompiler.groupby_agg``. Returns ------- callable(pandas.DataFrameGroupBy, callable, *args, **kwargs) -> [pandas.DataFrame | pandas.Series] Notes ----- Visit ``BaseQueryCompiler.groupby_agg`` doc-string for more information about `how` parameter. """ return cls._aggregation_methods_dict[how] class SeriesGroupByDefault(GroupByDefault): """Builder for default-to-pandas GroupBy aggregation functions for Series.""" _groupby_cls = SeriesGroupBy _aggregation_methods_dict = { "axis_wise": pandas.core.groupby.SeriesGroupBy.aggregate, "group_wise": pandas.core.groupby.SeriesGroupBy.apply, "transform": pandas.core.groupby.SeriesGroupBy.transform, "direct": lambda grp, func, *args, **kwargs: func(grp, *args, **kwargs), } ================================================ FILE: modin/core/dataframe/algebra/default2pandas/list.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses default applied-on-list accessor functions builder class.""" from .series import SeriesDefault class ListDefault(SeriesDefault): """Builder for default-to-pandas methods which is executed under list accessor.""" @classmethod def frame_wrapper(cls, df): """ Get list accessor of the passed frame. Parameters ---------- df : pandas.DataFrame Returns ------- pandas.core.arrays.arrow.ListAccessor """ return df.squeeze(axis=1).list ================================================ FILE: modin/core/dataframe/algebra/default2pandas/resample.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses default Resamle functions builder class.""" from .default import DefaultMethod # FIXME: there is no sence of keeping `Resampler` and `ResampleDefault` logic in a different # classes. They should be combined. class Resampler: """Builder class for resampled aggregation functions.""" @classmethod def build_resample(cls, func, squeeze_self): """ Build function that resamples time-series data and does aggregation. Parameters ---------- func : callable Aggregation function to execute under resampled frame. squeeze_self : bool Whether or not to squeeze frame before resampling. Returns ------- callable Function that takes pandas DataFrame and applies aggregation to resampled time-series data. """ def fn(df, resample_kwargs, *args, **kwargs): """Resample time-series data of the passed frame and apply specified aggregation.""" if squeeze_self: df = df.squeeze(axis=1) resampler = df.resample(**resample_kwargs) if type(func) is property: return func.fget(resampler) return func(resampler, *args, **kwargs) return fn class ResampleDefault(DefaultMethod): """Builder for default-to-pandas resampled aggregation functions.""" OBJECT_TYPE = "Resampler" @classmethod def register(cls, func, squeeze_self=False, **kwargs): """ Build function that do fallback to pandas and aggregate resampled data. Parameters ---------- func : callable Aggregation function to execute under resampled frame. squeeze_self : bool, default: False Whether or not to squeeze frame before resampling. **kwargs : kwargs Additional arguments that will be passed to function builder. Returns ------- callable Function that takes query compiler and does fallback to pandas to resample time-series data and apply aggregation on it. """ return super().register( Resampler.build_resample(func, squeeze_self), fn_name=func.__name__, **kwargs ) ================================================ FILE: modin/core/dataframe/algebra/default2pandas/rolling.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses default Rolling functions builder class.""" from .default import DefaultMethod class RollingDefault(DefaultMethod): """Builder for default-to-pandas aggregation on a rolling window functions.""" OBJECT_TYPE = "Rolling" @classmethod def _build_rolling(cls, func): """ Build function that creates a rolling window and executes `func` on it. Parameters ---------- func : callable Function to execute on a rolling window. Returns ------- callable Function that takes pandas DataFrame and applies `func` on a rolling window. """ def fn(df, rolling_kwargs, *args, **kwargs): """Create rolling window for the passed frame and execute specified `func` on it.""" roller = df.rolling(**rolling_kwargs) if type(func) is property: return func.fget(roller) return func(roller, *args, **kwargs) return fn @classmethod def register(cls, func, **kwargs): """ Build function that do fallback to pandas to apply `func` on a rolling window. Parameters ---------- func : callable Function to execute on a rolling window. **kwargs : kwargs Additional arguments that will be passed to function builder. Returns ------- callable Function that takes query compiler and defaults to pandas to apply aggregation `func` on a rolling window. """ return super().register( cls._build_rolling(func), fn_name=func.__name__, **kwargs ) class ExpandingDefault(DefaultMethod): """Builder for default-to-pandas aggregation on an expanding window functions.""" OBJECT_TYPE = "Expanding" @classmethod def _build_expanding(cls, func, squeeze_self): """ Build function that creates an expanding window and executes `func` on it. Parameters ---------- func : callable Function to execute on a expanding window. squeeze_self : bool Whether or not to squeeze frame before executing the window function. Returns ------- callable Function that takes pandas DataFrame and applies `func` on a expanding window. """ def fn(df, rolling_args, *args, **kwargs): """Create rolling window for the passed frame and execute specified `func` on it.""" if squeeze_self: df = df.squeeze(axis=1) roller = df.expanding(*rolling_args) if type(func) is property: return func.fget(roller) return func(roller, *args, **kwargs) return fn @classmethod def register(cls, func, squeeze_self=False, **kwargs): """ Build function that do fallback to pandas to apply `func` on a expanding window. Parameters ---------- func : callable Function to execute on an expanding window. squeeze_self : bool, default: False Whether or not to squeeze frame before executing the window function. **kwargs : kwargs Additional arguments that will be passed to function builder. Returns ------- callable Function that takes query compiler and defaults to pandas to apply aggregation `func` on an expanding window. """ return super().register( cls._build_expanding(func, squeeze_self=squeeze_self), fn_name=func.__name__, **kwargs ) ================================================ FILE: modin/core/dataframe/algebra/default2pandas/series.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses default Series functions builder class.""" from .default import DefaultMethod class SeriesDefault(DefaultMethod): """Builder for default-to-pandas methods which is executed under Series.""" OBJECT_TYPE = "Series" @classmethod def frame_wrapper(cls, df): """ Squeeze passed DataFrame to be able to process Series-specific functions on it. Parameters ---------- df : pandas.DataFrame One-column DataFrame to squeeze. Returns ------- pandas.Series """ return df.squeeze(axis=1) ================================================ FILE: modin/core/dataframe/algebra/default2pandas/str.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses default applied-on-str functions builder class.""" from .series import SeriesDefault class StrDefault(SeriesDefault): """Builder for default-to-pandas methods which is executed under `str` accessor.""" @classmethod def frame_wrapper(cls, df): """ Get `str` accessor of the passed frame. Parameters ---------- df : pandas.DataFrame Returns ------- pandas.core.strings.accessor.StringMethods """ return df.squeeze(axis=1).str ================================================ FILE: modin/core/dataframe/algebra/default2pandas/struct.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses default applied-on-struct accessor functions builder class.""" from .series import SeriesDefault class StructDefault(SeriesDefault): """Builder for default-to-pandas methods which is executed under struct accessor.""" @classmethod def frame_wrapper(cls, df): """ Get struct accessor of the passed frame. Parameters ---------- df : pandas.DataFrame Returns ------- pandas.core.arrays.arrow.StructAccessor """ return df.squeeze(axis=1).struct ================================================ FILE: modin/core/dataframe/algebra/fold.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses builder class for Fold operator.""" from __future__ import annotations from typing import TYPE_CHECKING, Callable, Optional from .operator import Operator if TYPE_CHECKING: import pandas from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler class Fold(Operator): """Builder class for Fold functions.""" @classmethod def register( cls, fold_function: Callable[..., pandas.DataFrame], shape_preserved=False ) -> Callable[..., PandasQueryCompiler]: """ Build Fold operator that will be performed across rows/columns. Parameters ---------- fold_function : callable(pandas.DataFrame, *args, **kwargs) -> pandas.DataFrame Function to apply across rows/columns. shape_preserved : bool, default: False Whether the shape of the dataframe is preserved or not after applying a function. Returns ------- callable Function that takes query compiler and executes Fold function. """ def caller( query_compiler: PandasQueryCompiler, fold_axis: Optional[int] = None, *args: tuple, new_index=None, new_columns=None, **kwargs: dict, ) -> PandasQueryCompiler: """ Execute Fold function against passed query compiler. Parameters ---------- query_compiler : PandasQueryCompiler The query compiler to execute the function on. fold_axis : int, optional 0 or None means apply across full column partitions. 1 means apply across full row partitions. *args : tuple Additional arguments passed to `fold_function`. new_index : list-like, optional The index of the result. new_columns : list-like, optional The columns of the result. **kwargs: dict Additional keyword arguments passed to `fold_function`. Returns ------- PandasQueryCompiler A new query compiler representing the result of executing the function. """ return query_compiler.__constructor__( query_compiler._modin_frame.fold( cls.validate_axis(fold_axis), lambda x: fold_function(x, *args, **kwargs), new_index=new_index, new_columns=new_columns, shape_preserved=shape_preserved, ) ) return caller ================================================ FILE: modin/core/dataframe/algebra/groupby.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses builder class for GroupByReduce operator.""" from __future__ import annotations from typing import TYPE_CHECKING, Callable, Optional, Union import pandas from modin.core.dataframe.pandas.metadata import ModinIndex from modin.error_message import ErrorMessage from modin.utils import MODIN_UNNAMED_SERIES_LABEL, hashable from .default2pandas.groupby import GroupBy, GroupByDefault from .tree_reduce import TreeReduce if TYPE_CHECKING: from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler class GroupByReduce(TreeReduce): """ Builder class for GroupBy aggregation functions. Attributes ---------- ID_LEVEL_NAME : str It's supposed that implementations may produce multiple temporary columns per one source column in an intermediate phase. In order for these columns to be processed accordingly at the Reduce phase, an implementation must store unique names for such temporary columns in the ``ID_LEVEL_NAME`` level. Duplicated names are not allowed. _GROUPBY_REDUCE_IMPL_FLAG : str Attribute indicating that a callable should be treated as an implementation for one of the TreeReduce phases rather than an arbitrary aggregation. Note: this attribute should be considered private. """ ID_LEVEL_NAME: str = "__ID_LEVEL_NAME__" _GROUPBY_REDUCE_IMPL_FLAG: str = "__groupby_reduce_impl_func__" @classmethod def register( cls, map_func: Union[str, dict, Callable[..., pandas.DataFrame]], reduce_func: Optional[Union[str, dict, Callable[..., pandas.DataFrame]]] = None, **call_kwds: dict, ) -> Callable[..., PandasQueryCompiler]: """ Build template GroupBy aggregation function. Resulted function is applied in parallel via TreeReduce algorithm. Parameters ---------- map_func : str, dict or callable(pandas.core.groupby.DataFrameGroupBy) -> pandas.DataFrame Function to apply to the `GroupByObject` at the map phase. If ``str`` was passed it will be treated as a DataFrameGroupBy's method name. reduce_func : str, dict or callable(pandas.core.groupby.DataFrameGroupBy) -> pandas.DataFrame, optional Function to apply to the ``DataFrameGroupBy`` at the reduce phase. If not specified will be set the same as 'map_func'. **call_kwds : dict Kwargs that will be passed to the returned function. Returns ------- callable Function that takes query compiler and executes GroupBy aggregation with TreeReduce algorithm. """ if reduce_func is None: reduce_func = map_func def build_fn(name): return lambda df, *args, **kwargs: getattr(df, name)(*args, **kwargs) if isinstance(map_func, str): map_func = build_fn(map_func) if isinstance(reduce_func, str): reduce_func = build_fn(reduce_func) assert not ( isinstance(map_func, dict) ^ isinstance(reduce_func, dict) ) and not ( callable(map_func) ^ callable(reduce_func) ), "Map and reduce functions must be either both dict or both callable." return lambda *args, **kwargs: cls.caller( *args, map_func=map_func, reduce_func=reduce_func, **kwargs, **call_kwds ) @classmethod def register_implementation( cls, map_func: Callable[..., pandas.DataFrame], reduce_func: Callable[..., pandas.DataFrame], ) -> None: """ Register callables to be recognized as an implementations of tree-reduce phases. Parameters ---------- map_func : callable(pandas.core.groupby.DataFrameGroupBy) -> pandas.DataFrame Callable to register. reduce_func : callable(pandas.core.groupby.DataFrameGroupBy) -> pandas.DataFrame Callable to register. """ setattr(map_func, cls._GROUPBY_REDUCE_IMPL_FLAG, True) setattr(reduce_func, cls._GROUPBY_REDUCE_IMPL_FLAG, True) @classmethod def map( cls, df: pandas.DataFrame, map_func: Callable[..., pandas.DataFrame], axis: int, groupby_kwargs: dict, agg_args: list, agg_kwargs: dict, other: Optional[pandas.DataFrame] = None, by=None, drop: bool = False, ) -> pandas.DataFrame: """ Execute Map phase of GroupByReduce. Groups DataFrame and applies map function. Groups will be preserved in the results index for the following reduce phase. Parameters ---------- df : pandas.DataFrame Serialized frame to group. map_func : dict or callable(pandas.DataFrameGroupBy) -> pandas.DataFrame Function to apply to the `GroupByObject`. axis : {0, 1} Axis to group and apply aggregation function along. 0 means index axis when 1 means column axis. groupby_kwargs : dict Dictionary which carries arguments for `pandas.DataFrame.groupby`. agg_args : list-like Positional arguments to pass to the aggregation functions. agg_kwargs : dict Keyword arguments to pass to the aggregation functions. other : pandas.DataFrame, optional Serialized frame, whose columns are used to determine the groups. If not specified, `by` parameter is used. by : level index name or list of such labels, optional Index levels, that is used to determine groups. If not specified, `other` parameter is used. drop : bool, default: False Indicates whether or not by-data came from the `self` frame. Returns ------- pandas.DataFrame GroupBy aggregation result for one particular partition. """ # Set `as_index` to True to track the metadata of the grouping object # It is used to make sure that between phases we are constructing the # right index and placing columns in the correct order. groupby_kwargs["as_index"] = True groupby_kwargs["observed"] = True # We have to filter func-dict BEFORE inserting broadcasted 'by' columns # to avoid multiple aggregation results for 'by' cols in case they're # present in the func-dict: apply_func = cls.get_callable( map_func, df, # We won't be able to preserve the order as the Map phase would likely # produce some temporary columns that won't fit into the original # aggregation order. It doesn't matter much as we restore the original # order at the Reduce phase. preserve_aggregation_order=False, ) if other is not None: # Other is a broadcasted partition that represents 'by' data to group on. # If 'drop' then the 'by' data came from the 'self' frame, thus # inserting missed columns to the partition to group on them. if drop or isinstance( other := other.squeeze(axis=axis ^ 1), pandas.DataFrame ): df = pandas.concat( [df] + [other[[o for o in other if o not in df]]], axis=1, ) other = list(other.columns) by_part = other else: by_part = by result = apply_func( df.groupby(by=by_part, axis=axis, **groupby_kwargs), *agg_args, **agg_kwargs ) # Result could not always be a frame, so wrapping it into DataFrame return pandas.DataFrame(result) @classmethod def reduce( cls, df: pandas.DataFrame, reduce_func: Union[dict, Callable[..., pandas.DataFrame]], axis: int, groupby_kwargs: dict, agg_args: list, agg_kwargs: dict, partition_idx: int = 0, drop: bool = False, method: Optional[str] = None, finalizer_fn: Optional[Callable[[pandas.DataFrame], pandas.DataFrame]] = None, ) -> pandas.DataFrame: """ Execute Reduce phase of GroupByReduce. Combines groups from the Map phase and applies reduce function. Parameters ---------- df : pandas.DataFrame Serialized frame which contain groups to combine. reduce_func : dict or callable(pandas.DataFrameGroupBy) -> pandas.DataFrame Function to apply to the `GroupByObject`. axis : {0, 1} Axis to group and apply aggregation function along. 0 means index axis when 1 means column axis. groupby_kwargs : dict Dictionary which carries arguments for `pandas.DataFrame.groupby`. agg_args : list-like Positional arguments to pass to the aggregation functions. agg_kwargs : dict Keyword arguments to pass to the aggregation functions. partition_idx : int, default: 0 Internal index of column partition to which this function is applied. drop : bool, default: False Indicates whether or not by-data came from the `self` frame. method : str, optional Name of the groupby function. This is a hint to be able to do special casing. finalizer_fn : callable(pandas.DataFrame) -> pandas.DataFrame, optional A callable to execute at the end a groupby kernel against groupby result. Returns ------- pandas.DataFrame GroupBy aggregation result. """ # Wrapping names into an Index should be unnecessary, however # there is a bug in pandas with intersection that forces us to do so: # https://github.com/pandas-dev/pandas/issues/39699 by_part = pandas.Index(df.index.names) groupby_kwargs = groupby_kwargs.copy() as_index = groupby_kwargs.get("as_index", True) # Set `as_index` to True to track the metadata of the grouping object groupby_kwargs["as_index"] = True # since now index levels contain out 'by', in the reduce phace # we want to group on these levels groupby_kwargs["level"] = list(range(len(df.index.names))) apply_func = cls.get_callable(reduce_func, df) result = apply_func( df.groupby(axis=axis, **groupby_kwargs), *agg_args, **agg_kwargs ) if not as_index: idx = df.index GroupBy.handle_as_index_for_dataframe( result, by_part, by_cols_dtypes=( idx.dtypes.values if isinstance(idx, pandas.MultiIndex) and hasattr(idx, "dtypes") else (idx.dtype,) ), by_length=len(by_part), selection=reduce_func.keys() if isinstance(reduce_func, dict) else None, partition_idx=partition_idx, drop=drop, method=method, inplace=True, ) # Result could not always be a frame, so wrapping it into DataFrame result = pandas.DataFrame(result) if result.index.name == MODIN_UNNAMED_SERIES_LABEL: result.index.name = None return result if finalizer_fn is None else finalizer_fn(result) @classmethod def caller( cls, query_compiler: PandasQueryCompiler, by, map_func: Union[dict, Callable[..., pandas.DataFrame]], reduce_func: Union[dict, Callable[..., pandas.DataFrame]], axis: int, groupby_kwargs: dict, agg_args: list, agg_kwargs: dict, drop: bool = False, method: Optional[str] = None, default_to_pandas_func: Optional[Callable[..., pandas.DataFrame]] = None, finalizer_fn: Optional[Callable[[pandas.DataFrame], pandas.DataFrame]] = None, ) -> PandasQueryCompiler: """ Execute GroupBy aggregation with TreeReduce approach. Parameters ---------- query_compiler : PandasQueryCompiler Frame to group. by : PandasQueryCompiler, column or index label, Grouper or list of such Object that determine groups. map_func : dict or callable(pandas.DataFrameGroupBy) -> pandas.DataFrame Function to apply to the `GroupByObject` at the Map phase. reduce_func : dict or callable(pandas.DataFrameGroupBy) -> pandas.DataFrame Function to apply to the `GroupByObject` at the Reduce phase. axis : {0, 1} Axis to group and apply aggregation function along. 0 means index axis when 1 means column axis. groupby_kwargs : dict Dictionary which carries arguments for pandas.DataFrame.groupby. agg_args : list-like Positional arguments to pass to the aggregation functions. agg_kwargs : dict Keyword arguments to pass to the aggregation functions. drop : bool, default: False Indicates whether or not by-data came from the `self` frame. method : str, optional Name of the GroupBy aggregation function. This is a hint to be able to do special casing. default_to_pandas_func : callable(pandas.DataFrameGroupBy) -> pandas.DataFrame, optional The pandas aggregation function equivalent to the `map_func + reduce_func`. Used in case of defaulting to pandas. If not specified `map_func` is used. finalizer_fn : callable(pandas.DataFrame) -> pandas.DataFrame, optional A callable to execute at the end a groupby kernel against groupby result. Returns ------- PandasQueryCompiler QueryCompiler which carries the result of GroupBy aggregation. """ is_unsupported_axis = axis != 0 # Defaulting to pandas in case of an empty frame as we can't process it properly. # Higher API level won't pass empty data here unless the frame has delayed # computations. So we apparently lose some laziness here (due to index access) # because of the inability to process empty groupby natively. is_empty_data = ( len(query_compiler.columns) == 0 or len(query_compiler.index) == 0 ) is_grouping_using_by_arg = ( groupby_kwargs.get("level", None) is None and by is not None ) is_unsupported_by_arg = isinstance(by, pandas.Grouper) or ( not hashable(by) and not isinstance(by, type(query_compiler)) ) if ( is_unsupported_axis or is_empty_data or (is_grouping_using_by_arg and is_unsupported_by_arg) ): if default_to_pandas_func is None: default_to_pandas_func = ( (lambda grp: grp.agg(map_func)) if isinstance(map_func, dict) else map_func ) default_to_pandas_func = GroupByDefault.register(default_to_pandas_func) return default_to_pandas_func( query_compiler, by=by, axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) # The bug only occurs in the case of Categorical 'by', so we might want to check whether any of # the 'by' dtypes is Categorical before going into this branch, however triggering 'dtypes' # computation if they're not computed may take time, so we don't do it if not groupby_kwargs.get("sort", True) and isinstance( by, type(query_compiler) ): ErrorMessage.mismatch_with_pandas( operation="df.groupby(categorical_by, sort=False)", message=( "the groupby keys will be sorted anyway, although the 'sort=False' was passed. " + "See the following issue for more details: " + "https://github.com/modin-project/modin/issues/3571" ), ) groupby_kwargs = groupby_kwargs.copy() groupby_kwargs["sort"] = True map_fn, reduce_fn = cls.build_map_reduce_functions( by=by, axis=axis, groupby_kwargs=groupby_kwargs, map_func=map_func, reduce_func=reduce_func, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, method=method, finalizer_fn=finalizer_fn, ) # If `by` is a ModinFrame, then its partitions will be broadcasted to every # `self` partition in a way determined by engine (modin_frame.groupby_reduce) # Otherwise `by` was already bound to the Map function in `build_map_reduce_functions`. broadcastable_by = getattr(by, "_modin_frame", None) apply_indices = list(map_func.keys()) if isinstance(map_func, dict) else None if ( broadcastable_by is not None and groupby_kwargs.get("as_index", True) and broadcastable_by.has_materialized_dtypes ): new_index = ModinIndex( # actual value will be assigned on a parent update value=None, axis=0, dtypes=broadcastable_by.dtypes, ) else: new_index = None new_modin_frame = query_compiler._modin_frame.groupby_reduce( axis, broadcastable_by, map_fn, reduce_fn, apply_indices=apply_indices, new_index=new_index, ) result = query_compiler.__constructor__(new_modin_frame) return result @classmethod def get_callable( cls, agg_func: Union[dict, Callable[..., pandas.DataFrame]], df: pandas.DataFrame, preserve_aggregation_order: bool = True, ) -> Callable[..., pandas.DataFrame]: """ Build aggregation function to apply to each group at this particular partition. If it's dictionary aggregation — filters aggregation dictionary for keys which this particular partition contains, otherwise do nothing with passed function. Parameters ---------- agg_func : dict or callable(pandas.DataFrameGroupBy) -> pandas.DataFrame Aggregation function. df : pandas.DataFrame Serialized partition which contains available columns. preserve_aggregation_order : bool, default: True Whether to manually restore the order of columns for the result specified by the `agg_func` keys (only makes sense when `agg_func` is a dictionary). Returns ------- callable(pandas.core.groupby.DataFrameGroupBy) -> pandas.DataFrame Aggregation function that can be safely applied to this particular partition. """ if not isinstance(agg_func, dict): return agg_func grp_has_id_level = df.columns.names[0] == cls.ID_LEVEL_NAME # The 'id' level prevents us from a lookup for the original # partition's columns. So dropping the level. partition_columns = frozenset( df.columns.droplevel(0) if grp_has_id_level else df.columns ) partition_dict = {k: v for k, v in agg_func.items() if k in partition_columns} return cls._build_callable_for_dict( partition_dict, preserve_aggregation_order, grp_has_id_level ) @classmethod def _build_callable_for_dict( cls, agg_dict: dict, preserve_aggregation_order: bool = True, grp_has_id_level: bool = False, ) -> Callable[..., pandas.DataFrame]: """ Build callable for an aggregation dictionary. Parameters ---------- agg_dict : dict Aggregation dictionary. preserve_aggregation_order : bool, default: True Whether to manually restore the order of columns for the result specified by the `agg_func` keys (only makes sense when `agg_func` is a dictionary). grp_has_id_level : bool, default: False Whether the frame we're grouping on has intermediate columns (see ``cls.ID_LEVEL_NAME``). Returns ------- callable(pandas.core.groupby.DataFrameGroupBy) -> pandas.DataFrame """ # We have to keep this import away from the module level to avoid circular import from modin.pandas.utils import walk_aggregation_dict # We now filter aggregation functions into those that could be applied natively # using pandas (pandas_grp_obj.agg(**native_aggs)) and those that require # special treatment (custom_aggs). custom_aggs = {} native_aggs = {} result_columns = [] for col, func, func_name, col_renaming_required in walk_aggregation_dict( agg_dict ): # Filter dictionary dict_to_add = ( custom_aggs if cls.is_registered_implementation(func) else native_aggs ) new_value = func if func_name is None else (func_name, func) old_value = dict_to_add.get(col, None) if old_value is not None: ErrorMessage.catch_bugs_and_request_email( failure_condition=not isinstance(old_value, list), extra_log="Expected for all aggregation values to be a list when at least " + f"one column has multiple aggregations. Got: {old_value} {type(old_value)}", ) old_value.append(new_value) else: # Pandas knows that it has to modify the resulting columns if it meets # a function wrapped into a list. Renaming is required if either a new # column name was explicitly specified, or multiple functions were # specified per one column, or if any other column in the aggregation # is going to be renamed. dict_to_add[col] = [new_value] if col_renaming_required else new_value # Construct resulting columns if col_renaming_required: func_name = str(func) if func_name is None else func_name result_columns.append( (*(col if isinstance(col, tuple) else (col,)), func_name) ) else: result_columns.append(col) result_columns = pandas.Index(result_columns) def aggregate_on_dict(grp_obj, *args, **kwargs): """Aggregate the passed groupby object.""" if len(native_aggs) == 0: native_agg_res = None elif grp_has_id_level: # Adding the 'id' level to the aggregation keys so they match `grp_obj` columns native_aggs_modified = { ( cls.ID_LEVEL_NAME, *(key if isinstance(key, tuple) else (key,)), ): value for key, value in native_aggs.items() } native_agg_res = grp_obj.agg(native_aggs_modified) # Dropping the 'id' level from the resulted frame native_agg_res.columns = native_agg_res.columns.droplevel(0) else: native_agg_res = grp_obj.agg(native_aggs) custom_results = [] insert_id_levels = False for col, func, func_name, col_renaming_required in walk_aggregation_dict( custom_aggs ): if grp_has_id_level: cols_without_ids = grp_obj.obj.columns.droplevel(0) if isinstance(cols_without_ids, pandas.MultiIndex): # We may have multiple columns matching the `col` in # a MultiIndex case, that's why use `.get_locs` here col_pos = cols_without_ids.get_locs(col) else: # `pandas.Index` doesn't have `.get_locs` method col_pos = cols_without_ids.get_loc(col) agg_key = grp_obj.obj.columns[col_pos] else: agg_key = [col] result = func(grp_obj[agg_key]) # The `func` may have discarded an ID-level if there were any. # So checking for this again. result_has_id_level = result.columns.names[0] == cls.ID_LEVEL_NAME insert_id_levels |= result_has_id_level if col_renaming_required: func_name = str(func) if func_name is None else func_name if result_has_id_level: result.columns = pandas.MultiIndex.from_tuples( [ # `old_col[0]` stores values from the 'id' # level, the ones we want to preserve here (old_col[0], col, func_name) for old_col in result.columns ], names=[ result.columns.names[0], result.columns.names[1], None, ], ) else: result.columns = pandas.MultiIndex.from_tuples( [(col, func_name)] * len(result.columns), names=[result.columns.names[0], None], ) custom_results.append(result) if insert_id_levels: # As long as any `result` has an id-level we have to insert the level # into every `result` so the number of levels matches for idx, ext_result in enumerate(custom_results): if ext_result.columns.names[0] != cls.ID_LEVEL_NAME: custom_results[idx] = pandas.concat( [ext_result], keys=[cls.ID_LEVEL_NAME], names=[cls.ID_LEVEL_NAME], axis=1, copy=False, ) if native_agg_res is not None: native_agg_res = pandas.concat( [native_agg_res], keys=[cls.ID_LEVEL_NAME], names=[cls.ID_LEVEL_NAME], axis=1, copy=False, ) native_res_part = [] if native_agg_res is None else [native_agg_res] parts = [*native_res_part, *custom_results] if parts: result = pandas.concat(parts, axis=1, copy=False) else: result = pandas.DataFrame(columns=result_columns) # The order is naturally preserved if there's no custom aggregations if preserve_aggregation_order and len(custom_aggs): result = result.reindex(result_columns, axis=1) return result return aggregate_on_dict @classmethod def is_registered_implementation(cls, func: Callable) -> bool: """ Check whether the passed `func` was registered as a TreeReduce implementation. Parameters ---------- func : callable Returns ------- bool """ return callable(func) and hasattr(func, cls._GROUPBY_REDUCE_IMPL_FLAG) @classmethod def build_map_reduce_functions( cls, by, axis: int, groupby_kwargs: dict, map_func: Union[dict, Callable[..., pandas.DataFrame]], reduce_func: Union[dict, Callable[..., pandas.DataFrame]], agg_args: list, agg_kwargs: dict, drop: bool = False, method: Optional[str] = None, finalizer_fn: Callable[[pandas.DataFrame], pandas.DataFrame] = None, ) -> tuple[Callable, Callable]: """ Bind appropriate arguments to map and reduce functions. Parameters ---------- by : BaseQueryCompiler, column or index label, Grouper or list of such Object that determine groups. axis : {0, 1} Axis to group and apply aggregation function along. 0 means index axis when 1 means column axis. groupby_kwargs : dict Dictionary which carries arguments for pandas.DataFrame.groupby. map_func : dict or callable(pandas.DataFrameGroupBy) -> pandas.DataFrame Function to apply to the `GroupByObject` at the Map phase. reduce_func : dict or callable(pandas.DataFrameGroupBy) -> pandas.DataFrame Function to apply to the `GroupByObject` at the Reduce phase. agg_args : list-like Positional arguments to pass to the aggregation functions. agg_kwargs : dict Keyword arguments to pass to the aggregation functions. drop : bool, default: False Indicates whether or not by-data came from the `self` frame. method : str, optional Name of the GroupBy aggregation function. This is a hint to be able to do special casing. finalizer_fn : callable(pandas.DataFrame) -> pandas.DataFrame, optional A callable to execute at the end a groupby kernel against groupby result. Returns ------- Tuple of callable Tuple of map and reduce functions with bound arguments. """ # if by is a query compiler, then it will be broadcasted explicit via # groupby_reduce method of the modin frame and so we don't want secondary # implicit broadcastion via passing it as an function argument. if hasattr(by, "_modin_frame"): by = None def _map( df: pandas.DataFrame, other: Optional[pandas.DataFrame] = None, **kwargs: dict, ) -> pandas.DataFrame: def wrapper( df: pandas.DataFrame, other: Optional[pandas.DataFrame] = None ) -> pandas.DataFrame: return cls.map( df, other=other, axis=axis, by=by, groupby_kwargs=groupby_kwargs.copy(), map_func=map_func, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, **kwargs, ) try: result = wrapper(df, other) # This will happen with Arrow buffer read-only errors. We don't want to copy # all the time, so this will try to fast-path the code first. except ValueError: result = wrapper(df.copy(), other if other is None else other.copy()) return result def _reduce(df: pandas.DataFrame, **call_kwargs: dict) -> pandas.DataFrame: def wrapper(df: pandas.DataFrame): return cls.reduce( df, axis=axis, groupby_kwargs=groupby_kwargs, reduce_func=reduce_func, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, method=method, finalizer_fn=finalizer_fn, **call_kwargs, ) try: result = wrapper(df) # This will happen with Arrow buffer read-only errors. We don't want to copy # all the time, so this will try to fast-path the code first. except ValueError: result = wrapper(df.copy()) return result return _map, _reduce ================================================ FILE: modin/core/dataframe/algebra/map.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses builder class for Map operator.""" from __future__ import annotations from typing import TYPE_CHECKING, Callable from .operator import Operator if TYPE_CHECKING: import pandas from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler class Map(Operator): """Builder class for Map operator.""" @classmethod def register( cls, function: Callable[..., pandas.DataFrame], *call_args: tuple, **call_kwds: dict, ) -> Callable[..., PandasQueryCompiler]: """ Build Map operator that will be performed across each partition. Parameters ---------- function : callable(pandas.DataFrame, *args, **kwargs) -> pandas.DataFrame Function that will be applied to the each partition. Function takes `pandas.DataFrame` and returns `pandas.DataFrame` of the same shape. *call_args : tuple Args that will be passed to the returned function. **call_kwds : dict Kwargs that will be passed to the returned function. Returns ------- callable Function that takes query compiler and executes map function. """ def caller( query_compiler: PandasQueryCompiler, *args: tuple, **kwargs: dict ) -> PandasQueryCompiler: """Execute Map function against passed query compiler.""" shape_hint = call_kwds.pop("shape_hint", None) or query_compiler._shape_hint return query_compiler.__constructor__( query_compiler._modin_frame.map( lambda x: function(x, *args, **kwargs), *call_args, **call_kwds ), shape_hint=shape_hint, ) return caller ================================================ FILE: modin/core/dataframe/algebra/operator.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module contains an interface for operator builder classes.""" from __future__ import annotations from typing import Callable, Optional class Operator(object): """Interface for building operators that can execute in parallel across partitions.""" def __init__(self) -> None: raise ValueError( "Please use {}.register instead of the constructor".format( type(self).__name__ ) ) @classmethod def register(cls, func: Callable, **kwargs: dict): """ Build operator that applies source function across the entire dataset. Parameters ---------- func : callable Source function. **kwargs : dict Kwargs that will be passed to the builder function. Returns ------- callable """ raise NotImplementedError("Please implement in child class") @classmethod def validate_axis(cls, axis: Optional[int]) -> int: """ Ensure that axis to apply function on has valid value. Parameters ---------- axis : int, optional 0 or None means apply on index, 1 means apply on columns. Returns ------- int Integer representation of given axis. """ return 0 if axis is None else axis ================================================ FILE: modin/core/dataframe/algebra/reduce.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses builder class for Reduce operator.""" from __future__ import annotations from typing import TYPE_CHECKING, Callable, Optional from .operator import Operator if TYPE_CHECKING: import pandas from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler class Reduce(Operator): """Builder class for Reduce operator.""" @classmethod def register( cls, reduce_function: Callable[..., pandas.Series], axis: Optional[int] = None, shape_hint: Optional[str] = None, ) -> Callable[..., PandasQueryCompiler]: """ Build Reduce operator that will be performed across rows/columns. It's used if `func` reduces the dimension of partitions in contrast to `Fold`. Parameters ---------- reduce_function : callable(pandas.DataFrame, *args, **kwargs) -> pandas.Series Source function. axis : int, optional Axis to apply function along. shape_hint : {"row", "column", None}, default: None Shape hint for the results known to be a column or a row, otherwise None. Returns ------- callable Function that takes query compiler and executes Reduce function. """ def caller( query_compiler: PandasQueryCompiler, *args: tuple, **kwargs: dict ) -> PandasQueryCompiler: """Execute Reduce function against passed query compiler.""" _axis = kwargs.get("axis") if axis is None else axis return query_compiler.__constructor__( query_compiler._modin_frame.reduce( cls.validate_axis(_axis), lambda x: reduce_function(x, *args, **kwargs), ), shape_hint=shape_hint, ) return caller ================================================ FILE: modin/core/dataframe/algebra/tree_reduce.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses builder class for TreeReduce operator.""" from __future__ import annotations from typing import TYPE_CHECKING, Callable, Optional from .operator import Operator if TYPE_CHECKING: import pandas from pandas._typing import DtypeObj from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler class TreeReduce(Operator): """Builder class for TreeReduce operator.""" @classmethod def register( cls, map_function: Optional[Callable[..., pandas.DataFrame]], reduce_function: Optional[Callable[..., pandas.Series]] = None, axis: Optional[int] = None, compute_dtypes: Optional[Callable[..., DtypeObj]] = None, ) -> Callable[..., PandasQueryCompiler]: """ Build TreeReduce operator. Parameters ---------- map_function : callable(pandas.DataFrame, *args, **kwargs) -> pandas.DataFrame Source map function. reduce_function : callable(pandas.DataFrame, *args, **kwargs) -> pandas.Series, optional Source reduce function. axis : int, optional Specifies axis to apply function along. compute_dtypes : callable(pandas.Series, *func_args, **func_kwargs) -> DtypeObj, optional Callable for computing dtypes. Returns ------- callable Function that takes query compiler and executes passed functions with TreeReduce algorithm. """ if reduce_function is None: reduce_function = map_function def caller( query_compiler: PandasQueryCompiler, *args: tuple, **kwargs: dict ) -> PandasQueryCompiler: """Execute TreeReduce function against passed query compiler.""" _axis = kwargs.get("axis") if axis is None else axis new_dtypes = None if compute_dtypes and query_compiler.frame_has_materialized_dtypes: new_dtypes = str(compute_dtypes(query_compiler.dtypes, *args, **kwargs)) return query_compiler.__constructor__( query_compiler._modin_frame.tree_reduce( cls.validate_axis(_axis), lambda x: map_function(x, *args, **kwargs), lambda y: reduce_function(y, *args, **kwargs), dtypes=new_dtypes, ) ) return caller ================================================ FILE: modin/core/dataframe/base/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Base Modin Dataframe classes.""" ================================================ FILE: modin/core/dataframe/base/dataframe/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Base Modin Dataframe class and Axis and JoinType Enums.""" ================================================ FILE: modin/core/dataframe/base/dataframe/dataframe.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Module contains class ModinDataframe. ModinDataframe is a parent abstract class for any dataframe class. """ from abc import ABC, abstractmethod from typing import Callable, Dict, Hashable, List, Optional, Union from modin.core.dataframe.base.dataframe.utils import Axis, JoinType class ModinDataframe(ABC): """ An abstract class that represents the Parent class for any Dataframe class. This class is intended to specify the behaviors that a Dataframe must implement. For more details about how these methods were chosen, please refer to this (https://people.eecs.berkeley.edu/~totemtang/paper/Modin.pdf) paper, which specifies a Dataframe algebra that this class exposes. """ @abstractmethod def take_2d_labels_or_positional( self, row_labels: Optional[List[Hashable]] = None, row_positions: Optional[List[int]] = None, col_labels: Optional[List[Hashable]] = None, col_positions: Optional[List[int]] = None, ) -> "ModinDataframe": """ Mask rows and columns in the dataframe. Allow users to perform selection and projection on the row and column labels (named notation), in addition to the row and column number (positional notation). Parameters ---------- row_labels : list of hashable, optional The row labels to extract. row_positions : list of int, optional The row positions to extract. col_labels : list of hashable, optional The column labels to extract. col_positions : list of int, optional The column positions to extract. Returns ------- ModinDataframe A new ModinDataframe from the mask provided. Notes ----- If both `row_labels` and `row_positions` are provided, a ValueError is raised. The same rule applies for `col_labels` and `col_positions`. """ pass @abstractmethod def filter_by_types(self, types: List[Hashable]) -> "ModinDataframe": """ Allow the user to specify a type or set of types by which to filter the columns. Parameters ---------- types : list of hashables The types to filter columns by. Returns ------- ModinDataframe A new ModinDataframe with only the columns whose dtypes appear in `types`. """ pass @abstractmethod def map( self, function: Callable, axis: Optional[Union[int, Axis]] = None, dtypes: Optional[str] = None, new_columns: Optional[List[Hashable]] = None, ) -> "ModinDataframe": """ Apply a user-defined function row-wise if `axis`=0, column-wise if `axis`=1, and cell-wise if `axis` is None. Parameters ---------- function : callable(row|col|cell) -> row|col|cell The function to map across the dataframe. axis : int or modin.core.dataframe.base.utils.Axis, optional The axis to map over. dtypes : str, optional The data types for the result. This is an optimization because there are functions that always result in a particular data type, and this allows us to avoid (re)computing it. new_columns : List[Hashable], optional New column labels of the result, its length has to be identical to the older columns. If not specified, old column labels are preserved. Returns ------- ModinDataframe A new ModinDataframe with the map applied. Notes ----- This does not change the shape of the dataframe. """ pass @abstractmethod def filter(self, axis: Union[int, Axis], condition: Callable) -> "ModinDataframe": """ Filter data based on the function provided along the specified axis. Parameters ---------- axis : int or modin.core.dataframe.base.utils.Axis The axis to filter over. condition : callable(row|col) -> bool The function to use for the filter. This function should filter the data itself. It accepts either a row or column (depending on the axis argument) and returns True to keep the row/col, and False to drop it. Returns ------- ModinDataframe A new ModinDataframe filtered by content according to the filter provided by condition. """ pass @abstractmethod def explode( self, axis: Union[int, Axis], function: Callable, result_schema: Optional[Dict[Hashable, type]] = None, ) -> "ModinDataframe": """ Explode data based on the function provided along the specified axis. Parameters ---------- axis : int or modin.core.dataframe.base.utils.Axis The axis to expand over. function : callable The function to use to expand the data. This function should accept one row/column, and return multiple. result_schema : dictionary, optional Mapping from column labels to data types that represents the types of the output dataframe. Returns ------- ModinDataframe A new ModinDataframe with the specified axis expanded. Notes ----- Only one axis can be expanded at a time. The user-defined function may increase the number of rows (columns if axis=1), but it should not remove or drop rows. """ pass @abstractmethod def window( self, axis: Union[int, Axis], reduce_fn: Callable, window_size: int, result_schema: Optional[Dict[Hashable, type]] = None, ) -> "ModinDataframe": """ Apply a sliding window operator that acts as a GROUPBY on each window, reducing each window to a single row (column). Parameters ---------- axis : int or modin.core.dataframe.base.utils.Axis The axis to slide over. reduce_fn : callable(rowgroup|colgroup) -> row|col The reduce function to apply over the data. window_size : int The number of row/columns to pass to the function. (The size of the sliding window). result_schema : dictionary, optional Mapping from column labels to data types that represents the types of the output dataframe. Returns ------- ModinDataframe A new ModinDataframe with the reduce function applied over windows of the specified axis. Notes ----- The user-defined reduce function must reduce each window's column (row if axis=1) down to a single value. """ pass @abstractmethod def groupby( self, axis: Union[int, Axis], by: Union[str, List[str]], operator: Callable, result_schema: Optional[Dict[Hashable, type]] = None, ) -> "ModinDataframe": """ Generate groups based on values in the input column(s) and perform the specified operation on each. Parameters ---------- axis : int or modin.core.dataframe.base.utils.Axis The axis to apply the grouping over. by : string or list of strings One or more column labels to use for grouping. operator : callable The operation to carry out on each of the groups. The operator is another algebraic operator with its own user-defined function parameter, depending on the output desired by the user. result_schema : dictionary, optional Mapping from column labels to data types that represents the types of the output dataframe. Returns ------- ModinDataframe A new ModinDataframe containing the groupings specified, with the operator applied to each group. Notes ----- No communication between groups is allowed in this algebra implementation. The number of rows (columns if axis=1) returned by the user-defined function passed to the groupby may be at most the number of rows in the group, and may be as small as a single row. Unlike the pandas API, an intermediate "GROUP BY" object is not present in this algebra implementation. """ pass @abstractmethod def reduce( self, axis: Union[int, Axis], function: Callable, dtypes: Optional[str] = None, ) -> "ModinDataframe": """ Perform a user-defined aggregation on the specified axis, where the axis reduces down to a singleton. Parameters ---------- axis : int or modin.core.dataframe.base.utils.Axis The axis to perform the reduce over. function : callable(row|col) -> single value The reduce function to apply to each column. dtypes : str, optional The data types for the result. This is an optimization because there are functions that always result in a particular data type, and this allows us to avoid (re)computing it. Returns ------- ModinDataframe A new ModinDataframe with the same columns as the previous, with only a single row. Notes ----- The user-defined function must reduce to a single value. """ pass @abstractmethod def tree_reduce( self, axis: Union[int, Axis], map_func: Callable, reduce_func: Optional[Callable] = None, dtypes: Optional[str] = None, ) -> "ModinDataframe": """ Perform a user-defined aggregation on the specified axis, where the axis reduces down to a singleton using a tree-reduce computation pattern. The map function is applied first over multiple partitions of a column, and then the reduce function (if specified, otherwise the map function is applied again) is applied to the results to produce a single value. Parameters ---------- axis : int or modin.core.dataframe.base.utils.Axis The axis to perform the tree reduce over. map_func : callable(row|col) -> row|col|single value The map function to apply to each column. reduce_func : callable(row|col) -> single value, optional The reduce function to apply to the results of the map function. dtypes : str, optional The data types for the result. This is an optimization because there are functions that always result in a particular data type, and this allows us to avoid (re)computing it. Returns ------- ModinDataframe A new ModinDataframe with the same columns as the previous, with only a single row. Notes ----- The user-defined function must reduce to a single value. If the user-defined function requires access to the entire column, please use reduce instead. """ pass @abstractmethod def infer_types(self, columns_list: List[str]) -> "ModinDataframe": """ Determine the compatible type shared by all values in the specified columns, and coerce them to that type. Parameters ---------- columns_list : list of strings List of column labels to infer and induce types over. Returns ------- ModinDataframe A new ModinDataframe with the inferred schema. """ pass @abstractmethod def join( self, axis: Union[int, Axis], condition: Callable, other: "ModinDataframe", join_type: Union[str, JoinType], ) -> "ModinDataframe": """ Join this dataframe with the other. Parameters ---------- axis : int or modin.core.dataframe.base.utils.Axis The axis to perform the join on. condition : callable Function that determines which rows should be joined. The condition can be a simple equality, e.g. "left.col1 == right.col1" or can be arbitrarily complex. other : ModinDataframe The other data to join with, i.e. the right dataframe. join_type : string {"inner", "left", "right", "outer"} or modin.core.dataframe.base.utils.JoinType The type of join to perform. Returns ------- ModinDataframe A new ModinDataframe that is the result of applying the specified join over the two dataframes. Notes ----- During the join, this dataframe is considered the left, while the other is treated as the right. Only inner joins, left outer, right outer, and full outer joins are currently supported. Support for other join types (e.g. natural join) may be implemented in the future. """ pass @abstractmethod def concat( self, axis: Union[int, Axis], others: Union["ModinDataframe", List["ModinDataframe"]], ) -> "ModinDataframe": """ Append rows/columns along the specified axis from multiple dataframes. Parameters ---------- axis : int or modin.core.dataframe.base.utils.Axis The axis on which to perform the concatenation. others : ModinDataframe or list of ModinDataframes The other ModinDataframe(s) to concatenate. Returns ------- ModinDataframe A new ModinDataframe that is the result of concatenating the dataframes over the specified axis. Notes ----- The concat operator incurs fixed overheads, and so this algebra places no limit to the number of dataframes that may be concatenated in this way. """ pass @abstractmethod def transpose(self) -> "ModinDataframe": """ Swap the row and column axes. Returns ------- ModinDataframe A new ModinDataframe with the row and column axes swapped. Notes ----- Transposing a dataframe is expensive, and so it is performed lazily. The axes are swapped logically immediately, but the physical swap does not occur until absolutely necessary, which helps motivate the axis argument to the other operators in this algebra. """ pass @abstractmethod def to_labels(self, column_labels: Union[str, List[str]]) -> "ModinDataframe": """ Replace the row labels with one or more columns of data. Parameters ---------- column_labels : string or list of strings Column label(s) to use as the new row labels. Returns ------- ModinDataframe A new ModinDataframe with the row labels replaced by the specified columns. Notes ----- When multiple column labels are specified, a hierarchical set of labels is created, ordered by the ordering of labels in the input. """ pass @abstractmethod def from_labels(self) -> "ModinDataframe": """ Move the row labels into the data at position 0, and sets the row labels to the positional notation. Returns ------- ModinDataframe A new ModinDataframe with the row labels moved into the data. Notes ----- In the case that the dataframe has hierarchical labels, all label "levels" are inserted into the dataframe in the order they occur in the labels, with the outermost being in position 0. """ pass @abstractmethod def rename( self, new_row_labels: Optional[Union[Dict[Hashable, Hashable], Callable]] = None, new_col_labels: Optional[Union[Dict[Hashable, Hashable], Callable]] = None, ) -> "ModinDataframe": """ Replace the row and column labels with the specified new labels. Parameters ---------- new_row_labels : dictionary or callable, optional Mapping or callable that relates old row labels to new labels. new_col_labels : dictionary or callable, optional Mapping or callable that relates old col labels to new labels. Returns ------- ModinDataframe A new ModinDataframe with the new row and column labels. """ pass @abstractmethod def sort_by( self, axis: Union[int, Axis], labels: Union[str, List[str]], ascending: bool = True, ) -> "ModinDataframe": """ Logically reorder rows (columns if axis=1) lexicographically by the data in a column or set of columns. Parameters ---------- axis : int or modin.core.dataframe.base.utils.Axis The axis to perform the sort over. labels : string or list of strings Column (row if axis=1) label(s) to use to determine lexicographical ordering. If multiple columns (rows if axis=1) are provided, the sort is performed on the first column (row if axis=1), with ties broken by the other columns (rows if axis=1) provided. ascending : boolean, default: True Whether to sort in ascending or descending order. Returns ------- ModinDataframe A new ModinDataframe sorted into lexicographical order by the specified column(s). """ pass ================================================ FILE: modin/core/dataframe/base/dataframe/utils.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Module contains useful enums for Modin. Axis is an enum that represents the `axis` argument for dataframe operations. JoinType is an enum that represents the `join_type` or `how` argument for the join algebra operator. """ from enum import Enum from typing import Dict, List, Sequence, Tuple, cast import pandas from pandas._typing import IndexLabel from pandas.api.types import is_scalar from pandas.core.dtypes.common import is_integer_dtype class Axis(Enum): # noqa: PR01 """ An enum that represents the `axis` argument provided to the algebra operators. The enum has 3 values - ROW_WISE to represent the row axis, COL_WISE to represent the column axis, and CELL_WISE to represent no axis. ROW_WISE operations iterate over the rows COL_WISE operations over the columns, and CELL_WISE operations over any of the partitioning schemes that are supported in Modin (row-wise, column-wise, or block-wise). """ ROW_WISE = 0 COL_WISE = 1 CELL_WISE = None class JoinType(Enum): # noqa: PR01 """ An enum that represents the `join_type` argument provided to the algebra operators. The enum has 4 values - INNER to represent inner joins, LEFT to represent left joins, RIGHT to represent right joins, and OUTER to represent outer joins. """ INNER = "inner" LEFT = "left" RIGHT = "right" OUTER = "outer" def join_columns( left: pandas.Index, right: pandas.Index, left_on: IndexLabel, right_on: IndexLabel, suffixes: Tuple[str, str], ) -> Tuple[pandas.Index, Dict[IndexLabel, IndexLabel], Dict[IndexLabel, IndexLabel]]: """ Compute resulting columns for the two dataframes being merged. Parameters ---------- left : pandas.Index Columns of the left frame to join. right : pandas.Index Columns of the right frame to join. left_on : list-like or scalar Column names on which the frames are joined in the left DataFrame. right_on : list-like or scalar Column names on which the frames are joined in the right DataFrame. suffixes : tuple[str, str] A 2-length sequence containing suffixes to append to the intersected columns. Returns ------- pandas.Index, dict[IndexLabel -> IndexLabel], dict[IndexLabel -> IndexLabel] Returns columns for the resulting frame and mappings of old to new column names for `left` and `right` accordingly. Raises ------ NotImplementedError Raised when one of the keys to join is an index level, pandas behaviour is really complicated in this case, so we're not supporting this case for now. """ # using `cast` to make `mypy` acknowledged that the variable now ensured to be `Sequence[IndexLabel]` left_on = cast(Sequence[IndexLabel], [left_on] if is_scalar(left_on) else left_on) right_on = cast( Sequence[IndexLabel], [right_on] if is_scalar(right_on) else right_on ) # handling a simple case of merging on one column and when the column is located in an index if len(left_on) == 1 and len(right_on) == 1 and left_on[0] == right_on[0]: if left_on[0] not in left and right_on[0] not in right: # in this case the 'on' column will stay in the index, so we can simply # drop the 'left/right_on' values and proceed as normal left_on = [] right_on = [] # in other cases, we can simply add the index name to columns and proceed as normal # on python 3.9 with pandas-stubs 2.2, these lines will warn about insert being an untyped call, # but this error is no longer present on higher versions elif left_on[0] not in left: left = left.insert(loc=0, item=left_on[0]) # type: ignore[no-untyped-call, unused-ignore] elif right_on[0] not in right: right = right.insert(loc=0, item=right_on[0]) # type: ignore[no-untyped-call, unused-ignore] if any(col not in left for col in left_on) or any( col not in right for col in right_on ): raise NotImplementedError( "Cases, where one of the keys to join is an index level, are not yet supported." ) left_conflicts = set(left) & (set(right) - set(right_on)) right_conflicts = set(right) & (set(left) - set(left_on)) conflicting_cols = left_conflicts | right_conflicts def _get_new_name(col: IndexLabel, suffix: str) -> IndexLabel: if col in conflicting_cols: return ( (f"{col[0]}{suffix}", *col[1:]) if isinstance(col, tuple) else f"{col}{suffix}" ) else: return col left_renamer: Dict[IndexLabel, IndexLabel] = {} right_renamer: Dict[IndexLabel, IndexLabel] = {} new_left: List = [] new_right: List = [] for col in left: new_name = _get_new_name(col, suffixes[0]) new_left.append(new_name) left_renamer[col] = new_name for col in right: # If we're joining on the column that exists in both frames then it was already # taken from the 'left', don't want to take it again from the 'right'. if not (col in left_on and col in right_on): new_name = _get_new_name(col, suffixes[1]) new_right.append(new_name) right_renamer[col] = new_name new_columns = pandas.Index(new_left + new_right) return new_columns, left_renamer, right_renamer def is_trivial_index(index: pandas.Index) -> bool: """ Check if the index is a trivial index, i.e. a sequence [0..n]. Parameters ---------- index : pandas.Index An index to check. Returns ------- bool """ if len(index) == 0: return True if isinstance(index, pandas.RangeIndex): return index.start == 0 and index.step == 1 if not (isinstance(index, pandas.Index) and is_integer_dtype(index)): return False return ( index.is_monotonic_increasing and index.is_unique and index.min() == 0 and index.max() == len(index) - 1 ) ================================================ FILE: modin/core/dataframe/base/interchange/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Base Modin Dataframe functionality related to data exchange protocols.""" ================================================ FILE: modin/core/dataframe/base/interchange/dataframe_protocol/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Base Modin Dataframe functionality related to the dataframe exchange protocol. See more in https://data-apis.org/dataframe-protocol/latest/index.html. """ ================================================ FILE: modin/core/dataframe/base/interchange/dataframe_protocol/dataframe.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Dataframe exchange protocol implementation. See more in https://data-apis.org/dataframe-protocol/latest/index.html. """ from abc import ABC, abstractmethod from typing import Any, Dict, Iterable, Optional, Sequence, Tuple, TypedDict from .utils import ColumnNullType, DlpackDeviceType, DTypeKind class ColumnBuffers(TypedDict): # noqa: GL08 # first element is a buffer containing the column data; # second element is the data buffer's associated dtype data: Tuple["ProtocolBuffer", Any] # first element is a buffer containing mask values indicating missing data; # second element is the mask value buffer's associated dtype. # None if the null representation is not a bit or byte mask validity: Optional[Tuple["ProtocolBuffer", Any]] # first element is a buffer containing the offset values for # variable-size binary data (e.g., variable-length strings); # second element is the offsets buffer's associated dtype. # None if the data buffer does not have an associated offsets buffer offsets: Optional[Tuple["ProtocolBuffer", Any]] class CategoricalDescription(TypedDict): # noqa: GL08 # whether the ordering of dictionary indices is semantically meaningful is_ordered: bool # whether a column-style mapping of categorical values to other objects exists is_dictionary: bool # None if not a column-style categorical. categories: Optional["ProtocolColumn"] class ProtocolBuffer(ABC): """ Data in the buffer is guaranteed to be contiguous in memory. Note that there is no dtype attribute present, a buffer can be thought of as simply a block of memory. However, if the column that the buffer is attached to has a dtype that's supported by DLPack and ``__dlpack__`` is implemented, then that dtype information will be contained in the return value from ``__dlpack__``. This distinction is useful to support both (a) data exchange via DLPack on a buffer and (b) dtypes like variable-length strings which do not have a fixed number of bytes per element. """ @property @abstractmethod def bufsize(self) -> int: """ Buffer size in bytes. Returns ------- int """ pass @property @abstractmethod def ptr(self) -> int: """ Pointer to start of the buffer as an integer. Returns ------- int """ pass @abstractmethod def __dlpack__(self) -> Any: """ Produce DLPack capsule (see array API standard). DLPack not implemented in NumPy yet, so leave it out here. Raises ------ ``TypeError`` if the buffer contains unsupported dtypes. ``NotImplementedError`` if DLPack support is not implemented. Notes ----- Useful to have to connect to array libraries. Support optional because it's not completely trivial to implement for a Python-only library. """ pass @abstractmethod def __dlpack_device__(self) -> Tuple[DlpackDeviceType, Optional[int]]: """ Device type and device ID for where the data in the buffer resides. Uses device type codes matching DLPack. Enum members are: - CPU = 1 - CUDA = 2 - CPU_PINNED = 3 - OPENCL = 4 - VULKAN = 7 - METAL = 8 - VPI = 9 - ROCM = 10 Returns ------- tuple Device type and device ID. Notes ----- Must be implemented even if ``__dlpack__`` is not. """ pass class ProtocolColumn(ABC): """ A column object, with only the methods and properties required by the interchange protocol defined. A column can contain one or more chunks. Each chunk can contain up to three buffers - a data buffer, a mask buffer (depending on null representation), and an offsets buffer (if variable-size binary; e.g., variable-length strings). TBD: Arrow has a separate "null" dtype, and has no separate mask concept. Instead, it seems to use "children" for both columns with a bit mask, and for nested dtypes. Unclear whether this is elegant or confusing. This design requires checking the null representation explicitly. The Arrow design requires checking: 1. the ARROW_FLAG_NULLABLE (for sentinel values) 2. if a column has two children, combined with one of those children having a null dtype. Making the mask concept explicit seems useful. One null dtype would not be enough to cover both bit and byte masks, so that would mean even more checking if we did it the Arrow way. TBD: there's also the "chunk" concept here, which is implicit in Arrow as multiple buffers per array (= column here). Semantically it may make sense to have both: chunks were meant for example for lazy evaluation of data which doesn't fit in memory, while multiple buffers per column could also come from doing a selection operation on a single contiguous buffer. Given these concepts, one would expect chunks to be all of the same size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), while multiple buffers could have data-dependent lengths. Not an issue in pandas if one column is backed by a single NumPy array, but in Arrow it seems possible. Are multiple chunks *and* multiple buffers per column necessary for the purposes of this interchange protocol, or must producers either reuse the chunk concept for this or copy the data? Notes ----- This ProtocolColumn object can only be produced by ``__dataframe__``, so doesn't need its own version or ``__column__`` protocol. """ @abstractmethod def size(self) -> int: """ Size of the column, in elements. Corresponds to `DataFrame.num_rows()` if column is a single chunk; equal to size of this current chunk otherwise. Is a method rather than a property because it may cause a (potentially expensive) computation for some dataframe implementations. Returns ------- int Size of the column, in elements. """ pass @property @abstractmethod def offset(self) -> int: """ Get the offset of first element. May be > 0 if using chunks; for example for a column with N chunks of equal size M (only the last chunk may be shorter), ``offset = n * M``, ``n = 0 .. N-1``. Returns ------- int The offset of first element. """ pass @property @abstractmethod def dtype(self) -> Tuple[DTypeKind, int, str, str]: """ Dtype description as a tuple ``(kind, bit-width, format string, endianness)``. * Kind : DTypeKind * Bit-width : the number of bits as an integer * Format string : data type description format string in Apache Arrow C Data Interface format. * Endianness : current only native endianness (``=``) is supported Returns ------- tuple ``(kind, bit-width, format string, endianness)``. Notes ----- - Kind specifiers are aligned with DLPack where possible (hence the jump to 20, leave enough room for future extension). - Masks must be specified as boolean with either bit width 1 (for bit masks) or 8 (for byte masks). - Dtype width in bits was preferred over bytes - Endianness isn't too useful, but included now in case in the future we need to support non-native endianness - Went with Apache Arrow format strings over NumPy format strings because they're more complete from a dataframe perspective - Format strings are mostly useful for datetime specification, and for categoricals. - For categoricals, the format string describes the type of the categorical in the data buffer. In case of a separate encoding of the categorical (e.g. an integer to string mapping), this can be derived from ``self.describe_categorical``. - Data types not included: complex, Arrow-style null, binary, decimal, and nested (list, struct, map, union) dtypes. """ pass @property @abstractmethod def describe_categorical(self) -> CategoricalDescription: """ If the dtype is categorical, there are two options. - There are only values in the data buffer. - There is a separate non-categorical Column encoding categorical values. TBD: are there any other in-memory representations that are needed? Returns ------- dict Content of returned dict: - "is_ordered" : bool, whether the ordering of dictionary indices is semantically meaningful. - "is_dictionary" : bool, whether a mapping of categorical values to other objects exists - "categories" : Column representing the (implicit) mapping of indices to category values (e.g. an array of cat1, cat2, ...). None if not a dictionary-style categorical. Raises ------ ``TypeError`` if the dtype is not categorical. """ pass @property @abstractmethod def describe_null(self) -> Tuple[ColumnNullType, Any]: """ Return the missing value (or "null") representation the column dtype uses. Return as a tuple ``(kind, value)``. * Kind: ColumnNullType * Value : if kind is "sentinel value", the actual value. If kind is a bit mask or a byte mask, the value (0 or 1) indicating a missing value. None otherwise. Returns ------- tuple ``(kind, value)``. """ pass @property @abstractmethod def null_count(self) -> int: """ Get number of null elements, if known. Returns ------- int Notes ----- Arrow uses -1 to indicate "unknown", but None seems cleaner. """ pass @property @abstractmethod def metadata(self) -> Dict[str, Any]: """ Get the metadata for the column. See `DataFrame.metadata` for more details. Returns ------- dict """ pass @abstractmethod def num_chunks(self) -> int: """ Return the number of chunks the column consists of. Returns ------- int The number of chunks the column consists of. """ pass @abstractmethod def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["ProtocolColumn"]: """ Return an iterator yielding the chunks. By default ``n_chunks=None``, yields the chunks that the data is stored as by the producer. If given, ``n_chunks`` must be a multiple of ``self.num_chunks()``, meaning the producer must subdivide each chunk before yielding it. Parameters ---------- n_chunks : int, optional Number of chunks to yield. Yields ------ DataFrame A ``DataFrame`` object(s). Raises ------ ``RuntimeError`` if ``n_chunks`` is not a multiple of ``self.num_chunks()``. """ pass @abstractmethod def get_buffers(self) -> ColumnBuffers: """ Return a dictionary containing the underlying buffers. Returns ------- dict - "data": a two-element tuple whose first element is a buffer containing the data and whose second element is the data buffer's associated dtype. - "validity": a two-element tuple whose first element is a buffer containing mask values indicating missing data and whose second element is the mask value buffer's associated dtype. None if the null representation is not a bit or byte mask. - "offsets": a two-element tuple whose first element is a buffer containing the offset values for variable-size binary data (e.g., variable-length strings) and whose second element is the offsets buffer's associated dtype. None if the data buffer does not have an associated offsets buffer. """ pass class ProtocolDataframe(ABC): """ A data frame class, with only the methods required by the interchange protocol defined. Instances of this (private) class are returned from ``modin.core.dataframe.base.dataframe.dataframe.ModinDataframe.__dataframe__`` as objects with the methods and attributes defined on this class. A "data frame" represents an ordered collection of named columns. A column's "name" must be a unique string. Columns may be accessed by name or by position. This could be a public data frame class, or an object with the methods and attributes defined on this ProtocolDataframe class could be returned from the ``__dataframe__`` method of a public data frame class in a library adhering to the dataframe interchange protocol specification. """ version = 0 # version of the protocol @abstractmethod def __dataframe__( self, nan_as_null: bool = False, allow_copy: bool = True ) -> "ProtocolDataframe": """ Construct a new dataframe interchange object, potentially changing the parameters. See more about the protocol in https://data-apis.org/dataframe-protocol/latest/index.html. Parameters ---------- nan_as_null : bool, default: False A keyword intended for the consumer to tell the producer to overwrite null values in the data with ``NaN``. This currently has no effect; once support for nullable extension dtypes is added, this value should be propagated to columns. allow_copy : bool, default: True A keyword that defines whether or not the library is allowed to make a copy of the data. For example, copying data would be necessary if a library supports strided buffers, given that this protocol specifies contiguous buffers. Currently, if the flag is set to ``False`` and a copy is needed, a ``RuntimeError`` will be raised. Returns ------- ProtocolDataframe """ pass @property @abstractmethod def metadata(self) -> Dict[str, Any]: """ Get the metadata for the data frame, as a dictionary with string keys. The contents of `metadata` may be anything, they are meant for a library to store information that it needs to, e.g., roundtrip losslessly or for two implementations to share data that is not (yet) part of the interchange protocol specification. For avoiding collisions with other entries, please add name the keys with the name of the library followed by a period and the desired name, e.g, ``pandas.indexcol``. Returns ------- dict """ pass @abstractmethod def num_columns(self) -> int: """ Return the number of columns in the ProtocolDataframe. Returns ------- int The number of columns in the ProtocolDataframe. """ pass @abstractmethod def num_rows(self) -> Optional[int]: """ Return the number of rows in the ProtocolDataframe, if available. Returns ------- int The number of rows in the ProtocolDataframe. """ pass @abstractmethod def num_chunks(self) -> int: """ Return the number of chunks the ProtocolDataframe consists of. Returns ------- int The number of chunks the ProtocolDataframe consists of. """ pass @abstractmethod def column_names(self) -> Iterable[str]: """ Return an iterator yielding the column names. Yields ------ str The name of the column(s). """ pass @abstractmethod def get_column(self, i: int) -> ProtocolColumn: """ Return the column at the indicated position. Parameters ---------- i : int Positional index of the column to be returned. Returns ------- Column The column at the indicated position. """ pass @abstractmethod def get_column_by_name(self, name: str) -> ProtocolColumn: """ Return the column whose name is the indicated name. Parameters ---------- name : str String label of the column to be returned. Returns ------- Column The column whose name is the indicated name. """ pass @abstractmethod def get_columns(self) -> Iterable[ProtocolColumn]: """ Return an iterator yielding the columns. Yields ------ Column The ``Column`` object(s). """ pass @abstractmethod def select_columns(self, indices: Sequence[int]) -> "ProtocolDataframe": """ Create a new ProtocolDataframe by selecting a subset of columns by index. Parameters ---------- indices : Sequence[int] Column indices to be selected out of the ProtocolDataframe. Returns ------- ProtocolDataframe A new ProtocolDataframe with selected a subset of columns by index. """ pass @abstractmethod def select_columns_by_name(self, names: Sequence[str]) -> "ProtocolDataframe": """ Create a new ProtocolDataframe by selecting a subset of columns by name. Parameters ---------- names : Sequence[str] Column names to be selected out of the ProtocolDataframe. Returns ------- ProtocolDataframe A new ProtocolDataframe with selected a subset of columns by name. """ pass @abstractmethod def get_chunks( self, n_chunks: Optional[int] = None ) -> Iterable["ProtocolDataframe"]: """ Return an iterator yielding the chunks. By default `n_chunks=None`, yields the chunks that the data is stored as by the producer. If given, `n_chunks` must be a multiple of `self.num_chunks()`, meaning the producer must subdivide each chunk before yielding it. Parameters ---------- n_chunks : int, optional Number of chunks to yield. Yields ------ ProtocolDataframe A ``ProtocolDataframe`` object(s). Raises ------ ``RuntimeError`` if ``n_chunks`` is not a multiple of ``self.num_chunks()``. """ pass ================================================ FILE: modin/core/dataframe/base/interchange/dataframe_protocol/utils.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Dataframe exchange protocol implementation. See more in https://data-apis.org/dataframe-protocol/latest/index.html. """ import enum import re from typing import Optional, Union import numpy as np import pandas from pandas.api.types import is_datetime64_dtype class DTypeKind(enum.IntEnum): # noqa PR01 """ Integer enum for data types. Attributes ---------- INT : int Matches to signed integer data type. UINT : int Matches to unsigned integer data type. FLOAT : int Matches to floating point data type. BOOL : int Matches to boolean data type. STRING : int Matches to string data type (UTF-8 encoded). DATETIME : int Matches to datetime data type. CATEGORICAL : int Matches to categorical data type. """ INT = 0 UINT = 1 FLOAT = 2 BOOL = 20 STRING = 21 # UTF-8 DATETIME = 22 CATEGORICAL = 23 class ColumnNullType(enum.IntEnum): # noqa PR01 """ Integer enum for null type representation. Attributes ---------- NON_NULLABLE : int Non-nullable column. USE_NAN : int Use explicit float NaN value. USE_SENTINEL : int Sentinel value besides NaN. USE_BITMASK : int The bit is set/unset representing a null on a certain position. USE_BYTEMASK : int The byte is set/unset representing a null on a certain position. """ NON_NULLABLE = 0 USE_NAN = 1 USE_SENTINEL = 2 USE_BITMASK = 3 USE_BYTEMASK = 4 class DlpackDeviceType(enum.IntEnum): # noqa PR01 """Integer enum for device type codes matching DLPack.""" CPU = 1 CUDA = 2 CPU_PINNED = 3 OPENCL = 4 VULKAN = 7 METAL = 8 VPI = 9 ROCM = 10 class ArrowCTypes: """ Enum for Apache Arrow C type format strings. The Arrow C data interface: https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings """ NULL = "n" BOOL = "b" INT8 = "c" UINT8 = "C" INT16 = "s" UINT16 = "S" INT32 = "i" UINT32 = "I" INT64 = "l" UINT64 = "L" FLOAT16 = "e" FLOAT32 = "f" FLOAT64 = "g" STRING = "u" # utf-8 DATE32 = "tdD" DATE64 = "tdm" # Resoulution: # - seconds -> 's' # - miliseconds -> 'm' # - microseconds -> 'u' # - nanoseconds -> 'n' TIMESTAMP = "ts{resolution}:{tz}" TIME = "tt{resolution}" class Endianness: """Enum indicating the byte-order of a data-type.""" LITTLE = "<" BIG = ">" NATIVE = "=" NA = "|" def pandas_dtype_to_arrow_c(dtype: Union[np.dtype, pandas.CategoricalDtype]) -> str: """ Represent pandas `dtype` as a format string in Apache Arrow C notation. Parameters ---------- dtype : np.dtype Datatype of pandas DataFrame to represent. Returns ------- str Format string in Apache Arrow C notation of the given `dtype`. """ if isinstance(dtype, pandas.CategoricalDtype): return ArrowCTypes.INT64 elif dtype == pandas.api.types.pandas_dtype("O"): return ArrowCTypes.STRING format_str = getattr(ArrowCTypes, dtype.name.upper(), None) if format_str is not None: return format_str if is_datetime64_dtype(dtype): # Selecting the first char of resolution string: # dtype.str -> ' None: """ Raise a ``RuntimeError`` mentioning that there's a copy required. Parameters ---------- copy_reason : str, optional The reason of making a copy. Should fit to the following format: 'The copy occurred due to {copy_reason}.'. """ msg = "Copy required but 'allow_copy=False' is set." if copy_reason: msg += f" The copy occurred due to {copy_reason}." raise RuntimeError(msg) ================================================ FILE: modin/core/dataframe/base/partitioning/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Base Modin Dataframe classes related to its partitioning.""" ================================================ FILE: modin/core/dataframe/base/partitioning/axis_partition.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Base class of an axis partition for a Modin Dataframe.""" from abc import ABC, abstractmethod from typing import Any, Callable, Iterable, Optional, Tuple, Type, Union from modin.logging import ClassLogger from modin.logging.config import LogLevel class BaseDataframeAxisPartition( ABC, ClassLogger, modin_layer="VIRTUAL-PARTITION", log_level=LogLevel.DEBUG ): # pragma: no cover """ An abstract class that represents the parent class for any axis partition class. This class is intended to simplify the way that operations are performed. Attributes ---------- _PARTITIONS_METADATA_LEN : int The number of metadata values that the object of `partition_type` consumes. """ @property @abstractmethod def list_of_blocks(self) -> list: """Get the list of physical partition objects that compose this partition.""" pass def apply( self, func: Callable, *args: Iterable, num_splits: Optional[int] = None, other_axis_partition: Optional["BaseDataframeAxisPartition"] = None, maintain_partitioning: bool = True, lengths: Optional[Iterable] = None, manual_partition: bool = False, **kwargs: dict, ) -> Any: """ Apply a function to this axis partition along full axis. Parameters ---------- func : callable The function to apply. This will be preprocessed according to the corresponding `BaseDataframePartition` objects. *args : iterable Positional arguments to pass to `func`. num_splits : int, default: None The number of times to split the result object. other_axis_partition : BaseDataframeAxisPartition, default: None Another `BaseDataframeAxisPartition` object to be applied to func. This is for operations that are between two data sets. maintain_partitioning : bool, default: True Whether to keep the partitioning in the same orientation as it was previously or not. This is important because we may be operating on an individual axis partition and not touching the rest. In this case, we have to return the partitioning to its previous orientation (the lengths will remain the same). This is ignored between two axis partitions. lengths : iterable, default: None The list of lengths to shuffle the partition into. manual_partition : bool, default: False If True, partition the result with `lengths`. **kwargs : dict Additional keywords arguments to be passed in `func`. Returns ------- list A list of `BaseDataframePartition` objects. Notes ----- The procedures that invoke this method assume full axis knowledge. Implement this method accordingly. You must return a list of `BaseDataframePartition` objects from this method. """ pass # Child classes must have these in order to correctly subclass. partition_type: Type _PARTITIONS_METADATA_LEN = 0 def _wrap_partitions( self, partitions: list, extract_metadata: Optional[bool] = None ) -> list: """ Wrap remote partition objects with `BaseDataframePartition` class. Parameters ---------- partitions : list List of remotes partition objects to be wrapped with `BaseDataframePartition` class. extract_metadata : bool, optional Whether the partitions list contains information about partition's metadata. If `None` was passed will take the argument's value from the value of `cls._PARTITIONS_METADATA_LEN`. Returns ------- list List of wrapped remote partition objects. """ assert self.partition_type is not None if extract_metadata is None: # If `_PARTITIONS_METADATA_LEN == 0` then the execution doesn't support metadata # and thus we should never try extracting it, otherwise assuming that the common # approach of always passing the metadata is used. extract_metadata = bool(self._PARTITIONS_METADATA_LEN) if extract_metadata: # Here we recieve a 1D array of futures describing partitions and their metadata as: # [object_id{partition_idx}, metadata{partition_idx}_{metadata_idx}, ...] # Here's an example of such array: # [ # object_id1, metadata1_1, metadata1_2, ..., metadata1_PARTITIONS_METADATA_LEN, # object_id2, metadata2_1, ..., metadata2_PARTITIONS_METADATA_LEN, # ... # object_idN, metadataN_1, ..., metadataN_PARTITIONS_METADATA_LEN, # ] return [ self.partition_type(*init_args) for init_args in zip( # `partition_type` consumes `(object_id, *metadata)`, thus adding `+1` *[iter(partitions)] * (1 + self._PARTITIONS_METADATA_LEN) ) ] else: return [self.partition_type(object_id) for object_id in partitions] def force_materialization( self, get_ip: bool = False ) -> "BaseDataframeAxisPartition": """ Materialize axis partitions into a single partition. Parameters ---------- get_ip : bool, default: False Whether to get node ip address to a single partition or not. Returns ------- BaseDataframeAxisPartition An axis partition containing only a single materialized partition. """ materialized = self.apply( lambda x: x, num_splits=1, maintain_partitioning=False ) return type(self)(materialized, get_ip=get_ip) # type: ignore[call-arg] def unwrap( self, squeeze: bool = False, get_ip: bool = False ) -> Union[list, Tuple[list, list]]: """ Unwrap partitions from this axis partition. Parameters ---------- squeeze : bool, default: False Flag used to unwrap only one partition. get_ip : bool, default: False Whether to get node ip address to each partition or not. Returns ------- list List of partitions from this axis partition. Notes ----- If `get_ip=True`, a tuple of lists of Ray.ObjectRef/Dask.Future to node ip addresses and unwrapped partitions, respectively, is returned if Ray/Dask is used as an engine (i.e. [(Ray.ObjectRef/Dask.Future, Ray.ObjectRef/Dask.Future), ...]). """ if squeeze and len(self.list_of_blocks) == 1: if get_ip: # TODO(https://github.com/modin-project/modin/issues/5176): Stop ignoring the list_of_ips # check once we know that we're not calling list_of_ips on python axis partitions return self.list_of_ips[0], self.list_of_blocks[0] # type: ignore[attr-defined] else: return self.list_of_blocks[0] else: if get_ip: return list(zip(self.list_of_ips, self.list_of_blocks)) # type: ignore[attr-defined] else: return self.list_of_blocks ================================================ FILE: modin/core/dataframe/pandas/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Base Modin Dataframe classes optimized for pandas storage format.""" ================================================ FILE: modin/core/dataframe/pandas/dataframe/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Base Modin Dataframe class optimized for pandas storage format.""" ================================================ FILE: modin/core/dataframe/pandas/dataframe/dataframe.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Module contains class PandasDataframe. PandasDataframe is a parent abstract class for any dataframe class for pandas storage format. """ from __future__ import annotations import datetime import re from abc import ABC, abstractmethod from functools import cached_property from typing import TYPE_CHECKING, Callable, Dict, Hashable, List, Optional, Union import numpy as np import pandas from pandas._libs.lib import no_default from pandas.api.types import is_object_dtype from pandas.core.dtypes.common import is_dtype_equal, is_list_like, is_numeric_dtype from pandas.core.indexes.api import Index, RangeIndex from modin.config import ( IsRayCluster, MinColumnPartitionSize, MinRowPartitionSize, NPartitions, ) from modin.core.dataframe.base.dataframe.dataframe import ModinDataframe from modin.core.dataframe.base.dataframe.utils import Axis, JoinType, is_trivial_index from modin.core.dataframe.pandas.dataframe.utils import ( ShuffleSortFunctions, add_missing_categories_to_groupby, lazy_metadata_decorator, ) from modin.core.dataframe.pandas.metadata import ( DtypesDescriptor, LazyProxyCategoricalDtype, ModinDtypes, ModinIndex, ) from modin.core.storage_formats.pandas.parsers import ( find_common_type_cat as find_common_type, ) from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler from modin.core.storage_formats.pandas.utils import get_length_list from modin.error_message import ErrorMessage from modin.logging import ClassLogger from modin.logging.config import LogLevel from modin.pandas.indexing import is_range_like from modin.pandas.utils import ( check_both_not_none, get_pandas_backend, is_full_grab_slice, ) from modin.utils import MODIN_UNNAMED_SERIES_LABEL if TYPE_CHECKING: from pandas._typing import npt from modin.core.dataframe.base.interchange.dataframe_protocol.dataframe import ( ProtocolDataframe, ) from modin.core.dataframe.pandas.partitioning.partition_manager import ( PandasDataframePartitionManager, ) class PandasDataframe( ABC, ClassLogger, modin_layer="CORE-DATAFRAME", log_level=LogLevel.DEBUG ): """ An abstract class that represents the parent class for any pandas storage format dataframe class. This class provides interfaces to run operations on dataframe partitions. Parameters ---------- partitions : np.ndarray A 2D NumPy array of partitions. index : sequence or callable, optional The index for the dataframe. Converted to a ``pandas.Index``. Is computed from partitions on demand if not specified. If ``callable() -> (pandas.Index, list of row lengths or None)`` type, then the calculation will be delayed until `self.index` is called. columns : sequence, optional The columns object for the dataframe. Converted to a ``pandas.Index``. Is computed from partitions on demand if not specified. row_lengths : list, optional The length of each partition in the rows. The "height" of each of the block partitions. Is computed if not provided. column_widths : list, optional The width of each partition in the columns. The "width" of each of the block partitions. Is computed if not provided. dtypes : pandas.Series or callable, optional The data types for the dataframe columns. pandas_backend : {"pyarrow", None}, optional Backend used by pandas. """ _partition_mgr_cls: PandasDataframePartitionManager _query_compiler_cls = PandasQueryCompiler # These properties flag whether or not we are deferring the metadata synchronization _deferred_index: bool = False _deferred_column: bool = False _index_cache: ModinIndex = None _columns_cache: ModinIndex = None _dtypes: Optional[ModinDtypes] = None _pandas_backend: Optional[str] = None @property def storage_format(self) -> str: """ The storage format for this frame's data. Returns ------- str The storage format. """ return "Pandas" @property @abstractmethod def engine(self) -> str: """ The engine for this frame. Returns ------- str The engine. """ pass @cached_property def __constructor__(self) -> type[PandasDataframe]: """ Create a new instance of this object. Returns ------- callable """ return type(self) def __init__( self, partitions, index=None, columns=None, row_lengths=None, column_widths=None, dtypes: Optional[Union[pandas.Series, ModinDtypes, Callable]] = None, pandas_backend: Optional[str] = None, ): self._partitions = partitions self.set_index_cache(index) self.set_columns_cache(columns) self._row_lengths_cache = row_lengths self._column_widths_cache = column_widths self._pandas_backend = pandas_backend if pandas_backend != "pyarrow" or len(partitions) == 0: # If the backend is pyarrow and there are no partitions, the computed dtype otherwise becomes NaN, # which means we lost the dtype, so actually set it in that case self.set_dtypes_cache(dtypes) else: # In this case, the type precomputation may be incorrect; we need # to know the type algebra precisely. Considering the number of operations # and different combinations of backends, the best solution would be to # introduce optimizations gradually, with a large number of tests. self.set_dtypes_cache(None) self._validate_axes_lengths() self._filter_empties(compute_metadata=False) def _validate_axes_lengths(self): """Validate that labels are split correctly if split is known.""" if ( self._row_lengths_cache is not None and self.has_materialized_index and len(self.index) > 0 ): # An empty frame can have 0 rows but a nonempty index. If the frame # does have rows, the number of rows must equal the size of the # index. num_rows = sum(self._row_lengths_cache) if num_rows > 0: ErrorMessage.catch_bugs_and_request_email( num_rows != len(self.index), f"Row lengths: {num_rows} != {len(self.index)}", ) ErrorMessage.catch_bugs_and_request_email( any(val < 0 for val in self._row_lengths_cache), f"Row lengths cannot be negative: {self._row_lengths_cache}", ) if ( self._column_widths_cache is not None and self.has_materialized_columns and len(self.columns) > 0 ): # An empty frame can have 0 column but a nonempty column index. If # the frame does have columns, the number of columns must equal the # size of the columns. num_columns = sum(self._column_widths_cache) if num_columns > 0: ErrorMessage.catch_bugs_and_request_email( num_columns != len(self.columns), f"Column widths: {num_columns} != {len(self.columns)}", ) ErrorMessage.catch_bugs_and_request_email( any(val < 0 for val in self._column_widths_cache), f"Column widths cannot be negative: {self._column_widths_cache}", ) @property def num_parts(self) -> int: """ Get the total number of partitions for this frame. Returns ------- int """ return np.prod(self._partitions.shape) @property def row_lengths(self): """ Compute the row partitions lengths if they are not cached. Returns ------- list A list of row partitions lengths. """ if self._row_lengths_cache is None: if len(self._partitions.T) > 0: row_parts = self._partitions.T[0] self._row_lengths_cache = self._get_lengths(row_parts, Axis.ROW_WISE) else: self._row_lengths_cache = [] return self._row_lengths_cache @classmethod def _get_lengths(cls, parts, axis): """ Get list of dimensions for all the provided parts. Parameters ---------- parts : list List of parttions. axis : {0, 1} The axis along which to get the lengths (0 - length across rows or, 1 - width across columns). Returns ------- list """ if axis == Axis.ROW_WISE: return [part.length() for part in parts] else: return [part.width() for part in parts] def __len__(self) -> int: """ Return length of index axis. Returns ------- int """ if self.has_materialized_index: _len = len(self.index) else: _len = sum(self.row_lengths) return _len @property def column_widths(self): """ Compute the column partitions widths if they are not cached. Returns ------- list A list of column partitions widths. """ if self._column_widths_cache is None: if len(self._partitions) > 0: col_parts = self._partitions[0] self._column_widths_cache = self._get_lengths(col_parts, Axis.COL_WISE) else: self._column_widths_cache = [] return self._column_widths_cache def _set_axis_lengths_cache(self, value, axis=0): """ Set the row/column lengths cache for the specified axis. Parameters ---------- value : list of ints axis : int, default: 0 0 for row lengths and 1 for column widths. """ if axis == 0: self._row_lengths_cache = value else: self._column_widths_cache = value def _get_axis_lengths_cache(self, axis=0): """ Get partition's shape caches along the specified axis if avaliable. Parameters ---------- axis : int, default: 0 0 - get row lengths cache, 1 - get column widths cache. Returns ------- list of ints or None If the cache is computed return a list of ints, ``None`` otherwise. """ return self._row_lengths_cache if axis == 0 else self._column_widths_cache def _get_axis_lengths(self, axis: int = 0) -> List[int]: """ Get row lengths/column widths. Parameters ---------- axis : int, default: 0 Returns ------- list of ints """ return self.row_lengths if axis == 0 else self.column_widths @property def has_dtypes_cache(self) -> bool: """ Check if the dtypes cache exists. Returns ------- bool """ return self._dtypes is not None @property def has_materialized_dtypes(self) -> bool: """ Check if dataframe has materialized index cache. Returns ------- bool """ return self.has_dtypes_cache and self._dtypes.is_materialized def copy_dtypes_cache(self): """ Copy the dtypes cache. Returns ------- pandas.Series, callable or None If there is an pandas.Series in the cache, then copying occurs. """ dtypes_cache = None if self.has_dtypes_cache: dtypes_cache = self._dtypes.copy() return dtypes_cache def _maybe_update_proxies(self, dtypes, new_parent=None): """ Update lazy proxies stored inside of `dtypes` with a new parent inplace. Parameters ---------- dtypes : pandas.Series, ModinDtypes or callable new_parent : object, optional A new parent to link the proxies to. If not specified will consider the `self` to be a new parent. Returns ------- pandas.Series, ModinDtypes or callable """ new_parent = new_parent or self if isinstance(dtypes, ModinDtypes): dtypes = dtypes.maybe_specify_new_frame_ref(new_parent) if isinstance(dtypes, pandas.Series): LazyProxyCategoricalDtype.update_dtypes(dtypes, new_parent) return dtypes def set_dtypes_cache(self, dtypes): """ Set dtypes cache. Parameters ---------- dtypes : pandas.Series, ModinDtypes, callable or None """ dtypes = self._maybe_update_proxies(dtypes) if dtypes is None and self.has_materialized_columns: # try to set a descriptor instead of 'None' to be more flexible in # dtypes computing try: self._dtypes = ModinDtypes( DtypesDescriptor( cols_with_unknown_dtypes=self.columns.tolist(), parent_df=self ) ) except NotImplementedError: self._dtypes = None elif isinstance(dtypes, ModinDtypes) or dtypes is None: self._dtypes = dtypes else: self._dtypes = ModinDtypes(dtypes) @property def dtypes(self): """ Compute the data types if they are not cached. Returns ------- pandas.Series A pandas Series containing the data types for this dataframe. """ if self.has_dtypes_cache: dtypes = self._dtypes.get() else: dtypes = self._compute_dtypes() self.set_dtypes_cache(dtypes) # During materialization, we can find out the backend and, if it # is suitable, use the ability to pre-calculate types. self._pandas_backend = get_pandas_backend(dtypes) return dtypes def get_dtypes_set(self): """ Get a set of dtypes that are in this dataframe. Returns ------- set """ if isinstance(self._dtypes, ModinDtypes): return self._dtypes.get_dtypes_set() return set(self.dtypes.values) def _compute_dtypes(self, columns=None) -> pandas.Series: """ Compute the data types via TreeReduce pattern for the specified columns. Parameters ---------- columns : list-like, optional Columns to compute dtypes for. If not specified compute dtypes for all the columns in the dataframe. Returns ------- pandas.Series A pandas Series containing the data types for this dataframe. """ def dtype_builder(df): return df.apply(lambda col: find_common_type(col.values), axis=0) if columns is not None: # Sorting positions to request columns in the order they're stored (it's more efficient) numeric_indices = sorted(self.columns.get_indexer_for(columns)) if any(pos < 0 for pos in numeric_indices): raise KeyError( f"Some of the columns are not in index: subset={columns}; columns={self.columns}" ) obj = self.take_2d_labels_or_positional( col_labels=self.columns[numeric_indices].tolist() ) else: obj = self # For now we will use a pandas Series for the dtypes. if len(obj.columns) > 0: dtypes = ( obj.tree_reduce(0, lambda df: df.dtypes, dtype_builder) .to_pandas() .iloc[0] ) else: dtypes = pandas.Series([]) # reset name to None because we use MODIN_UNNAMED_SERIES_LABEL internally dtypes.name = None return dtypes def set_index_cache(self, index): """ Set index cache. Parameters ---------- index : sequence, callable or None """ if index is None: self._index_cache = ModinIndex(self, axis=0) elif isinstance(index, ModinIndex): # update reference with the new frame to not pollute memory self._index_cache = index.maybe_specify_new_frame_ref(self, axis=0) else: self._index_cache = ModinIndex(index) def set_columns_cache(self, columns): """ Set columns cache. Parameters ---------- columns : sequence, callable or None """ if columns is None: self._columns_cache = ModinIndex(self, axis=1) elif isinstance(columns, ModinIndex): # update reference with the new frame to not pollute memory self._columns_cache = columns.maybe_specify_new_frame_ref(self, axis=1) else: self._columns_cache = ModinIndex(columns) def set_axis_cache(self, value, axis=0): """ Set cache for the specified axis (index or columns). Parameters ---------- value : sequence, callable or None axis : int, default: 0 """ if axis == 0: self.set_index_cache(value) else: self.set_columns_cache(value) def has_axis_cache(self, axis=0) -> bool: """ Check if the cache for the specified axis exists. Parameters ---------- axis : int, default: 0 Returns ------- bool """ return self.has_index_cache if axis == 0 else self.has_columns_cache @property def has_index_cache(self): """ Check if the index cache exists. Returns ------- bool """ return self._index_cache is not None def copy_index_cache(self, copy_lengths=False): """ Copy the index cache. Parameters ---------- copy_lengths : bool, default: False Whether to copy the stored partition lengths to the new index object. Returns ------- pandas.Index, callable or ModinIndex If there is an pandas.Index in the cache, then copying occurs. """ idx_cache = self._index_cache if self.has_index_cache: idx_cache = self._index_cache.copy(copy_lengths) return idx_cache def _get_axis_cache(self, axis=0) -> ModinIndex: """ Get axis cache for the specified axis if available. Parameters ---------- axis : int, default: 0 Returns ------- ModinIndex """ return self._index_cache if axis == 0 else self._columns_cache @property def has_columns_cache(self): """ Check if the columns cache exists. Returns ------- bool """ return self._columns_cache is not None def copy_columns_cache(self, copy_lengths=False): """ Copy the columns cache. Parameters ---------- copy_lengths : bool, default: False Whether to copy the stored partition lengths to the new index object. Returns ------- pandas.Index or None If there is an pandas.Index in the cache, then copying occurs. """ columns_cache = self._columns_cache if columns_cache is not None: columns_cache = columns_cache.copy(copy_lengths) return columns_cache def copy_axis_cache(self, axis=0, copy_lengths=False): """ Copy the axis cache (index or columns). Parameters ---------- axis : int, default: 0 copy_lengths : bool, default: False Whether to copy the stored partition lengths to the new index object. Returns ------- pandas.Index, callable or None If there is an pandas.Index in the cache, then copying occurs. """ if axis == 0: return self.copy_index_cache(copy_lengths) else: return self.copy_columns_cache(copy_lengths) @property def has_materialized_index(self): """ Check if dataframe has materialized index cache. Returns ------- bool """ return self.has_index_cache and self._index_cache.is_materialized @property def has_materialized_columns(self): """ Check if dataframe has materialized columns cache. Returns ------- bool """ return self.has_columns_cache and self._columns_cache.is_materialized def _validate_set_axis(self, new_labels, old_labels): """ Validate the possibility of replacement of old labels with the new labels. Parameters ---------- new_labels : list-like The labels to replace with. old_labels : list-like The labels to replace. Returns ------- list-like The validated labels. """ new_labels = ( ModinIndex(new_labels) if not isinstance(new_labels, ModinIndex) else new_labels ) old_len = len(old_labels) new_len = len(new_labels) if old_len != new_len: raise ValueError( f"Length mismatch: Expected axis has {old_len} elements, " + f"new values have {new_len} elements" ) return new_labels def _get_index(self): """ Get the index from the cache object. Returns ------- pandas.Index An index object containing the row labels. """ if self.has_index_cache: index, row_lengths = self._index_cache.get(return_lengths=True) else: index, row_lengths = self._compute_axis_labels_and_lengths(0) self.set_index_cache(index) if self._row_lengths_cache is None: self._row_lengths_cache = row_lengths return index def _get_columns(self): """ Get the columns from the cache object. Returns ------- pandas.Index An index object containing the column labels. """ if self.has_columns_cache: columns, column_widths = self._columns_cache.get(return_lengths=True) else: columns, column_widths = self._compute_axis_labels_and_lengths(1) self.set_columns_cache(columns) if self._column_widths_cache is None: self._column_widths_cache = column_widths return columns def _set_index(self, new_index): """ Replace the current row labels with new labels. Parameters ---------- new_index : list-like The new row labels. """ if self.has_materialized_index: new_index = self._validate_set_axis(new_index, self._index_cache) self.set_index_cache(new_index) self.synchronize_labels(axis=0) def _set_columns(self, new_columns): """ Replace the current column labels with new labels. Parameters ---------- new_columns : list-like The new column labels. """ if self.has_materialized_columns: # do not set new columns if they're identical to the previous ones if ( isinstance(new_columns, pandas.Index) and self.columns.identical(new_columns) ) or ( not isinstance(new_columns, pandas.Index) and np.array_equal(self.columns.values, new_columns) ): return new_columns = self._validate_set_axis(new_columns, self._columns_cache) if isinstance(self._dtypes, ModinDtypes): try: new_dtypes = self._dtypes.set_index(new_columns) except NotImplementedError: # can raise on duplicated labels new_dtypes = None elif isinstance(self._dtypes, pandas.Series): new_dtypes = self.dtypes.set_axis(new_columns) else: new_dtypes = None self.set_columns_cache(new_columns) # we have to set new dtypes cache after columns, # so the 'self.columns' and 'new_dtypes.index' indices would match self.set_dtypes_cache(new_dtypes) self.synchronize_labels(axis=1) columns = property(_get_columns, _set_columns) index = property(_get_index, _set_index) @property def axes(self): """ Get index and columns that can be accessed with an `axis` integer. Returns ------- list List with two values: index and columns. """ return [self.index, self.columns] def get_axis(self, axis: int = 0) -> pandas.Index: """ Get index object for the requested axis. Parameters ---------- axis : {0, 1}, default: 0 Returns ------- pandas.Index """ return self.index if axis == 0 else self.columns def _compute_axis_labels_and_lengths(self, axis: int, partitions=None): """ Compute the labels for specific `axis`. Parameters ---------- axis : int Axis to compute labels along. partitions : np.ndarray, optional A 2D NumPy array of partitions from which labels will be grabbed. If not specified, partitions will be taken from `self._partitions`. Returns ------- pandas.Index Labels for the specified `axis`. List of int Size of partitions alongside specified `axis`. """ if partitions is None: partitions = self._partitions new_index, internal_idx = self._partition_mgr_cls.get_indices(axis, partitions) return new_index, list(map(len, internal_idx)) def _filter_empties(self, compute_metadata=True): """ Remove empty partitions from `self._partitions` to avoid triggering excess computation. Parameters ---------- compute_metadata : bool, default: True Trigger the computations for partition sizes and labels if they're not done already. """ if not compute_metadata and ( self._row_lengths_cache is None or self._column_widths_cache is None ): # do not trigger the computations return if ( self.has_materialized_index and len(self.index) == 0 or self.has_materialized_columns and len(self.columns) == 0 or sum(self.row_lengths) == 0 or sum(self.column_widths) == 0 ): # This is the case for an empty frame. We don't want to completely remove # all metadata and partitions so for the moment, we won't prune if the frame # is empty. # TODO: Handle empty dataframes better return self._partitions = np.array( [ [ self._partitions[i][j] for j in range(len(self._partitions[i])) if j < len(self.column_widths) and self.column_widths[j] != 0 ] for i in range(len(self._partitions)) if i < len(self.row_lengths) and self.row_lengths[i] != 0 ] ) new_col_widths = [w for w in self.column_widths if w != 0] new_row_lengths = [r for r in self.row_lengths if r != 0] # check whether an axis partitioning was modified and if we should reset the lengths id for 'ModinIndex' if new_col_widths != self.column_widths: self.set_columns_cache(self.copy_columns_cache(copy_lengths=False)) if new_row_lengths != self.row_lengths: self.set_index_cache(self.copy_index_cache(copy_lengths=False)) self._column_widths_cache = new_col_widths self._row_lengths_cache = new_row_lengths def synchronize_labels(self, axis=None): """ Set the deferred axes variables for the ``PandasDataframe``. Parameters ---------- axis : int, optional The deferred axis. 0 for the index, 1 for the columns. """ if axis is None: self._deferred_index = True self._deferred_column = True elif axis == 0: self._deferred_index = True else: self._deferred_column = True def _propagate_index_objs(self, axis=None) -> None: """ Synchronize labels by applying the index object for specific `axis` to the `self._partitions` lazily. Adds `set_axis` function to call-queue of each partition from `self._partitions` to apply new axis. Parameters ---------- axis : int, optional The axis to apply to. If it's None applies to both axes. """ self._filter_empties(compute_metadata=False) if axis is None or axis == 0: cum_row_lengths = np.cumsum([0] + self.row_lengths) if axis is None or axis == 1: cum_col_widths = np.cumsum([0] + self.column_widths) if axis is None: def apply_idx_objs(df, idx, cols): # We should make at least one copy to avoid the data modification problem # that may arise when sharing buffers from distributed storage # (zero-copy pickling). return df.set_axis(idx, axis="index").set_axis( cols, axis="columns", copy=False ) self._partitions = np.array( [ [ self._partitions[i][j].add_to_apply_calls( apply_idx_objs, idx=self.index[ slice(cum_row_lengths[i], cum_row_lengths[i + 1]) ], cols=self.columns[ slice(cum_col_widths[j], cum_col_widths[j + 1]) ], length=self.row_lengths[i], width=self.column_widths[j], ) for j in range(len(self._partitions[i])) ] for i in range(len(self._partitions)) ] ) self._deferred_index = False self._deferred_column = False elif axis == 0: def apply_idx_objs(df, idx): return df.set_axis(idx, axis="index") self._partitions = np.array( [ [ self._partitions[i][j].add_to_apply_calls( apply_idx_objs, idx=self.index[ slice(cum_row_lengths[i], cum_row_lengths[i + 1]) ], length=self.row_lengths[i], width=( self.column_widths[j] if self._column_widths_cache is not None else None ), ) for j in range(len(self._partitions[i])) ] for i in range(len(self._partitions)) ] ) self._deferred_index = False elif axis == 1: def apply_idx_objs(df, cols): return df.set_axis(cols, axis="columns") self._partitions = np.array( [ [ self._partitions[i][j].add_to_apply_calls( apply_idx_objs, cols=self.columns[ slice(cum_col_widths[j], cum_col_widths[j + 1]) ], length=( self.row_lengths[i] if self._row_lengths_cache is not None else None ), width=self.column_widths[j], ) for j in range(len(self._partitions[i])) ] for i in range(len(self._partitions)) ] ) self._deferred_column = False else: ErrorMessage.catch_bugs_and_request_email( axis is not None and axis not in [0, 1] ) @lazy_metadata_decorator(apply_axis=None) def take_2d_labels_or_positional( self, row_labels: Optional[List[Hashable]] = None, row_positions: Optional[List[int]] = None, col_labels: Optional[List[Hashable]] = None, col_positions: Optional[List[int]] = None, ) -> PandasDataframe: """ Lazily select columns or rows from given indices. Parameters ---------- row_labels : list of hashable, optional The row labels to extract. row_positions : list-like of ints, optional The row positions to extract. col_labels : list of hashable, optional The column labels to extract. col_positions : list-like of ints, optional The column positions to extract. Returns ------- PandasDataframe A new PandasDataframe from the mask provided. Notes ----- If both `row_labels` and `row_positions` are provided, a ValueError is raised. The same rule applies for `col_labels` and `col_positions`. """ if check_both_not_none(row_labels, row_positions): raise ValueError( "Both row_labels and row_positions were provided - " + "please provide only one of row_labels and row_positions." ) if check_both_not_none(col_labels, col_positions): raise ValueError( "Both col_labels and col_positions were provided - " + "please provide only one of col_labels and col_positions." ) if row_labels is not None: # Get numpy array of positions of values from `row_labels` if isinstance(self.index, pandas.MultiIndex): row_positions = np.zeros(len(row_labels), dtype="int64") # we can't use .get_locs(row_labels) because the function # requires a different format for row_labels for idx, label in enumerate(row_labels): if isinstance(label, str): label = [label] # get_loc can return slice that _take_2d_positional can't handle row_positions[idx] = self.index.get_locs(label)[0] else: row_positions = self.index.get_indexer_for(row_labels) if col_labels is not None: # Get numpy array of positions of values from `col_labels` if isinstance(self.columns, pandas.MultiIndex): col_positions = np.zeros(len(col_labels), dtype="int64") # we can't use .get_locs(col_labels) because the function # requires a different format for row_labels for idx, label in enumerate(col_labels): if isinstance(label, str): label = [label] # get_loc can return slice that _take_2d_positional can't handle col_positions[idx] = self.columns.get_locs(label)[0] else: col_positions = self.columns.get_indexer_for(col_labels) return self._take_2d_positional(row_positions, col_positions) def _get_sorted_positions(self, positions): """ Sort positions if necessary. Parameters ---------- positions : Sequence[int] Returns ------- Sequence[int] """ # Helper for take_2d_positional if is_range_like(positions) and positions.step > 0: sorted_positions = positions else: sorted_positions = np.sort(positions) return sorted_positions def _get_new_lengths(self, partitions_dict, *, axis: int) -> List[int]: """ Find lengths of new partitions. Parameters ---------- partitions_dict : dict axis : int Returns ------- list[int] """ # Helper for take_2d_positional if axis == 0: axis_lengths = self.row_lengths else: axis_lengths = self.column_widths new_lengths = [ len( # Row lengths for slice are calculated as the length of the slice # on the partition. Often this will be the same length as the current # length, but sometimes it is different, thus the extra calculation. range(*part_indexer.indices(axis_lengths[part_idx])) if isinstance(part_indexer, slice) else part_indexer ) for part_idx, part_indexer in partitions_dict.items() ] return new_lengths def _get_new_index_obj( self, positions, sorted_positions, axis: int ) -> tuple[pandas.Index, slice | npt.NDArray[np.intp]]: """ Find the new Index object for take_2d_positional result. Parameters ---------- positions : Sequence[int] sorted_positions : Sequence[int] axis : int Returns ------- pandas.Index slice or Sequence[int] """ # Helper for take_2d_positional # Use the slice to calculate the new columns if axis == 0: idx = self.index else: idx = self.columns # TODO: Support fast processing of negative-step ranges if is_range_like(positions) and positions.step > 0: # pandas Index is more likely to preserve its metadata if the indexer # is slice monotonic_idx = slice(positions.start, positions.stop, positions.step) else: monotonic_idx = np.asarray(sorted_positions, dtype=np.intp) new_idx = idx[monotonic_idx] return new_idx, monotonic_idx def _take_2d_positional( self, row_positions: Optional[List[int]] = None, col_positions: Optional[List[int]] = None, ) -> PandasDataframe: """ Lazily select columns or rows from given indices. Parameters ---------- row_positions : list-like of ints, optional The row positions to extract. col_positions : list-like of ints, optional The column positions to extract. Returns ------- PandasDataframe A new PandasDataframe from the mask provided. """ indexers = [] for axis, indexer in enumerate((row_positions, col_positions)): if is_range_like(indexer): if indexer.step == 1 and len(indexer) == len(self.get_axis(axis)): # By this function semantics, `None` indexer is a full-axis access indexer = None elif indexer is not None and not isinstance(indexer, pandas.RangeIndex): # Pure python's range is not fully compatible with a list of ints, # converting it to ``pandas.RangeIndex``` that is compatible. indexer = pandas.RangeIndex( indexer.start, indexer.stop, indexer.step ) else: ErrorMessage.catch_bugs_and_request_email( failure_condition=not (indexer is None or is_list_like(indexer)), extra_log="Mask takes only list-like numeric indexers, " + f"received: {type(indexer)}", ) if isinstance(indexer, list): indexer = np.array(indexer, dtype=np.int64) indexers.append(indexer) row_positions, col_positions = indexers if col_positions is None and row_positions is None: return self.copy() # quite fast check that allows skip sorting must_sort_row_pos = row_positions is not None and not np.all( row_positions[1:] >= row_positions[:-1] ) must_sort_col_pos = col_positions is not None and not np.all( col_positions[1:] >= col_positions[:-1] ) if col_positions is None and row_positions is not None: # Check if the optimization that first takes part of the data using the mask # operation so that later less data is concatenated into a whole column is useful. # In the case when only a small portion of the data is discarded, the overhead of the # engine (for putting data in and out of storage) can exceed the resulting speedup. all_rows = None if self.has_materialized_index: all_rows = len(self.index) elif self._row_lengths_cache or must_sort_row_pos: all_rows = sum(self.row_lengths) # 'base_num_cols' specifies the number of columns that the dataframe should have # in order to jump to 'reordered_labels' in case of len(row_positions) / len(self) >= base_ratio; # these variables may be a subject to change in order to tune performance more accurately base_num_cols = 10 base_ratio = 0.2 # Example: # len(self.columns): 10 == base_num_cols -> min ratio to jump to reorder_labels: 0.2 == base_ratio # len(self.columns): 15 -> min ratio to jump to reorder_labels: 0.3 # len(self.columns): 20 -> min ratio to jump to reorder_labels: 0.4 # ... # len(self.columns): 49 -> min ratio to jump to reorder_labels: 0.98 # len(self.columns): 50 -> min ratio to jump to reorder_labels: 1.0 # len(self.columns): 55 -> min ratio to jump to reorder_labels: 1.0 # ... if (all_rows and len(row_positions) > 0.9 * all_rows) or ( must_sort_row_pos and len(row_positions) * base_num_cols >= min( all_rows * len(self.columns) * base_ratio, len(row_positions) * base_num_cols, ) ): return self._reorder_labels( row_positions=row_positions, col_positions=col_positions ) sorted_row_positions = sorted_col_positions = None if row_positions is not None: if must_sort_row_pos: sorted_row_positions = self._get_sorted_positions(row_positions) else: sorted_row_positions = row_positions # Get dict of row_parts as {row_index: row_internal_indices} row_partitions_dict = self._get_dict_of_block_index( 0, sorted_row_positions, are_indices_sorted=True ) new_row_lengths = self._get_new_lengths(row_partitions_dict, axis=0) new_index, _ = self._get_new_index_obj( row_positions, sorted_row_positions, axis=0 ) else: row_partitions_dict = {i: slice(None) for i in range(len(self._partitions))} new_row_lengths = self._row_lengths_cache new_index = self.copy_index_cache(copy_lengths=True) if col_positions is not None: if must_sort_col_pos: sorted_col_positions = self._get_sorted_positions(col_positions) else: sorted_col_positions = col_positions # Get dict of col_parts as {col_index: col_internal_indices} col_partitions_dict = self._get_dict_of_block_index( 1, sorted_col_positions, are_indices_sorted=True ) new_col_widths = self._get_new_lengths(col_partitions_dict, axis=1) new_columns, monotonic_col_idx = self._get_new_index_obj( col_positions, sorted_col_positions, axis=1 ) ErrorMessage.catch_bugs_and_request_email( failure_condition=sum(new_col_widths) != len(new_columns), extra_log=f"{sum(new_col_widths)} != {len(new_columns)}.\n" + f"{col_positions}\n{self.column_widths}\n{col_partitions_dict}", ) if self.has_materialized_dtypes: new_dtypes = self.dtypes.iloc[monotonic_col_idx] elif isinstance(self._dtypes, ModinDtypes): try: supported_monotonic_col_idx = monotonic_col_idx if isinstance(monotonic_col_idx, slice): supported_monotonic_col_idx = pandas.RangeIndex( monotonic_col_idx.start, monotonic_col_idx.stop, monotonic_col_idx.step, ).to_list() new_dtypes = self._dtypes.lazy_get( supported_monotonic_col_idx, numeric_index=True ) # can raise either on missing cache or on duplicated labels except (ValueError, NotImplementedError): new_dtypes = None else: new_dtypes = None else: col_partitions_dict = { i: slice(None) for i in range(len(self._partitions.T)) } new_col_widths = self._column_widths_cache new_columns = self.copy_columns_cache(copy_lengths=True) new_dtypes = self.copy_dtypes_cache() new_partitions = np.array( [ [ self._partitions[row_idx][col_idx].mask( row_internal_indices, col_internal_indices ) for col_idx, col_internal_indices in col_partitions_dict.items() ] for row_idx, row_internal_indices in row_partitions_dict.items() ] ) intermediate = self.__constructor__( new_partitions, new_index, new_columns, new_row_lengths, new_col_widths, new_dtypes, pandas_backend=self._pandas_backend, ) return self._maybe_reorder_labels( intermediate, row_positions, col_positions, ) def _maybe_reorder_labels( self, intermediate: PandasDataframe, row_positions, col_positions, ) -> PandasDataframe: """ Call re-order labels on take_2d_labels_or_positional result if necessary. Parameters ---------- intermediate : PandasDataFrame row_positions : list-like of ints, optional The row positions to extract. col_positions : list-like of ints, optional The column positions to extract. Returns ------- PandasDataframe """ # Check if monotonically increasing, return if it is. Fast track code path for # common case to keep it fast. if ( row_positions is None # Fast range processing of non-positive-step ranges is not yet supported or (is_range_like(row_positions) and row_positions.step > 0) or len(row_positions) == 1 or np.all(row_positions[1:] >= row_positions[:-1]) ) and ( col_positions is None # Fast range processing of non-positive-step ranges is not yet supported or (is_range_like(col_positions) and col_positions.step > 0) or len(col_positions) == 1 or np.all(col_positions[1:] >= col_positions[:-1]) ): return intermediate # The new labels are often smaller than the old labels, so we can't reuse the # original order values because those were mapped to the original data. We have # to reorder here based on the expected order from within the data. # To do so, we "unsort" the indices by using np.argsort() twice, as inspired by # https://stackoverflow.com/questions/2483696/undo-or-reverse-argsort-python, # meaning that `new_row_order` must be so `np.sort(row_positions)[new_row_order] == row_positions` # This is achieved by first calculating the indices which would sort `row_positions`, # and then by calculating new indices that would sort "sorting indices" themselves. # First argsort brings us to the proper "index space" (according to smaller labels count), # and the second re-orders them to match the original data. new_row_order, new_col_order = None, None if is_range_like(row_positions): if row_positions.step < 0: # do not need to re-order positive-step-ranges new_row_order = pandas.RangeIndex(len(row_positions) - 1, -1, -1) elif row_positions is not None: new_row_order = np.argsort( np.argsort(np.asarray(row_positions, dtype=np.intp)) ) if is_range_like(col_positions): if col_positions.step < 0: new_col_order = pandas.RangeIndex(len(col_positions) - 1, -1, -1) elif col_positions is not None: new_col_order = np.argsort( np.argsort(np.asarray(col_positions, dtype=np.intp)) ) return intermediate._reorder_labels( row_positions=new_row_order, col_positions=new_col_order ) @lazy_metadata_decorator(apply_axis="rows") def from_labels(self) -> PandasDataframe: """ Convert the row labels to a column of data, inserted at the first position. Gives result by similar way as `pandas.DataFrame.reset_index`. Each level of `self.index` will be added as separate column of data. Returns ------- PandasDataframe A PandasDataframe with new columns from index labels. """ new_row_labels = pandas.RangeIndex(len(self.index)) if self.index.nlevels > 1: level_names = [ ( self.index.names[i] if self.index.names[i] is not None else "level_{}".format(i) ) for i in range(self.index.nlevels) ] else: level_names = [ ( self.index.names[0] if self.index.names[0] is not None else ( "index" if "index" not in self.columns else "level_{}".format(0) ) ) ] names = tuple(level_names) if len(level_names) > 1 else level_names[0] new_dtypes = self.index.to_frame(name=names).dtypes try: new_dtypes = ModinDtypes.concat([new_dtypes, self._dtypes]) except NotImplementedError: # can raise on duplicated labels new_dtypes = None # We will also use the `new_column_names` in the calculation of the internal metadata, so this is a # lightweight way of ensuring the metadata matches. if self.columns.nlevels > 1: # Column labels are different for multilevel index. new_column_names = pandas.MultiIndex.from_tuples( # Set level names on the 1st columns level and fill up empty level names with empty string. # Expand tuples in level names. This is how reset_index works when col_level col_fill are not specified. [ tuple( list(level) + [""] * (self.columns.nlevels - len(level)) if isinstance(level, tuple) else [level] + [""] * (self.columns.nlevels - 1) ) for level in level_names ], names=self.columns.names, ) else: new_column_names = pandas.Index(level_names, tupleize_cols=False) new_columns = new_column_names.append(self.columns) def from_labels_executor( df: pandas.DataFrame, **kwargs ) -> pandas.DataFrame: # pragma: no cover # Setting the names here ensures that external and internal metadata always match. df.index.names = new_column_names # Handling of a case when columns have the same name as one of index levels names. # In this case `df.reset_index` provides errors related to columns duplication. # This case is possible because columns metadata updating is deferred. To workaround # `df.reset_index` error we allow columns duplication in "if" branch via `concat`. if any(name_level in df.columns for name_level in df.index.names): columns_to_add = df.index.to_frame() columns_to_add.reset_index(drop=True, inplace=True) df = df.reset_index(drop=True) result = pandas.concat([columns_to_add, df], axis=1, copy=False) else: result = df.reset_index() # Put the index back to the original due to GH#4394 result.index = df.index return result new_parts = self._partition_mgr_cls.apply_func_to_select_indices( 0, self._partitions, from_labels_executor, [0], keep_remaining=True, ) new_column_widths = [ self.index.nlevels + self.column_widths[0] ] + self.column_widths[1:] result = self.__constructor__( new_parts, new_row_labels, new_columns, row_lengths=self._row_lengths_cache, column_widths=new_column_widths, dtypes=new_dtypes, pandas_backend=self._pandas_backend, ) # Set flag for propagating deferred row labels across dataframe partitions result.synchronize_labels(axis=0) return result def to_labels(self, column_list: List[Hashable]) -> PandasDataframe: """ Move one or more columns into the row labels. Previous labels are dropped. Parameters ---------- column_list : list of hashable The list of column names to place as the new row labels. Returns ------- PandasDataframe A new PandasDataframe that has the updated labels. """ extracted_columns = self.take_2d_labels_or_positional( col_labels=column_list ).to_pandas() if len(column_list) == 1: new_labels = pandas.Index( extracted_columns.squeeze(axis=1), name=column_list[0] ) else: new_labels = pandas.MultiIndex.from_frame( extracted_columns, names=column_list ) result = self.take_2d_labels_or_positional( col_labels=[i for i in self.columns if i not in extracted_columns.columns] ) result.index = new_labels return result @lazy_metadata_decorator(apply_axis="both") def _reorder_labels(self, row_positions=None, col_positions=None): """ Reorder the column and or rows in this DataFrame. Parameters ---------- row_positions : list of int, optional The ordered list of new row orders such that each position within the list indicates the new position. col_positions : list of int, optional The ordered list of new column orders such that each position within the list indicates the new position. Returns ------- PandasDataframe A new PandasDataframe with reordered columns and/or rows. """ new_dtypes = self.copy_dtypes_cache() if row_positions is not None: # We want to preserve the frame's partitioning so passing in ``keep_partitioning=True`` # in order to use the cached `row_lengths` values for the new frame. # If the frame's is re-partitioned using the "standard" partitioning, # then knowing that, we can compute new row lengths. ordered_rows = self._partition_mgr_cls.map_axis_partitions( 0, self._partitions, lambda df: df.iloc[row_positions], keep_partitioning=True, ) row_idx = self.index[row_positions] if len(row_idx) != len(self.index): # The frame was re-partitioned along the 0 axis during reordering using # the "standard" partitioning. Knowing the standard partitioning scheme # we are able to compute new row lengths. new_lengths = get_length_list( axis_len=len(row_idx), num_splits=ordered_rows.shape[0], min_block_size=MinRowPartitionSize.get(), ) else: # If the frame's partitioning was preserved then # we can use previous row lengths cache new_lengths = self._row_lengths_cache else: ordered_rows = self._partitions row_idx = self.copy_index_cache(copy_lengths=True) new_lengths = self._row_lengths_cache if col_positions is not None: # We want to preserve the frame's partitioning so passing in ``keep_partitioning=True`` # in order to use the cached `column_widths` values for the new frame. # If the frame's is re-partitioned using the "standard" partitioning, # then knowing that, we can compute new column widths. ordered_cols = self._partition_mgr_cls.map_axis_partitions( 1, ordered_rows, lambda df: df.iloc[:, col_positions], keep_partitioning=True, ) col_idx = self.columns[col_positions] if self.has_materialized_dtypes: new_dtypes = self.dtypes.iloc[col_positions] elif isinstance(self._dtypes, ModinDtypes): try: new_dtypes = self._dtypes.lazy_get(col_idx) # can raise on duplicated labels except NotImplementedError: new_dtypes = None if len(col_idx) != len(self.columns): # The frame was re-partitioned along the 1 axis during reordering using # the "standard" partitioning. Knowing the standard partitioning scheme # we are able to compute new column widths. new_widths = get_length_list( axis_len=len(col_idx), num_splits=ordered_cols.shape[1], min_block_size=MinColumnPartitionSize.get(), ) else: # If the frame's partitioning was preserved then # we can use previous column widths cache new_widths = self._column_widths_cache else: ordered_cols = ordered_rows col_idx = self.copy_columns_cache(copy_lengths=True) new_widths = self._column_widths_cache return self.__constructor__( ordered_cols, row_idx, col_idx, new_lengths, new_widths, new_dtypes, pandas_backend=self._pandas_backend, ) @lazy_metadata_decorator(apply_axis=None) def copy(self): """ Copy this object. Returns ------- PandasDataframe A copied version of this object. """ return self.__constructor__( self._partitions, self.copy_index_cache(copy_lengths=True), self.copy_columns_cache(copy_lengths=True), self._row_lengths_cache, self._column_widths_cache, self.copy_dtypes_cache(), pandas_backend=self._pandas_backend, ) @lazy_metadata_decorator(apply_axis="both") def astype(self, col_dtypes, errors: str = "raise"): """ Convert the columns dtypes to given dtypes. Parameters ---------- col_dtypes : dictionary of {col: dtype,...} or str Where col is the column name and dtype is a NumPy dtype. errors : {'raise', 'ignore'}, default: 'raise' Control raising of exceptions on invalid data for provided dtype. Returns ------- BaseDataFrame Dataframe with updated dtypes. """ new_dtypes = None self_dtypes = self.dtypes # When casting to "category" we have to make up the whole axis partition # to get the properly encoded table of categories. Every block partition # will store the encoded table. That can lead to higher memory footprint. # TODO: Revisit if this hurts users. use_full_axis_cast = False if isinstance(col_dtypes, dict): for column, dtype in col_dtypes.items(): if not is_dtype_equal(dtype, self_dtypes[column]): if new_dtypes is None: new_dtypes = self_dtypes.copy() # Update the new dtype series to the proper pandas dtype new_dtype = pandas.api.types.pandas_dtype(dtype) if self.engine == "Dask" and hasattr(dtype, "_is_materialized"): # FIXME: https://github.com/dask/distributed/issues/8585 _ = dtype._materialize_categories() # We cannot infer without computing the dtype if new dtype is categorical if isinstance(new_dtype, pandas.CategoricalDtype): new_dtypes[column] = LazyProxyCategoricalDtype._build_proxy( # Actual parent will substitute `None` at `.set_dtypes_cache` parent=None, column_name=column, materializer=lambda parent, column: parent._compute_dtypes( columns=[column] )[column], ) use_full_axis_cast = True else: new_dtypes[column] = new_dtype def astype_builder(df): """Compute new partition frame with dtypes updated.""" return df.astype( {k: v for k, v in col_dtypes.items() if k in df}, errors=errors ) else: # Assume that the dtype is a scalar. if not (self_dtypes == col_dtypes).all(): new_dtypes = self_dtypes.copy() new_dtype = pandas.api.types.pandas_dtype(col_dtypes) if self.engine == "Dask" and hasattr(new_dtype, "_is_materialized"): # FIXME: https://github.com/dask/distributed/issues/8585 _ = new_dtype._materialize_categories() if isinstance(new_dtype, pandas.CategoricalDtype): new_dtypes[:] = new_dtypes.to_frame().apply( lambda column: LazyProxyCategoricalDtype._build_proxy( # Actual parent will substitute `None` at `.set_dtypes_cache` parent=None, column_name=column.index[0], materializer=lambda parent, column: parent._compute_dtypes( columns=[column] )[column], ) )[0] use_full_axis_cast = True else: new_dtypes[:] = new_dtype def astype_builder(df): """Compute new partition frame with dtypes updated.""" return df.astype(col_dtypes, errors=errors) if new_dtypes is None: return self.copy() if use_full_axis_cast: new_frame = self._partition_mgr_cls.map_axis_partitions( 0, self._partitions, astype_builder, keep_partitioning=True ) else: new_frame = self._partition_mgr_cls.lazy_map_partitions( self._partitions, astype_builder ) return self.__constructor__( new_frame, self.copy_index_cache(copy_lengths=True), self.copy_columns_cache(copy_lengths=True), self._row_lengths_cache, self._column_widths_cache, new_dtypes, pandas_backend=get_pandas_backend(new_dtypes), ) def numeric_columns(self, include_bool=True): """ Return the names of numeric columns in the frame. Parameters ---------- include_bool : bool, default: True Whether to consider boolean columns as numeric. Returns ------- list List of column names. """ columns = [] for col, dtype in zip(self.columns, self.dtypes): if is_numeric_dtype(dtype) and ( include_bool or (not include_bool and dtype != np.bool_) ): columns.append(col) return columns def _get_dict_of_block_index(self, axis, indices, are_indices_sorted=False): """ Convert indices to an ordered dict mapping partition (or block) index to internal indices in said partition. Parameters ---------- axis : {0, 1} The axis along which to get the indices (0 - rows, 1 - columns). indices : list of int, slice A list of global indices to convert. are_indices_sorted : bool, default: False Flag indicating whether the `indices` sequence is sorted by ascending or not. Note: the internal algorithm requires for the `indices` to be sorted, this flag is used for optimization in order to not sort already sorted data. Be careful when passing ``True`` for this flag, if the data appears to be unsorted with the flag set to ``True`` this would lead to undefined behavior. Returns ------- dict A mapping from partition index to list of internal indices which correspond to `indices` in each partition. """ # TODO: Support handling of slices with specified 'step'. For now, converting them into a range if isinstance(indices, slice) and ( indices.step is not None and indices.step != 1 ): indices = range(*indices.indices(len(self.get_axis(axis)))) # Fasttrack slices if isinstance(indices, slice) or (is_range_like(indices) and indices.step == 1): # Converting range-like indexer to slice indices = slice(indices.start, indices.stop, indices.step) if is_full_grab_slice(indices, sequence_len=len(self.get_axis(axis))): return dict( zip( range(self._partitions.shape[axis]), [slice(None)] * self._partitions.shape[axis], ) ) # Empty selection case if indices.start == indices.stop and indices.start is not None: return dict() if indices.start is None or indices.start == 0: last_part, last_idx = list( self._get_dict_of_block_index(axis, [indices.stop]).items() )[0] dict_of_slices = dict(zip(range(last_part), [slice(None)] * last_part)) dict_of_slices.update({last_part: slice(last_idx[0])}) return dict_of_slices elif indices.stop is None or indices.stop >= len(self.get_axis(axis)): first_part, first_idx = list( self._get_dict_of_block_index(axis, [indices.start]).items() )[0] dict_of_slices = dict({first_part: slice(first_idx[0], None)}) num_partitions = np.size(self._partitions, axis=axis) part_list = range(first_part + 1, num_partitions) dict_of_slices.update( dict(zip(part_list, [slice(None)] * len(part_list))) ) return dict_of_slices else: first_part, first_idx = list( self._get_dict_of_block_index(axis, [indices.start]).items() )[0] last_part, last_idx = list( self._get_dict_of_block_index(axis, [indices.stop]).items() )[0] if first_part == last_part: return dict({first_part: slice(first_idx[0], last_idx[0])}) else: if last_part - first_part == 1: return dict( # FIXME: this dictionary creation feels wrong - it might not maintain the order { first_part: slice(first_idx[0], None), last_part: slice(None, last_idx[0]), } ) else: dict_of_slices = dict({first_part: slice(first_idx[0], None)}) part_list = range(first_part + 1, last_part) dict_of_slices.update( dict(zip(part_list, [slice(None)] * len(part_list))) ) dict_of_slices.update({last_part: slice(None, last_idx[0])}) return dict_of_slices if isinstance(indices, list): # Converting python list to numpy for faster processing indices = np.array(indices, dtype=np.int64) # Fasttrack empty numpy array if isinstance(indices, np.ndarray) and indices.size == 0: # This will help preserve metadata stored in empty dataframes (indexes and dtypes) # Otherwise, we will get an empty `new_partitions` array, from which it will # no longer be possible to obtain metadata return dict([(0, np.array([], dtype=np.int64))]) negative_mask = np.less(indices, 0) has_negative = np.any(negative_mask) if has_negative: # We're going to modify 'indices' inplace in a numpy way, so doing a copy/converting indices to numpy. indices = ( indices.copy() if isinstance(indices, np.ndarray) else np.array(indices, dtype=np.int64) ) indices[negative_mask] = indices[negative_mask] % len(self.get_axis(axis)) # If the `indices` array was modified because of the negative indices conversion # then the original order was broken and so we have to sort anyway: if has_negative or not are_indices_sorted: indices = np.sort(indices) if axis == 0: bins = np.array(self.row_lengths) else: bins = np.array(self.column_widths) # INT_MAX to make sure we don't try to compute on partitions that don't exist. cumulative = np.append(bins[:-1].cumsum(), np.iinfo(bins.dtype).max) def internal(block_idx: int, global_index): """Transform global index to internal one for given block (identified by its index).""" return ( global_index if not block_idx else np.subtract( global_index, cumulative[min(block_idx, len(cumulative) - 1) - 1] ) ) partition_ids = np.digitize(indices, cumulative) count_for_each_partition = np.array( [(partition_ids == i).sum() for i in range(len(cumulative))] ).cumsum() # Compute the internal indices and pair those with the partition index. # If the first partition has any values we need to return, compute those # first to make the list comprehension easier. Otherwise, just append the # rest of the values to an empty list. if count_for_each_partition[0] > 0: first_partition_indices = [ (0, internal(0, indices[slice(count_for_each_partition[0])])) ] else: first_partition_indices = [] partition_ids_with_indices = first_partition_indices + [ ( i, internal( i, indices[ slice( count_for_each_partition[i - 1], count_for_each_partition[i], ) ], ), ) for i in range(1, len(count_for_each_partition)) if count_for_each_partition[i] > count_for_each_partition[i - 1] ] return dict(partition_ids_with_indices) @staticmethod def _join_index_objects(axis, indexes, how, sort, fill_value=None): """ Join the pair of index objects (columns or rows) by a given strategy. Unlike Index.join() in pandas, if `axis` is 1, `sort` is False, and `how` is "outer", the result will _not_ be sorted. Parameters ---------- axis : {0, 1} The axis index object to join (0 - rows, 1 - columns). indexes : list(Index) The indexes to join on. how : {'left', 'right', 'inner', 'outer', None} The type of join to join to make. If `None` then joined index considered to be the first index in the `indexes` list. sort : boolean Whether or not to sort the joined index. fill_value : any, optional Value to use for missing values. Returns ------- (Index, func) Joined index with make_reindexer func. """ assert isinstance(indexes, list) # define helper functions def merge(left_index, right_index): """Combine a pair of indices depending on `axis`, `how` and `sort` from outside.""" if axis == 1 and how == "outer" and not sort: return left_index.union(right_index, sort=False) else: return left_index.join(right_index, how=how, sort=sort) # define condition for joining indexes all_indices_equal = all(indexes[0].equals(index) for index in indexes[1:]) do_join_index = how is not None and not all_indices_equal # define condition for joining indexes with getting indexers need_indexers = ( axis == 0 and not all_indices_equal and any(not index.is_unique for index in indexes) ) indexers = None # perform joining indexes if do_join_index: if len(indexes) == 2 and need_indexers: # in case of count of indexes > 2 we should perform joining all indexes # after that get indexers # in the fast path we can obtain joined_index and indexers in one call indexers = [None, None] joined_index, indexers[0], indexers[1] = indexes[0].join( indexes[1], how=how, sort=sort, return_indexers=True ) else: joined_index = indexes[0] # TODO: revisit for performance for index in indexes[1:]: joined_index = merge(joined_index, index) else: joined_index = indexes[0].copy() if need_indexers and indexers is None: indexers = [index.get_indexer_for(joined_index) for index in indexes] def make_reindexer(do_reindex: bool, frame_idx: int): """Create callback that reindexes the dataframe using newly computed index.""" # the order of the frames must match the order of the indexes if not do_reindex: return lambda df: df if need_indexers: assert indexers is not None return lambda df: df._reindex_with_indexers( {0: [joined_index, indexers[frame_idx]]}, copy=True, allow_dups=True, fill_value=fill_value, ) return lambda df: df.reindex(joined_index, axis=axis, fill_value=fill_value) return joined_index, make_reindexer # Internal methods # These methods are for building the correct answer in a modular way. # Please be careful when changing these! def _build_treereduce_func(self, axis, func): """ Properly formats a TreeReduce result so that the partitioning is correct. Parameters ---------- axis : int The axis along which to apply the function. func : callable The function to apply. Returns ------- callable A function to be shipped to the partitions to be executed. Notes ----- This should be used for any TreeReduce style operation that results in a reduced data dimensionality (dataframe -> series). """ def _tree_reduce_func(df, *args, **kwargs): """Tree-reducer function itself executing `func`, presenting the resulting pandas.Series as pandas.DataFrame.""" series_result = func(df, *args, **kwargs) if axis == 0 and isinstance(series_result, pandas.Series): # In the case of axis=0, we need to keep the shape of the data # consistent with what we have done. In the case of a reduce, the # data for axis=0 should be a single value for each column. By # transposing the data after we convert to a DataFrame, we ensure that # the columns of the result line up with the columns from the data. # axis=1 does not have this requirement because the index already will # line up with the index of the data based on how pandas creates a # DataFrame from a Series. result = pandas.DataFrame(series_result).T result.index = [MODIN_UNNAMED_SERIES_LABEL] else: result = pandas.DataFrame(series_result) if isinstance(series_result, pandas.Series): result.columns = [MODIN_UNNAMED_SERIES_LABEL] return result return _tree_reduce_func def _compute_tree_reduce_metadata(self, axis, new_parts, dtypes=None): """ Compute the metadata for the result of reduce function. Parameters ---------- axis : int The axis on which reduce function was applied. new_parts : NumPy 2D array Partitions with the result of applied function. dtypes : str, optional The data types for the result. This is an optimization because there are functions that always result in a particular data type, and this allows us to avoid (re)computing it. Returns ------- PandasDataframe Modin series (1xN frame) containing the reduced data. """ new_axes, new_axes_lengths = [0, 0], [0, 0] new_axes[axis] = [MODIN_UNNAMED_SERIES_LABEL] new_axes[axis ^ 1] = self.get_axis(axis ^ 1) new_axes_lengths[axis] = [1] new_axes_lengths[axis ^ 1] = self._get_axis_lengths(axis ^ 1) if dtypes == "copy": dtypes = self.copy_dtypes_cache() elif dtypes is not None: dtypes = pandas.Series( [pandas.api.types.pandas_dtype(dtypes)] * len(new_axes[1]), index=new_axes[1], ) result = self.__constructor__( new_parts, *new_axes, *new_axes_lengths, dtypes, pandas_backend=self._pandas_backend, ) return result @lazy_metadata_decorator(apply_axis="both") def reduce( self, axis: Union[int, Axis], function: Callable, dtypes: Optional[str] = None, ) -> PandasDataframe: """ Perform a user-defined aggregation on the specified axis, where the axis reduces down to a singleton. Requires knowledge of the full axis for the reduction. Parameters ---------- axis : int or modin.core.dataframe.base.utils.Axis The axis to perform the reduce over. function : callable(row|col) -> single value The reduce function to apply to each column. dtypes : str, optional The data types for the result. This is an optimization because there are functions that always result in a particular data type, and this allows us to avoid (re)computing it. Returns ------- PandasDataframe Modin series (1xN frame) containing the reduced data. Notes ----- The user-defined function must reduce to a single value. """ axis = Axis(axis) function = self._build_treereduce_func(axis.value, function) new_parts = self._partition_mgr_cls.map_axis_partitions( axis.value, self._partitions, function ) return self._compute_tree_reduce_metadata(axis.value, new_parts, dtypes=dtypes) @lazy_metadata_decorator(apply_axis="opposite", axis_arg=0) def tree_reduce( self, axis: Union[int, Axis], map_func: Callable, reduce_func: Optional[Callable] = None, dtypes: Optional[str] = None, ) -> PandasDataframe: """ Apply function that will reduce the data to a pandas Series. Parameters ---------- axis : int or modin.core.dataframe.base.utils.Axis The axis to perform the tree reduce over. map_func : callable(row|col) -> row|col Callable function to map the dataframe. reduce_func : callable(row|col) -> single value, optional Callable function to reduce the dataframe. If none, then apply map_func twice. dtypes : str, optional The data types for the result. This is an optimization because there are functions that always result in a particular data type, and this allows us to avoid (re)computing it. Returns ------- PandasDataframe A new dataframe. """ axis = Axis(axis) map_func = self._build_treereduce_func(axis.value, map_func) if reduce_func is None: reduce_func = map_func else: reduce_func = self._build_treereduce_func(axis.value, reduce_func) map_parts = self._partition_mgr_cls.map_partitions(self._partitions, map_func) reduce_parts = self._partition_mgr_cls.map_axis_partitions( axis.value, map_parts, reduce_func ) return self._compute_tree_reduce_metadata( axis.value, reduce_parts, dtypes=dtypes ) @lazy_metadata_decorator(apply_axis=None) def map( self, func: Callable, dtypes: Optional[str] = None, new_columns: Optional[pandas.Index] = None, func_args=None, func_kwargs=None, lazy=False, ) -> PandasDataframe: """ Perform a function that maps across the entire dataset. Parameters ---------- func : callable(row|col|cell) -> row|col|cell The function to apply. dtypes : dtypes of the result, optional The data types for the result. This is an optimization because there are functions that always result in a particular data type, and this allows us to avoid (re)computing it. new_columns : pandas.Index, optional New column labels of the result, its length has to be identical to the older columns. If not specified, old column labels are preserved. func_args : iterable, optional Positional arguments for the 'func' callable. func_kwargs : dict, optional Keyword arguments for the 'func' callable. lazy : bool, default: False Whether to prefer lazy execution or not. Returns ------- PandasDataframe A new dataframe. """ map_fn = ( self._partition_mgr_cls.lazy_map_partitions if lazy else self._partition_mgr_cls.map_partitions ) new_partitions = map_fn(self._partitions, func, func_args, func_kwargs) if new_columns is not None and self.has_materialized_columns: assert len(new_columns) == len( self.columns ), "New column's length must be identical to the previous columns" elif new_columns is None: new_columns = self.copy_columns_cache(copy_lengths=True) if isinstance(dtypes, str) and dtypes == "copy": dtypes = self.copy_dtypes_cache() elif dtypes is not None and not isinstance(dtypes, pandas.Series): if isinstance(new_columns, ModinIndex): # Materializing lazy columns in order to build dtype's index new_columns = new_columns.get(return_lengths=False) dtypes = pandas.Series( [pandas.api.types.pandas_dtype(dtypes)] * len(new_columns), index=new_columns, ) return self.__constructor__( new_partitions, self.copy_index_cache(copy_lengths=True), new_columns, self._row_lengths_cache, self._column_widths_cache, dtypes=dtypes, pandas_backend=self._pandas_backend, ) def window( self, axis: Union[int, Axis], reduce_fn: Callable, window_size: int, result_schema: Optional[Dict[Hashable, type]] = None, ) -> PandasDataframe: """ Apply a sliding window operator that acts as a GROUPBY on each window, and reduces down to a single row (column) per window. Parameters ---------- axis : int or modin.core.dataframe.base.utils.Axis The axis to slide over. reduce_fn : callable(rowgroup|colgroup) -> row|col The reduce function to apply over the data. window_size : int The number of row/columns to pass to the function. (The size of the sliding window). result_schema : dict, optional Mapping from column labels to data types that represents the types of the output dataframe. Returns ------- PandasDataframe A new PandasDataframe with the reduce function applied over windows of the specified axis. Notes ----- The user-defined reduce function must reduce each window’s column (row if axis=1) down to a single value. """ pass @lazy_metadata_decorator(apply_axis="both") def fold(self, axis, func, new_index=None, new_columns=None, shape_preserved=False): """ Perform a function across an entire axis. Parameters ---------- axis : int The axis to apply over. func : callable The function to apply. new_index : list-like, optional The index of the result. new_columns : list-like, optional The columns of the result. shape_preserved : bool, default: False Whether the shape of the dataframe is preserved or not after applying a function. Returns ------- PandasDataframe A new dataframe. """ new_row_lengths = None new_column_widths = None if shape_preserved: if new_index is None: new_index = self.copy_index_cache(copy_lengths=True) if new_columns is None: new_columns = self.copy_columns_cache(copy_lengths=True) new_row_lengths = self._row_lengths_cache new_column_widths = self._column_widths_cache new_partitions = self._partition_mgr_cls.map_axis_partitions( axis, self._partitions, func, keep_partitioning=True ) return self.__constructor__( new_partitions, new_index, new_columns, row_lengths=new_row_lengths, column_widths=new_column_widths, pandas_backend=self._pandas_backend, ) def infer_objects(self) -> PandasDataframe: """ Attempt to infer better dtypes for object columns. Attempts soft conversion of object-dtyped columns, leaving non-object and unconvertible columns unchanged. The inference rules are the same as during normal Series/DataFrame construction. Returns ------- PandasDataframe A new PandasDataframe with the inferred schema. """ obj_cols = [ col for col, dtype in enumerate(self.dtypes) if is_object_dtype(dtype) ] return self.infer_types(obj_cols) def infer_types(self, col_labels: List[str]) -> PandasDataframe: """ Determine the compatible type shared by all values in the specified columns, and coerce them to that type. Parameters ---------- col_labels : list List of column labels to infer and induce types over. Returns ------- PandasDataframe A new PandasDataframe with the inferred schema. """ # Compute dtypes on the specified columns, and then set those dtypes on a new frame new_cols = self.take_2d_labels_or_positional(col_labels=col_labels) new_cols_dtypes = new_cols.tree_reduce(0, pandas.DataFrame.infer_objects).dtypes new_dtypes = self.dtypes.copy() new_dtypes[col_labels] = new_cols_dtypes return self.__constructor__( self._partitions, self.copy_index_cache(copy_lengths=True), self.copy_columns_cache(copy_lengths=True), self._row_lengths_cache, self._column_widths_cache, new_dtypes, pandas_backend=self._pandas_backend, ) def join( self, axis: Union[int, Axis], condition: Callable, other: ModinDataframe, join_type: Union[str, JoinType], ) -> PandasDataframe: """ Join this dataframe with the other. Parameters ---------- axis : int or modin.core.dataframe.base.utils.Axis The axis to perform the join on. condition : callable Function that determines which rows should be joined. The condition can be a simple equality, e.g. "left.col1 == right.col1" or can be arbitrarily complex. other : ModinDataframe The other data to join with, i.e. the right dataframe. join_type : string {"inner", "left", "right", "outer"} or modin.core.dataframe.base.utils.JoinType The type of join to perform. Returns ------- PandasDataframe A new PandasDataframe that is the result of applying the specified join over the two dataframes. Notes ----- During the join, this dataframe is considered the left, while the other is treated as the right. Only inner joins, left outer, right outer, and full outer joins are currently supported. Support for other join types (e.g. natural join) may be implemented in the future. """ pass def rename( self, new_row_labels: Optional[Union[Dict[Hashable, Hashable], Callable]] = None, new_col_labels: Optional[Union[Dict[Hashable, Hashable], Callable]] = None, ) -> PandasDataframe: """ Replace the row and column labels with the specified new labels. Parameters ---------- new_row_labels : dictionary or callable, optional Mapping or callable that relates old row labels to new labels. new_col_labels : dictionary or callable, optional Mapping or callable that relates old col labels to new labels. Returns ------- PandasDataframe A new PandasDataframe with the new row and column labels. """ result = self.copy() if new_row_labels is not None: if callable(new_row_labels): new_row_labels = result.index.map(new_row_labels) result.index = new_row_labels if new_col_labels is not None: if callable(new_col_labels): new_col_labels = result.columns.map(new_col_labels) result.columns = new_col_labels return result def combine_and_apply( self, func, new_index=None, new_columns=None, new_dtypes=None ): """ Combine all partitions into a single big one and apply the passed function to it. Use this method with care as it collects all the data on the same worker, it's only recommended to use this method on small or reduced datasets. Parameters ---------- func : callable(pandas.DataFrame) -> pandas.DataFrame A function to apply to the combined partition. new_index : sequence, optional Index of the result. new_columns : sequence, optional Columns of the result. new_dtypes : dict-like, optional Dtypes of the result. Returns ------- PandasDataframe """ if self._partitions.shape[1] > 1: new_partitions = self._partition_mgr_cls.row_partitions(self._partitions) new_partitions = np.array([[partition] for partition in new_partitions]) modin_frame = self.__constructor__( new_partitions, self.copy_index_cache(copy_lengths=True), self.copy_columns_cache(), self._row_lengths_cache, [len(self.columns)] if self.has_materialized_columns else None, self.copy_dtypes_cache(), pandas_backend=self._pandas_backend, ) else: modin_frame = self return modin_frame.apply_full_axis( axis=0, func=func, new_index=new_index, new_columns=new_columns, dtypes=new_dtypes, ) @lazy_metadata_decorator(apply_axis="both") def _apply_func_to_range_partitioning( self, key_columns, func, ascending=True, preserve_columns=False, data=None, data_key_columns=None, level=None, shuffle_func_cls=ShuffleSortFunctions, **kwargs, ): """ Reshuffle data so it would be range partitioned and then apply the passed function row-wise. Parameters ---------- key_columns : list of hashables Columns to build the range partitioning for. Can't be specified along with `level`. func : callable(pandas.DataFrame) -> pandas.DataFrame Function to apply against partitions. ascending : bool, default: True Whether the range should be built in ascending or descending order. preserve_columns : bool, default: False If the columns cache should be preserved (specify this flag if `func` doesn't change column labels). data : PandasDataframe, optional Dataframe to range-partition along with the `self` frame. If specified, the `func` will recieve a dataframe with an additional MultiIndex level in columns that separates `self` and `data`: ``df["grouper"] # self`` and ``df["data"] # data``. data_key_columns : list of hashables, optional Additional key columns from `data`. Will be combined with `key_columns`. level : list of ints or labels, optional Index level(s) to build the range partitioning for. Can't be specified along with `key_columns`. shuffle_func_cls : cls, default: ShuffleSortFunctions A class implementing ``modin.core.dataframe.pandas.utils.ShuffleFunctions`` to be used as a shuffle function. **kwargs : dict Additional arguments to forward to the range builder function. Returns ------- PandasDataframe A new dataframe. """ if data is not None: # adding an extra MultiIndex level in order to separate `self grouper` from the `data` # after concatenation new_grouper_cols = pandas.MultiIndex.from_tuples( [ ("grouper", *col) if isinstance(col, tuple) else ("grouper", col) for col in self.columns ] ) grouper = self.copy() grouper.columns = new_grouper_cols new_data_cols = pandas.MultiIndex.from_tuples( [ ("data", *col) if isinstance(col, tuple) else ("data", col) for col in data.columns ] ) data = data.copy() data.columns = new_data_cols grouper = grouper.concat(axis=1, others=[data], how="right", sort=False) # since original column names were modified, have to modify 'key_columns' as well key_columns = [ ("grouper", *col) if isinstance(col, tuple) else ("grouper", col) for col in key_columns ] if data_key_columns is None: data_key_columns = [] else: data_key_columns = [ ("data", *col) if isinstance(col, tuple) else ("data", col) for col in data_key_columns ] key_columns += data_key_columns else: grouper = self # If there's only one row partition can simply apply the function row-wise without the need to reshuffle if grouper._partitions.shape[0] == 1: result = grouper.apply_full_axis( axis=1, func=func, new_columns=grouper.copy_columns_cache() if preserve_columns else None, ) if preserve_columns: result._set_axis_lengths_cache(grouper._column_widths_cache, axis=1) return result # don't want to inherit over-partitioning so doing this 'min' check ideal_num_new_partitions = min(len(grouper._partitions), NPartitions.get()) m = len(grouper) / ideal_num_new_partitions sampling_probability = (1 / m) * np.log(ideal_num_new_partitions * len(grouper)) # If this df is overpartitioned, we try to sample each partition with probability # greater than 1, which leads to an error. In this case, we can do one of the following # two things. If there is only enough rows for one partition, and we have only 1 column # partition, we can just combine the overpartitioned df into one partition, and sort that # partition. If there is enough data for more than one partition, we can tell the sorting # algorithm how many partitions we want to end up with, so it samples and finds pivots # according to that. if sampling_probability >= 1: from modin.config import MinRowPartitionSize ideal_num_new_partitions = round(len(grouper) / MinRowPartitionSize.get()) if len(grouper) < MinRowPartitionSize.get() or ideal_num_new_partitions < 2: # If the data is too small, we shouldn't try reshuffling/repartitioning but rather # simply combine all partitions and apply the sorting to the whole dataframe return grouper.combine_and_apply(func=func) if ideal_num_new_partitions < len(grouper._partitions): if len(grouper._partitions) % ideal_num_new_partitions == 0: joining_partitions = np.split( grouper._partitions, ideal_num_new_partitions ) else: step = round(len(grouper._partitions) / ideal_num_new_partitions) joining_partitions = np.split( grouper._partitions, range(step, len(grouper._partitions), step), ) new_partitions = np.array( [ grouper._partition_mgr_cls.column_partitions( ptn_grp, full_axis=False ) for ptn_grp in joining_partitions ] ) else: new_partitions = grouper._partitions else: new_partitions = grouper._partitions shuffling_functions = shuffle_func_cls( grouper, key_columns, ascending[0] if is_list_like(ascending) else ascending, ideal_num_new_partitions, level=level, **kwargs, ) if key_columns: # here we want to get indices of those partitions that hold the key columns key_indices = grouper.columns.get_indexer_for(key_columns) partition_indices = np.unique( np.digitize(key_indices, np.cumsum(grouper.column_widths)) ) elif level is not None: # each partition contains an index, so taking the first one partition_indices = [0] else: raise ValueError("Must specify either 'level' or 'key_columns'") new_partitions = grouper._partition_mgr_cls.shuffle_partitions( new_partitions, partition_indices, shuffling_functions, func, ) result = grouper.__constructor__(new_partitions) if preserve_columns: result.set_columns_cache(grouper.copy_columns_cache()) # We perform the final steps of the sort on full axis partitions, so we know that the # length of each partition is the full length of the dataframe. if grouper.has_materialized_columns: result._set_axis_lengths_cache([len(grouper.columns)], axis=1) return result @lazy_metadata_decorator(apply_axis="both") def sort_by( self, axis: Union[int, Axis], columns: Union[str, List[str]], ascending: bool = True, **kwargs, ) -> PandasDataframe: """ Logically reorder rows (columns if axis=1) lexicographically by the data in a column or set of columns. Parameters ---------- axis : int or modin.core.dataframe.base.utils.Axis The axis to perform the sort over. columns : string or list Column label(s) to use to determine lexicographical ordering. ascending : boolean, default: True Whether to sort in ascending or descending order. **kwargs : dict Keyword arguments to pass when sorting partitions. Returns ------- PandasDataframe A new PandasDataframe sorted into lexicographical order by the specified column(s). """ if not isinstance(columns, list): columns = [columns] def sort_function(df): # pragma: no cover # When we do a sort on the result of Series.value_counts, we don't rename the index until # after everything is done, which causes an error when sorting the partitions, since the # index and the column share the same name, when in actuality, the index's name should be # None. This fixes the indexes name beforehand in that case, so that the sort works. index_renaming = None if any(name in df.columns for name in df.index.names): index_renaming = df.index.names df.index = df.index.set_names([None] * len(df.index.names)) df = df.sort_values(by=columns, ascending=ascending, **kwargs) if index_renaming is not None: df.index = df.index.set_names(index_renaming) return df # If this df is empty, we don't want to try and shuffle or sort. if len(self.get_axis(1)) == 0 or len(self) == 0: return self.copy() axis = Axis(axis) if axis != Axis.ROW_WISE: raise NotImplementedError( f"Algebra sort only implemented row-wise. {axis.name} sort not implemented yet!" ) result = self._apply_func_to_range_partitioning( key_columns=[columns[0]], func=sort_function, ascending=ascending, preserve_columns=True, **kwargs, ) result.set_dtypes_cache(self.copy_dtypes_cache()) if kwargs.get("ignore_index", False): result.index = RangeIndex(len(self.get_axis(axis.value))) # Since the strategy to pick our pivots involves random sampling # we could end up picking poor pivots, leading to skew in our partitions. # We should add a fix to check if there is skew in the partitions and rebalance # them if necessary. Calling `rebalance_partitions` won't do this, since it only # resolves the case where there isn't the right amount of partitions - not where # there is skew across the lengths of partitions. return result @lazy_metadata_decorator(apply_axis="both") def filter(self, axis: Union[Axis, int], condition: Callable) -> PandasDataframe: """ Filter data based on the function provided along an entire axis. Parameters ---------- axis : int or modin.core.dataframe.base.utils.Axis The axis to filter over. condition : callable(row|col) -> bool The function to use for the filter. This function should filter the data itself. Returns ------- PandasDataframe A new filtered dataframe. """ axis = Axis(axis) assert axis in ( Axis.ROW_WISE, Axis.COL_WISE, ), "Axis argument to filter operator must be 0 (rows) or 1 (columns)" new_partitions = self._partition_mgr_cls.map_axis_partitions( axis.value, self._partitions, condition, keep_partitioning=True ) new_axes, new_lengths = [0, 0], [0, 0] new_axes[axis.value] = self.copy_axis_cache(axis.value, copy_lengths=True) new_lengths[axis.value] = ( self._row_lengths_cache if axis.value == 0 else self._column_widths_cache ) new_axes[axis.value ^ 1], new_lengths[axis.value ^ 1] = None, None return self.__constructor__( new_partitions, *new_axes, *new_lengths, self.copy_dtypes_cache() if axis == Axis.COL_WISE else None, pandas_backend=self._pandas_backend, ) def filter_by_types(self, types: List[Hashable]) -> PandasDataframe: """ Allow the user to specify a type or set of types by which to filter the columns. Parameters ---------- types : list The types to filter columns by. Returns ------- PandasDataframe A new PandasDataframe from the filter provided. """ return self.take_2d_labels_or_positional( col_positions=[i for i, dtype in enumerate(self.dtypes) if dtype in types] ) @lazy_metadata_decorator(apply_axis="both") def explode(self, axis: Union[int, Axis], func: Callable) -> PandasDataframe: """ Explode list-like entries along an entire axis. Parameters ---------- axis : int or modin.core.dataframe.base.utils.Axis The axis specifying how to explode. If axis=1, explode according to columns. func : callable The function to use to explode a single element. Returns ------- PandasFrame A new filtered dataframe. """ axis = Axis(axis) partitions = self._partition_mgr_cls.map_axis_partitions( axis.value, self._partitions, func, keep_partitioning=True ) if axis == Axis.COL_WISE: new_index, row_lengths = self._compute_axis_labels_and_lengths( 0, partitions ) new_columns, column_widths = self.columns, self._column_widths_cache else: new_index, row_lengths = self.index, self._row_lengths_cache new_columns, column_widths = self._compute_axis_labels_and_lengths( 1, partitions ) return self.__constructor__( partitions, new_index, new_columns, row_lengths, column_widths, pandas_backend=self._pandas_backend, ) def combine(self) -> PandasDataframe: """ Create a single partition PandasDataframe from the partitions of the current dataframe. Returns ------- PandasDataframe A single partition PandasDataframe. """ new_index = None new_columns = None if self._deferred_index: new_index = self.index if self._deferred_column: new_columns = self.columns partitions = self._partition_mgr_cls.combine( self._partitions, new_index, new_columns ) result = self.__constructor__( partitions, index=self.copy_index_cache(), columns=self.copy_columns_cache(), row_lengths=( [sum(self._row_lengths_cache)] if self._row_lengths_cache is not None else None ), column_widths=( [sum(self._column_widths_cache)] if self._column_widths_cache is not None else None ), dtypes=self.copy_dtypes_cache(), pandas_backend=self._pandas_backend, ) return result @lazy_metadata_decorator(apply_axis="both") def apply_full_axis( self, axis, func, new_index=None, new_columns=None, apply_indices=None, enumerate_partitions: bool = False, dtypes=None, keep_partitioning=True, num_splits=None, sync_labels=True, pass_axis_lengths_to_partitions=False, ) -> PandasDataframe: """ Perform a function across an entire axis. Parameters ---------- axis : {0, 1} The axis to apply over (0 - rows, 1 - columns). func : callable The function to apply. new_index : list-like, optional The index of the result. We may know this in advance, and if not provided it must be computed. new_columns : list-like, optional The columns of the result. We may know this in advance, and if not provided it must be computed. apply_indices : list-like, optional Indices of `axis ^ 1` to apply function over. enumerate_partitions : bool, default: False Whether pass partition index into applied `func` or not. Note that `func` must be able to obtain `partition_idx` kwarg. dtypes : list-like or scalar, optional The data types of the result. This is an optimization because there are functions that always result in a particular data type, and allows us to avoid (re)computing it. keep_partitioning : boolean, default: True The flag to keep partition boundaries for Modin Frame if possible. Setting it to True disables shuffling data from one partition to another in case the resulting number of splits is equal to the initial number of splits. num_splits : int, optional The number of partitions to split the result into across the `axis`. If None, then the number of splits will be infered automatically. If `num_splits` is None and `keep_partitioning=True` then the number of splits is preserved. sync_labels : boolean, default: True Synchronize external indexes (`new_index`, `new_columns`) with internal indexes. This could be used when you're certain that the indices in partitions are equal to the provided hints in order to save time on syncing them. pass_axis_lengths_to_partitions : bool, default: False Whether pass partition lengths along `axis ^ 1` to the kernel `func`. Note that `func` must be able to obtain `df, *axis_lengths`. Returns ------- PandasDataframe A new dataframe. Notes ----- The data shape may change as a result of the function. """ return self.broadcast_apply_full_axis( axis=axis, func=func, new_index=new_index, new_columns=new_columns, apply_indices=apply_indices, enumerate_partitions=enumerate_partitions, dtypes=dtypes, other=None, keep_partitioning=keep_partitioning, num_splits=num_splits, sync_labels=sync_labels, pass_axis_lengths_to_partitions=pass_axis_lengths_to_partitions, ) @lazy_metadata_decorator(apply_axis="both") def apply_full_axis_select_indices( self, axis, func, apply_indices=None, numeric_indices=None, new_index=None, new_columns=None, keep_remaining=False, new_dtypes: Optional[Union[pandas.Series, ModinDtypes]] = None, ): """ Apply a function across an entire axis for a subset of the data. Parameters ---------- axis : int The axis to apply over. func : callable The function to apply. apply_indices : list-like, optional The labels to apply over. numeric_indices : list-like, optional The indices to apply over. new_index : list-like, optional The index of the result. We may know this in advance, and if not provided it must be computed. new_columns : list-like, optional The columns of the result. We may know this in advance, and if not provided it must be computed. keep_remaining : boolean, default: False Whether or not to drop the data that is not computed over. new_dtypes : ModinDtypes or pandas.Series, optional The data types of the result. This is an optimization because there are functions that always result in a particular data type, and allows us to avoid (re)computing it. Returns ------- PandasDataframe A new dataframe. """ assert apply_indices is not None or numeric_indices is not None # Convert indices to numeric indices old_index = self.index if axis else self.columns if apply_indices is not None: numeric_indices = old_index.get_indexer_for(apply_indices) # Get the indices for the axis being applied to (it is the opposite of axis # being applied over) dict_indices = self._get_dict_of_block_index(axis ^ 1, numeric_indices) new_partitions = ( self._partition_mgr_cls.apply_func_to_select_indices_along_full_axis( axis, self._partitions, func, dict_indices, keep_remaining=keep_remaining, ) ) # TODO Infer columns and index from `keep_remaining` and `apply_indices` if new_index is None: new_index = self.index if axis == 1 else None if new_columns is None: new_columns = self.columns if axis == 0 else None return self.__constructor__( new_partitions, new_index, new_columns, None, None, dtypes=new_dtypes, pandas_backend=self._pandas_backend, ) @lazy_metadata_decorator(apply_axis="both") def apply_select_indices( self, axis, func, apply_indices=None, row_labels=None, col_labels=None, new_index=None, new_columns=None, new_dtypes: Optional[pandas.Series] = None, keep_remaining=False, item_to_distribute=no_default, ) -> PandasDataframe: """ Apply a function for a subset of the data. Parameters ---------- axis : {0, 1} The axis to apply over. func : callable The function to apply. apply_indices : list-like, optional The labels to apply over. Must be given if axis is provided. row_labels : list-like, optional The row labels to apply over. Must be provided with `col_labels` to apply over both axes. col_labels : list-like, optional The column labels to apply over. Must be provided with `row_labels` to apply over both axes. new_index : list-like, optional The index of the result, if known in advance. new_columns : list-like, optional The columns of the result, if known in advance. new_dtypes : pandas.Series, optional The dtypes of the result, if known in advance. keep_remaining : boolean, default: False Whether or not to drop the data that is not computed over. item_to_distribute : np.ndarray or scalar, default: no_default The item to split up so it can be applied over both axes. Returns ------- PandasDataframe A new dataframe. """ # TODO Infer columns and index from `keep_remaining` and `apply_indices` if new_index is None: new_index = self.index if axis == 1 else None if new_columns is None: new_columns = self.columns if axis == 0 else None if new_columns is not None and isinstance(new_dtypes, pandas.Series): assert new_dtypes.index.equals( new_columns ), f"{new_dtypes=} doesn't have the same columns as in {new_columns=}" if axis is not None: assert apply_indices is not None # Convert indices to numeric indices old_index = self.index if axis else self.columns numeric_indices = old_index.get_indexer_for(apply_indices) # Get indices being applied to (opposite of indices being applied over) dict_indices = self._get_dict_of_block_index(axis ^ 1, numeric_indices) new_partitions = self._partition_mgr_cls.apply_func_to_select_indices( axis, self._partitions, func, dict_indices, keep_remaining=keep_remaining, ) # Length objects for new object creation. This is shorter than if..else # This object determines the lengths and widths based on the given # parameters and builds a dictionary used in the constructor below. 0 gives # the row lengths and 1 gives the column widths. Since the dimension of # `axis` given may have changed, we currently just recompute it. # TODO Determine lengths from current lengths if `keep_remaining=False` lengths_objs = { axis: ( [len(apply_indices)] if not keep_remaining else [self.row_lengths, self.column_widths][axis] ), axis ^ 1: [self.row_lengths, self.column_widths][axis ^ 1], } return self.__constructor__( new_partitions, new_index, new_columns, lengths_objs[0], lengths_objs[1], new_dtypes, pandas_backend=self._pandas_backend, ) else: # We are applying over both axes here, so make sure we have all the right # variables set. assert row_labels is not None and col_labels is not None assert keep_remaining assert item_to_distribute is not no_default row_partitions_list = self._get_dict_of_block_index(0, row_labels).items() col_partitions_list = self._get_dict_of_block_index(1, col_labels).items() new_partitions = self._partition_mgr_cls.apply_func_to_indices_both_axis( self._partitions, func, row_partitions_list, col_partitions_list, item_to_distribute, # Passing caches instead of values in order to not trigger shapes recomputation # if they are not used inside this function. self._row_lengths_cache, self._column_widths_cache, ) return self.__constructor__( new_partitions, new_index, new_columns, self._row_lengths_cache, self._column_widths_cache, new_dtypes, pandas_backend=self._pandas_backend, ) @lazy_metadata_decorator(apply_axis="both") def broadcast_apply( self, axis, func, other, join_type="left", copartition=True, labels="keep", dtypes=None, ): """ Broadcast axis partitions of `other` to partitions of `self` and apply a function. Parameters ---------- axis : {0, 1} Axis to broadcast over. func : callable Function to apply. other : PandasDataframe Modin DataFrame to broadcast. join_type : str, default: "left" Type of join to apply. copartition : bool, default: True Whether to align indices/partitioning of the `self` and `other` frame. Disabling this may save some time, however, you have to be 100% sure that the indexing and partitioning are identical along the broadcasting axis, this might be the case for example if `other` is a projection of the `self` or vice-versa. If copartitioning is disabled and partitioning/indexing are incompatible then you may end up with undefined behavior. labels : {"keep", "replace", "drop"}, default: "keep" Whether keep labels from `self` Modin DataFrame, replace them with labels from joined DataFrame or drop altogether to make them be computed lazily later. dtypes : "copy", pandas.Series or None, optional Dtypes of the result. "copy" to keep old dtypes and None to compute them on demand. Returns ------- PandasDataframe New Modin DataFrame. """ if copartition: # Only sort the indices if they do not match ( left_parts, right_parts, joined_index, partition_sizes_along_axis, ) = self._copartition( axis, other, join_type, ) # unwrap list returned by `copartition`. right_parts = right_parts[0] else: left_parts = self._partitions right_parts = other._partitions partition_sizes_along_axis, joined_index = self._get_axis_lengths_cache( axis ), self.copy_axis_cache(axis) new_frame = self._partition_mgr_cls.broadcast_apply( axis, func, left_parts, right_parts ) if isinstance(dtypes, str) and dtypes == "copy": dtypes = self.copy_dtypes_cache() def _pick_axis(get_axis, sizes_cache): if labels == "keep": return get_axis(), sizes_cache if labels == "replace": return joined_index, partition_sizes_along_axis assert labels == "drop", f"Unexpected `labels`: {labels}" return None, None if axis == 0: # Pass shape caches instead of values in order to not trigger shape computation. new_index, new_row_lengths = _pick_axis( self.copy_index_cache, self._row_lengths_cache ) new_columns, new_column_widths = ( self.copy_columns_cache(), self._column_widths_cache, ) else: new_index, new_row_lengths = ( self.copy_index_cache(), self._row_lengths_cache, ) new_columns, new_column_widths = _pick_axis( self.copy_columns_cache, self._column_widths_cache ) return self.__constructor__( new_frame, new_index, new_columns, new_row_lengths, new_column_widths, dtypes=dtypes, pandas_backend=self._pandas_backend, ) def _prepare_frame_to_broadcast(self, axis, indices, broadcast_all): """ Compute the indices to broadcast `self` considering `indices`. Parameters ---------- axis : {0, 1} Axis to broadcast along. indices : dict Dict of indices and internal indices of partitions where `self` must be broadcasted. broadcast_all : bool Whether broadcast the whole axis of `self` frame or just a subset of it. Returns ------- dict Dictionary with indices of partitions to broadcast. Notes ----- New dictionary of indices of `self` partitions represents that you want to broadcast `self` at specified another partition named `other`. For example, Dictionary {key: {key1: [0, 1], key2: [5]}} means, that in `other`[key] you want to broadcast [self[key1], self[key2]] partitions and internal indices for `self` must be [[0, 1], [5]] """ if broadcast_all: sizes = self.row_lengths if axis else self.column_widths return {key: dict(enumerate(sizes)) for key in indices.keys()} passed_len = 0 result_dict = {} for part_num, internal in indices.items(): result_dict[part_num] = self._get_dict_of_block_index( axis ^ 1, np.arange(passed_len, passed_len + len(internal)) ) passed_len += len(internal) return result_dict def _extract_partitions(self): """ Extract partitions if partitions are present. If partitions are empty return a dummy partition with empty data but index and columns of current dataframe. Returns ------- np.ndarray NumPy array with extracted partitions. """ if self._partitions.size > 0: return self._partitions else: dtypes = None if self.has_materialized_dtypes: dtypes = self.dtypes return self._partition_mgr_cls.create_partition_from_metadata( index=self.index, columns=self.columns, dtypes=dtypes ) @lazy_metadata_decorator(apply_axis="both") def broadcast_apply_select_indices( self, axis, func, other: PandasDataframe, apply_indices=None, numeric_indices=None, keep_remaining=False, broadcast_all=True, new_index=None, new_columns=None, ) -> PandasDataframe: """ Apply a function to select indices at specified axis and broadcast partitions of `other` Modin DataFrame. Parameters ---------- axis : {0, 1} Axis to apply function along. func : callable Function to apply. other : PandasDataframe Partitions of which should be broadcasted. apply_indices : list, optional List of labels to apply (if `numeric_indices` are not specified). numeric_indices : list, optional Numeric indices to apply (if `apply_indices` are not specified). keep_remaining : bool, default: False Whether drop the data that is not computed over or not. broadcast_all : bool, default: True Whether broadcast the whole axis of right frame to every partition or just a subset of it. new_index : pandas.Index, optional Index of the result. We may know this in advance, and if not provided it must be computed. new_columns : pandas.Index, optional Columns of the result. We may know this in advance, and if not provided it must be computed. Returns ------- PandasDataframe New Modin DataFrame. """ assert ( apply_indices is not None or numeric_indices is not None ), "Indices to apply must be specified!" if other is None: if apply_indices is None: apply_indices = self.get_axis(axis)[numeric_indices] return self.apply_select_indices( axis=axis, func=func, apply_indices=apply_indices, keep_remaining=keep_remaining, new_index=new_index, new_columns=new_columns, ) if numeric_indices is None: old_index = self.index if axis else self.columns numeric_indices = old_index.get_indexer_for(apply_indices) dict_indices = self._get_dict_of_block_index(axis ^ 1, numeric_indices) broadcasted_dict = other._prepare_frame_to_broadcast( axis, dict_indices, broadcast_all=broadcast_all ) new_partitions = self._partition_mgr_cls.broadcast_apply_select_indices( axis, func, self._partitions, other._partitions, dict_indices, broadcasted_dict, keep_remaining, ) return self.__constructor__( new_partitions, index=new_index, columns=new_columns, pandas_backend=self._pandas_backend, ) @lazy_metadata_decorator(apply_axis="both") def broadcast_apply_full_axis( self, axis, func, other, new_index=None, new_columns=None, apply_indices=None, enumerate_partitions=False, dtypes=None, keep_partitioning=True, num_splits=None, sync_labels=True, pass_axis_lengths_to_partitions=False, ): """ Broadcast partitions of `other` Modin DataFrame and apply a function along full axis. Parameters ---------- axis : {0, 1} Axis to apply over (0 - rows, 1 - columns). func : callable Function to apply. other : PandasDataframe or list Modin DataFrame(s) to broadcast. new_index : list-like, optional Index of the result. We may know this in advance, and if not provided it must be computed. new_columns : list-like, optional Columns of the result. We may know this in advance, and if not provided it must be computed. apply_indices : list-like, optional Indices of `axis ^ 1` to apply function over. enumerate_partitions : bool, default: False Whether pass partition index into applied `func` or not. Note that `func` must be able to obtain `partition_idx` kwarg. dtypes : list-like or scalar, optional Data types of the result. This is an optimization because there are functions that always result in a particular data type, and allows us to avoid (re)computing it. keep_partitioning : boolean, default: True The flag to keep partition boundaries for Modin Frame if possible. Setting it to True disables shuffling data from one partition to another in case the resulting number of splits is equal to the initial number of splits. num_splits : int, optional The number of partitions to split the result into across the `axis`. If None, then the number of splits will be infered automatically. If `num_splits` is None and `keep_partitioning=True` then the number of splits is preserved. sync_labels : boolean, default: True Synchronize external indexes (`new_index`, `new_columns`) with internal indexes. This could be used when you're certain that the indices in partitions are equal to the provided hints in order to save time on syncing them. pass_axis_lengths_to_partitions : bool, default: False Whether pass partition lengths along `axis ^ 1` to the kernel `func`. Note that `func` must be able to obtain `df, *axis_lengths`. Returns ------- PandasDataframe New Modin DataFrame. """ if other is not None: if not isinstance(other, list): other = [other] other = [o._extract_partitions() for o in other] if len(other) else None if apply_indices is not None: numeric_indices = self.get_axis(axis ^ 1).get_indexer_for(apply_indices) apply_indices = self._get_dict_of_block_index( axis ^ 1, numeric_indices ).keys() apply_func_args = None if pass_axis_lengths_to_partitions: if axis == 0: apply_func_args = ( self._column_widths_cache if self._column_widths_cache is not None else [part.width(materialize=False) for part in self._partitions[0]] ) else: apply_func_args = ( self._row_lengths_cache if self._row_lengths_cache is not None else [ part.length(materialize=False) for part in self._partitions.T[0] ] ) new_partitions = self._partition_mgr_cls.broadcast_axis_partitions( axis=axis, left=self._partitions, right=other, apply_func=self._build_treereduce_func(axis, func), apply_indices=apply_indices, enumerate_partitions=enumerate_partitions, keep_partitioning=keep_partitioning, num_splits=num_splits, apply_func_args=apply_func_args, ) kw = {"row_lengths": None, "column_widths": None} if isinstance(dtypes, str) and dtypes == "copy": kw["dtypes"] = self.copy_dtypes_cache() elif isinstance(dtypes, DtypesDescriptor): kw["dtypes"] = ModinDtypes(dtypes) elif dtypes is not None: if isinstance(dtypes, (pandas.Series, ModinDtypes)): kw["dtypes"] = dtypes.copy() else: if new_columns is None: assert not is_list_like(dtypes) dtype = pandas.api.types.pandas_dtype(dtypes) kw["dtypes"] = ModinDtypes(DtypesDescriptor(remaining_dtype=dtype)) else: kw["dtypes"] = ( pandas.Series(dtypes, index=new_columns) if is_list_like(dtypes) else pandas.Series( [pandas.api.types.pandas_dtype(dtypes)] * len(new_columns), index=new_columns, ) ) is_index_materialized = ModinIndex.is_materialized_index(new_index) is_columns_materialized = ModinIndex.is_materialized_index(new_columns) if axis == 0: if ( is_columns_materialized and len(new_partitions.shape) > 1 and new_partitions.shape[1] == 1 ): kw["column_widths"] = [len(new_columns)] elif axis == 1: if is_index_materialized and new_partitions.shape[0] == 1: kw["row_lengths"] = [len(new_index)] if not keep_partitioning: if kw["row_lengths"] is None and is_index_materialized: if axis == 0: kw["row_lengths"] = get_length_list( axis_len=len(new_index), num_splits=new_partitions.shape[0], min_block_size=MinRowPartitionSize.get(), ) elif axis == 1: if self._row_lengths_cache is not None and len(new_index) == sum( self._row_lengths_cache ): kw["row_lengths"] = self._row_lengths_cache if kw["column_widths"] is None and is_columns_materialized: if axis == 1: kw["column_widths"] = get_length_list( axis_len=len(new_columns), num_splits=new_partitions.shape[1], min_block_size=MinColumnPartitionSize.get(), ) elif axis == 0: if self._column_widths_cache is not None and len( new_columns ) == sum(self._column_widths_cache): kw["column_widths"] = self._column_widths_cache else: if axis == 0: if ( kw["row_lengths"] is None and self._row_lengths_cache is not None and is_index_materialized and len(new_index) == sum(self._row_lengths_cache) # to avoid problems that may arise when filtering empty dataframes and all(r != 0 for r in self._row_lengths_cache) ): kw["row_lengths"] = self._row_lengths_cache elif axis == 1: if ( kw["column_widths"] is None and self._column_widths_cache is not None and is_columns_materialized and len(new_columns) == sum(self._column_widths_cache) # to avoid problems that may arise when filtering empty dataframes and all(w != 0 for w in self._column_widths_cache) ): kw["column_widths"] = self._column_widths_cache result = self.__constructor__( new_partitions, index=new_index, columns=new_columns, **kw, pandas_backend=self._pandas_backend, ) if sync_labels and new_index is not None: result.synchronize_labels(axis=0) if sync_labels and new_columns is not None: result.synchronize_labels(axis=1) return result def _check_if_axes_identical(self, other: PandasDataframe, axis: int = 0) -> bool: """ Check whether indices/partitioning along the specified `axis` are identical when compared with `other`. Parameters ---------- other : PandasDataframe Dataframe to compare indices/partitioning with. axis : int, default: 0 Returns ------- bool """ if self.has_axis_cache(axis) and other.has_axis_cache(axis): self_cache, other_cache = self._get_axis_cache(axis), other._get_axis_cache( axis ) equal_indices = self_cache.equals(other_cache) if equal_indices: equal_lengths = self_cache.compare_partition_lengths_if_possible( other_cache ) if isinstance(equal_lengths, bool): return equal_lengths return self._get_axis_lengths(axis) == other._get_axis_lengths(axis) return False return self.get_axis(axis).equals( other.get_axis(axis) ) and self._get_axis_lengths(axis) == other._get_axis_lengths(axis) def _copartition( self, axis, other, how, sort=None, force_repartition=False, fill_value=None ): """ Copartition two Modin DataFrames. Perform aligning of partitions, index and partition blocks. Parameters ---------- axis : {0, 1} Axis to copartition along (0 - rows, 1 - columns). other : PandasDataframe Other Modin DataFrame(s) to copartition against. how : str How to manage joining the index object ("left", "right", etc.). sort : bool, default: None Whether sort the joined index or not. If ``None``, sort is defined in depend on labels equality along the axis. force_repartition : bool, default: False Whether force the repartitioning or not. By default, this method will skip repartitioning if it is possible. This is because reindexing is extremely inefficient. Because this method is used to `join` or `append`, it is vital that the internal indices match. fill_value : any, optional Value to use for missing values. Returns ------- tuple Tuple containing: 1) 2-d NumPy array of aligned left partitions 2) list of 2-d NumPy arrays of aligned right partitions 3) joined index along ``axis``, may be ``ModinIndex`` if not materialized 4) If materialized, list with sizes of partitions along axis that partitioning was done on, otherwise ``None``. This list will be empty if and only if all the frames are empty. """ if isinstance(other, type(self)): other = [other] if not force_repartition and all( o._check_if_axes_identical(self, axis) for o in other ): return ( self._partitions, [o._partitions for o in other], self.copy_axis_cache(axis, copy_lengths=True), self._get_axis_lengths_cache(axis), ) if sort is None: sort = not all(self.get_axis(axis).equals(o.get_axis(axis)) for o in other) self_index = self.get_axis(axis) others_index = [o.get_axis(axis) for o in other] joined_index, make_reindexer = self._join_index_objects( axis, [self_index] + others_index, how, sort, fill_value ) frames = [self] + other non_empty_frames_idx = [ i for i, o in enumerate(frames) if o._partitions.size != 0 ] # If all frames are empty if len(non_empty_frames_idx) == 0: return ( self._partitions, [o._partitions for o in other], joined_index, # There are no partition sizes because the resulting dataframe # has no partitions. [], ) base_frame_idx = non_empty_frames_idx[0] other_frames = frames[base_frame_idx + 1 :] # Picking first non-empty frame base_frame = frames[non_empty_frames_idx[0]] base_index = base_frame.get_axis(axis) # define conditions for reindexing and repartitioning `self` frame do_reindex_base = not base_index.equals(joined_index) do_repartition_base = force_repartition or do_reindex_base # Perform repartitioning and reindexing for `base_frame` if needed. # Also define length of base and frames. We will need to know the # lengths for alignment. if do_repartition_base: reindexed_base = base_frame._partition_mgr_cls.map_axis_partitions( axis, base_frame._partitions, make_reindexer(do_reindex_base, base_frame_idx), ) if axis: base_lengths = [obj.width() for obj in reindexed_base[0]] else: base_lengths = [obj.length() for obj in reindexed_base.T[0]] else: reindexed_base = base_frame._partitions base_lengths = base_frame.column_widths if axis else base_frame.row_lengths others_lengths = [o._get_axis_lengths(axis) for o in other_frames] # define conditions for reindexing and repartitioning `other` frames do_reindex_others = [ not o.get_axis(axis).equals(joined_index) for o in other_frames ] do_repartition_others = [None] * len(other_frames) for i in range(len(other_frames)): do_repartition_others[i] = ( force_repartition or do_reindex_others[i] or others_lengths[i] != base_lengths ) # perform repartitioning and reindexing for `other_frames` if needed reindexed_other_list = [None] * len(other_frames) for i in range(len(other_frames)): if do_repartition_others[i]: # indices of others frame start from `base_frame_idx` + 1 reindexed_other_list[i] = other_frames[ i ]._partition_mgr_cls.map_axis_partitions( axis, other_frames[i]._partitions, make_reindexer(do_repartition_others[i], base_frame_idx + 1 + i), lengths=base_lengths, ) else: reindexed_other_list[i] = other_frames[i]._partitions reindexed_frames = ( [frames[i]._partitions for i in range(base_frame_idx)] + [reindexed_base] + reindexed_other_list ) return (reindexed_frames[0], reindexed_frames[1:], joined_index, base_lengths) @lazy_metadata_decorator(apply_axis="both") def n_ary_op( self, op, right_frames: list[PandasDataframe], join_type="outer", sort=None, copartition_along_columns=True, labels="replace", dtypes: Optional[pandas.Series] = None, ) -> PandasDataframe: """ Perform an n-opary operation by joining with other Modin DataFrame(s). Parameters ---------- op : callable Function to apply after the join. right_frames : list of PandasDataframe Modin DataFrames to join with. join_type : str, default: "outer" Type of join to apply. sort : bool, default: None Whether to sort index and columns or not. copartition_along_columns : bool, default: True Whether to perform copartitioning along columns or not. For some ops this isn't needed (e.g., `fillna`). labels : {"replace", "drop"}, default: "replace" Whether use labels from joined DataFrame or drop altogether to make them be computed lazily later. dtypes : pandas.Series, optional Dtypes of the resultant dataframe, this argument will be received if the resultant dtypes of n-opary operation is precomputed. Returns ------- PandasDataframe New Modin DataFrame. """ left_parts, list_of_right_parts, joined_index, row_lengths = self._copartition( 0, right_frames, join_type, sort=sort, ) if copartition_along_columns: new_left_frame = self.__constructor__( left_parts, joined_index, self.copy_columns_cache(copy_lengths=True), row_lengths, self._column_widths_cache, pandas_backend=self._pandas_backend, ) new_right_frames = [ self.__constructor__( right_parts, joined_index, right_frame.copy_columns_cache(copy_lengths=True), row_lengths, right_frame._column_widths_cache, pandas_backend=self._pandas_backend, ) for right_parts, right_frame in zip(list_of_right_parts, right_frames) ] ( left_parts, list_of_right_parts, joined_columns, column_widths, ) = new_left_frame._copartition( 1, new_right_frames, join_type, sort=sort, ) else: joined_columns = self.copy_columns_cache(copy_lengths=True) column_widths = self._column_widths_cache new_frame = ( np.array([]) if len(left_parts) == 0 or any(len(right_parts) == 0 for right_parts in list_of_right_parts) else self._partition_mgr_cls.n_ary_operation( left_parts, op, list_of_right_parts ) ) if labels == "drop": joined_index = joined_columns = row_lengths = column_widths = None return self.__constructor__( new_frame, joined_index, joined_columns, row_lengths, column_widths, dtypes, pandas_backend=self._pandas_backend, ) @lazy_metadata_decorator(apply_axis="both") def concat( self, axis: Union[int, Axis], others: Union[PandasDataframe, List[PandasDataframe]], how, sort, ) -> PandasDataframe: """ Concatenate `self` with one or more other Modin DataFrames. Parameters ---------- axis : int or modin.core.dataframe.base.utils.Axis Axis to concatenate over. others : list List of Modin DataFrames to concatenate with. how : str Type of join to use for the axis. sort : bool Whether sort the result or not. Returns ------- PandasDataframe New Modin DataFrame. """ axis = Axis(axis) new_widths = None new_lengths = None def _compute_new_widths(): widths = None if self._column_widths_cache is not None and all( o._column_widths_cache is not None for o in others ): widths = self._column_widths_cache + [ width for o in others for width in o._column_widths_cache ] return widths # Fast path for equivalent columns and partitioning if axis == Axis.ROW_WISE and all( o._check_if_axes_identical(self, axis=1) for o in others ): joined_index = self.copy_columns_cache(copy_lengths=True) left_parts = self._partitions right_parts = [o._partitions for o in others] new_widths = self._column_widths_cache elif axis == Axis.COL_WISE and all( o._check_if_axes_identical(self, axis=0) for o in others ): joined_index = self.copy_index_cache(copy_lengths=True) left_parts = self._partitions right_parts = [o._partitions for o in others] new_lengths = self._row_lengths_cache # we can only do this for COL_WISE because `concat` might rebalance partitions for ROW_WISE new_widths = _compute_new_widths() else: ( left_parts, right_parts, joined_index, partition_sizes_along_axis, ) = self._copartition( axis.value ^ 1, others, how, sort=sort, force_repartition=False ) if axis == Axis.COL_WISE: new_lengths = partition_sizes_along_axis new_widths = _compute_new_widths() else: new_widths = partition_sizes_along_axis new_partitions, new_lengths2 = self._partition_mgr_cls.concat( axis.value, left_parts, right_parts ) if new_lengths is None: new_lengths = new_lengths2 new_dtypes = None new_index = None new_columns = None if axis == Axis.ROW_WISE: if all(obj.has_materialized_index for obj in (self, *others)): new_index = self.index.append([other.index for other in others]) new_columns = joined_index frames = [self] + others # TODO: should we wrap all `concat` call into "try except" block? # `ModinDtypes.concat` can throw exception in case of duplicate values new_dtypes = ModinDtypes.concat([frame._dtypes for frame in frames], axis=1) # If we have already cached the length of each row in at least one # of the row's partitions, we can build new_lengths for the new # frame. Typically, if we know the length for any partition in a # row, we know the length for the first partition in the row. So # just check the lengths of the first column of partitions. if not new_lengths: new_lengths = [] if new_partitions.size > 0: if all( part._length_cache is not None for part in new_partitions.T[0] ): new_lengths = self._get_lengths(new_partitions.T[0], axis) else: new_lengths = None else: if all(obj.has_materialized_columns for obj in (self, *others)): new_columns = self.columns.append([other.columns for other in others]) new_index = joined_index try: new_dtypes = ModinDtypes.concat( [self.copy_dtypes_cache()] + [o.copy_dtypes_cache() for o in others] ) except NotImplementedError: new_dtypes = None # If we have already cached the width of each column in at least one # of the column's partitions, we can build new_widths for the new # frame. Typically, if we know the width for any partition in a # column, we know the width for the first partition in the column. # So just check the widths of the first row of partitions. if not new_widths: new_widths = [] if new_partitions.size > 0: if all(part._width_cache is not None for part in new_partitions[0]): new_widths = self._get_lengths(new_partitions[0], axis) else: new_widths = None return self.__constructor__( new_partitions, new_index, new_columns, new_lengths, new_widths, new_dtypes, pandas_backend=self._pandas_backend, ) def _apply_func_to_range_partitioning_broadcast( self, right, func, key, new_index=None, new_columns=None, new_dtypes: Optional[Union[ModinDtypes, pandas.Series]] = None, ): """ Apply `func` against two dataframes using range-partitioning implementation. The method first builds range-partitioning for both dataframes using the data from `self[key]`, after that, it applies `func` row-wise to `self` frame and broadcasts row-parts of `right` to `self`. Parameters ---------- right : PandasDataframe func : callable(left : pandas.DataFrame, right : pandas.DataFrame) -> pandas.DataFrame key : list of labels Columns to use to build range-partitioning. Must present in both dataframes. new_index : pandas.Index, optional Index values to write to the result's cache. new_columns : pandas.Index, optional Column values to write to the result's cache. new_dtypes : pandas.Series or ModinDtypes, optional Dtype values to write to the result's cache. Returns ------- PandasDataframe """ if self._partitions.shape[0] == 1: result = self.broadcast_apply_full_axis( axis=1, func=func, new_columns=new_columns, dtypes=new_dtypes, other=right, ) return result if not isinstance(key, list): key = [key] shuffling_functions = ShuffleSortFunctions( self, key, ascending=True, ideal_num_new_partitions=self._partitions.shape[0], ) # here we want to get indices of those partitions that hold the key columns key_indices = self.columns.get_indexer_for(key) partition_indices = np.unique( np.digitize(key_indices, np.cumsum(self.column_widths)) ) new_partitions = self._partition_mgr_cls.shuffle_partitions( self._partitions, partition_indices, shuffling_functions, func, right_partitions=right._partitions, ) return self.__constructor__( new_partitions, index=new_index, columns=new_columns, dtypes=new_dtypes, pandas_backend=self._pandas_backend, ) @lazy_metadata_decorator(apply_axis="both") def groupby( self, axis: Union[int, Axis], internal_by: List[str], external_by: List[PandasDataframe], by_positions: List[int], operator: Callable, result_schema: Optional[Dict[Hashable, type]] = None, align_result_columns: bool = False, series_groupby: bool = False, add_missing_cats: bool = False, **kwargs: dict, ) -> PandasDataframe: """ Generate groups based on values in the input column(s) and perform the specified operation on each. Parameters ---------- axis : int or modin.core.dataframe.base.utils.Axis The axis to apply the grouping over. internal_by : list of strings One or more column labels from the `self` dataframe to use for grouping. external_by : list of PandasDataframes PandasDataframes to group by (may be specified along with or without `internal_by`). by_positions : list of ints Specifies the order of grouping by `internal_by` and `external_by` columns. Each element in `by_positions` specifies an index from either `external_by` or `internal_by`. Indices for `external_by` are positive and start from 0. Indices for `internal_by` are negative and start from -1 (so in order to convert them to a valid indices one should do ``-idx - 1``). ''' by_positions = [0, -1, 1, -2, 2, 3] internal_by = ["col1", "col2"] external_by = [sr1, sr2, sr3, sr4] df.groupby([sr1, "col1", sr2, "col2", sr3, sr4]) '''. operator : callable(pandas.core.groupby.DataFrameGroupBy) -> pandas.DataFrame The operation to carry out on each of the groups. The operator is another algebraic operator with its own user-defined function parameter, depending on the output desired by the user. result_schema : dict, optional Mapping from column labels to data types that represents the types of the output dataframe. align_result_columns : bool, default: False Whether to manually align columns between all the resulted row partitions. This flag is helpful when dealing with UDFs as they can change the partition's shape and labeling unpredictably, resulting in an invalid dataframe. series_groupby : bool, default: False Whether to convert a one-column DataFrame to a Series before performing groupby. add_missing_cats : bool, default: False Whether to add missing categories from `by` columns to the result. **kwargs : dict Additional arguments to pass to the ``df.groupby`` method (besides the 'by' argument). Returns ------- PandasDataframe A new PandasDataframe containing the groupings specified, with the operator applied to each group. Notes ----- No communication between groups is allowed in this algebra implementation. The number of rows (columns if axis=1) returned by the user-defined function passed to the groupby may be at most the number of rows in the group, and may be as small as a single row. Unlike the pandas API, an intermediate "GROUP BY" object is not present in this algebra implementation. """ axis = Axis(axis) if axis != Axis.ROW_WISE: raise NotImplementedError( f"Algebra groupby only implemented row-wise. {axis.name} axis groupby not implemented yet!" ) has_external_grouper = len(external_by) > 0 skip_on_aligning_flag = "__skip_me_on_aligning__" duplicated_suffix = "__duplicated_suffix__" duplicated_pattern = r"_[\d]*__duplicated_suffix__" kwargs["observed"] = True level = kwargs.get("level") if level is not None and not isinstance(level, list): level = [level] def apply_func(df): # pragma: no cover if has_external_grouper: external_grouper = df["grouper"] external_grouper = [ # `df.groupby()` can only take a list of Series'es, so splitting # the df into a list of individual Series'es external_grouper.iloc[:, i] for i in range(len(external_grouper.columns)) ] # renaming 'None' and duplicated names back to their original names for obj in external_grouper: if not isinstance(obj, pandas.Series): continue name = obj.name if isinstance(name, str): if name.startswith(MODIN_UNNAMED_SERIES_LABEL): name = None elif name.endswith(duplicated_suffix): name = re.sub(duplicated_pattern, "", name) elif isinstance(name, tuple): if name[-1].endswith(duplicated_suffix): name = ( *name[:-1], re.sub(duplicated_pattern, "", name[-1]), ) obj.name = name df = df["data"] else: external_grouper = [] by = [] # restoring original order of 'by' columns for idx in by_positions: if idx >= 0: by.append(external_grouper[idx]) else: by.append(internal_by[-idx - 1]) if series_groupby: df = df.squeeze(axis=1) if kwargs.get("level") is not None: assert len(by) == 0 # passing an empty list triggers an error by = None result = operator(df.groupby(by, **kwargs)) if align_result_columns and df.empty and result.empty: # We want to align columns only of those frames that actually performed # some groupby aggregation, if an empty frame was originally passed # (an empty bin on reshuffling was created) then there were no groupby # executed over this partition and so it has incorrect columns # that shouldn't be considered on the aligning phase result.attrs[skip_on_aligning_flag] = True return result if has_external_grouper: grouper = ( external_by[0] if len(external_by) == 1 else external_by[0].concat( axis=1, others=external_by[1:], how="left", sort=False ) ) new_grouper_cols = [] columns_were_changed = False same_columns = {} # duplicated names break range-partitioning mechanism, so renaming them. # original names will be reverted in the actual groupby kernel for col in grouper.columns: suffix = same_columns.get(col) if suffix is None: same_columns[col] = 0 else: same_columns[col] += 1 col = ( (*col[:-1], f"{col[-1]}_{suffix}{duplicated_suffix}") if isinstance(col, tuple) else f"{col}_{suffix}{duplicated_suffix}" ) columns_were_changed = True new_grouper_cols.append(col) if columns_were_changed: grouper.columns = pandas.Index(new_grouper_cols) grouper_key_columns = grouper.columns data = self data_key_columns = internal_by else: grouper = self grouper_key_columns = internal_by data, data_key_columns = None, None result = grouper._apply_func_to_range_partitioning( key_columns=grouper_key_columns, func=apply_func, data=data, data_key_columns=data_key_columns, level=level, ) # no need aligning columns if there's only one row partition if add_missing_cats or align_result_columns and result._partitions.shape[0] > 1: # FIXME: the current reshuffling implementation guarantees us that there's only one column # partition in the result, so we should never hit this exception for now, however # in the future, we might want to make this implementation more broader if result._partitions.shape[1] > 1: raise NotImplementedError( "Aligning columns is not yet implemented for multiple column partitions." ) # There're two implementations: # 1. The first one work faster, but may stress the network a lot in cluster mode since # it gathers all the dataframes in a single ray-kernel. # 2. The second one works slower, but only gathers light pandas.Index objects, # so there should be less stress on the network. if add_missing_cats or not IsRayCluster.get(): if self.has_materialized_dtypes: original_dtypes = pandas.Series( { # lazy proxies hold a reference to another modin's DataFrame which can be # a problem during serialization, in this scenario we don't need actual # categorical values, so a "category" string will be enough name: ( "category" if isinstance(dtype, LazyProxyCategoricalDtype) else dtype ) for name, dtype in self.dtypes.items() } ) else: original_dtypes = None def compute_aligned_columns(*dfs, initial_columns=None, by=None): """Take row partitions, filter empty ones, and return joined columns for them.""" if align_result_columns: valid_dfs = [ df for df in dfs if not df.attrs.get(skip_on_aligning_flag, False) ] if len(valid_dfs) == 0 and len(dfs) != 0: valid_dfs = dfs # Using '.concat()' on empty-slices instead of 'Index.join()' # in order to get identical behavior to pandas when it joins # results of different groups combined_cols = pandas.concat( [df.iloc[:0] for df in valid_dfs], axis=0, join="outer" ).columns else: combined_cols = dfs[0].columns masks = None if add_missing_cats: masks, combined_cols = add_missing_categories_to_groupby( dfs, by, operator, initial_columns, combined_cols, is_udf_agg=align_result_columns, kwargs=kwargs.copy(), initial_dtypes=original_dtypes, ) return ( (combined_cols, masks) if align_result_columns else (None, masks) ) external_by_cols = [ None if col.startswith(MODIN_UNNAMED_SERIES_LABEL) else col for obj in external_by for col in obj.columns ] by = [] # restoring original order of 'by' columns for idx in by_positions: if idx >= 0: by.append(external_by_cols[idx]) else: by.append(internal_by[-idx - 1]) # Passing all partitions to the 'compute_aligned_columns' kernel to get # aligned columns parts = result._partitions.flatten() aligned_columns = parts[0].apply( compute_aligned_columns, *[part._data for part in parts[1:]], initial_columns=pandas.Index(external_by_cols).append(self.columns), by=by, ) def apply_aligned(df, args, partition_idx): combined_cols, mask = args if mask is not None and mask.get(partition_idx) is not None: values = mask[partition_idx] original_names = df.index.names # TODO: inserting 'values' based on 'searchsorted' result might be more efficient # in cases of small amount of 'values' df = pandas.concat([df, values]) if kwargs["sort"]: df = df.sort_index(axis=0) df.index.names = original_names if combined_cols is not None: df = df.reindex(columns=combined_cols) return df # Lazily applying aligned columns to partitions new_partitions = self._partition_mgr_cls.lazy_map_partitions( result._partitions, apply_aligned, func_args=(aligned_columns._data,), enumerate_partitions=True, ) else: def join_cols(df, *cols): """Join `cols` and apply the joined columns to `df`.""" valid_cols = [ pandas.DataFrame(columns=col) for col in cols if col is not None ] if len(valid_cols) == 0: return df # Using '.concat()' on empty-slices instead of 'Index.join()' # in order to get identical behavior to pandas when it joins # results of different groups result_col = pandas.concat(valid_cols, axis=0, join="outer").columns return df.reindex(columns=result_col) # Getting futures for columns of non-empty partitions cols = [ part.apply( lambda df: ( None if df.attrs.get(skip_on_aligning_flag, False) else df.columns ) )._data for part in result._partitions.flatten() ] # Lazily joining and applying the aligned columns new_partitions = self._partition_mgr_cls.lazy_map_partitions( result._partitions, join_cols, func_args=cols, ) result = self.__constructor__( new_partitions, index=result.copy_index_cache(), row_lengths=result._row_lengths_cache, pandas_backend=self._pandas_backend, ) if ( not result.has_materialized_index and not has_external_grouper and level is None ): by_dtypes = ModinDtypes(self._dtypes).lazy_get(internal_by) if by_dtypes.is_materialized: new_index = ModinIndex(value=result, axis=0, dtypes=by_dtypes) result.set_index_cache(new_index) if result_schema is not None: new_dtypes = pandas.Series(result_schema) result.set_dtypes_cache(new_dtypes) result.set_columns_cache(new_dtypes.index) return result @lazy_metadata_decorator(apply_axis="both") def groupby_reduce( self, axis, by, map_func, reduce_func, new_index=None, new_columns=None, apply_indices=None, ): """ Groupby another Modin DataFrame dataframe and aggregate the result. Parameters ---------- axis : {0, 1} Axis to groupby and aggregate over. by : PandasDataframe or None A Modin DataFrame to group by. map_func : callable Map component of the aggregation. reduce_func : callable Reduce component of the aggregation. new_index : pandas.Index, optional Index of the result. We may know this in advance, and if not provided it must be computed. new_columns : pandas.Index, optional Columns of the result. We may know this in advance, and if not provided it must be computed. apply_indices : list-like, optional Indices of `axis ^ 1` to apply groupby over. Returns ------- PandasDataframe New Modin DataFrame. """ by_parts = by if by is None else by._partitions if by is None: self._propagate_index_objs(axis=0) if apply_indices is not None: numeric_indices = self.get_axis(axis ^ 1).get_indexer_for(apply_indices) apply_indices = list( self._get_dict_of_block_index(axis ^ 1, numeric_indices).keys() ) if by_parts is not None: # inplace operation if by_parts.shape[axis] != self._partitions.shape[axis]: self._filter_empties(compute_metadata=False) new_partitions = self._partition_mgr_cls.groupby_reduce( axis, self._partitions, by_parts, map_func, reduce_func, apply_indices ) return self.__constructor__( new_partitions, index=new_index, columns=new_columns, pandas_backend=self._pandas_backend, ) @classmethod def from_pandas(cls, df): """ Create a Modin DataFrame from a pandas DataFrame. Parameters ---------- df : pandas.DataFrame A pandas DataFrame. Returns ------- PandasDataframe New Modin DataFrame. """ new_index = df.index new_columns = df.columns new_dtypes = df.dtypes new_frame, pandas_backend, new_lengths, new_widths = ( cls._partition_mgr_cls.from_pandas(df, True) ) return cls( new_frame, new_index, new_columns, new_lengths, new_widths, dtypes=new_dtypes, pandas_backend=pandas_backend, ) @classmethod def from_arrow(cls, at): """ Create a Modin DataFrame from an Arrow Table. Parameters ---------- at : pyarrow.table Arrow Table. Returns ------- PandasDataframe New Modin DataFrame. """ new_frame, pandas_backend, new_lengths, new_widths = ( cls._partition_mgr_cls.from_arrow(at, return_dims=True) ) new_columns = Index.__new__(Index, data=at.column_names, dtype="O") new_index = Index.__new__(RangeIndex, data=range(at.num_rows)) new_dtypes = pandas.Series( [cls._arrow_type_to_dtype(col.type) for col in at.columns], index=at.column_names, ) return cls( partitions=new_frame, index=new_index, columns=new_columns, row_lengths=new_lengths, column_widths=new_widths, dtypes=new_dtypes, pandas_backend=pandas_backend, ) @classmethod def _arrow_type_to_dtype(cls, arrow_type): """ Convert an arrow data type to a pandas data type. Parameters ---------- arrow_type : arrow dtype Arrow data type to be converted to a pandas data type. Returns ------- object Any dtype compatible with pandas. """ import pyarrow try: # TODO: should we map arrow types to pyarrow-backed pandas types? # It seems like this might help avoid the expense of transferring # data between backends (numpy and pyarrow), but we need to be sure # how this fits into the type inference system in pandas. res = arrow_type.to_pandas_dtype() # Conversion to pandas is not implemented for some arrow types, # perform manual conversion for them: except NotImplementedError: if pyarrow.types.is_time(arrow_type): res = np.dtype(datetime.time) else: raise if not isinstance(res, (np.dtype, str)): return np.dtype(res) return res @lazy_metadata_decorator(apply_axis="both") def to_pandas(self): """ Convert this Modin DataFrame to a pandas DataFrame. Returns ------- pandas.DataFrame """ df = self._partition_mgr_cls.to_pandas(self._partitions) if df.empty: df = pandas.DataFrame(columns=self.columns, index=self.index) if len(df.columns) and self.has_materialized_dtypes: df = df.astype(self.dtypes) else: for axis, has_external_index in enumerate( ["has_materialized_index", "has_materialized_columns"] ): # no need to check external and internal axes since in that case # external axes will be computed from internal partitions if getattr(self, has_external_index): external_index = self.columns if axis else self.index ErrorMessage.catch_bugs_and_request_email( not df.axes[axis].equals(external_index), f"Internal and external indices on axis {axis} do not match.", ) # have to do this in order to assign some potentially missing metadata, # the ones that were set to the external index but were never propagated # into the internal ones df = df.set_axis(axis=axis, labels=external_index, copy=False) return df def to_numpy(self, **kwargs): """ Convert this Modin DataFrame to a NumPy array. Parameters ---------- **kwargs : dict Additional keyword arguments to be passed in `to_numpy`. Returns ------- np.ndarray """ arr = self._partition_mgr_cls.to_numpy(self._partitions, **kwargs) ErrorMessage.catch_bugs_and_request_email( self.has_materialized_index and len(arr) != len(self.index) or self.has_materialized_columns and len(arr[0]) != len(self.columns) ) return arr @lazy_metadata_decorator(apply_axis=None, transpose=True) def transpose(self): """ Transpose the index and columns of this Modin DataFrame. Reflect this Modin DataFrame over its main diagonal by writing rows as columns and vice-versa. Returns ------- PandasDataframe New Modin DataFrame. """ new_partitions = self._partition_mgr_cls.lazy_map_partitions( self._partitions, lambda df: df.T ).T if self.has_materialized_dtypes: new_dtypes = pandas.Series( np.full(len(self.index), find_common_type(self.dtypes.values)), index=self.index, ) else: new_dtypes = None return self.__constructor__( new_partitions, self.copy_columns_cache(copy_lengths=True), self.copy_index_cache(copy_lengths=True), self._column_widths_cache, self._row_lengths_cache, dtypes=new_dtypes, pandas_backend=self._pandas_backend, ) @lazy_metadata_decorator(apply_axis="both") def finalize(self): """ Perform all deferred calls on partitions. This makes `self` Modin Dataframe independent of a history of queries that were used to build it. """ self._partition_mgr_cls.finalize(self._partitions) def wait_computations(self): """Wait for all computations to complete without materializing data.""" self._partition_mgr_cls.wait_partitions(self._partitions.flatten()) def support_materialization_in_worker_process(self) -> bool: """ Whether it's possible to call function `to_pandas` during the pickling process, at the moment of recreating the object. Returns ------- bool """ return True def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): """ Get a Modin DataFrame that implements the dataframe exchange protocol. See more about the protocol in https://data-apis.org/dataframe-protocol/latest/index.html. Parameters ---------- nan_as_null : bool, default: False A keyword intended for the consumer to tell the producer to overwrite null values in the data with ``NaN`` (or ``NaT``). This currently has no effect; once support for nullable extension dtypes is added, this value should be propagated to columns. allow_copy : bool, default: True A keyword that defines whether or not the library is allowed to make a copy of the data. For example, copying data would be necessary if a library supports strided buffers, given that this protocol specifies contiguous buffers. Currently, if the flag is set to ``False`` and a copy is needed, a ``RuntimeError`` will be raised. Returns ------- ProtocolDataframe A dataframe object following the dataframe protocol specification. """ from modin.core.dataframe.pandas.interchange.dataframe_protocol.dataframe import ( PandasProtocolDataframe, ) return PandasProtocolDataframe( self, nan_as_null=nan_as_null, allow_copy=allow_copy ) @classmethod def from_interchange_dataframe(cls, df: ProtocolDataframe) -> PandasDataframe: """ Convert a DataFrame implementing the dataframe exchange protocol to a Core Modin Dataframe. See more about the protocol in https://data-apis.org/dataframe-protocol/latest/index.html. Parameters ---------- df : ProtocolDataframe The DataFrame object supporting the dataframe exchange protocol. Returns ------- PandasDataframe A new Core Modin Dataframe object. """ if type(df) is cls: return df if not hasattr(df, "__dataframe__"): raise ValueError( "`df` does not support DataFrame exchange protocol, i.e. `__dataframe__` method" ) from modin.core.dataframe.pandas.interchange.dataframe_protocol.from_dataframe import ( from_dataframe_to_pandas, ) ErrorMessage.default_to_pandas(message="`from_dataframe`") pandas_df = from_dataframe_to_pandas(df) return cls.from_pandas(pandas_df) def case_when(self, caselist): """ Replace values where the conditions are True. This is Series.case_when() implementation and, thus, it's designed to work only with single-column DataFrames. Parameters ---------- caselist : list of tuples Returns ------- PandasDataframe """ # The import is here to avoid an incorrect module initialization when running tests. # This module is loaded before `pytest_configure()` is called. If `pytest_configure()` # changes the engine, the `remote_function` decorator will not be valid. from modin.core.execution.utils import remote_function @remote_function def remote_fn(df, name, caselist): # pragma: no cover caselist = [ tuple( ( data.squeeze(axis=1) if isinstance(data, pandas.DataFrame) else data ) for data in case_tuple ) for case_tuple in caselist ] return pandas.DataFrame({name: df.squeeze(axis=1).case_when(caselist)}) cls = type(self) use_map = True is_trivial_idx = None name = self.columns[0] # Lists of modin frames: first for conditions, second for replacements modin_lists = [[], []] # Fill values for conditions and replacements respectively fill_values = [True, None] new_caselist = [] for case_tuple in caselist: new_case = [] for data, modin_list, fill_value in zip( case_tuple, modin_lists, fill_values ): if isinstance(data, cls): modin_list.append(data) elif callable(data): data = remote_function(data) elif isinstance(data, pandas.Series): use_map = False if is_trivial_idx is None: self_idx = self.index length = len(self_idx) is_trivial_idx = is_trivial_index(self_idx) if is_trivial_idx and is_trivial_index(data.index): data = data[:length] diff = length - len(data) if diff > 0: data = pandas.concat( [data, pandas.Series([fill_value] * diff)], ignore_index=True, ) else: data = data.reindex(self_idx, fill_value=fill_value) elif use_map and is_list_like(data): use_map = False new_case.append(data) new_caselist.append(tuple(new_case)) if modin_lists[0] or modin_lists[1]: # Copartition modin frames use_map = False columns = self.columns column_widths = [1] for modin_list, fill_value in zip(modin_lists, fill_values): _, list_of_right_parts, joined_index, row_lengths = self._copartition( Axis.ROW_WISE.value, modin_list, how="left", sort=False, fill_value=fill_value, ) modin_list.clear() modin_list.extend( self.__constructor__( part, joined_index, columns, row_lengths, column_widths, pandas_backend=self._pandas_backend, ) for part in list_of_right_parts ) # Replace modin frames with copartitioned caselist = new_caselist new_caselist = [] for i in range(2): modin_lists[i] = iter(modin_lists[i]) for case_tuple in caselist: new_case = tuple( next(modin_list) if isinstance(data, cls) else data for data, modin_list in zip(case_tuple, modin_lists) ) new_caselist.append(new_case) # If all the conditions are callable and the replacements are either # callable or scalar, use map(). if use_map: return self.map(func=remote_fn, func_args=[name, new_caselist], lazy=True) # Get the chunk of data corresponding the the specified partition def map_data( part_idx, part_len, data, data_offset, fill_value, ): if isinstance(data, cls): return data._partitions[part_idx][0]._data if isinstance(data, pandas.Series): return data[data_offset : data_offset + part_len] return ( data[data_offset : data_offset + part_len] if is_list_like(data) else data ) parts = [p[0] for p in self._partitions] lengths = self.row_lengths new_parts = [] data_offset = 0 # Split the data and apply the remote function to each partition # with the corresponding chunk of data for i, part, part_len in zip(range(len(parts)), parts, lengths): cases = [ tuple( map_data(i, part_len, data, data_offset, fill_value) for data, fill_value in zip(c, (True, None)) ) for c in new_caselist ] new_parts.append( part.add_to_apply_calls( remote_fn, name, cases, length=part_len, width=1, ) ) data_offset += part_len new_parts = np.array([[p] for p in new_parts]) return self.__constructor__( new_parts, columns=self.columns, index=self.index, row_lengths=lengths, column_widths=[1], pandas_backend=self._pandas_backend, ) ================================================ FILE: modin/core/dataframe/pandas/dataframe/utils.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Collection of algebra utility functions, used to shuffle data across partitions.""" import abc from collections import namedtuple from typing import TYPE_CHECKING, Callable, Optional, Union import numpy as np import pandas from pandas._libs.tslibs import to_offset from pandas.core.dtypes.common import is_list_like, is_numeric_dtype from pandas.core.resample import _get_timestamp_range_edges from modin.error_message import ErrorMessage from modin.utils import _inherit_docstrings if TYPE_CHECKING: from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe ColumnInfo = namedtuple("ColumnInfo", ["name", "pivots", "is_numeric"]) class ShuffleFunctions: """ Defines an interface to perform the sampling, quantiles picking, and the splitting stages for the range-partitioning building. Parameters ---------- modin_frame : PandasDataframe The frame to build the range-partitioning for. columns : str or list of strings The column/columns to use as a key. ascending : bool Whether the ranges should be in ascending or descending order. ideal_num_new_partitions : int The ideal number of new partitions. **kwargs : dict Additional keyword arguments. """ def __init__( self, modin_frame, columns, ascending, ideal_num_new_partitions, **kwargs ): pass @abc.abstractmethod def sample_fn(self, partition: pandas.DataFrame) -> pandas.DataFrame: """ Pick samples over the given partition. Parameters ---------- partition : pandas.DataFrame Returns ------- pandas.DataFrame: The samples for the partition. """ pass @abc.abstractmethod def pivot_fn(self, samples: "list[pandas.DataFrame]") -> int: """ Determine quantiles from the given samples and save it for the future ``.split_fn()`` calls. Parameters ---------- samples : list of pandas.DataFrames Returns ------- int The number of bins the ``.split_fn()`` will return. """ pass @abc.abstractmethod def split_fn(self, partition: pandas.DataFrame) -> "tuple[pandas.DataFrame, ...]": """ Split the given dataframe into the range-partitions defined by the preceding call of the ``.pivot_fn()``. Parameters ---------- partition : pandas.DataFrame Returns ------- tuple of pandas.DataFrames Notes ----- In order to call this method you must call the ``.pivot_fn()`` first. """ pass @_inherit_docstrings(ShuffleFunctions) class ShuffleSortFunctions(ShuffleFunctions): """ Perform the sampling, quantiles picking, and the splitting stages for the range-partitioning building. Parameters ---------- modin_frame : PandasDataframe The frame to build the range-partitioning for. columns : str, list of strings or None The column/columns to use as a key. Can't be specified along with `level`. ascending : bool Whether the ranges should be in ascending or descending order. ideal_num_new_partitions : int The ideal number of new partitions. level : list of strings or ints, or None Index level(s) to use as a key. Can't be specified along with `columns`. closed_on_right : bool, default: False Whether to include the right limit in range-partitioning. True: bins[i - 1] < x <= bins[i] False: bins[i - 1] <= x < bins[i] **kwargs : dict Additional keyword arguments. """ def __init__( self, modin_frame: "PandasDataframe", columns: Optional[Union[str, list]], ascending: Union[list, bool], ideal_num_new_partitions: int, level: Optional[list[Union[str, int]]] = None, closed_on_right: bool = False, **kwargs: dict, ): self.frame_len = len(modin_frame) self.ideal_num_new_partitions = ideal_num_new_partitions self.columns = columns if is_list_like(columns) else [columns] self.ascending = ascending self.kwargs = kwargs.copy() self.level = level self.columns_info = None self.closed_on_right = closed_on_right def sample_fn(self, partition: pandas.DataFrame) -> pandas.DataFrame: if self.level is not None: partition = self._index_to_df_zero_copy(partition, self.level) else: partition = partition[self.columns] return self.pick_samples_for_quantiles( partition, self.ideal_num_new_partitions, self.frame_len ) def pivot_fn(self, samples: "list[pandas.DataFrame]") -> int: key = self.kwargs.get("key", None) samples = pandas.concat(samples, axis=0, copy=False) columns_info: "list[ColumnInfo]" = [] number_of_groups = 1 cols = [] for i, col in enumerate(samples.columns): num_pivots = int(self.ideal_num_new_partitions / number_of_groups) if num_pivots < 2 and len(columns_info): break column_val = samples[col] cols.append(col) is_numeric = is_numeric_dtype(column_val.dtype) # When we are not sorting numbers, we need our quantiles to not do arithmetic on the values method = "linear" if is_numeric else "inverted_cdf" pivots = self.pick_pivots_from_samples_for_sort( column_val, num_pivots, method, key ) columns_info.append( ColumnInfo( self.level[i] if self.level is not None else col, pivots, is_numeric, ) ) number_of_groups *= len(pivots) + 1 self.columns_info = columns_info return number_of_groups def split_fn( self, partition: pandas.DataFrame, ) -> "tuple[pandas.DataFrame, ...]": ErrorMessage.catch_bugs_and_request_email( failure_condition=self.columns_info is None, extra_log="The 'split_fn' doesn't have proper metadata, the probable reason is that it was called before 'pivot_fn'", ) return self.split_partitions_using_pivots_for_sort( partition, self.columns_info, self.ascending, keys_are_index_levels=self.level is not None, closed_on_right=self.closed_on_right, **self.kwargs, ) @staticmethod def _find_quantiles( df: Union[pandas.DataFrame, pandas.Series], quantiles: list, method: str ) -> np.ndarray: """ Find quantiles of a given dataframe using the specified method. We use this method to provide backwards compatibility with NumPy versions < 1.23 (e.g. when the user is using Modin in compat mode). This is basically a wrapper around `np.quantile` that ensures we provide the correct `method` argument - i.e. if we are dealing with objects (which may or may not support algebra), we do not want to use a method to find quantiles that will involve algebra operations (e.g. mean) between the objects, since that may fail. Parameters ---------- df : pandas.DataFrame or pandas.Series The data to pick quantiles from. quantiles : list[float] The quantiles to compute. method : str The method to use. `linear` if dealing with numeric types, otherwise `inverted_cdf`. Returns ------- np.ndarray A NumPy array with the quantiles of the data. """ if method == "linear": # This is the default method for finding quantiles, so it does not need to be specified, # which keeps backwards compatibility with older versions of NumPy that do not have a # `method` keyword argument in np.quantile. return np.unique(np.quantile(df, quantiles)) else: try: return np.unique(np.quantile(df, quantiles, method=method)) except Exception: # In this case, we're dealing with an array of objects, but the current version of # NumPy does not have a `method` kwarg. We need to use the older kwarg, `interpolation` # instead. return np.unique(np.quantile(df, quantiles, interpolation="lower")) @staticmethod def pick_samples_for_quantiles( df: pandas.DataFrame, num_partitions: int, length: int, ) -> pandas.DataFrame: """ Pick samples over the given partition. This function picks samples from the given partition using the TeraSort algorithm - each value is sampled with probability 1 / m * ln(n * t) where m = total_length / num_partitions, t = num_partitions, and n = total_length. Parameters ---------- df : pandas.Dataframe The masked dataframe to pick samples from. num_partitions : int The number of partitions. length : int The total length. Returns ------- pandas.DataFrame: The samples for the partition. Notes ----- This sampling algorithm is inspired by TeraSort. You can find more information about TeraSort and the sampling algorithm at https://www.cse.cuhk.edu.hk/~taoyf/paper/sigmod13-mr.pdf. """ m = length / num_partitions probability = (1 / m) * np.log(num_partitions * length) return df.sample(frac=probability) def pick_pivots_from_samples_for_sort( self, samples: pandas.Series, ideal_num_new_partitions: int, method: str = "linear", key: Optional[Callable] = None, ) -> np.ndarray: """ Determine quantiles from the given samples. This function takes as input the quantiles calculated over all partitions from `sample_func` defined above, and determines a final NPartitions.get() quantiles to use to roughly sort the entire dataframe. It does so by collating all the samples and computing NPartitions.get() quantiles for the overall set. Parameters ---------- samples : pandas.Series The samples computed by ``get_partition_quantiles_for_sort``. ideal_num_new_partitions : int The ideal number of new partitions. method : str, default: linear The method to use when picking quantiles. key : Callable, default: None The key to use on the samples when picking pivots. Returns ------- np.ndarray A list of overall quantiles. """ samples = samples.to_numpy() # We don't call `np.unique` on the samples, since if a quantile shows up in multiple # partition's samples, this is probably an indicator of skew in the dataset, and we # want our final partitions to take this into account. if key is not None: samples = key(samples) # We don't want to pick very many quantiles if we have a very small dataframe. num_quantiles = ideal_num_new_partitions quantiles = [i / num_quantiles for i in range(1, num_quantiles)] # If we only desire 1 partition, we need to ensure that we're not trying to find quantiles # from an empty list of pivots. if len(quantiles) > 0: return self._find_quantiles(samples, quantiles, method) return np.array([]) @staticmethod def split_partitions_using_pivots_for_sort( df: pandas.DataFrame, columns_info: "list[ColumnInfo]", ascending: bool, keys_are_index_levels: bool = False, closed_on_right: bool = False, **kwargs: dict, ) -> "tuple[pandas.DataFrame, ...]": """ Split the given dataframe into the partitions specified by `pivots` in `columns_info`. This function takes as input a row-axis partition, as well as the quantiles determined by the `pivot_func` defined above. It then splits the input dataframe into NPartitions.get() dataframes, with the elements in the i-th split belonging to the i-th partition, as determined by the quantiles we're using. Parameters ---------- df : pandas.Dataframe The partition to split. columns_info : list of ColumnInfo Information regarding keys and pivots for range partitioning. ascending : bool The ascending flag. keys_are_index_levels : bool, default: False Whether `columns_info` describes index levels or actual columns from `df`. closed_on_right : bool, default: False Whether to include the right limit in range-partitioning. True: bins[i - 1] < x <= bins[i] False: bins[i - 1] <= x < bins[i] **kwargs : dict Additional keyword arguments. Returns ------- tuple[pandas.DataFrame] A tuple of the splits from this partition. """ if len(columns_info) == 0: # We can return the dataframe with zero changes if there were no pivots passed return (df,) key_data = ( ShuffleSortFunctions._index_to_df_zero_copy( df, [col_info.name for col_info in columns_info] ) if keys_are_index_levels else df[[col_info.name for col_info in columns_info]] ) na_index = key_data.isna().squeeze(axis=1) if na_index.ndim == 2: na_index = na_index.any(axis=1) na_rows = df[na_index] non_na_rows = df[~na_index] def get_group(grp, key, df): """Get a group with the `key` from the `grp`, if it doesn't exist return an empty slice of `df`.""" try: return grp.get_group(key) except KeyError: return pandas.DataFrame(index=df.index[:0], columns=df.columns).astype( df.dtypes ) groupby_codes = [] group_keys = [] for col_info in columns_info: pivots = col_info.pivots if len(pivots) == 0: continue # If `ascending=False` and we are dealing with a numeric dtype, we can pass in a reversed list # of pivots, and `np.digitize` will work correctly. For object dtypes, we use `np.searchsorted` # which breaks when we reverse the pivots. if not ascending and col_info.is_numeric: # `key` is already applied to `pivots` in the `pick_pivots_from_samples_for_sort` function. pivots = pivots[::-1] group_keys.append(range(len(pivots) + 1)) key = kwargs.pop("key", None) cols_to_digitize = ( non_na_rows.index.get_level_values(col_info.name) if keys_are_index_levels else non_na_rows[col_info.name] ) if key is not None: cols_to_digitize = key(cols_to_digitize) if cols_to_digitize.ndim == 2: cols_to_digitize = cols_to_digitize.squeeze() if col_info.is_numeric: groupby_col = np.digitize( cols_to_digitize, pivots, right=closed_on_right ) # `np.digitize` returns results based off of the sort order of the pivots it is passed. # When we only have one unique value in our pivots, `np.digitize` assumes that the pivots # are sorted in ascending order, and gives us results based off of that assumption - so if # we actually want to sort in descending order, we need to swap the new indices. if not ascending and len(np.unique(pivots)) == 1: groupby_col = len(pivots) - groupby_col else: groupby_col = np.searchsorted( pivots, cols_to_digitize, side="left" if closed_on_right else "right", ) # Since np.searchsorted requires the pivots to be in ascending order, if we want to sort # in descending order, we need to swap the new indices. if not ascending: groupby_col = len(pivots) - groupby_col groupby_codes.append(groupby_col) if len(group_keys) == 0: # We can return the dataframe with zero changes if there were no pivots passed return (df,) elif len(group_keys) == 1: group_keys = group_keys[0] else: group_keys = pandas.MultiIndex.from_product(group_keys) if len(non_na_rows) == 1: groups = [ # taking an empty slice for an index's metadata ( pandas.DataFrame(index=df.index[:0], columns=df.columns).astype( df.dtypes ) if key != groupby_codes[0] else non_na_rows ) for key in group_keys ] else: grouped = non_na_rows.groupby(groupby_codes) groups = [get_group(grouped, key, df) for key in group_keys] index_to_insert_na_vals = ( -1 if kwargs.get("na_position", "last") == "last" else 0 ) groups[index_to_insert_na_vals] = pandas.concat( [groups[index_to_insert_na_vals], na_rows] ).astype(df.dtypes) return tuple(groups) @staticmethod def _index_to_df_zero_copy( df: pandas.DataFrame, levels: list[Union[str, int]] ) -> pandas.DataFrame: """ Convert index `level` of `df` to a ``pandas.DataFrame``. Parameters ---------- df : pandas.DataFrame levels : list of labels or ints Index level to convert to a dataframe. Returns ------- pandas.DataFrame The columns in the resulting dataframe use the same data arrays as the index levels in the original `df`, so no copies. """ # calling 'df.index.to_frame()' creates a copy of the index, so doing the conversion manually # to avoid the copy data = { ( df.index.names[lvl] if isinstance(lvl, int) else lvl ): df.index.get_level_values(lvl) for lvl in levels } index_data = pandas.DataFrame(data, index=df.index, copy=False) return index_data @_inherit_docstrings(ShuffleSortFunctions) class ShuffleResample(ShuffleSortFunctions): def __init__( self, modin_frame: "PandasDataframe", columns: Union[str, list], ascending: Union[list, bool], ideal_num_new_partitions: int, resample_kwargs: dict, **kwargs: dict, ): resample_kwargs = resample_kwargs.copy() rule = resample_kwargs.pop("rule") if resample_kwargs["closed"] is None: # this rule regarding the default value of 'closed' is inherited # from pandas documentation for 'pandas.DataFrame.resample' if rule in ("ME", "YE", "QE", "BME", "BA", "BQE", "W"): resample_kwargs["closed"] = "right" else: resample_kwargs["closed"] = "left" super().__init__( modin_frame, columns, ascending, ideal_num_new_partitions, closed_on_right=resample_kwargs["closed"] == "right", **kwargs, ) resample_kwargs["freq"] = to_offset(rule) self.resample_kwargs = resample_kwargs @staticmethod def pick_samples_for_quantiles( df: pandas.DataFrame, num_partitions: int, length: int, ) -> pandas.DataFrame: # to build proper bins we need min and max timestamp of the whole DatetimeIndex, # so computing it in each partition return pandas.concat([df.min().to_frame().T, df.max().to_frame().T]) def pick_pivots_from_samples_for_sort( self, samples: np.ndarray, ideal_num_new_partitions: int, method: str = "linear", key: Optional[Callable] = None, ) -> np.ndarray: if key is not None: raise NotImplementedError(key) max_value = samples.max() first, last = _get_timestamp_range_edges( samples.min(), max_value, self.resample_kwargs["freq"], unit=samples.dt.unit, closed=self.resample_kwargs["closed"], origin=self.resample_kwargs["origin"], offset=self.resample_kwargs["offset"], ) all_bins = pandas.date_range( start=first, end=last, freq=self.resample_kwargs["freq"], ambiguous=True, nonexistent="shift_forward", unit=samples.dt.unit, ) all_bins = self._adjust_bin_edges( all_bins, max_value, freq=self.resample_kwargs["freq"], closed=self.resample_kwargs["closed"], ) # take pivot values with an even interval step = 1 / ideal_num_new_partitions bins = [ all_bins[int(len(all_bins) * i * step)] for i in range(1, ideal_num_new_partitions) ] return bins def _adjust_bin_edges( self, binner: pandas.DatetimeIndex, end_timestamp, freq, closed, ) -> pandas.DatetimeIndex: """ Adjust bin edges. This function was copied & simplified from ``pandas.core.resample.TimeGrouper._adjuct_bin_edges()``. Parameters ---------- binner : pandas.DatetimeIndex end_timestamp : pandas.Timestamp freq : str closed : bool Returns ------- pandas.DatetimeIndex """ # Some hacks for > daily data, see pandas-dev/pandas#1471, pandas-dev/pandas#1458, pandas-dev/pandas#1483 if freq.name not in ("BME", "ME", "W") and freq.name.split("-")[0] not in ( "BQE", "BYE", "QE", "YE", "W", ): return binner # If the right end-point is on the last day of the month, roll forwards # until the last moment of that day. Note that we only do this for offsets # which correspond to the end of a super-daily period - "month start", for # example, is excluded. if closed == "right": # GH 21459, GH 9119: Adjust the bins relative to the wall time edges_dti = binner.tz_localize(None) edges_dti = ( edges_dti + pandas.Timedelta(days=1, unit=edges_dti.unit).as_unit(edges_dti.unit) - pandas.Timedelta(1, unit=edges_dti.unit).as_unit(edges_dti.unit) ) binner = edges_dti.tz_localize(binner.tz) # intraday values on last day if binner[-2] > end_timestamp: binner = binner[:-1] return binner @staticmethod def split_partitions_using_pivots_for_sort( df: pandas.DataFrame, columns_info: "list[ColumnInfo]", ascending: bool, closed_on_right: bool = True, **kwargs: dict, ) -> "tuple[pandas.DataFrame, ...]": def add_attr(df, timestamp): if "bin_bounds" in df.attrs: df.attrs["bin_bounds"] = (*df.attrs["bin_bounds"], timestamp) else: df.attrs["bin_bounds"] = (timestamp,) return df result = ShuffleSortFunctions.split_partitions_using_pivots_for_sort( df, columns_info, ascending, **kwargs ) # it's required for each bin to know its bounds in order for resampling to work # properly when down-sampling occurs. Reach here for an example: # https://github.com/modin-project/modin/pull/7140#discussion_r1549246505 # We're writing the bounds as 'attrs' to avoid duplications in the final partition for i, pivot in enumerate(columns_info[0].pivots): add_attr(result[i], pivot - pandas.Timedelta(1, unit="ns")) if i + 1 <= len(result): add_attr(result[i + 1], pivot + pandas.Timedelta(1, unit="ns")) return result def lazy_metadata_decorator(apply_axis=None, axis_arg=-1, transpose=False): """ Lazily propagate metadata for the ``PandasDataframe``. This decorator first adds the minimum required reindexing operations to each partition's queue of functions to be lazily applied for each PandasDataframe in the arguments by applying the function run_f_on_minimally_updated_metadata. The decorator also sets the flags for deferred metadata synchronization on the function result if necessary. Parameters ---------- apply_axis : str, default: None The axes on which to apply the reindexing operations to the `self._partitions` lazily. Case None: No lazy metadata propagation. Case "both": Add reindexing operations on both axes to partition queue. Case "opposite": Add reindexing operations complementary to given axis. Case "rows": Add reindexing operations on row axis to partition queue. axis_arg : int, default: -1 The index or column axis. transpose : bool, default: False Boolean for if a transpose operation is being used. Returns ------- Wrapped Function. """ def decorator(f): from functools import wraps @wraps(f) def run_f_on_minimally_updated_metadata(self, *args, **kwargs): from .dataframe import PandasDataframe for obj in ( [self] + [o for o in args if isinstance(o, PandasDataframe)] + [v for v in kwargs.values() if isinstance(v, PandasDataframe)] + [ d for o in args if isinstance(o, list) for d in o if isinstance(d, PandasDataframe) ] + [ d for _, o in kwargs.items() if isinstance(o, list) for d in o if isinstance(d, PandasDataframe) ] ): if apply_axis == "both": if obj._deferred_index and obj._deferred_column: obj._propagate_index_objs(axis=None) elif obj._deferred_index: obj._propagate_index_objs(axis=0) elif obj._deferred_column: obj._propagate_index_objs(axis=1) elif apply_axis == "opposite": if "axis" not in kwargs: axis = args[axis_arg] else: axis = kwargs["axis"] if axis == 0 and obj._deferred_column: obj._propagate_index_objs(axis=1) elif axis == 1 and obj._deferred_index: obj._propagate_index_objs(axis=0) elif apply_axis == "rows": obj._propagate_index_objs(axis=0) result = f(self, *args, **kwargs) if apply_axis is None and not transpose: result._deferred_index = self._deferred_index result._deferred_column = self._deferred_column elif apply_axis is None and transpose: result._deferred_index = self._deferred_column result._deferred_column = self._deferred_index elif apply_axis == "opposite": if axis == 0: result._deferred_index = self._deferred_index else: result._deferred_column = self._deferred_column elif apply_axis == "rows": result._deferred_column = self._deferred_column return result return run_f_on_minimally_updated_metadata return decorator def add_missing_categories_to_groupby( dfs, by, operator, initial_columns, combined_cols, is_udf_agg, kwargs, initial_dtypes=None, ): """ Generate values for missing categorical values to be inserted into groupby result. This function is used to emulate behavior of ``groupby(observed=False)`` parameter, it takes groupby result that was computed using ``groupby(observed=True)`` and computes results for categorical values that are not presented in `dfs`. Parameters ---------- dfs : list of pandas.DataFrames Row partitions containing groupby results. by : list of hashable Column labels that were used to perform groupby. operator : callable Aggregation function that was used during groupby. initial_columns : pandas.Index Column labels of the original dataframe. combined_cols : pandas.Index Column labels of the groupby result. is_udf_agg : bool Whether ``operator`` is a UDF. kwargs : dict Parameters that were passed to ``groupby(by, **kwargs)``. initial_dtypes : pandas.Series, optional Dtypes of the original dataframe. If not specified, assume it's ``int64``. Returns ------- masks : dict[int, pandas.DataFrame] Mapping between partition idx and a dataframe with results for missing categorical values to insert to this partition. new_combined_cols : pandas.Index New column labels of the groupby result. If ``is_udf_agg is True``, then ``operator`` may change the resulted columns. """ kwargs["observed"] = False new_combined_cols = combined_cols ### At first we need to compute missing categorical values indices = [df.index for df in dfs] # total_index contains all categorical values that resided in the result, # missing values are computed differently depending on whether we're grouping # on multiple groupers or not total_index = indices[0].append(indices[1:]) if isinstance(total_index, pandas.MultiIndex): if all( not isinstance(level, pandas.CategoricalIndex) for level in total_index.levels ): return {}, new_combined_cols missing_cats_dtype = { name: ( level.dtype if isinstance(level.dtype, pandas.CategoricalDtype) # it's a bit confusing but we have to convert the remaining 'by' columns to categoricals # in order to compute a proper fill value later in the code else pandas.CategoricalDtype(level) ) for level, name in zip(total_index.levels, total_index.names) } # if we're grouping on multiple groupers, then the missing categorical values is a # carthesian product of (actual_missing_categorical_values X all_values_of_another_groupers) complete_index = pandas.MultiIndex.from_product( [ value.categories.astype(total_level.dtype) for total_level, value in zip( total_index.levels, missing_cats_dtype.values() ) ], names=by, ) missing_index = complete_index[~complete_index.isin(total_index)] else: if not isinstance(total_index, pandas.CategoricalIndex): return {}, new_combined_cols # if we're grouping on a single grouper then we simply compute the difference # between categorical values in the result and the values defined in categorical dtype missing_index = total_index.categories.difference(total_index.values) missing_cats_dtype = {by[0]: pandas.CategoricalDtype(missing_index)} missing_index.names = by if len(missing_index) == 0: return {}, new_combined_cols ### At this stage we want to get a fill_value for missing categorical values if is_udf_agg and isinstance(total_index, pandas.MultiIndex): # if grouping on multiple columns and aggregating with an UDF, then the # fill value is always `np.nan` missing_values = pandas.DataFrame({0: [np.nan]}) else: # In case of a udf aggregation we're forced to run the operator against each # missing category, as in theory it can return different results for each # empty group. In other cases it's enough to run the operator against a single # missing categorical and then broadcast the fill value to each missing value if not is_udf_agg: missing_cats_dtype = { key: pandas.CategoricalDtype(value.categories[:1]) for key, value in missing_cats_dtype.items() } empty_df = pandas.DataFrame(columns=initial_columns) # HACK: default 'object' dtype doesn't fit our needs, as most of the aggregations # fail on a non-numeric columns, ideally, we need dtypes of the original dataframe, # however, 'int64' also works fine here if the original schema is not available empty_df = empty_df.astype( "int64" if initial_dtypes is None else initial_dtypes ) empty_df = empty_df.astype(missing_cats_dtype) missing_values = operator(empty_df.groupby(by, **kwargs)) if is_udf_agg and not isinstance(total_index, pandas.MultiIndex): missing_values = missing_values.drop(columns=by, errors="ignore") new_combined_cols = pandas.concat( [ pandas.DataFrame(columns=combined_cols), missing_values.iloc[:0], ], axis=0, join="outer", ).columns else: # HACK: If the aggregation has failed, the result would be empty. Assuming the # fill value to be `np.nan` here (this may not always be correct!!!) fill_value = np.nan if len(missing_values) == 0 else missing_values.iloc[0, 0] missing_values = pandas.DataFrame( fill_value, index=missing_index, columns=combined_cols ) # restoring original categorical dtypes for the indices (MultiIndex already have proper dtypes) if not isinstance(missing_values.index, pandas.MultiIndex): missing_values.index = missing_values.index.astype(total_index.dtype) ### Then we decide to which missing categorical values should go to which partition if not kwargs["sort"]: # If the result is allowed to be unsorted, simply insert all the missing # categories to the last partition mask = {len(indices) - 1: missing_values} return mask, new_combined_cols # If the result has to be sorted, we have to assign missing categoricals to proper partitions. # For that purpose we define bins with corner values of each partition and then using either # np.digitize or np.searchsorted find correct bins for each missing categorical value. # Example: part0-> [0, 1, 2]; part1-> [3, 4, 10, 12]; part2-> [15, 17, 20, 100] # bins -> [2, 12] # took last values of each partition excluding the last partition # (every value that's matching 'x > part[-2][-1]' should go to the # last partition, meaning that including the last value of the last # partitions doesn't make sense) # missing_cats -> [-2, 5, 6, 14, 21, 120] # np.digitize(missing_cats, bins) -> [ 0, 1, 1, 2, 2, 2] # ^-- mapping between values and partition idx to insert bins = [] old_bins_to_new = {} offset = 0 # building bins by taking last values of each partition excluding the last partition for idx in indices[:-1]: if len(idx) == 0: # if a partition is empty, we can't use its values to define a bin, thus we simply # skip it and remember the number of skipped partitions as an 'offset' offset += 1 continue # remember the number of skipped partitions before this bin, in order to restore original # indexing at the end old_bins_to_new[len(bins)] = offset # for MultiIndices we always use the very first level for bins as using multiple levels # doesn't affect the result bins.append(idx[-1][0] if isinstance(idx, pandas.MultiIndex) else idx[-1]) old_bins_to_new[len(bins)] = offset if len(bins) == 0: # insert values to the first non-empty partition return {old_bins_to_new.get(0, 0): missing_values}, new_combined_cols # we used the very first level of MultiIndex to build bins, meaning that we also have # to use values of the first index's level for 'digitize' lvl_zero = ( missing_values.index.levels[0] if isinstance(missing_values.index, pandas.MultiIndex) else missing_values.index ) if pandas.api.types.is_any_real_numeric_dtype(lvl_zero): part_idx = np.digitize(lvl_zero, bins, right=True) else: part_idx = np.searchsorted(bins, lvl_zero) ### In the end we build a dictionary mapping partition index to a dataframe with missing categoricals ### to be inserted into this partition masks = {} if isinstance(total_index, pandas.MultiIndex): for idx, values in pandas.RangeIndex(len(lvl_zero)).groupby(part_idx).items(): masks[idx] = missing_values[ pandas.Index(missing_values.index.codes[0]).isin(values) ] else: frame_idx = missing_values.index.to_frame() for idx, values in lvl_zero.groupby(part_idx).items(): masks[idx] = missing_values[frame_idx.iloc[:, 0].isin(values)] # Restore the original indexing by adding the amount of skipped missing partitions masks = {key + old_bins_to_new[key]: value for key, value in masks.items()} return masks, new_combined_cols ================================================ FILE: modin/core/dataframe/pandas/interchange/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Base Modin Dataframe functionality related to data exchange protocols and optimized for pandas storage format.""" ================================================ FILE: modin/core/dataframe/pandas/interchange/dataframe_protocol/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Base Modin Dataframe functionality related to the dataframe exchange protocol and optimized for pandas storage format. See more in https://data-apis.org/dataframe-protocol/latest/index.html. """ ================================================ FILE: modin/core/dataframe/pandas/interchange/dataframe_protocol/buffer.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Dataframe exchange protocol implementation. See more in https://data-apis.org/dataframe-protocol/latest/index.html. Notes ----- - Interpreting a raw pointer (as in ``Buffer.ptr``) is annoying and unsafe to do in pure Python. It's more general but definitely less friendly than having ``to_arrow`` and ``to_numpy`` methods. So for the buffers which lack ``__dlpack__`` (e.g., because the column dtype isn't supported by DLPack), this is worth looking at again. """ import enum from typing import Tuple import numpy as np from modin.core.dataframe.base.interchange.dataframe_protocol.dataframe import ( ProtocolBuffer, ) from modin.utils import _inherit_docstrings @_inherit_docstrings(ProtocolBuffer) class PandasProtocolBuffer(ProtocolBuffer): """ Data in the buffer is guaranteed to be contiguous in memory. Note that there is no dtype attribute present, a buffer can be thought of as simply a block of memory. However, if the column that the buffer is attached to has a dtype that's supported by DLPack and ``__dlpack__`` is implemented, then that dtype information will be contained in the return value from ``__dlpack__``. This distinction is useful to support both (a) data exchange via DLPack on a buffer and (b) dtypes like variable-length strings which do not have a fixed number of bytes per element. Parameters ---------- x : np.ndarray Data to be held by ``Buffer``. allow_copy : bool, default: True A keyword that defines whether or not the library is allowed to make a copy of the data. For example, copying data would be necessary if a library supports strided buffers, given that this protocol specifies contiguous buffers. Currently, if the flag is set to ``False`` and a copy is needed, a ``RuntimeError`` will be raised. """ def __init__(self, x: np.ndarray, allow_copy: bool = True) -> None: if not x.strides == (x.dtype.itemsize,): # The protocol does not support strided buffers, so a copy is # necessary. If that's not allowed, we need to raise an exception. if allow_copy: x = x.copy() else: raise RuntimeError( "Exports cannot be zero-copy in the case " + "of a non-contiguous buffer" ) # Store the numpy array in which the data resides as a private # attribute, so we can use it to retrieve the public attributes self._x = x @property def bufsize(self) -> int: return self._x.size * self._x.dtype.itemsize @property def ptr(self) -> int: return self._x.__array_interface__["data"][0] def __dlpack__(self): raise NotImplementedError("__dlpack__") def __dlpack_device__(self) -> Tuple[enum.IntEnum, int]: class Device(enum.IntEnum): CPU = 1 return (Device.CPU, None) def __repr__(self) -> str: """ Return a string representation for a particular ``PandasProtocolBuffer``. Returns ------- str """ return ( "Buffer(" + str( { "bufsize": self.bufsize, "ptr": self.ptr, "device": self.__dlpack_device__()[0].name, } ) + ")" ) ================================================ FILE: modin/core/dataframe/pandas/interchange/dataframe_protocol/column.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Dataframe exchange protocol implementation. See more in https://data-apis.org/dataframe-protocol/latest/index.html. Notes ----- - Interpreting a raw pointer (as in ``Buffer.ptr``) is annoying and unsafe to do in pure Python. It's more general but definitely less friendly than having ``to_arrow`` and ``to_numpy`` methods. So for the buffers which lack ``__dlpack__`` (e.g., because the column dtype isn't supported by DLPack), this is worth looking at again. """ from __future__ import annotations from functools import cached_property from typing import Any, Dict, Iterable, Optional, Tuple import numpy as np import pandas from modin.core.dataframe.base.interchange.dataframe_protocol.dataframe import ( CategoricalDescription, ProtocolColumn, ) from modin.core.dataframe.base.interchange.dataframe_protocol.utils import ( ColumnNullType, DTypeKind, pandas_dtype_to_arrow_c, ) from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe from modin.utils import _inherit_docstrings from .buffer import PandasProtocolBuffer from .exception import NoOffsetsBuffer, NoValidityBuffer _NO_VALIDITY_BUFFER = { ColumnNullType.NON_NULLABLE: "This column is non-nullable so does not have a mask", ColumnNullType.USE_NAN: "This column uses NaN as null so does not have a separate mask", ColumnNullType.USE_SENTINEL: "This column uses a sentinel value so does not have a mask", } @_inherit_docstrings(ProtocolColumn) class PandasProtocolColumn(ProtocolColumn): """ A column object, with only the methods and properties required by the interchange protocol defined. A column can contain one or more chunks. Each chunk can contain up to three buffers - a data buffer, a mask buffer (depending on null representation), and an offsets buffer (if variable-size binary; e.g., variable-length strings). TBD: Arrow has a separate "null" dtype, and has no separate mask concept. Instead, it seems to use "children" for both columns with a bit mask, and for nested dtypes. Unclear whether this is elegant or confusing. This design requires checking the null representation explicitly. The Arrow design requires checking: 1. the ARROW_FLAG_NULLABLE (for sentinel values) 2. if a column has two children, combined with one of those children having a null dtype. Making the mask concept explicit seems useful. One null dtype would not be enough to cover both bit and byte masks, so that would mean even more checking if we did it the Arrow way. TBD: there's also the "chunk" concept here, which is implicit in Arrow as multiple buffers per array (= column here). Semantically it may make sense to have both: chunks were meant for example for lazy evaluation of data which doesn't fit in memory, while multiple buffers per column could also come from doing a selection operation on a single contiguous buffer. Given these concepts, one would expect chunks to be all of the same size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), while multiple buffers could have data-dependent lengths. Not an issue in pandas if one column is backed by a single NumPy array, but in Arrow it seems possible. Are multiple chunks *and* multiple buffers per column necessary for the purposes of this interchange protocol, or must producers either reuse the chunk concept for this or copy the data? Parameters ---------- column : PandasDataframe A ``PandasDataframe`` object. allow_copy : bool, default: True A keyword that defines whether or not the library is allowed to make a copy of the data. For example, copying data would be necessary if a library supports strided buffers, given that this protocol specifies contiguous buffers. Currently, if the flag is set to ``False`` and a copy is needed, a ``RuntimeError`` will be raised. Notes ----- This Column object can only be produced by ``__dataframe__``, so doesn't need its own version or ``__column__`` protocol. """ def __init__(self, column: PandasDataframe, allow_copy: bool = True) -> None: if not isinstance(column, PandasDataframe): raise NotImplementedError(f"Columns of type {type(column)} not handled yet") self._col = column self._allow_copy = allow_copy def size(self) -> int: return len(self._col.index) @property def offset(self) -> int: return 0 @cached_property def dtype(self) -> Tuple[DTypeKind, int, str, str]: dtype = self._col.dtypes.iloc[0] if isinstance(dtype, pandas.CategoricalDtype): pandas_series = self._col.to_pandas().squeeze(axis=1) codes = pandas_series.values.codes ( _, bitwidth, c_arrow_dtype_f_str, _, ) = self._dtype_from_primitive_pandas_dtype(codes.dtype) dtype_cache = ( DTypeKind.CATEGORICAL, bitwidth, c_arrow_dtype_f_str, "=", ) elif pandas.api.types.is_string_dtype(dtype): dtype_cache = (DTypeKind.STRING, 8, pandas_dtype_to_arrow_c(dtype), "=") else: dtype_cache = self._dtype_from_primitive_pandas_dtype(dtype) return dtype_cache def _dtype_from_primitive_pandas_dtype( self, dtype ) -> Tuple[DTypeKind, int, str, str]: """ Deduce dtype specific for the protocol from pandas dtype. See `self.dtype` for details. Parameters ---------- dtype : any A pandas dtype. Returns ------- tuple """ _np_kinds = { "i": DTypeKind.INT, "u": DTypeKind.UINT, "f": DTypeKind.FLOAT, "b": DTypeKind.BOOL, "M": DTypeKind.DATETIME, } kind = _np_kinds.get(dtype.kind, None) if kind is None: raise NotImplementedError( f"Data type {dtype} not supported by the dataframe exchange protocol" ) return ( kind, dtype.itemsize * 8, pandas_dtype_to_arrow_c(dtype), dtype.byteorder, ) @property def describe_categorical(self) -> CategoricalDescription: if self.dtype[0] != DTypeKind.CATEGORICAL: raise TypeError( "`describe_categorical only works on a column with " + "categorical dtype!" ) pandas_series = self._col.to_pandas().squeeze(axis=1) cat_frame = type(self._col).from_pandas( pandas.DataFrame({"cat": pandas_series.cat.categories}) ) return { "is_ordered": pandas_series.cat.ordered, "is_dictionary": True, "categories": PandasProtocolColumn(cat_frame, self._allow_copy), } @property def describe_null(self) -> Tuple[int, Any]: nulls = { DTypeKind.FLOAT: (ColumnNullType.USE_NAN, None), DTypeKind.DATETIME: (ColumnNullType.USE_NAN, None), DTypeKind.INT: (ColumnNullType.NON_NULLABLE, None), DTypeKind.UINT: (ColumnNullType.NON_NULLABLE, None), DTypeKind.BOOL: (ColumnNullType.NON_NULLABLE, None), # Null values for categoricals are stored as `-1` sentinel values # in the category date (e.g., `col.values.codes` is int8 np.ndarray) DTypeKind.CATEGORICAL: (ColumnNullType.USE_SENTINEL, -1), # follow Arrow in using 1 as valid value and 0 for missing/null value DTypeKind.STRING: (ColumnNullType.USE_BYTEMASK, 0), } kind = self.dtype[0] try: null, value = nulls[kind] except KeyError: raise NotImplementedError(f"Data type {kind} not yet supported") return null, value @cached_property def null_count(self) -> int: def map_func(df): return df.isna() def reduce_func(df): return pandas.DataFrame(df.sum()) intermediate_df = self._col.tree_reduce(0, map_func, reduce_func) # Set ``pandas.RangeIndex(1)`` to index and column labels because # 1) We internally use `MODIN_UNNAMED_SERIES_LABEL` for labels of a reduced axis # 2) The return value of `reduce_func` is a pandas DataFrame with # index and column labels set to ``pandas.RangeIndex(1)`` # 3) We further use `to_pandas().squeeze()` to get an integer value of the null count. # Otherwise, we get mismatching internal and external indices for both axes intermediate_df.index = pandas.RangeIndex(1) intermediate_df.columns = pandas.RangeIndex(1) return intermediate_df.to_pandas().squeeze(axis=1).item() @property def metadata(self) -> Dict[str, Any]: return {"modin.index": self._col.index} def num_chunks(self) -> int: return self._col._partitions.shape[0] def get_chunks( self, n_chunks: Optional[int] = None ) -> Iterable["PandasProtocolColumn"]: cur_n_chunks = self.num_chunks() n_rows = self.size() if n_chunks is None or n_chunks == cur_n_chunks: cum_row_lengths = np.cumsum([0] + self._col.row_lengths) for i in range(len(cum_row_lengths) - 1): yield PandasProtocolColumn( self._col.take_2d_labels_or_positional( row_positions=range(cum_row_lengths[i], cum_row_lengths[i + 1]), col_positions=None, ), allow_copy=self._col._allow_copy, ) return if n_chunks % cur_n_chunks != 0: raise RuntimeError( "The passed `n_chunks` must be a multiple of `self.num_chunks()`." ) if n_chunks > n_rows: raise RuntimeError( "The passed `n_chunks` value is bigger than `self.num_rows()`." ) chunksize = n_rows // n_chunks new_lengths = [chunksize] * n_chunks new_lengths[-1] = n_rows % n_chunks + new_lengths[-1] new_partitions = self._col._partition_mgr_cls.map_axis_partitions( 0, self._col._partitions, lambda df: df, keep_partitioning=False, lengths=new_lengths, ) new_df = self._col.__constructor__( new_partitions, self._col.index, self._col.columns, new_lengths, self._col.column_widths, ) cum_row_lengths = np.cumsum([0] + new_df.row_lengths) for i in range(len(cum_row_lengths) - 1): yield PandasProtocolColumn( new_df.take_2d_labels_or_positional( row_positions=range(cum_row_lengths[i], cum_row_lengths[i + 1]), col_positions=None, ), allow_copy=self._allow_copy, ) def get_buffers(self) -> Dict[str, Any]: buffers = {} buffers["data"] = self._get_data_buffer() try: buffers["validity"] = self._get_validity_buffer() except NoValidityBuffer: buffers["validity"] = None try: buffers["offsets"] = self._get_offsets_buffer() except NoOffsetsBuffer: buffers["offsets"] = None return buffers _data_buffer_cache = None def _get_data_buffer( self, ) -> Tuple[PandasProtocolBuffer, Any]: # Any is for self.dtype tuple """ Return the buffer containing the data and the buffer's associated dtype. Returns ------- tuple The data buffer. """ if self._data_buffer_cache is not None: return self._data_buffer_cache dtype = self.dtype if dtype[0] in ( DTypeKind.INT, DTypeKind.UINT, DTypeKind.FLOAT, DTypeKind.BOOL, DTypeKind.DATETIME, ): buffer = PandasProtocolBuffer( self._col.to_numpy().flatten(), allow_copy=self._allow_copy ) elif dtype[0] == DTypeKind.CATEGORICAL: pandas_series = self._col.to_pandas().squeeze(axis=1) codes = pandas_series.values.codes buffer = PandasProtocolBuffer(codes, allow_copy=self._allow_copy) dtype = self._dtype_from_primitive_pandas_dtype(codes.dtype) elif dtype[0] == DTypeKind.STRING: # Marshal the strings from a NumPy object array into a byte array buf = self._col.to_numpy().flatten() b = bytearray() # TODO: this for-loop is slow; can be implemented in Cython/C/C++ later for i in range(buf.size): if type(buf[i]) is str: b.extend(buf[i].encode(encoding="utf-8")) # Convert the byte array to a pandas "buffer" using a NumPy array as the backing store buffer = PandasProtocolBuffer(np.frombuffer(b, dtype="uint8")) # Define the dtype for the returned buffer dtype = ( DTypeKind.STRING, 8, "u", "=", ) # note: currently only support native endianness else: raise NotImplementedError(f"Data type {self._col.dtype[0]} not handled yet") self._data_buffer_cache = (buffer, dtype) return self._data_buffer_cache _validity_buffer_cache = None def _get_validity_buffer(self) -> Tuple[PandasProtocolBuffer, Any]: """ Get the validity buffer. The buffer contains the mask values indicating missing data and the buffer's associated dtype. Returns ------- tuple The validity buffer. Raises ------ ``NoValidityBuffer`` if null representation is not a bit or byte mask. """ if self._validity_buffer_cache is not None: return self._validity_buffer_cache null, invalid = self.describe_null if self.dtype[0] == DTypeKind.STRING: # For now, have the mask array be comprised of bytes, rather than a bit array buf = self._col.to_numpy().flatten() # Determine the encoding for valid values valid = invalid == 0 invalid = not valid mask = np.empty(shape=(len(buf),), dtype=np.bool_) for i, obj in enumerate(buf): mask[i] = valid if isinstance(obj, str) else invalid # Convert the mask array to a Pandas "buffer" using a NumPy array as the backing store buffer = PandasProtocolBuffer(mask) # Define the dtype of the returned buffer dtype = (DTypeKind.BOOL, 8, "b", "=") self._validity_buffer_cache = (buffer, dtype) return self._validity_buffer_cache try: msg = _NO_VALIDITY_BUFFER[null] except KeyError: raise NotImplementedError("See self.describe_null") raise NoValidityBuffer(msg) _offsets_buffer_cache = None def _get_offsets_buffer(self) -> Tuple[PandasProtocolBuffer, Any]: """ Get the offsets buffer. The buffer contains the offset values for variable-size binary data (e.g., variable-length strings) and the buffer's associated dtype. Returns ------- tuple The offsets buffer. Raises ------ ``NoOffsetsBuffer`` if the data buffer does not have an associated offsets buffer. """ if self._offsets_buffer_cache is not None: return self._offsets_buffer_cache if self.dtype[0] == DTypeKind.STRING: # For each string, we need to manually determine the next offset values = self._col.to_numpy().flatten() ptr = 0 offsets = [ptr] + [None] * len(values) for i, v in enumerate(values): # For missing values (in this case, `np.nan` values), we don't increment the pointer) if type(v) is str: b = v.encode(encoding="utf-8") ptr += len(b) offsets[i + 1] = ptr # Convert the list of offsets to a NumPy array of signed 64-bit integers (note: Arrow allows the offsets array to be either `int32` or `int64`; here, we default to the latter) buf = np.asarray(offsets, dtype="int64") # Convert the offsets to a Pandas "buffer" using the NumPy array as the backing store buffer = PandasProtocolBuffer(buf) # Assemble the buffer dtype info dtype = ( DTypeKind.INT, 64, "l", "=", ) # note: currently only support native endianness else: raise NoOffsetsBuffer( "This column has a fixed-length dtype so does not have an offsets buffer" ) self._offsets_buffer_cache = (buffer, dtype) return self._offsets_buffer_cache ================================================ FILE: modin/core/dataframe/pandas/interchange/dataframe_protocol/dataframe.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Dataframe exchange protocol implementation. See more in https://data-apis.org/dataframe-protocol/latest/index.html. Notes ----- - Interpreting a raw pointer (as in ``Buffer.ptr``) is annoying and unsafe to do in pure Python. It's more general but definitely less friendly than having ``to_arrow`` and ``to_numpy`` methods. So for the buffers which lack ``__dlpack__`` (e.g., because the column dtype isn't supported by DLPack), this is worth looking at again. """ import collections from typing import Any, Dict, Iterable, Optional, Sequence import numpy as np from modin.core.dataframe.base.interchange.dataframe_protocol.dataframe import ( ProtocolDataframe, ) from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe from modin.utils import _inherit_docstrings from .column import PandasProtocolColumn @_inherit_docstrings(ProtocolDataframe) class PandasProtocolDataframe(ProtocolDataframe): """ A data frame class, with only the methods required by the interchange protocol defined. Instances of this (private) class are returned from ``modin.pandas.DataFrame.__dataframe__`` as objects with the methods and attributes defined on this class. A "data frame" represents an ordered collection of named columns. A column's "name" must be a unique string. Columns may be accessed by name or by position. This could be a public data frame class, or an object with the methods and attributes defined on this DataFrame class could be returned from the ``__dataframe__`` method of a public data frame class in a library adhering to the dataframe interchange protocol specification. Parameters ---------- df : PandasDataframe A ``PandasDataframe`` object. nan_as_null : bool, default:False A keyword intended for the consumer to tell the producer to overwrite null values in the data with ``NaN`` (or ``NaT``). This currently has no effect; once support for nullable extension dtypes is added, this value should be propagated to columns. allow_copy : bool, default: True A keyword that defines whether or not the library is allowed to make a copy of the data. For example, copying data would be necessary if a library supports strided buffers, given that this protocol specifies contiguous buffers. Currently, if the flag is set to ``False`` and a copy is needed, a ``RuntimeError`` will be raised. """ def __init__( self, df: PandasDataframe, nan_as_null: bool = False, allow_copy: bool = True, ) -> None: self._df = df self._nan_as_null = nan_as_null self._allow_copy = allow_copy def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): return PandasProtocolDataframe( self._df, nan_as_null=nan_as_null, allow_copy=allow_copy ) @property def metadata(self) -> Dict[str, Any]: return {"modin.index": self._df.index} def num_columns(self) -> int: return len(self._df.columns) def num_rows(self) -> int: return len(self._df.index) def num_chunks(self) -> int: return self._df._partitions.shape[0] def column_names(self) -> Iterable[str]: for col in self._df.columns: yield col def get_column(self, i: int) -> PandasProtocolColumn: return PandasProtocolColumn( self._df.take_2d_labels_or_positional( row_positions=None, col_positions=[i] ), allow_copy=self._allow_copy, ) def get_column_by_name(self, name: str) -> PandasProtocolColumn: return PandasProtocolColumn( self._df.take_2d_labels_or_positional( row_positions=None, col_labels=[name] ), allow_copy=self._allow_copy, ) def get_columns(self) -> Iterable[PandasProtocolColumn]: for name in self._df.columns: yield PandasProtocolColumn( self._df.take_2d_labels_or_positional( row_positions=None, col_labels=[name] ), allow_copy=self._allow_copy, ) def select_columns(self, indices: Sequence[int]) -> "PandasProtocolDataframe": if not isinstance(indices, collections.abc.Sequence): raise ValueError("`indices` is not a sequence") return PandasProtocolDataframe( self._df.take_2d_labels_or_positional( row_positions=None, col_positions=indices ), allow_copy=self._allow_copy, ) def select_columns_by_name(self, names: Sequence[str]) -> "PandasProtocolDataframe": if not isinstance(names, collections.abc.Sequence): raise ValueError("`names` is not a sequence") return PandasProtocolDataframe( self._df.take_2d_labels_or_positional(row_positions=None, col_labels=names), allow_copy=self._allow_copy, ) def get_chunks( self, n_chunks: Optional[int] = None ) -> Iterable["PandasProtocolDataframe"]: cur_n_chunks = self.num_chunks() n_rows = self.num_rows() if n_chunks is None or n_chunks == cur_n_chunks: cum_row_lengths = np.cumsum([0] + self._df.row_lengths) for i in range(len(cum_row_lengths) - 1): yield PandasProtocolDataframe( self._df.take_2d_labels_or_positional( row_positions=range(cum_row_lengths[i], cum_row_lengths[i + 1]), col_positions=None, ), allow_copy=self._allow_copy, ) return if n_chunks % cur_n_chunks != 0: raise RuntimeError( "The passed `n_chunks` must be a multiple of `self.num_chunks()`." ) if n_chunks > n_rows: raise RuntimeError( "The passed `n_chunks` value is bigger than `self.num_rows()`." ) chunksize = n_rows // n_chunks new_lengths = [chunksize] * n_chunks new_lengths[-1] = n_rows % n_chunks + new_lengths[-1] new_partitions = self._df._partition_mgr_cls.map_axis_partitions( 0, self._df._partitions, lambda df: df, keep_partitioning=False, lengths=new_lengths, ) new_df = self._df.__constructor__( new_partitions, self._df.index, self._df.columns, new_lengths, self._df.column_widths, ) cum_row_lengths = np.cumsum([0] + new_df.row_lengths) for i in range(len(cum_row_lengths) - 1): yield PandasProtocolDataframe( new_df.take_2d_labels_or_positional( row_positions=range(cum_row_lengths[i], cum_row_lengths[i + 1]), col_positions=None, ), allow_copy=self._allow_copy, ) ================================================ FILE: modin/core/dataframe/pandas/interchange/dataframe_protocol/exception.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Exceptions that can be caught by dataframe exchange protocol implementation for pandas storage format.""" class NoValidityBuffer(Exception): """Exception to be raised if there is no validity buffer for ``PandasProtocolColumn``.""" pass class NoOffsetsBuffer(Exception): """Exception to be raised if there is no offsets buffer for ``PandasProtocolColumn``.""" pass ================================================ FILE: modin/core/dataframe/pandas/interchange/dataframe_protocol/from_dataframe.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses the functions building a ``pandas.DataFrame`` from a DataFrame exchange protocol object.""" import ctypes import re from typing import Any, Optional, Tuple, Union import numpy as np import pandas from modin.core.dataframe.base.interchange.dataframe_protocol.dataframe import ( ProtocolBuffer, ProtocolColumn, ProtocolDataframe, ) from modin.core.dataframe.base.interchange.dataframe_protocol.utils import ( ArrowCTypes, ColumnNullType, DTypeKind, Endianness, ) np_types_map = { DTypeKind.INT: {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64}, DTypeKind.UINT: {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64}, DTypeKind.FLOAT: {32: np.float32, 64: np.float64}, # Consider bitmask to be a uint8 dtype to parse the bits later DTypeKind.BOOL: {1: np.uint8, 8: bool}, } def from_dataframe_to_pandas(df: ProtocolDataframe, n_chunks: Optional[int] = None): """ Build a ``pandas.DataFrame`` from an object supporting the DataFrame exchange protocol, i.e. `__dataframe__` method. Parameters ---------- df : ProtocolDataframe Object supporting the exchange protocol, i.e. `__dataframe__` method. n_chunks : int, optional Number of chunks to split `df`. Returns ------- pandas.DataFrame """ if not hasattr(df, "__dataframe__"): raise ValueError("`df` does not support __dataframe__") df = df.__dataframe__() if isinstance(df, dict): df = df["dataframe"] pandas_dfs = [] for chunk in df.get_chunks(n_chunks): pandas_df = protocol_df_chunk_to_pandas(chunk) pandas_dfs.append(pandas_df) pandas_df = pandas.concat(pandas_dfs, axis=0, ignore_index=True) index_obj = df.metadata.get("modin.index", df.metadata.get("pandas.index", None)) if index_obj is not None: pandas_df.index = index_obj return pandas_df def protocol_df_chunk_to_pandas(df): """ Convert exchange protocol chunk to ``pandas.DataFrame``. Parameters ---------- df : ProtocolDataframe Returns ------- pandas.DataFrame """ # We need a dict of columns here, with each column being a NumPy array (at # least for now, deal with non-NumPy dtypes later). columns = dict() buffers = [] # hold on to buffers, keeps memory alive for name in df.column_names(): if not isinstance(name, str): raise ValueError(f"Column {name} is not a string") if name in columns: raise ValueError(f"Column {name} is not unique") col = df.get_column_by_name(name) columns[name], buf = unpack_protocol_column(col) buffers.append(buf) pandas_df = pandas.DataFrame(columns) pandas_df._buffers = buffers return pandas_df def unpack_protocol_column( col: ProtocolColumn, ) -> Tuple[Union[np.ndarray, pandas.Series], Any]: """ Unpack an interchange protocol column to a pandas-ready column. Parameters ---------- col : ProtocolColumn Column to unpack. Returns ------- tuple Tuple of resulting column (either an ndarray or a series) and the object which keeps memory referenced by the column alive. """ dtype = col.dtype[0] if dtype in ( DTypeKind.INT, DTypeKind.UINT, DTypeKind.FLOAT, DTypeKind.BOOL, ): return primitive_column_to_ndarray(col) elif dtype == DTypeKind.CATEGORICAL: return categorical_column_to_series(col) elif dtype == DTypeKind.STRING: return string_column_to_ndarray(col) elif dtype == DTypeKind.DATETIME: return datetime_column_to_ndarray(col) else: raise NotImplementedError(f"Data type {dtype} not handled yet") def primitive_column_to_ndarray(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: """ Convert a column holding one of the primitive dtypes (int, uint, float or bool) to a NumPy array. Parameters ---------- col : ProtocolColumn Returns ------- tuple Tuple of np.ndarray holding the data and the memory owner object that keeps the memory alive. """ buffers = col.get_buffers() data_buff, data_dtype = buffers["data"] data = buffer_to_ndarray(data_buff, data_dtype, col.offset, col.size()) data = set_nulls(data, col, buffers["validity"]) return data, buffers def categorical_column_to_series(col: ProtocolColumn) -> Tuple[pandas.Series, Any]: """ Convert a column holding categorical data to a pandas Series. Parameters ---------- col : ProtocolColumn Returns ------- tuple Tuple of pandas.Series holding the data and the memory owner object that keeps the memory alive. """ cat_descr = col.describe_categorical ordered, is_dict, categories = ( cat_descr["is_ordered"], cat_descr["is_dictionary"], cat_descr["categories"], ) if not is_dict or categories is None: raise NotImplementedError("Non-dictionary categoricals not supported yet") buffers = col.get_buffers() codes_buff, codes_dtype = buffers["data"] codes = buffer_to_ndarray(codes_buff, codes_dtype, col.offset, col.size()) # Doing module in order to not get ``IndexError`` for out-of-bounds sentinel values in `codes` cat_values, categories_buf = unpack_protocol_column(categories) values = cat_values[codes % len(cat_values)] cat = pandas.Categorical(values, categories=cat_values, ordered=ordered) data = pandas.Series(cat) data = set_nulls(data, col, buffers["validity"]) return data, [buffers, categories_buf] def _inverse_null_buf(buf: np.ndarray, null_kind: ColumnNullType) -> np.ndarray: """ Inverse the boolean value of buffer storing either bit- or bytemask. Parameters ---------- buf : np.ndarray Buffer to inverse the boolean value for. null_kind : {ColumnNullType.USE_BYTEMASK, ColumnNullType.USE_BITMASK} How to treat the buffer. Returns ------- np.ndarray Logically inversed buffer. """ if null_kind == ColumnNullType.USE_BITMASK: return ~buf assert ( null_kind == ColumnNullType.USE_BYTEMASK ), f"Unexpected null kind: {null_kind}" # bytemasks use 0 for `False` and anything else for `True`, so convert to bool # by direct comparison instead of bitwise reversal like we do for bitmasks return buf == 0 def string_column_to_ndarray(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: """ Convert a column holding string data to a NumPy array. Parameters ---------- col : ProtocolColumn Returns ------- tuple Tuple of np.ndarray holding the data and the memory owner object that keeps the memory alive. """ null_kind, sentinel_val = col.describe_null if null_kind not in ( ColumnNullType.NON_NULLABLE, ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK, ): raise NotImplementedError( f"{null_kind} null kind is not yet supported for string columns." ) buffers = col.get_buffers() # Retrieve the data buffer containing the UTF-8 code units data_buff, protocol_data_dtype = buffers["data"] # We're going to reinterpret the buffer as uint8, so making sure we can do it safely assert protocol_data_dtype[1] == 8 # bitwidth == 8 assert protocol_data_dtype[2] == ArrowCTypes.STRING # format_str == utf-8 # Convert the buffers to NumPy arrays, in order to go from STRING to an equivalent ndarray, # we claim that the buffer is uint8 (i.e., a byte array) data_dtype = ( DTypeKind.UINT, 8, ArrowCTypes.UINT8, Endianness.NATIVE, ) # Specify zero offset as we don't want to chunk the string data data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=col.size()) # Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string offset_buff, offset_dtype = buffers["offsets"] # Offsets buffer contains start-stop positions of strings in the data buffer, # meaning that it has more elements than in the data buffer, do `col.size() + 1` here # to pass a proper offsets buffer size offsets = buffer_to_ndarray( offset_buff, offset_dtype, col.offset, length=col.size() + 1 ) null_pos = None if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK): valid_buff, valid_dtype = buffers["validity"] null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size()) if sentinel_val == 0: null_pos = _inverse_null_buf(null_pos, null_kind) # Assemble the strings from the code units str_list = [None] * col.size() for i in range(col.size()): # Check for missing values if null_pos is not None and null_pos[i]: str_list[i] = np.nan continue # Extract a range of code units units = data[offsets[i] : offsets[i + 1]] # Convert the list of code units to bytes str_bytes = bytes(units) # Create the string string = str_bytes.decode(encoding="utf-8") # Add to our list of strings str_list[i] = string # Convert the string list to a NumPy array return np.asarray(str_list, dtype="object"), buffers def datetime_column_to_ndarray(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: """ Convert a column holding DateTime data to a NumPy array. Parameters ---------- col : ProtocolColumn Returns ------- tuple Tuple of np.ndarray holding the data and the memory owner object that keeps the memory alive. """ buffers = col.get_buffers() _, _, format_str, _ = col.dtype dbuf, dtype = buffers["data"] # Consider dtype being `uint` to get number of units passed since the 01.01.1970 data = buffer_to_ndarray( dbuf, ( DTypeKind.UINT, dtype[1], getattr(ArrowCTypes, f"UINT{dtype[1]}"), Endianness.NATIVE, ), col.offset, col.size(), ) def parse_format_str(format_str, data): """Parse datetime `format_str` to interpret the `data`.""" # timestamp 'ts{unit}:tz' timestamp_meta = re.match(r"ts([smun]):(.*)", format_str) if timestamp_meta: unit, tz = timestamp_meta.group(1), timestamp_meta.group(2) if tz != "": raise NotImplementedError("Timezones are not supported yet") if unit != "s": # the format string describes only a first letter of the unit, add one extra # letter to make the unit in numpy-style: 'm' -> 'ms', 'u' -> 'us', 'n' -> 'ns' unit += "s" data = data.astype(f"datetime64[{unit}]") return data # date 'td{Days/Ms}' date_meta = re.match(r"td([Dm])", format_str) if date_meta: unit = date_meta.group(1) if unit == "D": # NumPy doesn't support DAY unit, so converting days to seconds # (converting to uint64 to avoid overflow) data = (data.astype(np.uint64) * (24 * 60 * 60)).astype("datetime64[s]") elif unit == "m": data = data.astype("datetime64[ms]") else: raise NotImplementedError(f"Date unit is not supported: {unit}") return data raise NotImplementedError(f"DateTime kind is not supported: {format_str}") data = parse_format_str(format_str, data) data = set_nulls(data, col, buffers["validity"]) return data, buffers def buffer_to_ndarray( buffer: ProtocolBuffer, dtype: Tuple[DTypeKind, int, str, str], offset: int = 0, length: Optional[int] = None, ) -> np.ndarray: """ Build a NumPy array from the passed buffer. Parameters ---------- buffer : ProtocolBuffer Buffer to build a NumPy array from. dtype : tuple Data type of the buffer conforming protocol dtypes format. offset : int, default: 0 Number of elements to offset from the start of the buffer. length : int, optional If the buffer is a bit-mask, specifies a number of bits to read from the buffer. Has no effect otherwise. Returns ------- np.ndarray Notes ----- The returned array doesn't own the memory. A user of the function must keep the memory owner object alive as long as the returned NumPy array is being used. """ kind, bit_width, _, _ = dtype column_dtype = np_types_map.get(kind, {}).get(bit_width, None) if column_dtype is None: raise NotImplementedError(f"Convertion for {dtype} is not yet supported.") # TODO: No DLPack yet, so need to construct a new ndarray from the data pointer # and size in the buffer plus the dtype on the column. Use DLPack as NumPy supports # it since https://github.com/numpy/numpy/pull/19083 ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype) data_pointer = ctypes.cast( buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type) ) if bit_width == 1: assert length is not None, "`length` must be specified for a bit-mask buffer." arr = np.ctypeslib.as_array(data_pointer, shape=(buffer.bufsize,)) return bitmask_to_bool_ndarray(arr, length, first_byte_offset=offset % 8) else: return np.ctypeslib.as_array( data_pointer, shape=(buffer.bufsize // (bit_width // 8),) ) def bitmask_to_bool_ndarray( bitmask: np.ndarray, mask_length: int, first_byte_offset: int = 0 ) -> np.ndarray: """ Convert bit-mask to a boolean NumPy array. Parameters ---------- bitmask : np.ndarray[uint8] NumPy array of uint8 dtype representing the bitmask. mask_length : int Number of elements in the mask to interpret. first_byte_offset : int, default: 0 Number of elements to offset from the start of the first byte. Returns ------- np.ndarray[bool] """ bytes_to_skip = first_byte_offset // 8 bitmask = bitmask[bytes_to_skip:] first_byte_offset %= 8 bool_mask = np.zeros(mask_length, dtype=bool) # Proccessing the first byte separately as it has its own offset val = bitmask[0] mask_idx = 0 bits_in_first_byte = min(8 - first_byte_offset, mask_length) for j in range(bits_in_first_byte): if val & (1 << (j + first_byte_offset)): bool_mask[mask_idx] = True mask_idx += 1 # `mask_length // 8` describes how many full bytes to process for i in range((mask_length - bits_in_first_byte) // 8): # doing `+ 1` as we already processed the first byte val = bitmask[i + 1] for j in range(8): if val & (1 << j): bool_mask[mask_idx] = True mask_idx += 1 if len(bitmask) > 1: # Processing reminder of last byte val = bitmask[-1] for j in range(len(bool_mask) - mask_idx): if val & (1 << j): bool_mask[mask_idx] = True mask_idx += 1 return bool_mask def set_nulls( data: Union[np.ndarray, pandas.Series], col: ProtocolColumn, validity: Tuple[ProtocolBuffer, Tuple[DTypeKind, int, str, str]], allow_modify_inplace: bool = True, ): """ Set null values for the data according to the column null kind. Parameters ---------- data : np.ndarray or pandas.Series Data to set nulls in. col : ProtocolColumn Column object that describes the `data`. validity : tuple(ProtocolBuffer, dtype) or None The return value of ``col.buffers()``. We do not access the ``col.buffers()`` here to not take the ownership of the memory of buffer objects. allow_modify_inplace : bool, default: True Whether to modify the `data` inplace when zero-copy is possible (True) or always modify a copy of the `data` (False). Returns ------- np.ndarray or pandas.Series Data with the nulls being set. """ null_kind, sentinel_val = col.describe_null null_pos = None if null_kind == ColumnNullType.USE_SENTINEL: null_pos = data == sentinel_val elif null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK): valid_buff, valid_dtype = validity null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size()) if sentinel_val == 0: null_pos = _inverse_null_buf(null_pos, null_kind) elif null_kind in (ColumnNullType.NON_NULLABLE, ColumnNullType.USE_NAN): pass else: raise NotImplementedError(f"Null kind {null_kind} is not yet supported.") if null_pos is not None and np.any(null_pos): if not allow_modify_inplace: data = data.copy() try: data[null_pos] = None except TypeError: # TypeError happens if the `data` dtype appears to be non-nullable in numpy notation # (bool, int, uint), if such happens, cast the `data` to nullable float dtype. data = data.astype(float) data[null_pos] = None return data ================================================ FILE: modin/core/dataframe/pandas/metadata/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Utilities and classes to handle work with metadata.""" from .dtypes import ( DtypesDescriptor, LazyProxyCategoricalDtype, ModinDtypes, extract_dtype, ) from .index import ModinIndex __all__ = [ "ModinDtypes", "ModinIndex", "LazyProxyCategoricalDtype", "DtypesDescriptor", "extract_dtype", ] ================================================ FILE: modin/core/dataframe/pandas/metadata/dtypes.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module contains class ``ModinDtypes``.""" from __future__ import annotations from typing import TYPE_CHECKING, Callable, Optional, Union import pandas from pandas._typing import DtypeObj, IndexLabel from pandas.core.dtypes.cast import find_common_type if TYPE_CHECKING: from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe from .index import ModinIndex from modin.error_message import ErrorMessage class DtypesDescriptor: """ Describes partial dtypes. Parameters ---------- known_dtypes : dict[IndexLabel, DtypeObj] or pandas.Series, optional Columns that we know dtypes for. cols_with_unknown_dtypes : list[IndexLabel], optional Column names that have unknown dtypes. If specified together with `remaining_dtype`, must describe all columns with unknown dtypes, otherwise, the missing columns will be assigned to `remaining_dtype`. If `cols_with_unknown_dtypes` is incomplete, you must specify `know_all_names=False`. remaining_dtype : DtypeObj, optional Dtype for columns that are not present neither in `known_dtypes` nor in `cols_with_unknown_dtypes`. This parameter is intended to describe columns that we known dtypes for, but don't know their names yet. Note, that this parameter DOESN'T describe dtypes for columns from `cols_with_unknown_dtypes`. parent_df : PandasDataframe, optional Dataframe object for which we describe dtypes. This dataframe will be used to compute missing dtypes on ``.materialize()``. columns_order : dict[int, IndexLabel], optional Order of columns in the dataframe. If specified, must describe all the columns of the dataframe. know_all_names : bool, default: True Whether `known_dtypes` and `cols_with_unknown_dtypes` contain all column names for this dataframe besides those, that are being described by `remaining_dtype`. One can't pass `know_all_names=False` together with `remaining_dtype` as this creates ambiguity on how to interpret missing columns (whether they belong to `remaining_dtype` or not). _schema_is_known : bool, optional Whether `known_dtypes` describe all columns in the dataframe. This parameter intended mostly for internal use. """ def __init__( self, known_dtypes: Optional[Union[dict[IndexLabel, DtypeObj], pandas.Series]] = None, cols_with_unknown_dtypes: Optional[list[IndexLabel]] = None, remaining_dtype: Optional[DtypeObj] = None, parent_df: Optional[PandasDataframe] = None, columns_order: Optional[dict[int, IndexLabel]] = None, know_all_names: bool = True, _schema_is_known: Optional[bool] = None, ): if not know_all_names and remaining_dtype is not None: raise ValueError( "It's not allowed to pass 'remaining_dtype' and 'know_all_names=False' at the same time." ) # columns with known dtypes self._known_dtypes: dict[IndexLabel, DtypeObj] = ( {} if known_dtypes is None else dict(known_dtypes) ) if known_dtypes is not None and len(self._known_dtypes) != len(known_dtypes): raise NotImplementedError( "Duplicated column names are not yet supported by DtypesDescriptor" ) # columns with unknown dtypes (they're not described by 'remaining_dtype') if cols_with_unknown_dtypes is not None and len( set(cols_with_unknown_dtypes) ) != len(cols_with_unknown_dtypes): raise NotImplementedError( "Duplicated column names are not yet supported by DtypesDescriptor" ) self._cols_with_unknown_dtypes: list[IndexLabel] = ( [] if cols_with_unknown_dtypes is None else cols_with_unknown_dtypes ) # whether 'known_dtypes' describe all columns in the dataframe self._schema_is_known: Optional[bool] = _schema_is_known if self._schema_is_known is None: self._schema_is_known = False if ( # if 'cols_with_unknown_dtypes' was explicitly specified as an empty list and # we don't have any 'remaining_dtype', then we assume that 'known_dtypes' are complete cols_with_unknown_dtypes is not None and know_all_names and remaining_dtype is None and len(self._known_dtypes) > 0 ): self._schema_is_known = len(cols_with_unknown_dtypes) == 0 self._know_all_names: bool = know_all_names # a common dtype for columns that are not present in 'known_dtypes' nor in 'cols_with_unknown_dtypes' self._remaining_dtype: Optional[DtypeObj] = remaining_dtype self._parent_df: Optional[PandasDataframe] = parent_df if columns_order is None: self._columns_order: Optional[dict[int, IndexLabel]] = None # try to compute '._columns_order' using 'parent_df' self.columns_order else: if remaining_dtype is not None: raise ValueError( "Passing 'columns_order' and 'remaining_dtype' is ambiguous. You have to manually " + "complete 'known_dtypes' using the information from 'columns_order' and 'remaining_dtype'." ) elif not self._know_all_names: raise ValueError( "Passing 'columns_order' and 'know_all_names=False' is ambiguous. You have to manually " + "complete 'cols_with_unknown_dtypes' using the information from 'columns_order' " + "and pass 'know_all_names=True'." ) elif len(columns_order) != ( len(self._cols_with_unknown_dtypes) + len(self._known_dtypes) ): raise ValueError( "The length of 'columns_order' doesn't match to 'known_dtypes' and 'cols_with_unknown_dtypes'" ) self._columns_order: Optional[dict[int, IndexLabel]] = columns_order def update_parent(self, new_parent: PandasDataframe): """ Set new parent dataframe. Parameters ---------- new_parent : PandasDataframe """ self._parent_df = new_parent LazyProxyCategoricalDtype.update_dtypes(self._known_dtypes, new_parent) # try to compute '._columns_order' using 'new_parent' self.columns_order @property def columns_order(self) -> Optional[dict[int, IndexLabel]]: """ Get order of columns for the described dataframe if available. Returns ------- dict[int, IndexLabel] or None """ if self._columns_order is not None: return self._columns_order if self._parent_df is None or not self._parent_df.has_materialized_columns: return None actual_columns = self._parent_df.columns self._normalize_self_levels(actual_columns) self._columns_order = {i: col for i, col in enumerate(actual_columns)} # we got information about new columns and thus can potentially # extend our knowledge about missing dtypes if len(self._columns_order) > ( len(self._known_dtypes) + len(self._cols_with_unknown_dtypes) ): new_cols = [ col for col in self._columns_order.values() if col not in self._known_dtypes and col not in self._cols_with_unknown_dtypes ] if self._remaining_dtype is not None: self._known_dtypes.update( {col: self._remaining_dtype for col in new_cols} ) self._remaining_dtype = None if len(self._cols_with_unknown_dtypes) == 0: self._schema_is_known = True else: self._cols_with_unknown_dtypes.extend(new_cols) self._know_all_names = True return self._columns_order def __repr__(self): # noqa: GL08 return ( f"DtypesDescriptor:\n\tknown dtypes: {self._known_dtypes};\n\t" + f"remaining dtype: {self._remaining_dtype};\n\t" + f"cols with unknown dtypes: {self._cols_with_unknown_dtypes};\n\t" + f"schema is known: {self._schema_is_known};\n\t" + f"has parent df: {self._parent_df is not None};\n\t" + f"columns order: {self._columns_order};\n\t" + f"know all names: {self._know_all_names}" ) def __str__(self): # noqa: GL08 return self.__repr__() def lazy_get( self, ids: list[Union[IndexLabel, int]], numeric_index: bool = False ) -> DtypesDescriptor: """ Get dtypes descriptor for a subset of columns without triggering any computations. Parameters ---------- ids : list of index labels or positional indexers Columns for the subset. numeric_index : bool, default: False Whether `ids` are positional indixes or column labels to take. Returns ------- DtypesDescriptor Descriptor that describes dtypes for columns specified in `ids`. """ if len(set(ids)) != len(ids): raise NotImplementedError( "Duplicated column names are not yet supported by DtypesDescriptor" ) if numeric_index: if self.columns_order is not None: ids = [self.columns_order[i] for i in ids] else: raise ValueError( "Can't lazily get columns by positional indixers if the columns order is unknown" ) result = {} unknown_cols = [] columns_order = {} for i, col in enumerate(ids): columns_order[i] = col if col in self._cols_with_unknown_dtypes: unknown_cols.append(col) continue dtype = self._known_dtypes.get(col) if dtype is None and self._remaining_dtype is None: unknown_cols.append(col) elif dtype is None and self._remaining_dtype is not None: result[col] = self._remaining_dtype else: result[col] = dtype remaining_dtype = self._remaining_dtype if len(unknown_cols) != 0 else None return DtypesDescriptor( result, unknown_cols, remaining_dtype, self._parent_df, columns_order=columns_order, ) def copy(self) -> DtypesDescriptor: """ Get a copy of this descriptor. Returns ------- DtypesDescriptor """ return type(self)( # should access '.columns_order' first, as it may compute columns order # and complete the metadata for 'self' columns_order=( None if self.columns_order is None else self.columns_order.copy() ), known_dtypes=self._known_dtypes.copy(), cols_with_unknown_dtypes=self._cols_with_unknown_dtypes.copy(), remaining_dtype=self._remaining_dtype, parent_df=self._parent_df, know_all_names=self._know_all_names, _schema_is_known=self._schema_is_known, ) def set_index(self, new_index: Union[pandas.Index, ModinIndex]) -> DtypesDescriptor: """ Set new column names for this descriptor. Parameters ---------- new_index : pandas.Index or ModinIndex Returns ------- DtypesDescriptor New descriptor with updated column names. Notes ----- Calling this method on a descriptor that returns ``None`` for ``.columns_order`` will result into information lose. """ if len(new_index) != len(set(new_index)): raise NotImplementedError( "Duplicated column names are not yet supported by DtypesDescriptor" ) if self.columns_order is None: # we can't map new columns to old columns and lost all dtypes :( return DtypesDescriptor( cols_with_unknown_dtypes=new_index, columns_order={i: col for i, col in enumerate(new_index)}, parent_df=self._parent_df, know_all_names=True, ) new_self = self.copy() renamer = {old_c: new_index[i] for i, old_c in new_self.columns_order.items()} new_self._known_dtypes = { renamer[old_col]: value for old_col, value in new_self._known_dtypes.items() } new_self._cols_with_unknown_dtypes = [ renamer[old_col] for old_col in new_self._cols_with_unknown_dtypes ] new_self._columns_order = { i: renamer[old_col] for i, old_col in new_self._columns_order.items() } return new_self def equals(self, other: DtypesDescriptor) -> bool: """ Compare two descriptors for equality. Parameters ---------- other : DtypesDescriptor Returns ------- bool """ return ( self._known_dtypes == other._known_dtypes and set(self._cols_with_unknown_dtypes) == set(other._cols_with_unknown_dtypes) and self._remaining_dtype == other._remaining_dtype and self._schema_is_known == other._schema_is_known and self.columns_order == other.columns_order and self._know_all_names == other._know_all_names ) @property def is_materialized(self) -> bool: """ Whether this descriptor contains information about all dtypes in the dataframe. Returns ------- bool """ return self._schema_is_known def _materialize_all_names(self): """Materialize missing column names.""" if self._know_all_names: return all_cols = self._parent_df.columns self._normalize_self_levels(all_cols) for col in all_cols: if ( col not in self._known_dtypes and col not in self._cols_with_unknown_dtypes ): self._cols_with_unknown_dtypes.append(col) self._know_all_names = True def _materialize_cols_with_unknown_dtypes(self): """Compute dtypes for cols specified in `._cols_with_unknown_dtypes`.""" if ( len(self._known_dtypes) == 0 and len(self._cols_with_unknown_dtypes) == 0 and not self._know_all_names ): # here we have to compute dtypes for all columns in the dataframe, # so avoiding columns materialization by setting 'subset=None' subset = None else: if not self._know_all_names: self._materialize_all_names() subset = self._cols_with_unknown_dtypes if subset is None or len(subset) > 0: self._known_dtypes.update(self._parent_df._compute_dtypes(subset)) self._know_all_names = True self._cols_with_unknown_dtypes = [] def materialize(self): """Complete information about dtypes.""" if self.is_materialized: return if self._parent_df is None: raise RuntimeError( "It's not allowed to call '.materialize()' before '._parent_df' is specified." ) self._materialize_cols_with_unknown_dtypes() if self._remaining_dtype is not None: cols = self._parent_df.columns self._normalize_self_levels(cols) self._known_dtypes.update( { col: self._remaining_dtype for col in cols if col not in self._known_dtypes } ) # we currently not guarantee for dtypes to be in a proper order: # https://github.com/modin-project/modin/blob/8a332c1597c54d36f7ccbbd544e186b689f9ceb1/modin/pandas/test/utils.py#L644-L646 # so restoring the order only if it's possible if self.columns_order is not None: assert len(self.columns_order) == len(self._known_dtypes) self._known_dtypes = { self.columns_order[i]: self._known_dtypes[self.columns_order[i]] for i in range(len(self.columns_order)) } self._schema_is_known = True self._remaining_dtype = None self._parent_df = None def to_series(self) -> pandas.Series: """ Convert descriptor to a pandas Series. Returns ------- pandas.Series """ self.materialize() return pandas.Series(self._known_dtypes) def get_dtypes_set(self) -> set[DtypeObj]: """ Get a set of dtypes from the descriptor. Returns ------- set[DtypeObj] """ if len(self._cols_with_unknown_dtypes) > 0 or not self._know_all_names: self._materialize_cols_with_unknown_dtypes() known_dtypes: set[DtypeObj] = set(self._known_dtypes.values()) if self._remaining_dtype is not None: known_dtypes.add(self._remaining_dtype) return known_dtypes @classmethod def _merge_dtypes( cls, values: list[Union[DtypesDescriptor, pandas.Series, None]] ) -> DtypesDescriptor: """ Union columns described by ``values`` and compute common dtypes for them. Parameters ---------- values : list of DtypesDescriptors, pandas.Series or Nones Returns ------- DtypesDescriptor """ known_dtypes = {} cols_with_unknown_dtypes = [] know_all_names = True dtypes_are_unknown = False # index - joined column names, columns - dtypes taken from 'values' # 0 1 2 3 # col1 int bool float int # col2 int int int int # colN bool bool bool int dtypes_matrix = pandas.DataFrame() for i, val in enumerate(values): if isinstance(val, cls): know_all_names &= val._know_all_names dtypes = val._known_dtypes.copy() dtypes.update({col: "unknown" for col in val._cols_with_unknown_dtypes}) if val._remaining_dtype is not None: # we can't process remaining dtypes, so just discarding them know_all_names = False # setting a custom name to the Series to prevent duplicated names # in the 'dtypes_matrix' series = pandas.Series(dtypes, name=i) dtypes_matrix = pandas.concat([dtypes_matrix, series], axis=1) if not (val._know_all_names and val._remaining_dtype is None): dtypes_matrix.fillna( value={ # If we encountered a 'NaN' while 'val' describes all the columns, then # it means, that the missing columns for this instance will be filled with NaNs (floats), # otherwise, it may indicate missing columns that this 'val' has no info about, # meaning that we shouldn't try computing a new dtype for this column, # so marking it as 'unknown' i: "unknown", }, inplace=True, ) elif isinstance(val, pandas.Series): dtypes_matrix = pandas.concat([dtypes_matrix, val], axis=1) elif val is None: # one of the 'dtypes' is None, meaning that we wouldn't been infer a valid result dtype, # however, we're continuing our loop so we would at least know the columns we're missing # dtypes for dtypes_are_unknown = True know_all_names = False else: raise NotImplementedError(type(val)) if dtypes_are_unknown: return DtypesDescriptor( cols_with_unknown_dtypes=dtypes_matrix.index.tolist(), know_all_names=know_all_names, ) def combine_dtypes(row): if (row == "unknown").any(): return "unknown" row = row.fillna(pandas.api.types.pandas_dtype("float")) return find_common_type(list(row.values)) dtypes = dtypes_matrix.apply(combine_dtypes, axis=1) for col, dtype in dtypes.items(): if dtype == "unknown": cols_with_unknown_dtypes.append(col) else: known_dtypes[col] = dtype return DtypesDescriptor( known_dtypes, cols_with_unknown_dtypes, remaining_dtype=None, know_all_names=know_all_names, ) @classmethod def concat( cls, values: list[Union[DtypesDescriptor, pandas.Series, None]], axis: int = 0 ) -> DtypesDescriptor: """ Concatenate dtypes descriptors into a single descriptor. Parameters ---------- values : list of DtypesDescriptors and pandas.Series axis : int, default: 0 If ``axis == 0``: concatenate column names. This implements the logic of how dtypes are combined on ``pd.concat([df1, df2], axis=1)``. If ``axis == 1``: perform a union join for the column names described by `values` and then find common dtypes for the columns appeared to be in an intersection. This implements the logic of how dtypes are combined on ``pd.concat([df1, df2], axis=0).dtypes``. Returns ------- DtypesDescriptor """ if axis == 1: return cls._merge_dtypes(values) known_dtypes = {} cols_with_unknown_dtypes = [] schema_is_known = True # some default value to not mix it with 'None' remaining_dtype = "default" know_all_names = True for val in values: if isinstance(val, cls): all_new_cols = ( list(val._known_dtypes.keys()) + val._cols_with_unknown_dtypes ) if any( col in known_dtypes or col in cols_with_unknown_dtypes for col in all_new_cols ): raise NotImplementedError( "Duplicated column names are not yet supported by DtypesDescriptor" ) know_all_names &= val._know_all_names known_dtypes.update(val._known_dtypes) cols_with_unknown_dtypes.extend(val._cols_with_unknown_dtypes) if know_all_names: if ( remaining_dtype == "default" and val._remaining_dtype is not None ): remaining_dtype = val._remaining_dtype elif ( remaining_dtype != "default" and val._remaining_dtype is not None and remaining_dtype != val._remaining_dtype ): remaining_dtype = None know_all_names = False else: remaining_dtype = None schema_is_known &= val._schema_is_known elif isinstance(val, pandas.Series): if any( col in known_dtypes or col in cols_with_unknown_dtypes for col in val.index ): raise NotImplementedError( "Duplicated column names are not yet supported by DtypesDescriptor" ) known_dtypes.update(val) elif val is None: remaining_dtype = None schema_is_known = False know_all_names = False else: raise NotImplementedError(type(val)) return cls( known_dtypes, cols_with_unknown_dtypes, None if remaining_dtype == "default" else remaining_dtype, parent_df=None, _schema_is_known=schema_is_known, know_all_names=know_all_names, ) @staticmethod def _normalize_levels(columns, reference=None): """ Normalize levels of MultiIndex column names. The function fills missing levels with empty strings as pandas do: ''' >>> columns = ["a", ("l1", "l2"), ("l1a", "l2a", "l3a")] >>> _normalize_levels(columns) [("a", "", ""), ("l1", "l2", ""), ("l1a", "l2a", "l3a")] >>> # with a reference >>> idx = pandas.MultiIndex(...) >>> idx.nlevels 4 >>> _normalize_levels(columns, reference=idx) [("a", "", "", ""), ("l1", "l2", "", ""), ("l1a", "l2a", "l3a", "")] ''' Parameters ---------- columns : sequence Labels to normalize. If dictionary, will replace keys with normalized columns. reference : pandas.Index, optional An index to match the number of levels with. If reference is a MultiIndex, then the reference number of levels should not be greater than the maximum number of levels in `columns`. If not specified, the `columns` themselves become a `reference`. Returns ------- sequence Column values with normalized levels. dict[hashable, hashable] Mapping from old column names to new names, only contains column names that were changed. Raises ------ ValueError When the reference number of levels is greater than the maximum number of levels in `columns`. """ if reference is None: reference = columns if isinstance(reference, pandas.Index): max_nlevels = reference.nlevels else: max_nlevels = 1 for col in reference: if isinstance(col, tuple): max_nlevels = max(max_nlevels, len(col)) # if the reference is a regular flat index, then no actions are required (the result will be # a flat index containing tuples of different lengths, this behavior fully matches pandas). # Yes, this shortcut skips the 'if max_columns_nlevels > max_nlevels' below check on purpose. if max_nlevels == 1: return columns, {} max_columns_nlevels = 1 for col in columns: if isinstance(col, tuple): max_columns_nlevels = max(max_columns_nlevels, len(col)) if max_columns_nlevels > max_nlevels: raise ValueError( f"The reference number of levels is greater than the maximum number of levels in columns: {max_columns_nlevels} > {max_nlevels}" ) new_columns = [] old_to_new_mapping = {} for col in columns: old_col = col if not isinstance(col, tuple): col = (col,) col = col + ("",) * (max_nlevels - len(col)) new_columns.append(col) if old_col != col: old_to_new_mapping[old_col] = col return new_columns, old_to_new_mapping def _normalize_self_levels(self, reference=None): """ Call ``self._normalize_levels()`` for known and unknown dtypes of this object. Parameters ---------- reference : pandas.Index, optional """ _, old_to_new_mapping = self._normalize_levels( self._known_dtypes.keys(), reference ) for old_col, new_col in old_to_new_mapping.items(): value = self._known_dtypes.pop(old_col) self._known_dtypes[new_col] = value self._cols_with_unknown_dtypes, _ = self._normalize_levels( self._cols_with_unknown_dtypes, reference ) class ModinDtypes: """ A class that hides the various implementations of the dtypes needed for optimization. Parameters ---------- value : pandas.Series, callable, DtypesDescriptor or ModinDtypes, optional """ def __init__( self, value: Optional[Union[Callable, pandas.Series, DtypesDescriptor, ModinDtypes]], ): if callable(value) or isinstance(value, pandas.Series): self._value = value elif isinstance(value, DtypesDescriptor): self._value = value.to_series() if value.is_materialized else value elif isinstance(value, type(self)): self._value = value.copy()._value elif isinstance(value, None): self._value = DtypesDescriptor() else: raise ValueError(f"ModinDtypes doesn't work with '{value}'") def __repr__(self): # noqa: GL08 return f"ModinDtypes:\n\tvalue type: {type(self._value)};\n\tvalue:\n\t{self._value}" def __str__(self): # noqa: GL08 return self.__repr__() @property def is_materialized(self) -> bool: """ Check if the internal representation is materialized. Returns ------- bool """ return isinstance(self._value, pandas.Series) def get_dtypes_set(self) -> set[DtypeObj]: """ Get a set of dtypes from the descriptor. Returns ------- set[DtypeObj] """ if isinstance(self._value, DtypesDescriptor): return self._value.get_dtypes_set() if not self.is_materialized: self.get() return set(self._value.values) def maybe_specify_new_frame_ref(self, new_parent: PandasDataframe) -> ModinDtypes: """ Set a new parent for the stored value if needed. Parameters ---------- new_parent : PandasDataframe Returns ------- ModinDtypes A copy of ``ModinDtypes`` with updated parent. """ new_self = self.copy() if new_self.is_materialized: LazyProxyCategoricalDtype.update_dtypes(new_self._value, new_parent) return new_self if isinstance(self._value, DtypesDescriptor): new_self._value.update_parent(new_parent) return new_self return new_self def lazy_get(self, ids: list, numeric_index: bool = False) -> ModinDtypes: """ Get new ``ModinDtypes`` for a subset of columns without triggering any computations. Parameters ---------- ids : list of index labels or positional indexers Columns for the subset. numeric_index : bool, default: False Whether `ids` are positional indixes or column labels to take. Returns ------- ModinDtypes ``ModinDtypes`` that describes dtypes for columns specified in `ids`. """ if isinstance(self._value, DtypesDescriptor): res = self._value.lazy_get(ids, numeric_index) return ModinDtypes(res) elif callable(self._value): new_self = self.copy() old_value = new_self._value new_self._value = lambda: ( old_value().iloc[ids] if numeric_index else old_value()[ids] ) return new_self ErrorMessage.catch_bugs_and_request_email( failure_condition=not self.is_materialized ) return ModinDtypes(self._value.iloc[ids] if numeric_index else self._value[ids]) @classmethod def concat(cls, values: list, axis: int = 0) -> ModinDtypes: """ Concatenate dtypes. Parameters ---------- values : list of DtypesDescriptors, pandas.Series, ModinDtypes and Nones axis : int, default: 0 If ``axis == 0``: concatenate column names. This implements the logic of how dtypes are combined on ``pd.concat([df1, df2], axis=1)``. If ``axis == 1``: perform a union join for the column names described by `values` and then find common dtypes for the columns appeared to be in an intersection. This implements the logic of how dtypes are combined on ``pd.concat([df1, df2], axis=0).dtypes``. Returns ------- ModinDtypes """ preprocessed_vals = [] for val in values: if isinstance(val, cls): val = val._value if isinstance(val, (DtypesDescriptor, pandas.Series)) or val is None: preprocessed_vals.append(val) else: raise NotImplementedError(type(val)) try: desc = DtypesDescriptor.concat(preprocessed_vals, axis=axis) except NotImplementedError as e: # 'DtypesDescriptor' doesn't support duplicated labels, however, if all values are pandas Series, # we still can perform concatenation using pure pandas if ( # 'pd.concat(axis=1)' fails on duplicated labels anyway, so doing this logic # only in case 'axis=0' axis == 0 and "duplicated" not in e.args[0].lower() or not all(isinstance(val, pandas.Series) for val in values) ): raise e desc = pandas.concat(values) return ModinDtypes(desc) def set_index(self, new_index: Union[pandas.Index, ModinIndex]) -> ModinDtypes: """ Set new column names for stored dtypes. Parameters ---------- new_index : pandas.Index or ModinIndex Returns ------- ModinDtypes New ``ModinDtypes`` with updated column names. """ new_self = self.copy() if self.is_materialized: new_self._value.index = new_index return new_self elif callable(self._value): old_val = new_self._value new_self._value = lambda: old_val().set_axis(new_index) return new_self elif isinstance(new_self._value, DtypesDescriptor): new_self._value = new_self._value.set_index(new_index) return new_self else: raise NotImplementedError() def get(self) -> pandas.Series: """ Get the materialized internal representation. Returns ------- pandas.Series """ if not self.is_materialized: if callable(self._value): self._value = self._value() if self._value is None: self._value = pandas.Series([]) elif isinstance(self._value, DtypesDescriptor): self._value = self._value.to_series() else: raise NotImplementedError(type(self._value)) return self._value def __len__(self): """ Redirect the 'len' request to the internal representation. Returns ------- int Notes ----- Executing this function materializes the data. """ if not self.is_materialized: self.get() return len(self._value) def __reduce__(self): """ Serialize an object of this class. Returns ------- tuple Notes ----- The default implementation generates a recursion error. In a short: during the construction of the object, `__getattr__` function is called, which is not intended to be used in situations where the object is not initialized. """ return (self.__class__, (self._value,)) def __getattr__(self, name): """ Redirect access to non-existent attributes to the internal representation. This is necessary so that objects of this class in most cases mimic the behavior of the ``pandas.Series``. The main limitations of the current approach are type checking and the use of this object where pandas dtypes are supposed to be used. Parameters ---------- name : str Attribute name. Returns ------- object Attribute. Notes ----- Executing this function materializes the data. """ if not self.is_materialized: self.get() return self._value.__getattribute__(name) def copy(self) -> ModinDtypes: """ Copy an object without materializing the internal representation. Returns ------- ModinDtypes """ idx_cache = self._value if not callable(idx_cache): idx_cache = idx_cache.copy() return ModinDtypes(idx_cache) def __getitem__(self, key): # noqa: GL08 if not self.is_materialized: self.get() return self._value.__getitem__(key) def __setitem__(self, key, item): # noqa: GL08 if not self.is_materialized: self.get() self._value.__setitem__(key, item) def __iter__(self): # noqa: GL08 if not self.is_materialized: self.get() return iter(self._value) def __contains__(self, key): # noqa: GL08 if not self.is_materialized: self.get() return key in self._value class LazyProxyCategoricalDtype(pandas.CategoricalDtype): """ A lazy proxy representing ``pandas.CategoricalDtype``. Parameters ---------- categories : list-like, optional ordered : bool, default: False Notes ----- Important note! One shouldn't use the class' constructor to instantiate a proxy instance, it's intended only for compatibility purposes! In order to create a new proxy instance use the appropriate class method `._build_proxy(...)`. """ def __init__(self, categories=None, ordered=False): # These will be initialized later inside of the `._build_proxy()` method self._parent, self._column_name, self._categories_val, self._materializer = ( None, None, None, None, ) super().__init__(categories, ordered) @staticmethod def update_dtypes(dtypes, new_parent): """ Update a parent for categorical proxies in a dtype object. Parameters ---------- dtypes : dict-like A dict-like object describing dtypes. The method will walk through every dtype an update parents for categorical proxies inplace. new_parent : object """ for key, value in dtypes.items(): if isinstance(value, LazyProxyCategoricalDtype): dtypes[key] = value._update_proxy(new_parent, column_name=key) def _update_proxy(self, parent, column_name): """ Create a new proxy, if either parent or column name are different. Parameters ---------- parent : object Source object to extract categories on demand. column_name : str Column name of the categorical column in the source object. Returns ------- pandas.CategoricalDtype or LazyProxyCategoricalDtype """ if self._is_materialized: # The parent has been materialized, we don't need a proxy anymore. return pandas.CategoricalDtype(self.categories, ordered=self._ordered) elif parent is self._parent and column_name == self._column_name: return self else: return self._build_proxy(parent, column_name, self._materializer) @classmethod def _build_proxy(cls, parent, column_name, materializer, dtype=None): """ Construct a lazy proxy. Parameters ---------- parent : object Source object to extract categories on demand. column_name : str Column name of the categorical column in the source object. materializer : callable(parent, column_name) -> pandas.CategoricalDtype A function to call in order to extract categorical values. dtype : dtype, optional The categories dtype. Returns ------- LazyProxyCategoricalDtype """ result = cls() result._parent = parent result._column_name = column_name result._materializer = materializer result._dtype = dtype return result def _get_dtype(self): """ Get the categories dtype. Returns ------- dtype """ if self._dtype is None: self._dtype = self.categories.dtype return self._dtype def __reduce__(self): """ Serialize an object of this class. Returns ------- tuple Notes ----- This object is serialized into a ``pandas.CategoricalDtype`` as an actual proxy can't be properly serialized because of the references it stores for its potentially distributed parent. """ return (pandas.CategoricalDtype, (self.categories, self.ordered)) @property def _categories(self): """ Get materialized categorical values. Returns ------- pandas.Index """ if not self._is_materialized: self._materialize_categories() return self._categories_val @_categories.setter def _categories(self, categories): """ Set new categorical values. Parameters ---------- categories : list-like """ self._categories_val = categories self._parent = None # The parent is not required any more self._materializer = None self._dtype = None @property def _is_materialized(self) -> bool: """ Check whether categorical values were already materialized. Returns ------- bool """ return self._categories_val is not None def _materialize_categories(self): """Materialize actual categorical values.""" ErrorMessage.catch_bugs_and_request_email( failure_condition=self._parent is None, extra_log="attempted to materialize categories with parent being 'None'", ) categoricals = self._materializer(self._parent, self._column_name) self._categories = categoricals.categories self._ordered = categoricals.ordered def get_categories_dtype( cdt: Union[LazyProxyCategoricalDtype, pandas.CategoricalDtype], ) -> DtypeObj: """ Get the categories dtype. Parameters ---------- cdt : LazyProxyCategoricalDtype or pandas.CategoricalDtype Returns ------- dtype """ return ( cdt._get_dtype() if isinstance(cdt, LazyProxyCategoricalDtype) else cdt.categories.dtype ) def extract_dtype(value) -> DtypeObj | pandas.Series: """ Extract dtype(s) from the passed `value`. Parameters ---------- value : object Returns ------- DtypeObj or pandas.Series of DtypeObj """ try: dtype = pandas.api.types.pandas_dtype(value) except (TypeError, ValueError): dtype = pandas.Series(value).dtype return dtype ================================================ FILE: modin/core/dataframe/pandas/metadata/index.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module contains class ModinIndex.""" import uuid from typing import Optional import pandas from pandas.core.dtypes.common import is_list_like from pandas.core.indexes.api import ensure_index class ModinIndex: """ A class that hides the various implementations of the index needed for optimization. Parameters ---------- value : sequence, PandasDataframe or callable() -> (pandas.Index, list of ints), optional If a sequence passed this will be considered as the index values. If a ``PandasDataframe`` passed then it will be used to lazily extract indices when required, note that the `axis` parameter must be passed in this case. If a callable passed then it's expected to return a pandas Index and a list of partition lengths along the index axis. If ``None`` was passed, the index will be considered an incomplete and will raise a ``RuntimeError`` on an attempt of materialization. To complete the index object you have to use ``.maybe_specify_new_frame_ref()`` method. axis : int, optional Specifies an axis the object represents, serves as an optional hint. This parameter must be passed in case value is a ``PandasDataframe``. dtypes : pandas.Series, optional Materialized dtypes of index levels. """ def __init__(self, value=None, axis=None, dtypes: Optional[pandas.Series] = None): from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe self._is_default_callable = False self._axis = axis self._dtypes = dtypes if callable(value): self._value = value elif isinstance(value, PandasDataframe): assert axis is not None self._value = self._get_default_callable(value, axis) self._is_default_callable = True elif value is None: assert axis is not None self._value = value else: self._value = ensure_index(value) self._lengths_cache = None # index/lengths ID's for faster comparison between other ModinIndex objects, # these should be propagated to the copies of the index self._index_id = uuid.uuid4() self._lengths_id = uuid.uuid4() def maybe_get_dtypes(self) -> Optional[pandas.Series]: """ Get index dtypes if available. Returns ------- pandas.Series or None """ if self._dtypes is not None: return self._dtypes if self.is_materialized: self._dtypes = ( self._value.dtypes if isinstance(self._value, pandas.MultiIndex) else pandas.Series([self._value.dtype], index=[self._value.name]) ) return self._dtypes return None @staticmethod def _get_default_callable(dataframe_obj, axis): """ Build a callable extracting index labels and partitions lengths for the specified axis. Parameters ---------- dataframe_obj : PandasDataframe axis : int 0 - extract indices, 1 - extract columns. Returns ------- callable() -> tuple(pandas.Index, list[ints]) """ return lambda: dataframe_obj._compute_axis_labels_and_lengths(axis) def maybe_specify_new_frame_ref(self, value, axis) -> "ModinIndex": """ Set a new reference for a frame used to lazily extract index labels if it's needed. The method sets a new reference only if the indices are not yet materialized and if a PandasDataframe was originally passed to construct this index (so the ModinIndex object holds a reference to it). The reason the reference should be updated is that we don't want to hold in memory those frames that are already not needed. Once the reference is updated, the old frame will be garbage collected if there are no more references to it. Parameters ---------- value : PandasDataframe New dataframe to reference. axis : int Axis to extract labels from. Returns ------- ModinIndex New ModinIndex with the reference updated. """ if self._value is not None and ( not callable(self._value) or not self._is_default_callable ): return self new_index = self.copy(copy_lengths=True) new_index._axis = axis new_index._value = self._get_default_callable(value, new_index._axis) # if the '._value' was 'None' initially, then the '_is_default_callable' flag was # also being set to 'False', since now the '._value' is a default callable, # so we want to ensure that the flag is set to 'True' new_index._is_default_callable = True return new_index @property def is_materialized(self) -> bool: """ Check if the internal representation is materialized. Returns ------- bool """ return self.is_materialized_index(self) @classmethod def is_materialized_index(cls, index) -> bool: """ Check if the passed object represents a materialized index. Parameters ---------- index : object An object to check. Returns ------- bool """ # importing here to avoid circular import issue from modin.pandas.indexing import is_range_like if isinstance(index, cls): index = index._value return is_list_like(index) or is_range_like(index) or isinstance(index, slice) def get(self, return_lengths=False) -> pandas.Index: """ Get the materialized internal representation. Parameters ---------- return_lengths : bool, default: False In some cases, during the index calculation, it's possible to get the lengths of the partitions. This flag allows this data to be used for optimization. Returns ------- pandas.Index """ if not self.is_materialized: if callable(self._value): index, self._lengths_cache = self._value() self._value = ensure_index(index) elif self._value is None: raise RuntimeError( "It's not allowed to call '.materialize()' before '._value' is specified." ) else: raise NotImplementedError(type(self._value)) if return_lengths: return self._value, self._lengths_cache else: return self._value def equals(self, other: "ModinIndex") -> bool: """ Check equality of the index values. Parameters ---------- other : ModinIndex Returns ------- bool The result of the comparison. """ if self._index_id == other._index_id: return True if not self.is_materialized: self.get() if not other.is_materialized: other.get() return self._value.equals(other._value) def compare_partition_lengths_if_possible(self, other: "ModinIndex"): """ Compare the partition lengths cache for the index being stored if possible. The ``ModinIndex`` object may sometimes store the information about partition lengths along the axis the index belongs to. If both `self` and `other` have this information or it can be inferred from them, the method returns a boolean - the result of the comparison, otherwise it returns ``None`` as an indication that the comparison cannot be made. Parameters ---------- other : ModinIndex Returns ------- bool or None The result of the comparison if both `self` and `other` contain the lengths data, ``None`` otherwise. """ if self._lengths_id == other._lengths_id: return True can_extract_lengths_from_self = self._lengths_cache is not None or callable( self._value ) can_extract_lengths_from_other = other._lengths_cache is not None or callable( other._value ) if can_extract_lengths_from_self and can_extract_lengths_from_other: return self.get(return_lengths=True)[1] == other.get(return_lengths=True)[1] return None def __len__(self): """ Redirect the 'len' request to the internal representation. Returns ------- int Notes ----- Executing this function materializes the data. """ if not self.is_materialized: self.get() return len(self._value) def __reduce__(self): """ Serialize an object of this class. Returns ------- tuple Notes ----- The default implementation generates a recursion error. In a short: during the construction of the object, `__getattr__` function is called, which is not intended to be used in situations where the object is not initialized. """ return ( self.__class__, (self._value, self._axis), { "_lengths_cache": self._lengths_cache, "_index_id": self._index_id, "_lengths_id": self._lengths_id, "_is_default_callable": self._is_default_callable, }, ) def __getitem__(self, key): """ Get an index value at the position of `key`. Parameters ---------- key : int Returns ------- label """ if not self.is_materialized: self.get() return self._value[key] def __getattr__(self, name): """ Redirect access to non-existent attributes to the internal representation. This is necessary so that objects of this class in most cases mimic the behavior of the ``pandas.Index``. The main limitations of the current approach are type checking and the use of this object where pandas indexes are supposed to be used. Parameters ---------- name : str Attribute name. Returns ------- object Attribute. Notes ----- Executing this function materializes the data. """ if not self.is_materialized: self.get() return self._value.__getattribute__(name) def copy(self, copy_lengths=False) -> "ModinIndex": """ Copy an object without materializing the internal representation. Parameters ---------- copy_lengths : bool, default: False Whether to copy the stored partition lengths to the new index object. Returns ------- ModinIndex """ idx_cache = self._value if idx_cache is not None and not callable(idx_cache): idx_cache = idx_cache.copy() result = ModinIndex(idx_cache, axis=self._axis, dtypes=self._dtypes) result._index_id = self._index_id result._is_default_callable = self._is_default_callable if copy_lengths: result._lengths_cache = self._lengths_cache result._lengths_id = self._lengths_id return result ================================================ FILE: modin/core/dataframe/pandas/partitioning/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Base Modin Dataframe classes related to its partitioning and optimized for pandas storage format.""" ================================================ FILE: modin/core/dataframe/pandas/partitioning/axis_partition.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """The module defines base interface for an axis partition of a Modin DataFrame.""" import warnings import numpy as np import pandas from modin.config import MinColumnPartitionSize, MinRowPartitionSize from modin.core.dataframe.base.partitioning.axis_partition import ( BaseDataframeAxisPartition, ) from modin.core.storage_formats.pandas.utils import ( generate_result_of_axis_func_pandas, split_result_of_axis_func_pandas, ) from .partition import PandasDataframePartition class PandasDataframeAxisPartition(BaseDataframeAxisPartition): """ An abstract class is created to simplify and consolidate the code for axis partition that run pandas. Because much of the code is similar, this allows us to reuse this code. Parameters ---------- list_of_partitions : Union[list, PandasDataframePartition] List of ``PandasDataframePartition`` and ``PandasDataframeAxisPartition`` objects, or a single ``PandasDataframePartition``. get_ip : bool, default: False Whether to get node IP addresses to conforming partitions or not. full_axis : bool, default: True Whether or not the axis partition encompasses the whole axis. call_queue : list, optional A list of tuples (callable, args, kwargs) that contains deferred calls. length : the future's type or int, optional Length, or reference to length, of wrapped ``pandas.DataFrame``. width : the future's type or int, optional Width, or reference to width, of wrapped ``pandas.DataFrame``. """ def __init__( self, list_of_partitions, get_ip=False, full_axis=True, call_queue=None, length=None, width=None, ): if isinstance(list_of_partitions, PandasDataframePartition): list_of_partitions = [list_of_partitions] self.full_axis = full_axis self.call_queue = call_queue or [] self._length_cache = length self._width_cache = width # Check that all axis partition axes are the same in `list_of_partitions` # We should never have mismatching axis in the current implementation. We add this # defensive assertion to ensure that undefined behavior does not happen. assert ( len( set( obj.axis for obj in list_of_partitions if isinstance(obj, PandasDataframeAxisPartition) ) ) <= 1 ) self._list_of_constituent_partitions = list_of_partitions # Defer computing _list_of_block_partitions because we might need to # drain call queues for that. self._list_of_block_partitions = None @property def list_of_blocks(self): """ Get the list of physical partition objects that compose this partition. Returns ------- list A list of physical partition objects (``ray.ObjectRef``, ``distributed.Future`` e.g.). """ # Defer draining call queue (which is hidden in `partition.list_of_blocks` call) until we get the partitions. # TODO Look into draining call queue at the same time as the task return [ partition.list_of_blocks[0] for partition in self.list_of_block_partitions ] @property def list_of_block_partitions(self) -> list: """ Get the list of block partitions that compose this partition. Returns ------- List A list of ``PandasDataframePartition``. """ if self._list_of_block_partitions is not None: return self._list_of_block_partitions self._list_of_block_partitions = [] # Extract block partitions from the block and axis partitions that # constitute this partition. for partition in self._list_of_constituent_partitions: if isinstance(partition, PandasDataframeAxisPartition): if partition.axis == self.axis: # We are building an axis partition out of another # axis partition `partition` that contains its own list # of block partitions, partition.list_of_block_partitions. # `partition` may have its own call queue, which has to be # applied to the entire `partition` before we execute any # further operations on its block parittions. partition.drain_call_queue() self._list_of_block_partitions.extend( partition.list_of_block_partitions ) else: # If this axis partition is made of axis partitions # for the other axes, squeeze such partitions into a single # block so that this partition only holds a one-dimensional # list of blocks. We could change this implementation to # hold a 2-d list of blocks, but that would complicate the # code quite a bit. self._list_of_block_partitions.append( partition.force_materialization().list_of_block_partitions[0] ) else: self._list_of_block_partitions.append(partition) return self._list_of_block_partitions @classmethod def _get_drain_func(cls): # noqa: GL08 return PandasDataframeAxisPartition.drain def drain_call_queue(self, num_splits=None): """ Execute all operations stored in this partition's call queue. Parameters ---------- num_splits : int, default: None The number of times to split the result object. """ if len(self.call_queue) == 0: # this implicitly calls `drain_call_queue` for block partitions, # which might have deferred call queues _ = self.list_of_blocks return call_queue = self.call_queue try: # Clearing the queue before calling `.apply()` so it won't try to drain it repeatedly self.call_queue = [] drained = self.apply( self._get_drain_func(), num_splits=num_splits, call_queue=call_queue ) except Exception: # Restoring the call queue in case of an exception as it most likely wasn't drained self.call_queue = call_queue raise if not isinstance(drained, list): drained = [drained] self._list_of_block_partitions = drained def force_materialization(self, get_ip=False): """ Materialize partitions into a single partition. Parameters ---------- get_ip : bool, default: False Whether to get node ip address to a single partition or not. Returns ------- PandasDataframeAxisPartition An axis partition containing only a single materialized partition. """ materialized = super().force_materialization(get_ip=get_ip) self._list_of_block_partitions = materialized.list_of_block_partitions return materialized def apply( self, func, *args, num_splits=None, other_axis_partition=None, maintain_partitioning=True, lengths=None, manual_partition=False, **kwargs, ): """ Apply a function to this axis partition along full axis. Parameters ---------- func : callable The function to apply. *args : iterable Positional arguments to pass to `func`. num_splits : int, default: None The number of times to split the result object. other_axis_partition : PandasDataframeAxisPartition, default: None Another `PandasDataframeAxisPartition` object to be applied to func. This is for operations that are between two data sets. maintain_partitioning : bool, default: True Whether to keep the partitioning in the same orientation as it was previously or not. This is important because we may be operating on an individual AxisPartition and not touching the rest. In this case, we have to return the partitioning to its previous orientation (the lengths will remain the same). This is ignored between two axis partitions. lengths : iterable, default: None The list of lengths to shuffle the object. manual_partition : bool, default: False If True, partition the result with `lengths`. **kwargs : dict Additional keywords arguments to be passed in `func`. Returns ------- list A list of `PandasDataframePartition` objects. """ if not self.full_axis: # If this is not a full axis partition, it already contains a subset of # the full axis, so we shouldn't split the result further. num_splits = 1 if len(self.call_queue) > 0: self.drain_call_queue() if num_splits is None: num_splits = len(self.list_of_blocks) if other_axis_partition is not None: if not isinstance(other_axis_partition, list): other_axis_partition = [other_axis_partition] # (other_shape[i-1], other_shape[i]) will indicate slice # to restore i-1 axis partition other_shape = np.cumsum( [0] + [len(o.list_of_blocks) for o in other_axis_partition] ) return self._wrap_partitions( self.deploy_func_between_two_axis_partitions( self.axis, func, args, kwargs, num_splits, len(self.list_of_blocks), other_shape, *tuple( self.list_of_blocks + [ part for axis_partition in other_axis_partition for part in axis_partition.list_of_blocks ] ), min_block_size=( MinRowPartitionSize.get() if self.axis == 0 else MinColumnPartitionSize.get() ), ) ) result = self._wrap_partitions( self.deploy_axis_func( self.axis, func, args, kwargs, num_splits, maintain_partitioning, *self.list_of_blocks, min_block_size=( MinRowPartitionSize.get() if self.axis == 0 else MinColumnPartitionSize.get() ), lengths=lengths, manual_partition=manual_partition, ) ) if self.full_axis: return result else: # If this is not a full axis partition, just take out the single split in the result. return result[0] def split( self, split_func, num_splits, f_args=None, f_kwargs=None, extract_metadata=False ): """ Split axis partition into multiple partitions using the `split_func`. Parameters ---------- split_func : callable(pandas.DataFrame) -> list[pandas.DataFrame] A function that takes partition's content and split it into multiple chunks. num_splits : int The number of splits the `split_func` return. f_args : iterable, optional Positional arguments to pass to the `split_func`. f_kwargs : dict, optional Keyword arguments to pass to the `split_func`. extract_metadata : bool, default: False Whether to return metadata (length, width, ip) of the result. Passing `False` may relax the load on object storage as the remote function would return X times fewer futures (where X is the number of metadata values). Passing `False` makes sense for temporary results where you know for sure that the metadata will never be requested. Returns ------- list List of wrapped remote partition objects. """ f_args = tuple() if f_args is None else f_args f_kwargs = {} if f_kwargs is None else f_kwargs return self._wrap_partitions( self.deploy_splitting_func( self.axis, split_func, f_args, f_kwargs, num_splits, *self.list_of_blocks, extract_metadata=extract_metadata, ), extract_metadata=extract_metadata, ) @classmethod def deploy_splitting_func( cls, axis, split_func, f_args, f_kwargs, num_splits, *partitions, extract_metadata=False, ): """ Deploy a splitting function along a full axis. Parameters ---------- axis : {0, 1} The axis to perform the function along. split_func : callable(pandas.DataFrame) -> list[pandas.DataFrame] The function to perform. f_args : list or tuple Positional arguments to pass to `split_func`. f_kwargs : dict Keyword arguments to pass to `split_func`. num_splits : int The number of splits the `split_func` return. *partitions : iterable All partitions that make up the full axis (row or column). extract_metadata : bool, default: False Whether to return metadata (length, width, ip) of the result. Note that `True` value is not supported in `PandasDataframeAxisPartition` class. Returns ------- list A list of pandas DataFrames. """ dataframe = pandas.concat(list(partitions), axis=axis, copy=False) # to reduce peak memory consumption del partitions return split_func(dataframe, *f_args, **f_kwargs) @classmethod def deploy_axis_func( cls, axis, func, f_args, f_kwargs, num_splits, maintain_partitioning, *partitions, min_block_size, lengths=None, manual_partition=False, return_generator=False, ): """ Deploy a function along a full axis. Parameters ---------- axis : {0, 1} The axis to perform the function along. func : callable The function to perform. f_args : list or tuple Positional arguments to pass to ``func``. f_kwargs : dict Keyword arguments to pass to ``func``. num_splits : int The number of splits to return (see `split_result_of_axis_func_pandas`). maintain_partitioning : bool If True, keep the old partitioning if possible. If False, create a new partition layout. *partitions : iterable All partitions that make up the full axis (row or column). min_block_size : int Minimum number of rows/columns in a single split. lengths : list, optional The list of lengths to shuffle the object. manual_partition : bool, default: False If True, partition the result with `lengths`. return_generator : bool, default: False Return a generator from the function, set to `True` for Ray backend as Ray remote functions can return Generators. Returns ------- list | Generator A list or generator of pandas DataFrames. """ len_partitions = len(partitions) lengths_partitions = [len(part) for part in partitions] widths_partitions = [len(part.columns) for part in partitions] dataframe = pandas.concat(list(partitions), axis=axis, copy=False) # to reduce peak memory consumption del partitions with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=FutureWarning) try: result = func(dataframe, *f_args, **f_kwargs) except ValueError as err: if "assignment destination is read-only" in str(err): result = func(dataframe.copy(), *f_args, **f_kwargs) else: raise err # to reduce peak memory consumption del dataframe if num_splits == 1: # If we're not going to split the result, we don't need to specify # split lengths. lengths = None elif manual_partition: # The split function is expecting a list lengths = list(lengths) # We set lengths to None so we don't use the old lengths for the resulting partition # layout. This is done if the number of splits is changing or we are told not to # keep the old partitioning. elif num_splits != len_partitions or not maintain_partitioning: lengths = None else: if axis == 0: lengths = lengths_partitions if sum(lengths) != len(result): lengths = None else: lengths = widths_partitions if sum(lengths) != len(result.columns): lengths = None if return_generator: return generate_result_of_axis_func_pandas( axis, num_splits, result, min_block_size, lengths, ) else: return split_result_of_axis_func_pandas( axis, num_splits, result, min_block_size, lengths ) @classmethod def deploy_func_between_two_axis_partitions( cls, axis, func, f_args, f_kwargs, num_splits, len_of_left, other_shape, *partitions, min_block_size, return_generator=False, ): """ Deploy a function along a full axis between two data sets. Parameters ---------- axis : {0, 1} The axis to perform the function along. func : callable The function to perform. f_args : list or tuple Positional arguments to pass to ``func``. f_kwargs : dict Keyword arguments to pass to ``func``. num_splits : int The number of splits to return (see `split_result_of_axis_func_pandas`). len_of_left : int The number of values in `partitions` that belong to the left data set. other_shape : np.ndarray The shape of right frame in terms of partitions, i.e. (other_shape[i-1], other_shape[i]) will indicate slice to restore i-1 axis partition. *partitions : iterable All partitions that make up the full axis (row or column) for both data sets. min_block_size : int Minimum number of rows/columns in a single split. return_generator : bool, default: False Return a generator from the function, set to `True` for Ray backend as Ray remote functions can return Generators. Returns ------- list | Generator A list or generator of pandas DataFrames. """ lt_frame = pandas.concat(partitions[:len_of_left], axis=axis, copy=False) rt_parts = partitions[len_of_left:] # to reduce peak memory consumption del partitions # reshaping flattened `rt_parts` array into a frame with shape `other_shape` combined_axis = [ pandas.concat( rt_parts[other_shape[i - 1] : other_shape[i]], axis=axis, copy=False, ) for i in range(1, len(other_shape)) ] # to reduce peak memory consumption del rt_parts rt_frame = pandas.concat(combined_axis, axis=axis ^ 1, copy=False) # to reduce peak memory consumption del combined_axis with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=FutureWarning) result = func(lt_frame, rt_frame, *f_args, **f_kwargs) # to reduce peak memory consumption del lt_frame, rt_frame if return_generator: return generate_result_of_axis_func_pandas( axis, num_splits, result, min_block_size, ) else: return split_result_of_axis_func_pandas( axis, num_splits, result, min_block_size, ) @classmethod def drain(cls, df: pandas.DataFrame, call_queue: list): """ Execute all operations stored in the call queue on the pandas object (helper function). Parameters ---------- df : pandas.DataFrame call_queue : list Call queue that needs to be executed on pandas DataFrame. Returns ------- pandas.DataFrame """ for func, args, kwargs in call_queue: df = func(df, *args, **kwargs) return df def mask(self, row_indices, col_indices): """ Create (synchronously) a mask that extracts the indices provided. Parameters ---------- row_indices : list-like, slice or label The row labels for the rows to extract. col_indices : list-like, slice or label The column labels for the columns to extract. Returns ------- PandasDataframeAxisPartition A new ``PandasDataframeAxisPartition`` object, materialized. """ return ( self.force_materialization() .list_of_block_partitions[0] .mask(row_indices, col_indices) ) def to_pandas(self): """ Convert the data in this partition to a ``pandas.DataFrame``. Returns ------- pandas DataFrame. """ return self.force_materialization().list_of_block_partitions[0].to_pandas() def to_numpy(self): """ Convert the data in this partition to a ``numpy.array``. Returns ------- NumPy array. """ return self.force_materialization().list_of_block_partitions[0].to_numpy() _length_cache = None def length(self, materialize=True): """ Get the length of this partition. Parameters ---------- materialize : bool, default: True Whether to forcibly materialize the result into an integer. If ``False`` was specified, may return a future of the result if it hasn't been materialized yet. Returns ------- int The length of the partition. """ if self._length_cache is None: if self.axis == 0: self._length_cache = sum( obj.length() for obj in self.list_of_block_partitions ) else: self._length_cache = self.list_of_block_partitions[0].length( materialize ) return self._length_cache _width_cache = None def width(self, materialize=True): """ Get the width of this partition. Parameters ---------- materialize : bool, default: True Whether to forcibly materialize the result into an integer. If ``False`` was specified, may return a future of the result if it hasn't been materialized yet. Returns ------- int The width of the partition. """ if self._width_cache is None: if self.axis == 1: self._width_cache = sum( obj.width() for obj in self.list_of_block_partitions ) else: self._width_cache = self.list_of_block_partitions[0].width(materialize) return self._width_cache def wait(self): """Wait completing computations on the object wrapped by the partition.""" pass def add_to_apply_calls(self, func, *args, length=None, width=None, **kwargs): """ Add a function to the call queue. Parameters ---------- func : callable or a future type Function to be added to the call queue. *args : iterable Additional positional arguments to be passed in `func`. length : A future type or int, optional Length, or reference to it, of wrapped ``pandas.DataFrame``. width : A future type or int, optional Width, or reference to it, of wrapped ``pandas.DataFrame``. **kwargs : dict Additional keyword arguments to be passed in `func`. Returns ------- PandasDataframeAxisPartition A new ``PandasDataframeAxisPartition`` object. """ return type(self)( self.list_of_block_partitions, full_axis=self.full_axis, call_queue=self.call_queue + [[func, args, kwargs]], length=length, width=width, ) ================================================ FILE: modin/core/dataframe/pandas/partitioning/partition.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """The module defines base interface for a partition of a Modin DataFrame.""" from __future__ import annotations import logging import uuid from abc import ABC from copy import copy from functools import cached_property import pandas from pandas.api.types import is_scalar from modin.core.storage_formats.pandas.utils import length_fn_pandas, width_fn_pandas from modin.logging import ClassLogger, get_logger from modin.logging.config import LogLevel from modin.pandas.indexing import compute_sliced_len class PandasDataframePartition( ABC, ClassLogger, modin_layer="BLOCK-PARTITION", log_level=LogLevel.DEBUG ): # pragma: no cover """ An abstract class that is base for any partition class of ``pandas`` storage format. The class providing an API that has to be overridden by child classes. """ _length_cache = None _width_cache = None _identity_cache = None _data = None execution_wrapper = None # these variables are intentionally initialized at runtime # so as not to initialize the engine during import _iloc_func = None def __init__(self): if type(self)._iloc_func is None: # Places `_iloc` function into the storage to speed up # remote function calls and caches the result. # It also postpones engine initialization, which happens # implicitly when `execution_wrapper.put` is called. if self.execution_wrapper is not None: type(self)._iloc_func = staticmethod( self.execution_wrapper.put(self._iloc) ) else: type(self)._iloc_func = staticmethod(self._iloc) @cached_property def __constructor__(self) -> type[PandasDataframePartition]: """ Create a new instance of this object. Returns ------- PandasDataframePartition New instance of pandas partition. """ return type(self) def get(self): """ Get the object wrapped by this partition. Returns ------- object The object that was wrapped by this partition. Notes ----- This is the opposite of the classmethod `put`. E.g. if you assign `x = PandasDataframePartition.put(1)`, `x.get()` should always return 1. """ log = get_logger() self._is_debug(log) and log.debug(f"ENTER::Partition.get::{self._identity}") self.drain_call_queue() result = self.execution_wrapper.materialize(self._data) self._is_debug(log) and log.debug(f"EXIT::Partition.get::{self._identity}") return result @property def list_of_blocks(self): """ Get the list of physical partition objects that compose this partition. Returns ------- list A list of physical partition objects (``ray.ObjectRef``, ``distributed.Future`` e.g.). """ # Defer draining call queue until we get the partitions. # TODO Look into draining call queue at the same time as the task self.drain_call_queue() return [self._data] def apply(self, func, *args, **kwargs): """ Apply a function to the object wrapped by this partition. Parameters ---------- func : callable Function to apply. *args : iterable Additional positional arguments to be passed in `func`. **kwargs : dict Additional keyword arguments to be passed in `func`. Returns ------- PandasDataframePartition New `PandasDataframePartition` object. Notes ----- It is up to the implementation how `kwargs` are handled. They are an important part of many implementations. As of right now, they are not serialized. """ pass def add_to_apply_calls(self, func, *args, length=None, width=None, **kwargs): """ Add a function to the call queue. Parameters ---------- func : callable Function to be added to the call queue. *args : iterable Additional positional arguments to be passed in `func`. length : reference or int, optional Length, or reference to length, of wrapped ``pandas.DataFrame``. width : reference or int, optional Width, or reference to width, of wrapped ``pandas.DataFrame``. **kwargs : dict Additional keyword arguments to be passed in `func`. Returns ------- PandasDataframePartition New `PandasDataframePartition` object with the function added to the call queue. Notes ----- This function will be executed when `apply` is called. It will be executed in the order inserted; apply's func operates the last and return. """ return self.__constructor__( self._data, call_queue=self.call_queue + [[func, args, kwargs]], length=length, width=width, ) def drain_call_queue(self): """Execute all operations stored in the call queue on the object wrapped by this partition.""" pass def wait(self): """Wait for completion of computations on the object wrapped by the partition.""" pass def to_pandas(self): """ Convert the object wrapped by this partition to a ``pandas.DataFrame``. Returns ------- pandas.DataFrame Notes ----- If the underlying object is a pandas DataFrame, this will likely only need to call `get`. """ dataframe = self.get() assert isinstance(dataframe, (pandas.DataFrame, pandas.Series)) return dataframe def to_numpy(self, **kwargs): """ Convert the object wrapped by this partition to a NumPy array. Parameters ---------- **kwargs : dict Additional keyword arguments to be passed in ``to_numpy``. Returns ------- np.ndarray Notes ----- If the underlying object is a pandas DataFrame, this will return a 2D NumPy array. """ return self.apply(lambda df: df.to_numpy(**kwargs)).get() @staticmethod def _iloc(df, row_labels, col_labels): # noqa: RT01, PR01 """Perform `iloc` on dataframes wrapped in partitions (helper function).""" return df.iloc[row_labels, col_labels] def mask(self, row_labels, col_labels): """ Lazily create a mask that extracts the indices provided. Parameters ---------- row_labels : list-like, slice or label The row labels for the rows to extract. col_labels : list-like, slice or label The column labels for the columns to extract. Returns ------- PandasDataframePartition New `PandasDataframePartition` object. """ def is_full_axis_mask(index, axis_length): """Check whether `index` mask grabs `axis_length` amount of elements.""" if isinstance(index, slice): return index == slice(None) or ( isinstance(axis_length, int) and compute_sliced_len(index, axis_length) == axis_length ) return ( hasattr(index, "__len__") and isinstance(axis_length, int) and len(index) == axis_length ) row_labels = [row_labels] if is_scalar(row_labels) else row_labels col_labels = [col_labels] if is_scalar(col_labels) else col_labels if is_full_axis_mask(row_labels, self._length_cache) and is_full_axis_mask( col_labels, self._width_cache ): return copy(self) new_obj = self.add_to_apply_calls(self._iloc_func, row_labels, col_labels) def try_recompute_cache(indices, previous_cache): """Compute new axis-length cache for the masked frame based on its previous cache.""" if not isinstance(indices, slice): return len(indices) if not isinstance(previous_cache, int): return None return compute_sliced_len(indices, previous_cache) new_obj._length_cache = try_recompute_cache(row_labels, self._length_cache) new_obj._width_cache = try_recompute_cache(col_labels, self._width_cache) return new_obj @classmethod def put(cls, obj): """ Put an object into a store and wrap it with partition object. Parameters ---------- obj : object An object to be put. Returns ------- PandasDataframePartition New `PandasDataframePartition` object. """ pass @classmethod def preprocess_func(cls, func): """ Preprocess a function before an `apply` call. Parameters ---------- func : callable Function to preprocess. Returns ------- callable An object that can be accepted by `apply`. Notes ----- This is a classmethod because the definition of how to preprocess should be class-wide. Also, we may want to use this before we deploy a preprocessed function to multiple `PandasDataframePartition` objects. """ pass @classmethod def _length_extraction_fn(cls): """ Return the function that computes the length of the object wrapped by this partition. Returns ------- callable The function that computes the length of the object wrapped by this partition. """ return length_fn_pandas @classmethod def _width_extraction_fn(cls): """ Return the function that computes the width of the object wrapped by this partition. Returns ------- callable The function that computes the width of the object wrapped by this partition. """ return width_fn_pandas def length(self, materialize=True): """ Get the length of the object wrapped by this partition. Parameters ---------- materialize : bool, default: True Whether to forcibly materialize the result into an integer. If ``False`` was specified, may return a future of the result if it hasn't been materialized yet. Returns ------- int or its Future The length of the object. """ if self._length_cache is None: self._length_cache = self.apply(self._length_extraction_fn()).get() return self._length_cache def width(self, materialize=True): """ Get the width of the object wrapped by the partition. Parameters ---------- materialize : bool, default: True Whether to forcibly materialize the result into an integer. If ``False`` was specified, may return a future of the result if it hasn't been materialized yet. Returns ------- int or its Future The width of the object. """ if self._width_cache is None: self._width_cache = self.apply(self._width_extraction_fn()).get() return self._width_cache @property def _identity(self): """ Calculate identifier on request for debug logging mode. Returns ------- str """ if self._identity_cache is None: self._identity_cache = uuid.uuid4().hex return self._identity_cache def split(self, split_func, num_splits, *args): """ Split the object wrapped by the partition into multiple partitions. Parameters ---------- split_func : Callable[pandas.DataFrame, List[Any]] -> List[pandas.DataFrame] The function that will split this partition into multiple partitions. The list contains pivots to split by, and will have the same dtype as the major column we are shuffling on. num_splits : int The number of resulting partitions (may be empty). *args : List[Any] Arguments to pass to ``split_func``. Returns ------- list A list of partitions. """ log = get_logger() self._is_debug(log) and log.debug(f"ENTER::Partition.split::{self._identity}") self._is_debug(log) and log.debug(f"SUBMIT::_split_df::{self._identity}") outputs = self.execution_wrapper.deploy( split_func, [self._data] + list(args), num_returns=num_splits ) self._is_debug(log) and log.debug(f"EXIT::Partition.split::{self._identity}") return [self.__constructor__(output) for output in outputs] @classmethod def empty(cls): """ Create a new partition that wraps an empty pandas DataFrame. Returns ------- PandasDataframePartition New `PandasDataframePartition` object. """ return cls.put(pandas.DataFrame(), 0, 0) def _is_debug(self, logger=None): """ Check that the logger is set to debug mode. Parameters ---------- logger : logging.logger, optional Logger obtained from Modin's `get_logger` utility. Explicit transmission of this parameter can be used in the case when within the context of `_is_debug` call there was already `get_logger` call. This is an optimization. Returns ------- bool """ if logger is None: logger = get_logger() return logger.isEnabledFor(logging.DEBUG) ================================================ FILE: modin/core/dataframe/pandas/partitioning/partition_manager.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Module holding base PartitionManager class - the thing that tracks partitions across the distribution. The manager also allows manipulating the data - running functions at each partition, shuffle over the distribution, etc. """ import os import warnings from abc import ABC from functools import wraps from typing import TYPE_CHECKING, Optional import numpy as np import pandas from pandas._libs.lib import no_default from modin.config import ( BenchmarkMode, CpuCount, DynamicPartitioning, Engine, MinColumnPartitionSize, MinRowPartitionSize, NPartitions, PersistentPickle, ProgressBar, ) from modin.core.dataframe.pandas.utils import create_pandas_df_from_partitions from modin.core.storage_formats.pandas.utils import compute_chunksize from modin.error_message import ErrorMessage from modin.logging import ClassLogger from modin.logging.config import LogLevel from modin.pandas.utils import get_pandas_backend if TYPE_CHECKING: from modin.core.dataframe.pandas.dataframe.utils import ShuffleFunctions def wait_computations_if_benchmark_mode(func): """ Make sure a `func` finished its computations in benchmark mode. Parameters ---------- func : callable A function that should be performed in syncronous mode. Returns ------- callable Wrapped function that executes eagerly (if benchmark mode) or original `func`. Notes ----- `func` should return NumPy array with partitions. """ @wraps(func) def wait(cls, *args, **kwargs): """Wait for computation results.""" result = func(cls, *args, **kwargs) if BenchmarkMode.get(): if isinstance(result, tuple): partitions = result[0] else: partitions = result # When partitions have a deferred call queue, calling # partition.wait() on each partition serially will serially kick # off each deferred computation and wait for each partition to # finish before kicking off the next one. Instead, we want to # serially kick off all the deferred computations so that they can # all run asynchronously, then wait on all the results. cls.finalize(partitions) # The partition manager invokes the relevant .wait() method under # the hood, which should wait in parallel for all computations to finish cls.wait_partitions(partitions.flatten()) return result return wait class PandasDataframePartitionManager( ClassLogger, ABC, modin_layer="PARTITION-MANAGER", log_level=LogLevel.DEBUG ): """ Base class for managing the dataframe data layout and operators across the distribution of partitions. Partition class is the class to use for storing each partition. Each partition must extend the `PandasDataframePartition` class. """ _partition_class = None # Column partitions class is the class to use to create the column partitions. _column_partitions_class = None # Row partitions class is the class to use to create the row partitions. _row_partition_class = None _execution_wrapper = None @classmethod def materialize_futures(cls, input_list): """ Materialize all futures in the input list. Parameters ---------- input_list : list The list that has to be manipulated. Returns ------- list A new list with materialized objects. """ # Do nothing if input_list is None or []. if input_list is None: return None filtered_list = [] filtered_idx = [] for idx, item in enumerate(input_list): if cls._execution_wrapper.is_future(item): filtered_idx.append(idx) filtered_list.append(item) filtered_list = cls._execution_wrapper.materialize(filtered_list) result = input_list.copy() for idx, item in zip(filtered_idx, filtered_list): result[idx] = item return result @classmethod def preprocess_func(cls, map_func): """ Preprocess a function to be applied to `PandasDataframePartition` objects. Parameters ---------- map_func : callable The function to be preprocessed. Returns ------- callable The preprocessed version of the `map_func` provided. Notes ----- Preprocessing does not require any specific format, only that the `PandasDataframePartition.apply` method will recognize it (for the subclass being used). If your `PandasDataframePartition` objects assume that a function provided is serialized or wrapped or in some other format, this is the place to add that logic. It is possible that this can also just return `map_func` if the `apply` method of the `PandasDataframePartition` object you are using does not require any modification to a given function. """ if cls._execution_wrapper.is_future(map_func): return map_func # Has already been preprocessed old_value = PersistentPickle.get() # When performing a function with Modin objects, it is more profitable to # do the conversion to pandas once on the main process than several times # on worker processes. Details: https://github.com/modin-project/modin/pull/6673/files#r1391086755 # For Dask, otherwise there may be an error: `coroutine 'Client._gather' was never awaited` need_update = not PersistentPickle.get() and Engine.get() != "Dask" if need_update: PersistentPickle.put(True) try: result = cls._partition_class.preprocess_func(map_func) finally: if need_update: PersistentPickle.put(old_value) return result # END Abstract Methods @classmethod def create_partition_from_metadata( cls, dtypes: Optional[pandas.Series] = None, **metadata ): """ Create NumPy array of partitions that holds an empty dataframe with given metadata. Parameters ---------- dtypes : pandas.Series, optional Column dtypes. Upon creating a pandas DataFrame from `metadata` we call `astype` since pandas doesn't allow to pass a list of dtypes directly in the constructor. **metadata : dict Metadata that has to be wrapped in a partition. Returns ------- np.ndarray A NumPy 2D array of a single partition which contains the data. """ metadata_dataframe = pandas.DataFrame(**metadata) if dtypes is not None: metadata_dataframe = metadata_dataframe.astype(dtypes) return np.array([[cls._partition_class.put(metadata_dataframe)]]) @classmethod def column_partitions(cls, partitions, full_axis=True): """ Get the list of `BaseDataframeAxisPartition` objects representing column-wise partitions. Parameters ---------- partitions : list-like List of (smaller) partitions to be combined to column-wise partitions. full_axis : bool, default: True Whether or not this partition contains the entire column axis. Returns ------- list A list of `BaseDataframeAxisPartition` objects. Notes ----- Each value in this list will be an `BaseDataframeAxisPartition` object. `BaseDataframeAxisPartition` is located in `axis_partition.py`. """ if not isinstance(partitions, list): partitions = [partitions] return [ cls._column_partitions_class(col, full_axis=full_axis) for frame in partitions for col in frame.T ] @classmethod def row_partitions(cls, partitions): """ List of `BaseDataframeAxisPartition` objects representing row-wise partitions. Parameters ---------- partitions : list-like List of (smaller) partitions to be combined to row-wise partitions. Returns ------- list A list of `BaseDataframeAxisPartition` objects. Notes ----- Each value in this list will an `BaseDataframeAxisPartition` object. `BaseDataframeAxisPartition` is located in `axis_partition.py`. """ if not isinstance(partitions, list): partitions = [partitions] return [cls._row_partition_class(row) for frame in partitions for row in frame] @classmethod def axis_partition(cls, partitions, axis, full_axis: bool = True): """ Logically partition along given axis (columns or rows). Parameters ---------- partitions : list-like List of partitions to be combined. axis : {0, 1} 0 for column partitions, 1 for row partitions. full_axis : bool, default: True Whether or not this partition contains the entire column axis. Returns ------- list A list of `BaseDataframeAxisPartition` objects. """ make_column_partitions = axis == 0 if not full_axis and not make_column_partitions: raise NotImplementedError( ( "Row partitions must contain the entire axis. We don't " + "support virtual partitioning for row partitions yet." ) ) return ( cls.column_partitions(partitions) if make_column_partitions else cls.row_partitions(partitions) ) @classmethod def groupby_reduce( cls, axis, partitions, by, map_func, reduce_func, apply_indices=None ): """ Groupby data using the `map_func` provided along the `axis` over the `partitions` then reduce using `reduce_func`. Parameters ---------- axis : {0, 1} Axis to groupby over. partitions : NumPy 2D array Partitions of the ModinFrame to groupby. by : NumPy 2D array Partitions of 'by' to broadcast. map_func : callable Map function. reduce_func : callable, Reduce function. apply_indices : list of ints, default: None Indices of `axis ^ 1` to apply function over. Returns ------- NumPy array Partitions with applied groupby. """ if apply_indices is not None: partitions = ( partitions[apply_indices] if axis else partitions[:, apply_indices] ) if by is not None: # need to make sure that the partitioning of the following objects # coincides in the required axis, because `partition_manager.broadcast_apply` # doesn't call `_copartition` unlike `modin_frame.broadcast_apply` assert partitions.shape[axis] == by.shape[axis], ( f"the number of partitions along {axis=} is not equal: " + f"{partitions.shape[axis]} != {by.shape[axis]}" ) mapped_partitions = cls.broadcast_apply( axis, map_func, left=partitions, right=by ) else: mapped_partitions = cls.map_partitions(partitions, map_func) # Assuming, that the output will not be larger than the input, # keep the current number of partitions. num_splits = min(len(partitions), NPartitions.get()) return cls.map_axis_partitions( axis, mapped_partitions, reduce_func, enumerate_partitions=True, num_splits=num_splits, ) @classmethod @wait_computations_if_benchmark_mode def broadcast_apply_select_indices( cls, axis, apply_func, left, right, left_indices, right_indices, keep_remaining=False, ): """ Broadcast the `right` partitions to `left` and apply `apply_func` to selected indices. Parameters ---------- axis : {0, 1} Axis to apply and broadcast over. apply_func : callable Function to apply. left : NumPy 2D array Left partitions. right : NumPy 2D array Right partitions. left_indices : list-like Indices to apply function to. right_indices : dictionary of indices of right partitions Indices that you want to bring at specified left partition, for example dict {key: {key1: [0, 1], key2: [5]}} means that in left[key] you want to broadcast [right[key1], right[key2]] partitions and internal indices for `right` must be [[0, 1], [5]]. keep_remaining : bool, default: False Whether or not to keep the other partitions. Some operations may want to drop the remaining partitions and keep only the results. Returns ------- NumPy array An array of partition objects. Notes ----- Your internal function must take these kwargs: [`internal_indices`, `other`, `internal_other_indices`] to work correctly! """ if not axis: partitions_for_apply = left.T right = right.T else: partitions_for_apply = left [obj.drain_call_queue() for row in right for obj in row] def get_partitions(index): """Grab required partitions and indices from `right` and `right_indices`.""" must_grab = right_indices[index] partitions_list = np.array([right[i] for i in must_grab.keys()]) indices_list = list(must_grab.values()) return {"other": partitions_list, "internal_other_indices": indices_list} new_partitions = np.array( [ ( partitions_for_apply[i] if i not in left_indices else cls._apply_func_to_list_of_partitions_broadcast( apply_func, partitions_for_apply[i], internal_indices=left_indices[i], **get_partitions(i), ) ) for i in range(len(partitions_for_apply)) if i in left_indices or keep_remaining ] ) if not axis: new_partitions = new_partitions.T return new_partitions @classmethod @wait_computations_if_benchmark_mode def base_broadcast_apply(cls, axis, apply_func, left, right): """ Broadcast the `right` partitions to `left` and apply `apply_func` function. Parameters ---------- axis : {0, 1} Axis to apply and broadcast over. apply_func : callable Function to apply. left : np.ndarray NumPy array of left partitions. right : np.ndarray NumPy array of right partitions. Returns ------- np.ndarray NumPy array of result partition objects. Notes ----- This will often be overridden by implementations. It materializes the entire partitions of the right and applies them to the left through `apply`. """ def map_func(df, *others): other = ( pandas.concat(others, axis=axis ^ 1) if len(others) > 1 else others[0] ) # to reduce peak memory consumption del others return apply_func(df, other) map_func = cls.preprocess_func(map_func) rt_axis_parts = cls.axis_partition(right, axis ^ 1) return np.array( [ [ part.apply( map_func, *( rt_axis_parts[col_idx].list_of_blocks if axis else rt_axis_parts[row_idx].list_of_blocks ), ) for col_idx, part in enumerate(left[row_idx]) ] for row_idx in range(len(left)) ] ) @classmethod @wait_computations_if_benchmark_mode def broadcast_axis_partitions( cls, axis, apply_func, left, right, keep_partitioning=False, num_splits=None, apply_indices=None, broadcast_all=True, enumerate_partitions=False, lengths=None, apply_func_args=None, **kwargs, ): """ Broadcast the `right` partitions to `left` and apply `apply_func` along full `axis`. Parameters ---------- axis : {0, 1} Axis to apply and broadcast over. apply_func : callable Function to apply. left : NumPy 2D array Left partitions. right : NumPy 2D array Right partitions. keep_partitioning : boolean, default: False The flag to keep partition boundaries for Modin Frame if possible. Setting it to True disables shuffling data from one partition to another in case the resulting number of splits is equal to the initial number of splits. num_splits : int, optional The number of partitions to split the result into across the `axis`. If None, then the number of splits will be infered automatically. If `num_splits` is None and `keep_partitioning=True` then the number of splits is preserved. apply_indices : list of ints, default: None Indices of `axis ^ 1` to apply function over. broadcast_all : bool, default: True Whether or not to pass all right axis partitions to each of the left axis partitions. enumerate_partitions : bool, default: False Whether or not to pass partition index into `apply_func`. Note that `apply_func` must be able to accept `partition_idx` kwarg. lengths : list of ints, default: None The list of lengths to shuffle the object. Note: 1. Passing `lengths` omits the `num_splits` parameter as the number of splits will now be inferred from the number of integers present in `lengths`. 2. When passing lengths you must explicitly specify `keep_partitioning=False`. apply_func_args : list-like, optional Positional arguments to pass to the `func`. **kwargs : dict Additional options that could be used by different engines. Returns ------- NumPy array An array of partition objects. """ ErrorMessage.catch_bugs_and_request_email( failure_condition=keep_partitioning and lengths is not None, extra_log=f"`keep_partitioning` must be set to `False` when passing `lengths`. Got: {keep_partitioning=} | {lengths=}", ) # Since we are already splitting the DataFrame back up after an # operation, we will just use this time to compute the number of # partitions as best we can right now. if keep_partitioning and num_splits is None: num_splits = len(left) if axis == 0 else len(left.T) elif lengths: num_splits = len(lengths) elif num_splits is None: num_splits = NPartitions.get() else: ErrorMessage.catch_bugs_and_request_email( failure_condition=not isinstance(num_splits, int), extra_log=f"Expected `num_splits` to be an integer, got: {type(num_splits)} | {num_splits=}", ) preprocessed_map_func = cls.preprocess_func(apply_func) left_partitions = cls.axis_partition(left, axis) right_partitions = None if right is None else cls.axis_partition(right, axis) # For mapping across the entire axis, we don't maintain partitioning because we # may want to line to partitioning up with another BlockPartitions object. Since # we don't need to maintain the partitioning, this gives us the opportunity to # load-balance the data as well. kw = { "num_splits": num_splits, "maintain_partitioning": keep_partitioning, } if lengths: kw["lengths"] = lengths kw["manual_partition"] = True if apply_indices is None: apply_indices = np.arange(len(left_partitions)) result_blocks = np.array( [ left_partitions[i].apply( preprocessed_map_func, *(apply_func_args if apply_func_args else []), other_axis_partition=( right_partitions if broadcast_all else right_partitions[i] ), **kw, **({"partition_idx": idx} if enumerate_partitions else {}), **kwargs, ) for idx, i in enumerate(apply_indices) ] ) # If we are mapping over columns, they are returned to use the same as # rows, so we need to transpose the returned 2D NumPy array to return # the structure to the correct order. return result_blocks.T if not axis else result_blocks @classmethod @wait_computations_if_benchmark_mode def base_map_partitions( cls, partitions, map_func, func_args=None, func_kwargs=None, ): """ Apply `map_func` to every partition in `partitions`. Parameters ---------- partitions : NumPy 2D array Partitions housing the data of Modin Frame. map_func : callable Function to apply. func_args : iterable, optional Positional arguments for the 'map_func'. func_kwargs : dict, optional Keyword arguments for the 'map_func'. Returns ------- NumPy array An array of partitions """ preprocessed_map_func = cls.preprocess_func(map_func) return np.array( [ [ part.apply( preprocessed_map_func, *func_args if func_args is not None else (), **func_kwargs if func_kwargs is not None else {}, ) for part in row_of_parts ] for row_of_parts in partitions ] ) @classmethod @wait_computations_if_benchmark_mode def broadcast_apply( cls, axis, apply_func, left, right, ): """ Broadcast the `right` partitions to `left` and apply `apply_func` function using different approaches to achieve the best performance. Parameters ---------- axis : {0, 1} Axis to apply and broadcast over. apply_func : callable Function to apply. left : np.ndarray NumPy array of left partitions. right : np.ndarray NumPy array of right partitions. Returns ------- np.ndarray NumPy array of result partition objects. """ if not DynamicPartitioning.get(): # block-wise broadcast new_partitions = cls.base_broadcast_apply( axis, apply_func, left, right, ) else: # The dynamic partitioning behavior of `broadcast_apply` differs from that of `map_partitions`, # since the columnar approach for `broadcast_apply` results in slowdown. # axis-wise broadcast new_partitions = cls.broadcast_axis_partitions( axis=axis ^ 1, left=left, right=right, apply_func=apply_func, broadcast_all=False, keep_partitioning=True, ) return new_partitions @classmethod @wait_computations_if_benchmark_mode def map_partitions( cls, partitions, map_func, func_args=None, func_kwargs=None, ): """ Apply `map_func` to `partitions` using different approaches to achieve the best performance. Parameters ---------- partitions : NumPy 2D array Partitions housing the data of Modin Frame. map_func : callable Function to apply. func_args : iterable, optional Positional arguments for the 'map_func'. func_kwargs : dict, optional Keyword arguments for the 'map_func'. Returns ------- NumPy array An array of partitions """ if not DynamicPartitioning.get(): # block-wise map new_partitions = cls.base_map_partitions( partitions, map_func, func_args, func_kwargs ) else: # axis-wise map # we choose an axis for a combination of partitions # whose size is closer to the number of CPUs if abs(partitions.shape[0] - CpuCount.get()) < abs( partitions.shape[1] - CpuCount.get() ): axis = 1 else: axis = 0 column_splits = CpuCount.get() // partitions.shape[1] if axis == 0 and column_splits > 1: # splitting by parts of columnar partitions new_partitions = cls.map_partitions_joined_by_column( partitions, column_splits, map_func, func_args, func_kwargs ) else: # splitting by full axis partitions new_partitions = cls.map_axis_partitions( axis, partitions, lambda df: map_func( df, *(func_args if func_args is not None else ()), **(func_kwargs if func_kwargs is not None else {}), ), keep_partitioning=True, ) return new_partitions @classmethod @wait_computations_if_benchmark_mode def lazy_map_partitions( cls, partitions, map_func, func_args=None, func_kwargs=None, enumerate_partitions=False, ): """ Apply `map_func` to every partition in `partitions` *lazily*. Parameters ---------- partitions : NumPy 2D array Partitions of Modin Frame. map_func : callable Function to apply. func_args : iterable, optional Positional arguments for the 'map_func'. func_kwargs : dict, optional Keyword arguments for the 'map_func'. enumerate_partitions : bool, default: False Returns ------- NumPy array An array of partitions """ preprocessed_map_func = cls.preprocess_func(map_func) return np.array( [ [ part.add_to_apply_calls( preprocessed_map_func, *(tuple() if func_args is None else func_args), **func_kwargs if func_kwargs is not None else {}, **({"partition_idx": i} if enumerate_partitions else {}), ) for part in row ] for i, row in enumerate(partitions) ] ) @classmethod def map_axis_partitions( cls, axis, partitions, map_func, keep_partitioning=False, num_splits=None, lengths=None, enumerate_partitions=False, **kwargs, ): """ Apply `map_func` to every partition in `partitions` along given `axis`. Parameters ---------- axis : {0, 1} Axis to perform the map across (0 - index, 1 - columns). partitions : NumPy 2D array Partitions of Modin Frame. map_func : callable Function to apply. keep_partitioning : boolean, default: False The flag to keep partition boundaries for Modin Frame if possible. Setting it to True disables shuffling data from one partition to another in case the resulting number of splits is equal to the initial number of splits. num_splits : int, optional The number of partitions to split the result into across the `axis`. If None, then the number of splits will be infered automatically. If `num_splits` is None and `keep_partitioning=True` then the number of splits is preserved. lengths : list of ints, default: None The list of lengths to shuffle the object. Note: 1. Passing `lengths` omits the `num_splits` parameter as the number of splits will now be inferred from the number of integers present in `lengths`. 2. When passing lengths you must explicitly specify `keep_partitioning=False`. enumerate_partitions : bool, default: False Whether or not to pass partition index into `map_func`. Note that `map_func` must be able to accept `partition_idx` kwarg. **kwargs : dict Additional options that could be used by different engines. Returns ------- NumPy array An array of new partitions for Modin Frame. Notes ----- This method should be used in the case when `map_func` relies on some global information about the axis. """ return cls.broadcast_axis_partitions( axis=axis, left=partitions, apply_func=map_func, keep_partitioning=keep_partitioning, num_splits=num_splits, right=None, lengths=lengths, enumerate_partitions=enumerate_partitions, **kwargs, ) @classmethod def map_partitions_joined_by_column( cls, partitions, column_splits, map_func, map_func_args=None, map_func_kwargs=None, ): """ Combine several blocks by column into one virtual partition and apply "map_func" to them. Parameters ---------- partitions : NumPy 2D array Partitions of Modin Frame. column_splits : int The number of splits by column. map_func : callable Function to apply. map_func_args : iterable, optional Positional arguments for the 'map_func'. map_func_kwargs : dict, optional Keyword arguments for the 'map_func'. Returns ------- NumPy array An array of new partitions for Modin Frame. """ if column_splits < 1: raise ValueError( "The value of columns_splits must be greater than or equal to 1." ) # step cannot be less than 1 step = max(partitions.shape[0] // column_splits, 1) preprocessed_map_func = cls.preprocess_func(map_func) result = np.empty(partitions.shape, dtype=object) for i in range( 0, partitions.shape[0], step, ): partitions_subset = partitions[i : i + step] # This is necessary when ``partitions.shape[0]`` is not divisible # by `column_splits` without a remainder. actual_step = len(partitions_subset) kw = { "num_splits": actual_step, } joined_column_partitions = cls.column_partitions(partitions_subset) for j in range(partitions.shape[1]): result[i : i + actual_step, j] = joined_column_partitions[j].apply( preprocessed_map_func, *map_func_args if map_func_args is not None else (), **kw, **map_func_kwargs if map_func_kwargs is not None else {}, ) return result @classmethod def concat(cls, axis, left_parts, right_parts): """ Concatenate the blocks of partitions with another set of blocks. Parameters ---------- axis : int The axis to concatenate to. left_parts : np.ndarray NumPy array of partitions to concatenate with. right_parts : np.ndarray or list NumPy array of partitions to be concatenated. Returns ------- np.ndarray A new NumPy array with concatenated partitions. list[int] or None Row lengths if possible to compute it. Notes ----- Assumes that the blocks are already the same shape on the dimension being concatenated. A ValueError will be thrown if this condition is not met. """ # TODO: Possible change is `isinstance(right_parts, list)` if type(right_parts) is list: # `np.array` with partitions of empty ModinFrame has a shape (0,) # but `np.concatenate` can concatenate arrays only if its shapes at # specified axis are equals, so filtering empty frames to avoid concat error right_parts = [o for o in right_parts if o.size != 0] to_concat = ( [left_parts] + right_parts if left_parts.size != 0 else right_parts ) result = ( np.concatenate(to_concat, axis=axis) if len(to_concat) else left_parts ) else: result = np.append(left_parts, right_parts, axis=axis) if axis == 0: return cls.rebalance_partitions(result) else: return result, None @classmethod def to_pandas(cls, partitions): """ Convert NumPy array of PandasDataframePartition to pandas DataFrame. Parameters ---------- partitions : np.ndarray NumPy array of PandasDataframePartition. Returns ------- pandas.DataFrame A pandas DataFrame """ return create_pandas_df_from_partitions( cls.get_objects_from_partitions(partitions.flatten()), partitions.shape ) @classmethod def to_numpy(cls, partitions, **kwargs): """ Convert NumPy array of PandasDataframePartition to NumPy array of data stored within `partitions`. Parameters ---------- partitions : np.ndarray NumPy array of PandasDataframePartition. **kwargs : dict Keyword arguments for PandasDataframePartition.to_numpy function. Returns ------- np.ndarray A NumPy array. """ return np.block( [[block.to_numpy(**kwargs) for block in row] for row in partitions] ) @classmethod def split_pandas_df_into_partitions( cls, df, row_chunksize, col_chunksize, update_bar ): """ Split given pandas DataFrame according to the row/column chunk sizes into distributed partitions. Parameters ---------- df : pandas.DataFrame row_chunksize : int col_chunksize : int update_bar : callable(x) -> x Function that updates a progress bar. Returns ------- 2D np.ndarray[PandasDataframePartition] """ put_func = cls._partition_class.put # even a full-axis slice can cost something (https://github.com/pandas-dev/pandas/issues/55202) # so we try not to do it if unnecessary. if col_chunksize >= len(df.columns): col_parts = [df] else: col_parts = [ df.iloc[:, i : i + col_chunksize] for i in range(0, len(df.columns), col_chunksize) ] parts = [ [ update_bar( put_func(col_part.iloc[i : i + row_chunksize]), ) for col_part in col_parts ] for i in range(0, len(df), row_chunksize) ] return np.array(parts) @classmethod @wait_computations_if_benchmark_mode def from_pandas(cls, df, return_dims=False): """ Return the partitions from pandas.DataFrame. Parameters ---------- df : pandas.DataFrame A pandas.DataFrame. return_dims : bool, default: False If it's True, return as (np.ndarray, row_lengths, col_widths), else np.ndarray. Returns ------- (np.ndarray, backend) or (np.ndarray, backend, row_lengths, col_widths) A NumPy array with partitions (with dimensions or not). """ num_splits = NPartitions.get() min_row_block_size = MinRowPartitionSize.get() min_column_block_size = MinColumnPartitionSize.get() row_chunksize = compute_chunksize(df.shape[0], num_splits, min_row_block_size) col_chunksize = compute_chunksize( df.shape[1], num_splits, min_column_block_size ) bar_format = ( "{l_bar}{bar}{r_bar}" if os.environ.get("DEBUG_PROGRESS_BAR", "False") == "True" else "{desc}: {percentage:3.0f}%{bar} Elapsed time: {elapsed}, estimated remaining time: {remaining}" ) if ProgressBar.get(): with warnings.catch_warnings(): warnings.simplefilter("ignore") try: from tqdm.autonotebook import tqdm as tqdm_notebook except ImportError: raise ImportError("Please pip install tqdm to use the progress bar") rows = max(1, round(len(df) / row_chunksize)) cols = max(1, round(len(df.columns) / col_chunksize)) update_count = rows * cols pbar = tqdm_notebook( total=round(update_count), desc="Distributing Dataframe", bar_format=bar_format, ) else: pbar = None def update_bar(f): if ProgressBar.get(): pbar.update(1) return f parts = cls.split_pandas_df_into_partitions( df, row_chunksize, col_chunksize, update_bar ) backend = get_pandas_backend(df.dtypes) if ProgressBar.get(): pbar.close() if not return_dims: return parts, backend else: row_lengths = [ ( row_chunksize if i + row_chunksize < len(df) else len(df) % row_chunksize or row_chunksize ) for i in range(0, len(df), row_chunksize) ] col_widths = [ ( col_chunksize if i + col_chunksize < len(df.columns) else len(df.columns) % col_chunksize or col_chunksize ) for i in range(0, len(df.columns), col_chunksize) ] return parts, backend, row_lengths, col_widths @classmethod def from_arrow(cls, at, return_dims=False): """ Return the partitions from Apache Arrow (PyArrow). Parameters ---------- at : pyarrow.table Arrow Table. return_dims : bool, default: False If it's True, return as (np.ndarray, row_lengths, col_widths), else np.ndarray. Returns ------- (np.ndarray, backend) or (np.ndarray, backend, row_lengths, col_widths) A NumPy array with partitions (with dimensions or not). """ return cls.from_pandas(at.to_pandas(), return_dims=return_dims) @classmethod def get_objects_from_partitions(cls, partitions): """ Get the objects wrapped by `partitions` (in parallel if supported). Parameters ---------- partitions : np.ndarray NumPy array with ``PandasDataframePartition``-s. Returns ------- list The objects wrapped by `partitions`. """ if hasattr(cls, "_execution_wrapper"): # more efficient parallel implementation for idx, part in enumerate(partitions): if hasattr(part, "force_materialization"): partitions[idx] = part.force_materialization() assert all( [len(partition.list_of_blocks) == 1 for partition in partitions] ), "Implementation assumes that each partition contains a single block." return cls._execution_wrapper.materialize( [partition.list_of_blocks[0] for partition in partitions] ) return [partition.get() for partition in partitions] @classmethod def wait_partitions(cls, partitions): """ Wait on the objects wrapped by `partitions`, without materializing them. This method will block until all computations in the list have completed. Parameters ---------- partitions : np.ndarray NumPy array with ``PandasDataframePartition``-s. Notes ----- This method should be implemented in a more efficient way for engines that supports waiting on objects in parallel. """ for partition in partitions: partition.wait() @classmethod def get_indices(cls, axis, partitions, index_func=None): """ Get the internal indices stored in the partitions. Parameters ---------- axis : {0, 1} Axis to extract the labels over. partitions : np.ndarray NumPy array with PandasDataframePartition's. index_func : callable, default: None The function to be used to extract the indices. Returns ------- pandas.Index A pandas Index object. list of pandas.Index The list of internal indices for each partition. Notes ----- These are the global indices of the object. This is mostly useful when you have deleted rows/columns internally, but do not know which ones were deleted. """ if index_func is None: index_func = lambda df: df.axes[axis] # noqa: E731 ErrorMessage.catch_bugs_and_request_email(not callable(index_func)) func = cls.preprocess_func(index_func) target = partitions.T if axis == 0 else partitions if len(target): new_idx = [idx.apply(func) for idx in target[0]] new_idx = cls.get_objects_from_partitions(new_idx) else: new_idx = [pandas.Index([])] # filter empty indexes in case there are multiple partitions total_idx = list(filter(len, new_idx)) if len(total_idx) > 0: # TODO FIX INFORMATION LEAK!!!!1!!1!! total_idx = total_idx[0].append(total_idx[1:]) else: # Meaning that all partitions returned a zero-length index, # in this case, we return an index of any partition to preserve # the index's metadata total_idx = new_idx[0] return total_idx, new_idx @classmethod def _apply_func_to_list_of_partitions_broadcast( cls, func, partitions, other, **kwargs ): """ Apply a function to a list of remote partitions. `other` partitions will be broadcasted to `partitions` and `func` will be applied. Parameters ---------- func : callable The func to apply. partitions : np.ndarray The partitions to which the `func` will apply. other : np.ndarray The partitions to be broadcasted to `partitions`. **kwargs : dict Keyword arguments for PandasDataframePartition.apply function. Returns ------- list A list of PandasDataframePartition objects. """ preprocessed_func = cls.preprocess_func(func) return [ obj.apply(preprocessed_func, other=[o.get() for o in broadcasted], **kwargs) for obj, broadcasted in zip(partitions, other.T) ] @classmethod def _apply_func_to_list_of_partitions(cls, func, partitions, **kwargs): """ Apply a function to a list of remote partitions. Parameters ---------- func : callable The func to apply. partitions : np.ndarray The partitions to which the `func` will apply. **kwargs : dict Keyword arguments for PandasDataframePartition.apply function. Returns ------- list A list of PandasDataframePartition objects. Notes ----- This preprocesses the `func` first before applying it to the partitions. """ preprocessed_func = cls.preprocess_func(func) return [obj.apply(preprocessed_func, **kwargs) for obj in partitions] @classmethod def combine(cls, partitions, new_index=None, new_columns=None): """ Convert a NumPy 2D array of partitions to a NumPy 2D array of a single partition. Parameters ---------- partitions : np.ndarray The partitions which have to be converted to a single partition. new_index : pandas.Index, optional Index for propagation into internal partitions. Optimization allowing to do this in one remote kernel. new_columns : pandas.Index, optional Columns for propagation into internal partitions. Optimization allowing to do this in one remote kernel. Returns ------- np.ndarray A NumPy 2D array of a single partition. """ if partitions.size <= 1 and new_index is None and new_columns is None: return partitions def to_pandas_remote(df, partition_shape, *dfs): """Copy of ``cls.to_pandas()`` method adapted for a remote function.""" return create_pandas_df_from_partitions( (df,) + dfs, partition_shape, called_from_remote=True, new_index=new_index, new_columns=new_columns, ) preprocessed_func = cls.preprocess_func(to_pandas_remote) partition_shape = partitions.shape partitions_flattened = partitions.flatten() for idx, part in enumerate(partitions_flattened): if hasattr(part, "force_materialization"): partitions_flattened[idx] = part.force_materialization() partition_refs = [ partition.list_of_blocks[0] for partition in partitions_flattened[1:] ] combined_partition = partitions.flat[0].apply( preprocessed_func, partition_shape, *partition_refs ) return np.array([combined_partition]).reshape(1, -1) @classmethod @wait_computations_if_benchmark_mode def apply_func_to_select_indices( cls, axis, partitions, func, indices, keep_remaining=False ): """ Apply a function to select indices. Parameters ---------- axis : {0, 1} Axis to apply the `func` over. partitions : np.ndarray The partitions to which the `func` will apply. func : callable The function to apply to these indices of partitions. indices : dict The indices to apply the function to. keep_remaining : bool, default: False Whether or not to keep the other partitions. Some operations may want to drop the remaining partitions and keep only the results. Returns ------- np.ndarray A NumPy array with partitions. Notes ----- Your internal function must take a kwarg `internal_indices` for this to work correctly. This prevents information leakage of the internal index to the external representation. """ if partitions.size == 0: return np.array([[]]) # Handling dictionaries has to be done differently, but we still want # to figure out the partitions that need to be applied to, so we will # store the dictionary in a separate variable and assign `indices` to # the keys to handle it the same as we normally would. if isinstance(func, dict): dict_func = func else: dict_func = None if not axis: partitions_for_apply = partitions.T else: partitions_for_apply = partitions # We may have a command to perform different functions on different # columns at the same time. We attempt to handle this as efficiently as # possible here. Functions that use this in the dictionary format must # accept a keyword argument `func_dict`. if dict_func is not None: if not keep_remaining: result = np.array( [ cls._apply_func_to_list_of_partitions( func, partitions_for_apply[o_idx], func_dict={ i_idx: dict_func[i_idx] for i_idx in list_to_apply if i_idx >= 0 }, ) for o_idx, list_to_apply in indices.items() ] ) else: result = np.array( [ ( partitions_for_apply[i] if i not in indices else cls._apply_func_to_list_of_partitions( func, partitions_for_apply[i], func_dict={ idx: dict_func[idx] for idx in indices[i] if idx >= 0 }, ) ) for i in range(len(partitions_for_apply)) ] ) else: if not keep_remaining: # We are passing internal indices in here. In order for func to # actually be able to use this information, it must be able to take in # the internal indices. This might mean an iloc in the case of Pandas # or some other way to index into the internal representation. result = np.array( [ cls._apply_func_to_list_of_partitions( func, partitions_for_apply[idx], internal_indices=list_to_apply, ) for idx, list_to_apply in indices.items() ] ) else: # The difference here is that we modify a subset and return the # remaining (non-updated) blocks in their original position. result = np.array( [ ( partitions_for_apply[i] if i not in indices else cls._apply_func_to_list_of_partitions( func, partitions_for_apply[i], internal_indices=indices[i], ) ) for i in range(len(partitions_for_apply)) ] ) return result.T if not axis else result @classmethod @wait_computations_if_benchmark_mode def apply_func_to_select_indices_along_full_axis( cls, axis, partitions, func, indices, keep_remaining=False ): """ Apply a function to a select subset of full columns/rows. Parameters ---------- axis : {0, 1} The axis to apply the function over. partitions : np.ndarray The partitions to which the `func` will apply. func : callable The function to apply. indices : list-like The global indices to apply the func to. keep_remaining : bool, default: False Whether or not to keep the other partitions. Some operations may want to drop the remaining partitions and keep only the results. Returns ------- np.ndarray A NumPy array with partitions. Notes ----- This should be used when you need to apply a function that relies on some global information for the entire column/row, but only need to apply a function to a subset. For your func to operate directly on the indices provided, it must use `internal_indices` as a keyword argument. """ if partitions.size == 0: return np.array([[]]) # Handling dictionaries has to be done differently, but we still want # to figure out the partitions that need to be applied to, so we will # store the dictionary in a separate variable and assign `indices` to # the keys to handle it the same as we normally would. if isinstance(func, dict): dict_func = func else: dict_func = None preprocessed_func = cls.preprocess_func(func) # Since we might be keeping the remaining blocks that are not modified, # we have to also keep the block_partitions object in the correct # direction (transpose for columns). if not keep_remaining: selected_partitions = partitions.T if not axis else partitions selected_partitions = np.array([selected_partitions[i] for i in indices]) selected_partitions = ( selected_partitions.T if not axis else selected_partitions ) else: selected_partitions = partitions if not axis: partitions_for_apply = cls.column_partitions(selected_partitions) partitions_for_remaining = partitions.T else: partitions_for_apply = cls.row_partitions(selected_partitions) partitions_for_remaining = partitions # We may have a command to perform different functions on different # columns at the same time. We attempt to handle this as efficiently as # possible here. Functions that use this in the dictionary format must # accept a keyword argument `func_dict`. if dict_func is not None: if not keep_remaining: result = np.array( [ part.apply( preprocessed_func, func_dict={idx: dict_func[idx] for idx in indices[i]}, ) for i, part in zip(indices, partitions_for_apply) ] ) else: result = np.array( [ ( partitions_for_remaining[i] if i not in indices else cls._apply_func_to_list_of_partitions( preprocessed_func, partitions_for_apply[i], func_dict={idx: dict_func[idx] for idx in indices[i]}, ) ) for i in range(len(partitions_for_apply)) ] ) else: if not keep_remaining: # See notes in `apply_func_to_select_indices` result = np.array( [ part.apply(preprocessed_func, internal_indices=indices[i]) for i, part in zip(indices, partitions_for_apply) ] ) else: # See notes in `apply_func_to_select_indices` result = np.array( [ ( partitions_for_remaining[i] if i not in indices else partitions_for_apply[i].apply( preprocessed_func, internal_indices=indices[i] ) ) for i in range(len(partitions_for_remaining)) ] ) return result.T if not axis else result @classmethod @wait_computations_if_benchmark_mode def apply_func_to_indices_both_axis( cls, partitions, func, row_partitions_list, col_partitions_list, item_to_distribute=no_default, row_lengths=None, col_widths=None, ): """ Apply a function along both axes. Parameters ---------- partitions : np.ndarray The partitions to which the `func` will apply. func : callable The function to apply. row_partitions_list : iterable of tuples Iterable of tuples, containing 2 values: 1. Integer row partition index. 2. Internal row indexer of this partition. col_partitions_list : iterable of tuples Iterable of tuples, containing 2 values: 1. Integer column partition index. 2. Internal column indexer of this partition. item_to_distribute : np.ndarray or scalar, default: no_default The item to split up so it can be applied over both axes. row_lengths : list of ints, optional Lengths of partitions for every row. If not specified this information is extracted from partitions itself. col_widths : list of ints, optional Widths of partitions for every column. If not specified this information is extracted from partitions itself. Returns ------- np.ndarray A NumPy array with partitions. Notes ----- For your func to operate directly on the indices provided, it must use `row_internal_indices`, `col_internal_indices` as keyword arguments. """ partition_copy = partitions.copy() row_position_counter = 0 if row_lengths is None: row_lengths = [None] * len(row_partitions_list) if col_widths is None: col_widths = [None] * len(col_partitions_list) def compute_part_size(indexer, remote_part, part_idx, axis): """Compute indexer length along the specified axis for the passed partition.""" if isinstance(indexer, slice): shapes_container = row_lengths if axis == 0 else col_widths part_size = shapes_container[part_idx] if part_size is None: part_size = ( remote_part.length() if axis == 0 else remote_part.width() ) shapes_container[part_idx] = part_size indexer = range(*indexer.indices(part_size)) return len(indexer) for row_idx, row_values in enumerate(row_partitions_list): row_blk_idx, row_internal_idx = row_values col_position_counter = 0 row_offset = 0 for col_idx, col_values in enumerate(col_partitions_list): col_blk_idx, col_internal_idx = col_values remote_part = partition_copy[row_blk_idx, col_blk_idx] row_offset = compute_part_size( row_internal_idx, remote_part, row_idx, axis=0 ) col_offset = compute_part_size( col_internal_idx, remote_part, col_idx, axis=1 ) if item_to_distribute is not no_default: if isinstance(item_to_distribute, np.ndarray): item = item_to_distribute[ row_position_counter : row_position_counter + row_offset, col_position_counter : col_position_counter + col_offset, ] else: item = item_to_distribute item = {"item": item} else: item = {} block_result = remote_part.add_to_apply_calls( func, row_internal_indices=row_internal_idx, col_internal_indices=col_internal_idx, **item, ) partition_copy[row_blk_idx, col_blk_idx] = block_result col_position_counter += col_offset row_position_counter += row_offset return partition_copy @classmethod @wait_computations_if_benchmark_mode def n_ary_operation(cls, left, func, right: list): r""" Apply an n-ary operation to multiple ``PandasDataframe`` objects. This method assumes that all the partitions of the dataframes in left and right have the same dimensions. For each position i, j in each dataframe's partitions, the result has a partition at (i, j) whose data is func(left_partitions[i,j], \*each_right_partitions[i,j]). Parameters ---------- left : np.ndarray The partitions of left ``PandasDataframe``. func : callable The function to apply. right : list of np.ndarray The list of partitions of other ``PandasDataframe``. Returns ------- np.ndarray A NumPy array with new partitions. """ func = cls.preprocess_func(func) def get_right_block(right_partitions, row_idx, col_idx): partition = right_partitions[row_idx][col_idx] blocks = partition.list_of_blocks """ NOTE: Currently we do one remote call per right virtual partition to materialize the partitions' blocks, then another remote call to do the n_ary operation. we could get better performance if we assembled the other partition within the remote `apply` call, by passing the partition in as `other_axis_partition`. However, passing `other_axis_partition` requires some extra care that would complicate the code quite a bit: - block partitions don't know how to deal with `other_axis_partition` - the right axis partition's axis could be different from the axis of the corresponding left partition - there can be multiple other_axis_partition because this is an n-ary operation and n can be > 2. So for now just do the materialization in a separate remote step. """ if len(blocks) > 1: partition.force_materialization() assert len(partition.list_of_blocks) == 1 return partition.list_of_blocks[0] return np.array( [ [ part.apply( func, *( get_right_block(right_partitions, row_idx, col_idx) for right_partitions in right ), ) for col_idx, part in enumerate(left[row_idx]) ] for row_idx in range(len(left)) ] ) @classmethod def finalize(cls, partitions): """ Perform all deferred calls on partitions. Parameters ---------- partitions : np.ndarray Partitions of Modin Dataframe on which all deferred calls should be performed. """ [part.drain_call_queue() for row in partitions for part in row] @classmethod def rebalance_partitions(cls, partitions): """ Rebalance a 2-d array of partitions if we are using ``PandasOnRay`` or ``PandasOnDask`` executions. For all other executions, the partitions are returned unchanged. Rebalance the partitions by building a new array of partitions out of the original ones so that: - If all partitions have a length, each new partition has roughly the same number of rows. - Otherwise, each new partition spans roughly the same number of old partitions. Parameters ---------- partitions : np.ndarray The 2-d array of partitions to rebalance. Returns ------- np.ndarray A NumPy array with the same; or new, rebalanced, partitions, depending on the execution engine and storage format. list[int] or None Row lengths if possible to compute it. """ # We rebalance when the ratio of the number of existing partitions to # the ideal number of partitions is larger than this threshold. The # threshold is a heuristic that may need to be tuned for performance. max_excess_of_num_partitions = 1.5 num_existing_partitions = partitions.shape[0] ideal_num_new_partitions = NPartitions.get() if ( num_existing_partitions <= ideal_num_new_partitions * max_excess_of_num_partitions ): return partitions, None # If any partition has an unknown length, give each axis partition # roughly the same number of row partitions. We use `_length_cache` here # to avoid materializing any unmaterialized lengths. if any( partition._length_cache is None for row in partitions for partition in row ): # We need each partition to go into an axis partition, but the # number of axis partitions may not evenly divide the number of # partitions. chunk_size = compute_chunksize( num_existing_partitions, ideal_num_new_partitions, min_block_size=1 ) new_partitions = np.array( [ cls.column_partitions( partitions[i : i + chunk_size], full_axis=False, ) for i in range( 0, num_existing_partitions, chunk_size, ) ] ) return new_partitions, None # If we know the number of rows in every partition, then we should try # instead to give each new partition roughly the same number of rows. new_partitions = [] # `start` is the index of the first existing partition that we want to # put into the current new partition. start = 0 total_rows = sum(part.length() for part in partitions[:, 0]) ideal_partition_size = compute_chunksize( total_rows, ideal_num_new_partitions, min_block_size=1 ) for _ in range(ideal_num_new_partitions): # We might pick up old partitions too quickly and exhaust all of them. if start >= len(partitions): break # `stop` is the index of the last existing partition so far that we # want to put into the current new partition. stop = start partition_size = partitions[start][0].length() # Add existing partitions into the current new partition until the # number of rows in the new partition hits `ideal_partition_size`. while stop < len(partitions) and partition_size < ideal_partition_size: stop += 1 if stop < len(partitions): partition_size += partitions[stop][0].length() # If the new partition is larger than we want, split the last # current partition that it contains into two partitions, where # the first partition has just enough rows to make the current # new partition have length `ideal_partition_size`, and the second # partition has the remainder. if partition_size > ideal_partition_size * max_excess_of_num_partitions: prev_length = sum(row[0].length() for row in partitions[start:stop]) new_last_partition_size = ideal_partition_size - prev_length partitions = np.insert( partitions, stop + 1, [ obj.mask(slice(new_last_partition_size, None), slice(None)) for obj in partitions[stop] ], 0, ) # TODO: explicit `_length_cache` computing may be avoided after #4903 is merged for obj in partitions[stop + 1]: obj._length_cache = partition_size - ( prev_length + new_last_partition_size ) partitions[stop, :] = [ obj.mask(slice(None, new_last_partition_size), slice(None)) for obj in partitions[stop] ] # TODO: explicit `_length_cache` computing may be avoided after #4903 is merged for obj in partitions[stop]: obj._length_cache = new_last_partition_size # The new virtual partitions are not `full_axis`, even if they # happen to span all rows in the dataframe, because they are # meant to be the final partitions of the dataframe. They've # already been split up correctly along axis 0, but using the # default full_axis=True would cause partition.apply() to split # its result along axis 0. new_partitions.append( cls.column_partitions(partitions[start : stop + 1], full_axis=False) ) start = stop + 1 new_partitions = np.array(new_partitions) lengths = [part.length() for part in new_partitions[:, 0]] return new_partitions, lengths @classmethod @wait_computations_if_benchmark_mode def shuffle_partitions( cls, partitions, index, shuffle_functions: "ShuffleFunctions", final_shuffle_func, right_partitions=None, ): """ Return shuffled partitions. Parameters ---------- partitions : np.ndarray The 2-d array of partitions to shuffle. index : int or list of ints The index(es) of the column partitions corresponding to the partitions that contain the column to sample. shuffle_functions : ShuffleFunctions An object implementing the functions that we will be using to perform this shuffle. final_shuffle_func : Callable(pandas.DataFrame) -> pandas.DataFrame Function that shuffles the data within each new partition. right_partitions : np.ndarray, optional Partitions to broadcast to `self` partitions. If specified, the method builds range-partitioning for `right_partitions` basing on bins calculated for `partitions`, then performs broadcasting. Returns ------- np.ndarray A list of row-partitions that have been shuffled. """ # Mask the partition that contains the column that will be sampled. masked_partitions = partitions[:, index] # Sample each partition sample_func = cls.preprocess_func(shuffle_functions.sample_fn) if masked_partitions.ndim == 1: samples = [partition.apply(sample_func) for partition in masked_partitions] else: samples = [ cls._row_partition_class(row_part, full_axis=False).apply(sample_func) for row_part in masked_partitions ] # Get each sample to pass in to the pivot function samples = cls.get_objects_from_partitions(samples) num_bins = shuffle_functions.pivot_fn(samples) # Convert our list of block partitions to row partitions. We need to create full-axis # row partitions since we need to send the whole partition to the split step as otherwise # we wouldn't know how to split the block partitions that don't contain the shuffling key. row_partitions = cls.row_partitions(partitions) if num_bins > 1: # Gather together all of the sub-partitions split_row_partitions = np.array( [ partition.split( shuffle_functions.split_fn, num_splits=num_bins, # The partition's metadata will never be accessed for the split partitions, # thus no need to compute it. extract_metadata=False, ) for partition in row_partitions ] ).T if right_partitions is None: # We need to convert every partition that came from the splits into a column partition. return np.array( [ [ cls._column_partitions_class( row_partition, full_axis=False ).apply(final_shuffle_func) ] for row_partition in split_row_partitions ] ) right_row_parts = cls.row_partitions(right_partitions) right_split_row_partitions = np.array( [ partition.split( shuffle_functions.split_fn, num_splits=num_bins, extract_metadata=False, ) for partition in right_row_parts ] ).T return np.array( [ cls._column_partitions_class(row_partition, full_axis=False).apply( final_shuffle_func, other_axis_partition=cls._column_partitions_class( right_row_partitions ), ) for right_row_partitions, row_partition in zip( right_split_row_partitions, split_row_partitions ) ] ) else: # If there are not pivots we can simply apply the function row-wise if right_partitions is None: return np.array( [row_part.apply(final_shuffle_func) for row_part in row_partitions] ) right_row_parts = cls.row_partitions(right_partitions) return np.array( [ row_part.apply( final_shuffle_func, other_axis_partition=right_row_part ) for right_row_part, row_part in zip(right_row_parts, row_partitions) ] ) ================================================ FILE: modin/core/dataframe/pandas/utils.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Collection of utility functions for the PandasDataFrame.""" import pandas from pandas.api.types import union_categoricals from modin.error_message import ErrorMessage def concatenate(dfs, copy=True): """ Concatenate pandas DataFrames with saving 'category' dtype. All dataframes' columns must be equal to each other. Parameters ---------- dfs : list List of pandas DataFrames to concatenate. copy : bool, default: True Make explicit copy when creating dataframe. Returns ------- pandas.DataFrame A pandas DataFrame. """ for df in dfs: assert df.columns.equals(dfs[0].columns) for i in dfs[0].columns.get_indexer_for(dfs[0].select_dtypes("category").columns): columns = [df.iloc[:, i] for df in dfs] all_categorical_parts_are_empty = None has_non_categorical_parts = False for col in columns: if isinstance(col.dtype, pandas.CategoricalDtype): if all_categorical_parts_are_empty is None: all_categorical_parts_are_empty = len(col) == 0 continue all_categorical_parts_are_empty &= len(col) == 0 else: has_non_categorical_parts = True # 'union_categoricals' raises an error if some of the passed values don't have categorical dtype, # if it happens, we only want to continue when all parts with categorical dtypes are actually empty. # This can happen if there were an aggregation that discards categorical dtypes and that aggregation # doesn't properly do so for empty partitions if has_non_categorical_parts and all_categorical_parts_are_empty: continue union = union_categoricals(columns) for df in dfs: df.isetitem( i, pandas.Categorical(df.iloc[:, i], categories=union.categories) ) # `ValueError: buffer source array is read-only` if copy==False if len(dfs) == 1 and copy: # concat doesn't make a copy if len(dfs) == 1, # so do it explicitly return dfs[0].copy() return pandas.concat(dfs, copy=copy) def create_pandas_df_from_partitions( partition_data, partition_shape, called_from_remote=False, new_index=None, new_columns=None, ): """ Convert partition data of multiple dataframes to a single dataframe. Parameters ---------- partition_data : list List of pandas DataFrames or list of Object references holding pandas DataFrames. partition_shape : int or tuple Shape of the partitions NumPy array. called_from_remote : bool, default: False Flag used to check if explicit copy should be done in concat. new_index : pandas.Index, optional Index for propagation into internal partitions. Optimization allowing to do this in one remote kernel. new_columns : pandas.Index, optional Columns for propagation into internal partitions. Optimization allowing to do this in one remote kernel. Returns ------- pandas.DataFrame A pandas DataFrame. """ if all( isinstance(obj, (pandas.DataFrame, pandas.Series)) for obj in partition_data ): height, width, *_ = tuple(partition_shape) + (0,) # restore 2d array objs = iter(partition_data) partition_data = [[next(objs) for _ in range(width)] for __ in range(height)] else: # Partitions do not always contain pandas objects. # This implementation comes from the fact that calling `partition.get` # function is not always equivalent to `partition.to_pandas`. partition_data = [[obj.to_pandas() for obj in part] for part in partition_data] if all(isinstance(part, pandas.Series) for row in partition_data for part in row): axis = 0 elif all( isinstance(part, pandas.DataFrame) for row in partition_data for part in row ): axis = 1 else: ErrorMessage.catch_bugs_and_request_email(True) def is_part_empty(part): return part.empty and ( not isinstance(part, pandas.DataFrame) or (len(part.columns) == 0) ) df_rows = [ pandas.concat([part for part in row], axis=axis, copy=False) for row in partition_data if not all(is_part_empty(part) for part in row) ] # to reduce peak memory consumption del partition_data if len(df_rows) == 0: res = pandas.DataFrame() else: res = concatenate(df_rows, copy=not called_from_remote) if new_index is not None: res.index = new_index if new_columns is not None: res.columns = new_columns return res ================================================ FILE: modin/core/execution/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Modin's functionality related to execution engines supported.""" ================================================ FILE: modin/core/execution/dask/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Modin's functionality related to Dask execution engine.""" ================================================ FILE: modin/core/execution/dask/common/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Common utilities for Dask execution engine.""" from .engine_wrapper import DaskWrapper from .utils import initialize_dask __all__ = [ "initialize_dask", "DaskWrapper", ] ================================================ FILE: modin/core/execution/dask/common/engine_wrapper.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses class responsible for execution of remote operations.""" from collections import UserDict import pandas from dask.distributed import wait from distributed import Future from distributed.client import default_client from distributed.worker import get_worker def get_dask_client(): """ Get the Dask client, reusing the worker's client if execution is on a Dask worker. Returns ------- distributed.Client The Dask client. """ try: client = default_client() except ValueError: # We ought to be in a worker process worker = get_worker() client = worker.client return client def _deploy_dask_func(func, *args, return_pandas_df=None, **kwargs): # pragma: no cover """ Wrap `func` to ease calling it remotely. Parameters ---------- func : callable A local function that we want to call remotely. *args : iterable Positional arguments to pass to `func` when calling remotely. return_pandas_df : bool, optional Whether to convert the result of `func` to a pandas DataFrame or not. **kwargs : dict Keyword arguments to pass to `func` when calling remotely. Returns ------- distributed.Future or list Dask identifier of the result being put into distributed memory. """ result = func(*args, **kwargs) if return_pandas_df and not isinstance(result, pandas.DataFrame): result = pandas.DataFrame(result) return result class DaskWrapper: """The class responsible for execution of remote operations.""" @classmethod def deploy( cls, func, f_args=None, f_kwargs=None, return_pandas_df=None, num_returns=1, pure=True, ): """ Deploy a function in a worker process. Parameters ---------- func : callable or distributed.Future Function to be deployed in a worker process. f_args : list or tuple, optional Positional arguments to pass to ``func``. f_kwargs : dict, optional Keyword arguments to pass to ``func``. return_pandas_df : bool, optional Whether to convert the result of `func` to a pandas DataFrame or not. num_returns : int, default: 1 The number of returned objects. pure : bool, default: True Whether or not `func` is pure. See `Client.submit` for details. Returns ------- list The result of ``func`` split into parts in accordance with ``num_returns``. """ client = get_dask_client() args = [] if f_args is None else f_args kwargs = {} if f_kwargs is None else f_kwargs if callable(func): remote_task_future = client.submit(func, *args, pure=pure, **kwargs) else: # for the case where type(func) is distributed.Future remote_task_future = client.submit( _deploy_dask_func, func, *args, pure=pure, return_pandas_df=return_pandas_df, **kwargs, ) if num_returns != 1: return [ client.submit(lambda tup, i: tup[i], remote_task_future, i) for i in range(num_returns) ] return remote_task_future @classmethod def is_future(cls, item): """ Check if the item is a Future. Parameters ---------- item : distributed.Future or object Future or object to check. Returns ------- boolean If the value is a future. """ return isinstance(item, Future) @classmethod def materialize(cls, future): """ Materialize data matching `future` object. Parameters ---------- future : distributed.Future or list Future object of list of future objects whereby data needs to be materialized. Returns ------- Any An object(s) from the distributed memory. """ client = get_dask_client() return client.gather(future) @classmethod def put(cls, data, **kwargs): """ Put data into distributed memory. Parameters ---------- data : list, dict, or object Data to scatter out to workers. Output type matches input type. **kwargs : dict Additional keyword arguments to be passed in `Client.scatter`. Returns ------- List, dict, iterator, or queue of futures matching the type of input. """ if isinstance(data, dict): # there is a bug that looks similar to https://github.com/dask/distributed/issues/3965; # to avoid this we could change behaviour for serialization: # # vs # {'sep': , \ # 'delimiter': ... data = UserDict(data) client = get_dask_client() return client.scatter(data, **kwargs) @classmethod def wait(cls, obj_ids, num_returns=None): """ Wait on the objects without materializing them (blocking operation). Parameters ---------- obj_ids : list, scalar num_returns : int, optional """ if not isinstance(obj_ids, list): obj_ids = [obj_ids] if num_returns is None: num_returns = len(obj_ids) if num_returns == len(obj_ids): wait(obj_ids, return_when="ALL_COMPLETED") else: # Dask doesn't natively support `num_returns` as int. # `wait` function doesn't always return only one finished future, # so a simple loop is not enough here done, not_done = wait(obj_ids, return_when="FIRST_COMPLETED") while len(done) < num_returns and (i := 0 < num_returns): extra_done, not_done = wait(not_done, return_when="FIRST_COMPLETED") done.update(extra_done) i += 1 ================================================ FILE: modin/core/execution/dask/common/utils.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses utility function to initialize Dask environment.""" import os from modin.config import ( CIAWSAccessKeyID, CIAWSSecretAccessKey, CpuCount, DaskThreadsPerWorker, GithubCI, Memory, NPartitions, ) from modin.core.execution.utils import set_env def initialize_dask(): """Initialize Dask environment.""" from distributed.client import default_client from distributed.worker import get_worker try: # Check if running within a Dask worker process get_worker() # If the above line does not raise an error, we are in a worker process # and should not create a new client return except ValueError: # Not in a Dask worker, proceed to check for or create a client pass try: client = default_client() def _disable_warnings(): import warnings warnings.simplefilter("ignore", category=FutureWarning) client.run(_disable_warnings) except ValueError: from distributed import Client num_cpus = CpuCount.get() threads_per_worker = DaskThreadsPerWorker.get() memory_limit = Memory.get() worker_memory_limit = memory_limit // num_cpus if memory_limit else "auto" # when the client is initialized, environment variables are inherited with set_env(PYTHONWARNINGS="ignore::FutureWarning"): client = Client( n_workers=num_cpus, threads_per_worker=threads_per_worker, memory_limit=worker_memory_limit, ) if GithubCI.get(): # set these keys to run tests that write to the mock s3 service. this seems # to be the way to pass environment variables to the workers: # https://jacobtomlinson.dev/posts/2021/bio-for-2021/ access_key = CIAWSAccessKeyID.get() aws_secret = CIAWSSecretAccessKey.get() client.run( lambda: os.environ.update( { "AWS_ACCESS_KEY_ID": access_key, "AWS_SECRET_ACCESS_KEY": aws_secret, } ) ) num_cpus = len(client.ncores()) NPartitions._put(num_cpus) CpuCount._put(num_cpus) ================================================ FILE: modin/core/execution/dask/implementations/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Modin's functionality related to Dask execution engine and optimized for specific storage formats.""" ================================================ FILE: modin/core/execution/dask/implementations/pandas_on_dask/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Modin's functionality related to Dask execution engine and optimized for pandas storage format.""" ================================================ FILE: modin/core/execution/dask/implementations/pandas_on_dask/dataframe/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Base Modin Dataframe class optimized for pandas on Dask execution.""" from .dataframe import PandasOnDaskDataframe __all__ = ["PandasOnDaskDataframe"] ================================================ FILE: modin/core/execution/dask/implementations/pandas_on_dask/dataframe/dataframe.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses class that implements ``PandasDataframe``.""" from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe from modin.utils import _inherit_docstrings from ..partitioning.partition_manager import PandasOnDaskDataframePartitionManager class PandasOnDaskDataframe(PandasDataframe): """ The class implements the interface in ``PandasDataframe``. Parameters ---------- partitions : np.ndarray A 2D NumPy array of partitions. index : sequence The index for the dataframe. Converted to a pandas.Index. columns : sequence The columns object for the dataframe. Converted to a pandas.Index. row_lengths : list, optional The length of each partition in the rows. The "height" of each of the block partitions. Is computed if not provided. column_widths : list, optional The width of each partition in the columns. The "width" of each of the block partitions. Is computed if not provided. dtypes : pandas.Series, optional The data types for the dataframe columns. pandas_backend : {"pyarrow", None}, optional Backend used by pandas. None - means default NumPy backend. """ _partition_mgr_cls = PandasOnDaskDataframePartitionManager @classmethod def reconnect(cls, address, attributes): # noqa: GL08 # The main goal is to configure the client for the worker process # using the address passed by the custom `__reduce__` function try: from distributed import default_client default_client() except ValueError: from distributed import Client # setup `default_client` for worker process _ = Client(address) obj = cls.__new__(cls) obj.__dict__.update(attributes) return obj def __reduce__(self): # noqa: GL08 from distributed import default_client address = default_client().scheduler_info()["address"] return self.reconnect, (address, self.__dict__) @property @_inherit_docstrings(PandasDataframe.engine) def engine(self) -> str: return "Dask" ================================================ FILE: modin/core/execution/dask/implementations/pandas_on_dask/io/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Base IO classes optimized for pandas on Dask execution.""" from .io import PandasOnDaskIO __all__ = [ "PandasOnDaskIO", ] ================================================ FILE: modin/core/execution/dask/implementations/pandas_on_dask/io/io.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses class that implements ``BaseIO`` using Dask as an execution engine.""" import numpy as np from distributed.client import default_client from modin.core.execution.dask.common import DaskWrapper from modin.core.execution.dask.implementations.pandas_on_dask.dataframe import ( PandasOnDaskDataframe, ) from modin.core.execution.dask.implementations.pandas_on_dask.partitioning import ( PandasOnDaskDataframePartition, ) from modin.core.io import ( BaseIO, CSVDispatcher, ExcelDispatcher, FeatherDispatcher, FWFDispatcher, JSONDispatcher, ParquetDispatcher, SQLDispatcher, ) from modin.core.storage_formats.pandas.parsers import ( PandasCSVParser, PandasExcelParser, PandasFeatherParser, PandasFWFParser, PandasJSONParser, PandasParquetParser, PandasSQLParser, ) from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler from modin.distributed.dataframe.pandas.partitions import ( from_partitions, unwrap_partitions, ) from modin.experimental.core.io import ( ExperimentalCSVGlobDispatcher, ExperimentalCustomTextDispatcher, ExperimentalGlobDispatcher, ExperimentalSQLDispatcher, ) from modin.experimental.core.storage_formats.pandas.parsers import ( ExperimentalCustomTextParser, ExperimentalPandasCSVGlobParser, ExperimentalPandasJsonParser, ExperimentalPandasParquetParser, ExperimentalPandasPickleParser, ExperimentalPandasXmlParser, ) from modin.pandas.series import Series from modin.utils import MODIN_UNNAMED_SERIES_LABEL class PandasOnDaskIO(BaseIO): """The class implements interface in ``BaseIO`` using Dask as an execution engine.""" frame_cls = PandasOnDaskDataframe frame_partition_cls = PandasOnDaskDataframePartition query_compiler_cls = PandasQueryCompiler build_args = dict( frame_cls=PandasOnDaskDataframe, frame_partition_cls=PandasOnDaskDataframePartition, query_compiler_cls=PandasQueryCompiler, base_io=BaseIO, ) def __make_read(*classes, build_args=build_args): # used to reduce code duplication return type("", (DaskWrapper, *classes), build_args).read def __make_write(*classes, build_args=build_args): # used to reduce code duplication return type("", (DaskWrapper, *classes), build_args).write read_csv = __make_read(PandasCSVParser, CSVDispatcher) read_fwf = __make_read(PandasFWFParser, FWFDispatcher) read_json = __make_read(PandasJSONParser, JSONDispatcher) read_parquet = __make_read(PandasParquetParser, ParquetDispatcher) to_parquet = __make_write(ParquetDispatcher) # Blocked on pandas-dev/pandas#12236. It is faster to default to pandas. # read_hdf = __make_read(PandasHDFParser, HDFReader) read_feather = __make_read(PandasFeatherParser, FeatherDispatcher) read_sql = __make_read(PandasSQLParser, SQLDispatcher) to_sql = __make_write(SQLDispatcher) read_excel = __make_read(PandasExcelParser, ExcelDispatcher) # experimental methods that don't exist in pandas read_csv_glob = __make_read( ExperimentalPandasCSVGlobParser, ExperimentalCSVGlobDispatcher ) read_parquet_glob = __make_read( ExperimentalPandasParquetParser, ExperimentalGlobDispatcher ) to_parquet_glob = __make_write( ExperimentalGlobDispatcher, build_args={**build_args, "base_write": BaseIO.to_parquet}, ) read_json_glob = __make_read( ExperimentalPandasJsonParser, ExperimentalGlobDispatcher ) to_json_glob = __make_write( ExperimentalGlobDispatcher, build_args={**build_args, "base_write": BaseIO.to_json}, ) read_xml_glob = __make_read(ExperimentalPandasXmlParser, ExperimentalGlobDispatcher) to_xml_glob = __make_write( ExperimentalGlobDispatcher, build_args={**build_args, "base_write": BaseIO.to_xml}, ) read_pickle_glob = __make_read( ExperimentalPandasPickleParser, ExperimentalGlobDispatcher ) to_pickle_glob = __make_write( ExperimentalGlobDispatcher, build_args={**build_args, "base_write": BaseIO.to_pickle}, ) read_custom_text = __make_read( ExperimentalCustomTextParser, ExperimentalCustomTextDispatcher ) read_sql_distributed = __make_read( ExperimentalSQLDispatcher, build_args={**build_args, "base_read": read_sql} ) del __make_read # to not pollute class namespace del __make_write # to not pollute class namespace @classmethod def from_dask(cls, dask_obj): """ Create a Modin `query_compiler` from a Dask DataFrame. Parameters ---------- dask_obj : dask.dataframe.DataFrame The Dask DataFrame to convert from. Returns ------- BaseQueryCompiler QueryCompiler containing data from the Dask DataFrame. """ client = default_client() dask_fututures = client.compute(dask_obj.to_delayed()) modin_df = from_partitions(dask_fututures, axis=0)._query_compiler return modin_df @classmethod def to_dask(cls, modin_obj): """ Convert a Modin DataFrame/Series to a Dask DataFrame/Series. Parameters ---------- modin_obj : modin.pandas.DataFrame, modin.pandas.Series The Modin DataFrame/Series to convert. Returns ------- dask.dataframe.DataFrame or dask.dataframe.Series Converted object with type depending on input. """ from dask.dataframe import from_delayed partitions = unwrap_partitions(modin_obj, axis=0) # partiotions must be converted to pandas Series if isinstance(modin_obj, Series): client = default_client() def df_to_series(df): series = df[df.columns[0]] if df.columns[0] == MODIN_UNNAMED_SERIES_LABEL: series.name = None return series partitions = [client.submit(df_to_series, part) for part in partitions] return from_delayed(partitions) @classmethod def from_map(cls, func, iterable, *args, **kwargs): """ Create a Modin `query_compiler` from a map function. This method will construct a Modin `query_compiler` split by row partitions. The number of row partitions matches the number of elements in the iterable object. Parameters ---------- func : callable Function to map across the iterable object. iterable : Iterable An iterable object. *args : tuple Positional arguments to pass in `func`. **kwargs : dict Keyword arguments to pass in `func`. Returns ------- BaseQueryCompiler QueryCompiler containing data returned by map function. """ func = cls.frame_cls._partition_mgr_cls.preprocess_func(func) partitions = np.array( [ [ cls.frame_partition_cls( DaskWrapper.deploy( func, f_args=(obj,) + args, f_kwargs=kwargs, return_pandas_df=True, ) ) ] for obj in iterable ] ) return cls.query_compiler_cls(cls.frame_cls(partitions)) ================================================ FILE: modin/core/execution/dask/implementations/pandas_on_dask/partitioning/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Base Modin Dataframe classes related to its partitioning and optimized for pandas on Dask execution.""" from .partition import PandasOnDaskDataframePartition from .partition_manager import PandasOnDaskDataframePartitionManager from .virtual_partition import ( PandasOnDaskDataframeColumnPartition, PandasOnDaskDataframeRowPartition, PandasOnDaskDataframeVirtualPartition, ) __all__ = [ "PandasOnDaskDataframePartition", "PandasOnDaskDataframePartitionManager", "PandasOnDaskDataframeVirtualPartition", "PandasOnDaskDataframeColumnPartition", "PandasOnDaskDataframeRowPartition", ] ================================================ FILE: modin/core/execution/dask/implementations/pandas_on_dask/partitioning/partition.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses class that wraps data (block partition) and its metadata.""" import pandas from distributed import Future from distributed.utils import get_ip from modin.core.dataframe.pandas.partitioning.partition import PandasDataframePartition from modin.core.execution.dask.common import DaskWrapper from modin.logging import get_logger from modin.pandas.indexing import compute_sliced_len class PandasOnDaskDataframePartition(PandasDataframePartition): """ The class implements the interface in ``PandasDataframePartition``. Parameters ---------- data : distributed.Future A reference to pandas DataFrame that need to be wrapped with this class. length : distributed.Future or int, optional Length or reference to it of wrapped pandas DataFrame. width : distributed.Future or int, optional Width or reference to it of wrapped pandas DataFrame. ip : distributed.Future or str, optional Node IP address or reference to it that holds wrapped pandas DataFrame. call_queue : list, optional Call queue that needs to be executed on wrapped pandas DataFrame. """ execution_wrapper = DaskWrapper def __init__(self, data, length=None, width=None, ip=None, call_queue=None): super().__init__() assert isinstance(data, Future) self._data = data if call_queue is None: call_queue = [] self.call_queue = call_queue self._length_cache = length self._width_cache = width self._ip_cache = ip log = get_logger() self._is_debug(log) and log.debug( "Partition ID: {}, Height: {}, Width: {}, Node IP: {}".format( self._identity, str(self._length_cache), str(self._width_cache), str(self._ip_cache), ) ) def apply(self, func, *args, **kwargs): """ Apply a function to the object wrapped by this partition. Parameters ---------- func : callable or distributed.Future A function to apply. *args : iterable Additional positional arguments to be passed in `func`. **kwargs : dict Additional keyword arguments to be passed in `func`. Returns ------- PandasOnDaskDataframePartition A new ``PandasOnDaskDataframePartition`` object. Notes ----- The keyword arguments are sent as a dictionary. """ log = get_logger() self._is_debug(log) and log.debug(f"ENTER::Partition.apply::{self._identity}") call_queue = self.call_queue + [[func, args, kwargs]] if len(call_queue) > 1: self._is_debug(log) and log.debug( f"SUBMIT::_apply_list_of_funcs::{self._identity}" ) futures = self.execution_wrapper.deploy( func=apply_list_of_funcs, f_args=(call_queue, self._data), num_returns=2, pure=False, ) else: # We handle `len(call_queue) == 1` in a different way because # this improves performance a bit. func, f_args, f_kwargs = call_queue[0] futures = self.execution_wrapper.deploy( func=apply_func, f_args=(self._data, func, *f_args), f_kwargs=f_kwargs, num_returns=2, pure=False, ) self._is_debug(log) and log.debug(f"SUBMIT::_apply_func::{self._identity}") self._is_debug(log) and log.debug(f"EXIT::Partition.apply::{self._identity}") return self.__constructor__(futures[0], ip=futures[1]) def drain_call_queue(self): """Execute all operations stored in the call queue on the object wrapped by this partition.""" log = get_logger() self._is_debug(log) and log.debug( f"ENTER::Partition.drain_call_queue::{self._identity}" ) if len(self.call_queue) == 0: return call_queue = self.call_queue if len(call_queue) > 1: self._is_debug(log) and log.debug( f"SUBMIT::_apply_list_of_funcs::{self._identity}" ) futures = self.execution_wrapper.deploy( func=apply_list_of_funcs, f_args=(call_queue, self._data), num_returns=2, pure=False, ) else: # We handle `len(call_queue) == 1` in a different way because # this improves performance a bit. func, f_args, f_kwargs = call_queue[0] self._is_debug(log) and log.debug(f"SUBMIT::_apply_func::{self._identity}") futures = self.execution_wrapper.deploy( func=apply_func, f_args=(self._data, func, *f_args), f_kwargs=f_kwargs, num_returns=2, pure=False, ) self._data = futures[0] self._ip_cache = futures[1] self._is_debug(log) and log.debug( f"EXIT::Partition.drain_call_queue::{self._identity}" ) self.call_queue = [] def wait(self): """Wait completing computations on the object wrapped by the partition.""" self.drain_call_queue() self.execution_wrapper.wait(self._data) def mask(self, row_labels, col_labels): """ Lazily create a mask that extracts the indices provided. Parameters ---------- row_labels : list-like, slice or label The row labels for the rows to extract. col_labels : list-like, slice or label The column labels for the columns to extract. Returns ------- PandasOnDaskDataframePartition A new ``PandasOnDaskDataframePartition`` object. """ log = get_logger() self._is_debug(log) and log.debug(f"ENTER::Partition.mask::{self._identity}") new_obj = super().mask(row_labels, col_labels) if isinstance(row_labels, slice) and isinstance(self._length_cache, Future): if row_labels == slice(None): # fast path - full axis take new_obj._length_cache = self._length_cache else: new_obj._length_cache = self.execution_wrapper.deploy( func=compute_sliced_len, f_args=(row_labels, self._length_cache) ) if isinstance(col_labels, slice) and isinstance(self._width_cache, Future): if col_labels == slice(None): # fast path - full axis take new_obj._width_cache = self._width_cache else: new_obj._width_cache = self.execution_wrapper.deploy( func=compute_sliced_len, f_args=(col_labels, self._width_cache) ) self._is_debug(log) and log.debug(f"EXIT::Partition.mask::{self._identity}") return new_obj def __copy__(self): """ Create a copy of this partition. Returns ------- PandasOnDaskDataframePartition A copy of this partition. """ return self.__constructor__( self._data, length=self._length_cache, width=self._width_cache, ip=self._ip_cache, call_queue=self.call_queue, ) @classmethod def put(cls, obj): """ Put an object into distributed memory and wrap it with partition object. Parameters ---------- obj : any An object to be put. Returns ------- PandasOnDaskDataframePartition A new ``PandasOnDaskDataframePartition`` object. """ return cls( cls.execution_wrapper.put(obj, hash=False), len(obj.index), len(obj.columns), ) @classmethod def preprocess_func(cls, func): """ Preprocess a function before an ``apply`` call. Parameters ---------- func : callable The function to preprocess. Returns ------- callable An object that can be accepted by ``apply``. """ return cls.execution_wrapper.put(func, hash=False, broadcast=True) def length(self, materialize=True): """ Get the length of the object wrapped by this partition. Parameters ---------- materialize : bool, default: True Whether to forcibly materialize the result into an integer. If ``False`` was specified, may return a future of the result if it hasn't been materialized yet. Returns ------- int or distributed.Future The length of the object. """ if self._length_cache is None: self._length_cache = self.apply(len)._data if isinstance(self._length_cache, Future) and materialize: self._length_cache = self.execution_wrapper.materialize(self._length_cache) return self._length_cache def width(self, materialize=True): """ Get the width of the object wrapped by the partition. Parameters ---------- materialize : bool, default: True Whether to forcibly materialize the result into an integer. If ``False`` was specified, may return a future of the result if it hasn't been materialized yet. Returns ------- int or distributed.Future The width of the object. """ if self._width_cache is None: self._width_cache = self.apply(lambda df: len(df.columns))._data if isinstance(self._width_cache, Future) and materialize: self._width_cache = self.execution_wrapper.materialize(self._width_cache) return self._width_cache def ip(self, materialize=True): """ Get the node IP address of the object wrapped by this partition. Parameters ---------- materialize : bool, default: True Whether to forcibly materialize the result into an integer. If ``False`` was specified, may return a future of the result if it hasn't been materialized yet. Returns ------- str IP address of the node that holds the data. """ if self._ip_cache is None: self._ip_cache = self.apply(lambda df: pandas.DataFrame([]))._ip_cache if materialize and isinstance(self._ip_cache, Future): self._ip_cache = self.execution_wrapper.materialize(self._ip_cache) return self._ip_cache def apply_func(partition, func, *args, **kwargs): """ Execute a function on the partition in a worker process. Parameters ---------- partition : pandas.DataFrame A pandas DataFrame the function needs to be executed on. func : callable The function to perform. *args : list Positional arguments to pass to ``func``. **kwargs : dict Keyword arguments to pass to ``func``. Returns ------- pandas.DataFrame The resulting pandas DataFrame. str The node IP address of the worker process. Notes ----- Directly passing a call queue entry (i.e. a list of [func, args, kwargs]) instead of destructuring it causes a performance penalty. """ result = func(partition, *args, **kwargs) return result, get_ip() def apply_list_of_funcs(call_queue, partition): """ Execute all operations stored in the call queue on the partition in a worker process. Parameters ---------- call_queue : list A call queue of ``[func, args, kwargs]`` triples that needs to be executed on the partition. partition : pandas.DataFrame A pandas DataFrame the call queue needs to be executed on. Returns ------- pandas.DataFrame The resulting pandas DataFrame. str The node IP address of the worker process. """ for func, f_args, f_kwargs in call_queue: partition = func(partition, *f_args, **f_kwargs) return partition, get_ip() ================================================ FILE: modin/core/execution/dask/implementations/pandas_on_dask/partitioning/partition_manager.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses class that implements ``PandasDataframePartitionManager``.""" from modin.core.dataframe.pandas.partitioning.partition_manager import ( PandasDataframePartitionManager, ) from modin.core.execution.dask.common import DaskWrapper from .partition import PandasOnDaskDataframePartition from .virtual_partition import ( PandasOnDaskDataframeColumnPartition, PandasOnDaskDataframeRowPartition, ) class PandasOnDaskDataframePartitionManager(PandasDataframePartitionManager): """The class implements the interface in `PandasDataframePartitionManager`.""" # This object uses PandasOnDaskDataframePartition objects as the underlying store. _partition_class = PandasOnDaskDataframePartition _column_partitions_class = PandasOnDaskDataframeColumnPartition _row_partition_class = PandasOnDaskDataframeRowPartition _execution_wrapper = DaskWrapper @classmethod def wait_partitions(cls, partitions): """ Wait on the objects wrapped by `partitions` in parallel, without materializing them. This method will block until all computations in the list have completed. Parameters ---------- partitions : np.ndarray NumPy array with ``PandasDataframePartition``-s. """ cls._execution_wrapper.wait( [block for partition in partitions for block in partition.list_of_blocks] ) ================================================ FILE: modin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses classes responsible for storing a virtual partition and applying a function to it.""" import pandas from distributed.utils import get_ip from modin.core.dataframe.pandas.partitioning.axis_partition import ( PandasDataframeAxisPartition, ) from modin.core.execution.dask.common import DaskWrapper from modin.utils import _inherit_docstrings from .partition import PandasOnDaskDataframePartition class PandasOnDaskDataframeVirtualPartition(PandasDataframeAxisPartition): """ The class implements the interface in ``PandasDataframeAxisPartition``. Parameters ---------- list_of_partitions : Union[list, PandasOnDaskDataframePartition] List of ``PandasOnDaskDataframePartition`` and ``PandasOnDaskDataframeVirtualPartition`` objects, or a single ``PandasOnDaskDataframePartition``. get_ip : bool, default: False Whether to get node IP addresses of conforming partitions or not. full_axis : bool, default: True Whether or not the virtual partition encompasses the whole axis. call_queue : list, optional A list of tuples (callable, args, kwargs) that contains deferred calls. length : distributed.Future or int, optional Length, or reference to length, of wrapped ``pandas.DataFrame``. width : distributed.Future or int, optional Width, or reference to width, of wrapped ``pandas.DataFrame``. """ axis = None _PARTITIONS_METADATA_LEN = 3 # (length, width, ip) partition_type = PandasOnDaskDataframePartition @property def list_of_ips(self): """ Get the IPs holding the physical objects composing this partition. Returns ------- List A list of IPs as ``distributed.Future`` or str. """ # Defer draining call queue until we get the ip address result = [None] * len(self.list_of_block_partitions) for idx, partition in enumerate(self.list_of_block_partitions): partition.drain_call_queue() result[idx] = partition.ip(materialize=False) return result @classmethod @_inherit_docstrings(PandasDataframeAxisPartition.deploy_splitting_func) def deploy_splitting_func( cls, axis, func, f_args, f_kwargs, num_splits, *partitions, extract_metadata=False, ): return DaskWrapper.deploy( func=_deploy_dask_func, f_args=( PandasDataframeAxisPartition.deploy_splitting_func, axis, func, f_args, f_kwargs, num_splits, *partitions, ), f_kwargs={"extract_metadata": extract_metadata}, num_returns=( num_splits * (1 + cls._PARTITIONS_METADATA_LEN) if extract_metadata else num_splits ), pure=False, ) @classmethod def deploy_axis_func( cls, axis, func, f_args, f_kwargs, num_splits, maintain_partitioning, *partitions, min_block_size, lengths=None, manual_partition=False, ): """ Deploy a function along a full axis. Parameters ---------- axis : {0, 1} The axis to perform the function along. func : callable The function to perform. f_args : list or tuple Positional arguments to pass to ``func``. f_kwargs : dict Keyword arguments to pass to ``func``. num_splits : int The number of splits to return (see `split_result_of_axis_func_pandas`). maintain_partitioning : bool If True, keep the old partitioning if possible. If False, create a new partition layout. *partitions : iterable All partitions that make up the full axis (row or column). min_block_size : int Minimum number of rows/columns in a single split. lengths : iterable, default: None The list of lengths to shuffle the partition into. manual_partition : bool, default: False If True, partition the result with `lengths`. Returns ------- list A list of distributed.Future. """ result_num_splits = len(lengths) if lengths else num_splits return DaskWrapper.deploy( func=_deploy_dask_func, f_args=( PandasDataframeAxisPartition.deploy_axis_func, axis, func, f_args, f_kwargs, num_splits, maintain_partitioning, *partitions, ), f_kwargs={ "min_block_size": min_block_size, "lengths": lengths, "manual_partition": manual_partition, }, num_returns=result_num_splits * (1 + cls._PARTITIONS_METADATA_LEN), pure=False, ) @classmethod def deploy_func_between_two_axis_partitions( cls, axis, func, f_args, f_kwargs, num_splits, len_of_left, other_shape, *partitions, min_block_size, ): """ Deploy a function along a full axis between two data sets. Parameters ---------- axis : {0, 1} The axis to perform the function along. func : callable The function to perform. f_args : list or tuple Positional arguments to pass to ``func``. f_kwargs : dict Keyword arguments to pass to ``func``. num_splits : int The number of splits to return (see `split_result_of_axis_func_pandas`). len_of_left : int The number of values in `partitions` that belong to the left data set. other_shape : np.ndarray The shape of right frame in terms of partitions, i.e. (other_shape[i-1], other_shape[i]) will indicate slice to restore i-1 axis partition. *partitions : iterable All partitions that make up the full axis (row or column) for both data sets. min_block_size : int Minimum number of rows/columns in a single split. Returns ------- list A list of distributed.Future. """ return DaskWrapper.deploy( func=_deploy_dask_func, f_args=( PandasDataframeAxisPartition.deploy_func_between_two_axis_partitions, axis, func, f_args, f_kwargs, num_splits, len_of_left, other_shape, *partitions, ), f_kwargs={ "min_block_size": min_block_size, }, num_returns=num_splits * (1 + cls._PARTITIONS_METADATA_LEN), pure=False, ) def wait(self): """Wait completing computations on the object wrapped by the partition.""" self.drain_call_queue() DaskWrapper.wait(self.list_of_blocks) @_inherit_docstrings(PandasOnDaskDataframeVirtualPartition) class PandasOnDaskDataframeColumnPartition(PandasOnDaskDataframeVirtualPartition): axis = 0 @_inherit_docstrings(PandasOnDaskDataframeVirtualPartition) class PandasOnDaskDataframeRowPartition(PandasOnDaskDataframeVirtualPartition): axis = 1 def _deploy_dask_func( deployer, axis, f_to_deploy, f_args, f_kwargs, *args, extract_metadata=True, **kwargs, ): """ Execute a function on an axis partition in a worker process. This is ALWAYS called on either ``PandasDataframeAxisPartition.deploy_axis_func`` or ``PandasDataframeAxisPartition.deploy_func_between_two_axis_partitions``, which both serve to deploy another dataframe function on a Dask worker process. Parameters ---------- deployer : callable A `PandasDataFrameAxisPartition.deploy_*` method that will call `deploy_f`. axis : {0, 1} The axis to perform the function along. f_to_deploy : callable or RayObjectID The function to deploy. f_args : list or tuple Positional arguments to pass to ``f_to_deploy``. f_kwargs : dict Keyword arguments to pass to ``f_to_deploy``. *args : list Positional arguments to pass to ``func``. extract_metadata : bool, default: True Whether to return metadata (length, width, ip) of the result. Passing `False` may relax the load on object storage as the remote function would return 4 times fewer futures. Passing `False` makes sense for temporary results where you know for sure that the metadata will never be requested. **kwargs : dict Keyword arguments to pass to ``func``. Returns ------- list The result of the function ``func`` and metadata for it. """ result = deployer(axis, f_to_deploy, f_args, f_kwargs, *args, **kwargs) if not extract_metadata: return result ip = get_ip() if isinstance(result, pandas.DataFrame): return result, len(result), len(result.columns), ip elif all(isinstance(r, pandas.DataFrame) for r in result): return [i for r in result for i in [r, len(r), len(r.columns), ip]] else: return [i for r in result for i in [r, None, None, ip]] ================================================ FILE: modin/core/execution/dispatching/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Modin's functionality related to dispatching to specific execution.""" ================================================ FILE: modin/core/execution/dispatching/factories/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Factories responsible for dispatching to specific execution.""" ================================================ FILE: modin/core/execution/dispatching/factories/dispatcher.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Contain IO dispatcher class. Dispatcher routes the work to execution-specific functions. """ from typing import Union from pandas._libs.lib import NoDefault, no_default from modin.config import Backend, Engine, IsExperimental, StorageFormat from modin.core.execution.dispatching.factories import factories from modin.core.storage_formats.base import BaseQueryCompiler from modin.utils import _inherit_docstrings class FactoryNotFoundError(AttributeError): """ ``FactoryNotFound`` exception class. Raise when no matching factory could be found. """ pass class StubIoEngine(object): """ IO-Engine that does nothing more than raise NotImplementedError when any method is called. Parameters ---------- factory_name : str Factory name, which will be reflected in error messages. Notes ----- Used for testing purposes. """ def __init__(self, factory_name=""): self.factory_name = factory_name or "Unknown" def __getattr__(self, name): """ Return a function that raises `NotImplementedError` for the `name` method. Parameters ---------- name : str Method name to indicate in `NotImplementedError`. Returns ------- callable """ def stub(*args, **kw): raise NotImplementedError( f"Method {self.factory_name}.{name} is not implemented" ) return stub class StubFactory(factories.BaseFactory): """ Factory that does nothing more than raise NotImplementedError when any method is called. Notes ----- Used for testing purposes. """ io_cls = StubIoEngine() @classmethod def set_failing_name(cls, factory_name): """ Fill in `.io_cls` class attribute with ``StubIoEngine`` engine. Parameters ---------- factory_name : str Name to pass to the ``StubIoEngine`` constructor. """ cls.io_cls = StubIoEngine(factory_name) return cls class FactoryDispatcher(object): """ Class that routes IO-work to the factories. This class is responsible for keeping selected factory up-to-date and dispatching calls of IO-functions to its actual execution-specific implementations. """ __factory: factories.BaseFactory = None @classmethod def get_factory(cls) -> factories.BaseFactory: """Get current factory.""" if cls.__factory is None: from modin.pandas import _initialize_engine Engine.subscribe( lambda engine_parameter: _initialize_engine(engine_parameter.get()) ) Backend.subscribe(cls._update_factory) return_value = cls.__factory return return_value @classmethod def _get_prepared_factory_for_backend(cls, backend) -> factories.BaseFactory: """ Get factory for the specified backend. Parameters ---------- backend : str Backend name. Returns ------- factories.BaseFactory Factory for the specified backend. """ execution = Backend.get_execution_for_backend(backend) from modin.pandas import _initialize_engine _initialize_engine(execution.engine) factory_name = f"{execution.storage_format}On{execution.engine}Factory" experimental_factory_name = "Experimental" + factory_name try: factory = getattr(factories, factory_name, None) or getattr( factories, experimental_factory_name ) except AttributeError: if not IsExperimental.get(): # allow missing factories in experimental mode only msg = ( "Cannot find neither factory {} nor experimental factory {}. " + "Potential reason might be incorrect environment variable value for " + f"{StorageFormat.varname} or {Engine.varname}" ) raise FactoryNotFoundError( msg.format(factory_name, experimental_factory_name) ) factory = StubFactory.set_failing_name(factory_name) else: try: factory.prepare() except ModuleNotFoundError as err: raise ModuleNotFoundError( f"Make sure all required packages are installed: {str(err)}" ) from err return factory @classmethod def _update_factory(cls, *args): """ Update and prepare factory with a new one specified via Modin config. Parameters ---------- *args : iterable This parameters serves the compatibility purpose. Does not affect the result. """ cls.__factory = cls._get_prepared_factory_for_backend(Backend.get()) @classmethod def from_pandas( cls, df, backend: Union[str, NoDefault] = no_default ) -> BaseQueryCompiler: """ Create a Modin query compiler from a pandas DataFrame. Parameters ---------- df : pandas.DataFrame The pandas DataFrame to convert. backend : str or NoDefault, default: NoDefault The backend to use for the resulting query compiler. If NoDefault, use the current global default ``Backend`` from the Modin config. Returns ------- BaseQueryCompiler A Modin query compiler that wraps the input pandas DataFrame. """ return ( cls.get_factory() if backend is no_default else cls._get_prepared_factory_for_backend(backend) )._from_pandas(df) @classmethod @_inherit_docstrings(factories.BaseFactory._from_arrow) def from_arrow(cls, at): return cls.get_factory()._from_arrow(at) @classmethod @_inherit_docstrings(factories.BaseFactory._from_non_pandas) def from_non_pandas(cls, *args, **kwargs): return cls.get_factory()._from_non_pandas(*args, **kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._from_interchange_dataframe) def from_interchange_dataframe(cls, *args, **kwargs): return cls.get_factory()._from_interchange_dataframe(*args, **kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._from_ray) def from_ray(cls, ray_obj): return cls.get_factory()._from_ray(ray_obj) @classmethod @_inherit_docstrings(factories.BaseFactory._from_dask) def from_dask(cls, dask_obj): return cls.get_factory()._from_dask(dask_obj) @classmethod @_inherit_docstrings(factories.BaseFactory._from_map) def from_map(cls, func, iterable, *args, **kwargs): return cls.get_factory()._from_map(func, iterable, *args, **kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_parquet) def read_parquet(cls, **kwargs): return cls.get_factory()._read_parquet(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_csv) def read_csv(cls, **kwargs): return cls.get_factory()._read_csv(**kwargs) @classmethod @_inherit_docstrings(factories.PandasOnRayFactory._read_csv_glob) def read_csv_glob(cls, **kwargs): return cls.get_factory()._read_csv_glob(**kwargs) @classmethod @_inherit_docstrings(factories.PandasOnRayFactory._read_pickle_glob) def read_pickle_glob(cls, **kwargs): return cls.get_factory()._read_pickle_glob(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_json) def read_json(cls, **kwargs): return cls.get_factory()._read_json(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_gbq) def read_gbq(cls, **kwargs): return cls.get_factory()._read_gbq(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_html) def read_html(cls, **kwargs): return cls.get_factory()._read_html(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_clipboard) def read_clipboard(cls, **kwargs): return cls.get_factory()._read_clipboard(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_excel) def read_excel(cls, **kwargs): return cls.get_factory()._read_excel(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_hdf) def read_hdf(cls, **kwargs): return cls.get_factory()._read_hdf(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_feather) def read_feather(cls, **kwargs): return cls.get_factory()._read_feather(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_stata) def read_stata(cls, **kwargs): return cls.get_factory()._read_stata(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_sas) def read_sas(cls, **kwargs): # pragma: no cover return cls.get_factory()._read_sas(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_pickle) def read_pickle(cls, **kwargs): return cls.get_factory()._read_pickle(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_sql) def read_sql(cls, **kwargs): return cls.get_factory()._read_sql(**kwargs) @classmethod @_inherit_docstrings(factories.PandasOnRayFactory._read_sql_distributed) def read_sql_distributed(cls, **kwargs): return cls.get_factory()._read_sql_distributed(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_fwf) def read_fwf(cls, **kwargs): return cls.get_factory()._read_fwf(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_sql_table) def read_sql_table(cls, **kwargs): return cls.get_factory()._read_sql_table(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_sql_query) def read_sql_query(cls, **kwargs): return cls.get_factory()._read_sql_query(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_spss) def read_spss(cls, **kwargs): return cls.get_factory()._read_spss(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._to_sql) def to_sql(cls, *args, **kwargs): return cls.get_factory()._to_sql(*args, **kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._to_pickle) def to_pickle(cls, *args, **kwargs): return cls.get_factory()._to_pickle(*args, **kwargs) @classmethod @_inherit_docstrings(factories.PandasOnRayFactory._to_pickle_glob) def to_pickle_glob(cls, *args, **kwargs): return cls.get_factory()._to_pickle_glob(*args, **kwargs) @classmethod @_inherit_docstrings(factories.PandasOnRayFactory._read_parquet_glob) def read_parquet_glob(cls, *args, **kwargs): return cls.get_factory()._read_parquet_glob(*args, **kwargs) @classmethod @_inherit_docstrings(factories.PandasOnRayFactory._to_parquet_glob) def to_parquet_glob(cls, *args, **kwargs): return cls.get_factory()._to_parquet_glob(*args, **kwargs) @classmethod @_inherit_docstrings(factories.PandasOnRayFactory._read_json_glob) def read_json_glob(cls, *args, **kwargs): return cls.get_factory()._read_json_glob(*args, **kwargs) @classmethod @_inherit_docstrings(factories.PandasOnRayFactory._to_json_glob) def to_json_glob(cls, *args, **kwargs): return cls.get_factory()._to_json_glob(*args, **kwargs) @classmethod @_inherit_docstrings(factories.PandasOnRayFactory._read_xml_glob) def read_xml_glob(cls, *args, **kwargs): return cls.get_factory()._read_xml_glob(*args, **kwargs) @classmethod @_inherit_docstrings(factories.PandasOnRayFactory._to_xml_glob) def to_xml_glob(cls, *args, **kwargs): return cls.get_factory()._to_xml_glob(*args, **kwargs) @classmethod @_inherit_docstrings(factories.PandasOnRayFactory._read_custom_text) def read_custom_text(cls, **kwargs): return cls.get_factory()._read_custom_text(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._to_csv) def to_csv(cls, *args, **kwargs): return cls.get_factory()._to_csv(*args, **kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._to_json) def to_json(cls, *args, **kwargs): return cls.get_factory()._to_json(*args, **kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._to_json) def to_json_series(cls, *args, **kwargs): return cls.get_factory()._to_json_series(*args, **kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._to_xml) def to_xml(cls, *args, **kwargs): return cls.get_factory()._to_xml(*args, **kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._to_parquet) def to_parquet(cls, *args, **kwargs): return cls.get_factory()._to_parquet(*args, **kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._to_ray) def to_ray(cls, modin_obj): return cls.get_factory()._to_ray(modin_obj) @classmethod @_inherit_docstrings(factories.BaseFactory._to_dask) def to_dask(cls, modin_obj): return cls.get_factory()._to_dask(modin_obj) ================================================ FILE: modin/core/execution/dispatching/factories/factories.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Module contains Factories for all of the supported Modin executions. Factory is a bridge between calls of IO function from high-level API and its actual implementation in the execution, bound to that factory. Each execution is represented with a Factory class. """ import re import typing import warnings import pandas from pandas.util._decorators import doc from modin.core.io import BaseIO from modin.core.storage_formats.pandas.native_query_compiler import NativeQueryCompiler from modin.utils import get_current_execution _doc_abstract_factory_class = """ Abstract {role} factory which allows to override the IO module easily. This class is responsible for dispatching calls of IO-functions to its actual execution-specific implementations. Attributes ---------- io_cls : BaseIO IO module class of the underlying execution. The place to dispatch calls to. """ _doc_factory_class = """ Factory of {execution_name} execution. This class is responsible for dispatching calls of IO-functions to its actual execution-specific implementations. Attributes ---------- io_cls : {execution_name}IO IO module class of the underlying execution. The place to dispatch calls to. """ _doc_factory_prepare_method = """ Initialize Factory. Fills in `.io_cls` class attribute with {io_module_name} lazily. """ _doc_io_method_raw_template = """ Build query compiler from {source}. Parameters ---------- {params} Returns ------- QueryCompiler Query compiler of the selected storage format. """ _doc_io_method_template = ( _doc_io_method_raw_template + """ See Also -------- modin.pandas.{method} """ ) _doc_io_method_all_params = """*args : args Arguments to pass to the QueryCompiler builder method. **kwargs : kwargs Arguments to pass to the QueryCompiler builder method.""" _doc_io_method_kwargs_params = """**kwargs : kwargs Arguments to pass to the QueryCompiler builder method.""" types_dictionary = {"pandas": {"category": pandas.CategoricalDtype}} supported_executions = ( "PandasOnRay", "PandasOnUnidist", "PandasOnDask", ) class FactoryInfo(typing.NamedTuple): """ Structure that stores information about factory. Parameters ---------- engine : str Name of underlying execution engine. partition : str Name of the partition format. experimental : bool Whether underlying engine is experimental-only. """ engine: str partition: str experimental: bool class NotRealFactory(Exception): """ ``NotRealFactory`` exception class. Raise when no matching factory could be found. """ pass @doc(_doc_abstract_factory_class, role="") class BaseFactory(object): io_cls: typing.Type[BaseIO] = None # The module where the I/O functionality exists. @classmethod def get_info(cls) -> FactoryInfo: """ Get information about current factory. Notes ----- It parses factory name, so it must be conformant with how ``FactoryDispatcher`` class constructs factory names. """ try: experimental, partition, engine = re.match( r"^(Experimental)?(.*)On(.*)Factory$", cls.__name__ ).groups() except AttributeError: raise NotRealFactory() return FactoryInfo( engine=engine, partition=partition, experimental=bool(experimental) ) @classmethod @doc( _doc_factory_prepare_method, io_module_name="an underlying execution's IO-module", ) def prepare(cls): raise NotImplementedError("Subclasses of BaseFactory must implement prepare") @classmethod @doc( _doc_io_method_template, source="pandas DataFrame", params="df : pandas.DataFrame", method="io.from_pandas", ) def _from_pandas(cls, df): return cls.io_cls.from_pandas(df) @classmethod @doc( _doc_io_method_template, source="Arrow Table", params="at : pyarrow.Table", method="io.from_arrow", ) def _from_arrow(cls, at): return cls.io_cls.from_arrow(at) @classmethod @doc( _doc_io_method_template, source="a non-pandas object (dict, list, np.array etc...)", params=_doc_io_method_all_params, method="io.from_non_pandas", ) def _from_non_pandas(cls, *args, **kwargs): return cls.io_cls.from_non_pandas(*args, **kwargs) @classmethod @doc( _doc_io_method_template, source="a DataFrame object supporting exchange protocol `__dataframe__()`", params=_doc_io_method_all_params, method="io.from_interchange_dataframe", ) def _from_interchange_dataframe(cls, *args, **kwargs): return cls.io_cls.from_interchange_dataframe(*args, **kwargs) @classmethod @doc( _doc_io_method_template, source="a Ray Dataset", params="ray_obj : ray.data.Dataset", method="modin.core.execution.ray.implementations.pandas_on_ray.io.PandasOnRayIO.from_ray", ) def _from_ray(cls, ray_obj): return cls.io_cls.from_ray(ray_obj) @classmethod @doc( _doc_io_method_template, source="a Dask DataFrame", params="dask_obj : dask.dataframe.DataFrame", method="modin.core.execution.dask.implementations.pandas_on_dask.io.PandasOnDaskIO.from_dask", ) def _from_dask(cls, dask_obj): return cls.io_cls.from_dask(dask_obj) @classmethod def _from_map(cls, func, iterable, *args, **kwargs): """ Create a Modin `query_compiler` from a map function. This method will construct a Modin `query_compiler` split by row partitions. The number of row partitions matches the number of elements in the iterable object. Parameters ---------- func : callable Function to map across the iterable object. iterable : Iterable An iterable object. *args : tuple Positional arguments to pass in `func`. **kwargs : dict Keyword arguments to pass in `func`. Returns ------- BaseQueryCompiler QueryCompiler containing data returned by map function. """ return cls.io_cls.from_map(func, iterable, *args, **kwargs) @classmethod @doc( _doc_io_method_template, source="a Parquet file", params=_doc_io_method_kwargs_params, method="read_parquet", ) def _read_parquet(cls, **kwargs): return cls.io_cls.read_parquet(**kwargs) @classmethod @doc( _doc_io_method_template, source="a CSV file", params=_doc_io_method_kwargs_params, method="read_csv", ) def _read_csv(cls, **kwargs): return cls.io_cls.read_csv(**kwargs) @classmethod @doc( _doc_io_method_template, source="a JSON file", params=_doc_io_method_kwargs_params, method="read_json", ) def _read_json(cls, **kwargs): return cls.io_cls.read_json(**kwargs) @classmethod @doc( _doc_io_method_template, source="a Google BigQuery", params=_doc_io_method_kwargs_params, method="read_gbq", ) def _read_gbq(cls, **kwargs): return cls.io_cls.read_gbq(**kwargs) @classmethod @doc( _doc_io_method_template, source="an HTML document", params=_doc_io_method_kwargs_params, method="read_html", ) def _read_html(cls, **kwargs): return cls.io_cls.read_html(**kwargs) @classmethod @doc( _doc_io_method_template, source="clipboard", params=_doc_io_method_kwargs_params, method="read_clipboard", ) def _read_clipboard(cls, **kwargs): # pragma: no cover return cls.io_cls.read_clipboard(**kwargs) @classmethod @doc( _doc_io_method_template, source="an Excel file", params=_doc_io_method_kwargs_params, method="read_excel", ) def _read_excel(cls, **kwargs): return cls.io_cls.read_excel(**kwargs) @classmethod @doc( _doc_io_method_template, source="an HDFStore", params=_doc_io_method_kwargs_params, method="read_hdf", ) def _read_hdf(cls, **kwargs): return cls.io_cls.read_hdf(**kwargs) @classmethod @doc( _doc_io_method_template, source="a feather-format object", params=_doc_io_method_kwargs_params, method="read_feather", ) def _read_feather(cls, **kwargs): return cls.io_cls.read_feather(**kwargs) @classmethod @doc( _doc_io_method_template, source="a Stata file", params=_doc_io_method_kwargs_params, method="read_stata", ) def _read_stata(cls, **kwargs): return cls.io_cls.read_stata(**kwargs) @classmethod @doc( _doc_io_method_template, source="a SAS file", params=_doc_io_method_kwargs_params, method="read_sas", ) def _read_sas(cls, **kwargs): # pragma: no cover return cls.io_cls.read_sas(**kwargs) @classmethod @doc( _doc_io_method_template, source="a pickled Modin or pandas DataFrame", params=_doc_io_method_kwargs_params, method="read_pickle", ) def _read_pickle(cls, **kwargs): return cls.io_cls.read_pickle(**kwargs) @classmethod @doc( _doc_io_method_template, source="a SQL query or database table", params=_doc_io_method_kwargs_params, method="read_sql", ) def _read_sql(cls, **kwargs): return cls.io_cls.read_sql(**kwargs) @classmethod @doc( _doc_io_method_template, source="a table of fixed-width formatted lines", params=_doc_io_method_kwargs_params, method="read_fwf", ) def _read_fwf(cls, **kwargs): return cls.io_cls.read_fwf(**kwargs) @classmethod @doc( _doc_io_method_template, source="a SQL database table", params=_doc_io_method_kwargs_params, method="read_sql_table", ) def _read_sql_table(cls, **kwargs): return cls.io_cls.read_sql_table(**kwargs) @classmethod @doc( _doc_io_method_template, source="a SQL query", params=_doc_io_method_kwargs_params, method="read_sql_query", ) def _read_sql_query(cls, **kwargs): return cls.io_cls.read_sql_query(**kwargs) @classmethod @doc( _doc_io_method_template, source="an SPSS file", params=_doc_io_method_kwargs_params, method="read_spss", ) def _read_spss(cls, **kwargs): return cls.io_cls.read_spss(**kwargs) @classmethod def _to_sql(cls, *args, **kwargs): """ Write query compiler content to a SQL database. Parameters ---------- *args : args Arguments to the writer method. **kwargs : kwargs Arguments to the writer method. """ return cls.io_cls.to_sql(*args, **kwargs) @classmethod def _to_pickle(cls, *args, **kwargs): """ Pickle query compiler object. Parameters ---------- *args : args Arguments to the writer method. **kwargs : kwargs Arguments to the writer method. """ return cls.io_cls.to_pickle(*args, **kwargs) @classmethod def _to_csv(cls, *args, **kwargs): """ Write query compiler content to a CSV file. Parameters ---------- *args : args Arguments to pass to the writer method. **kwargs : kwargs Arguments to pass to the writer method. """ return cls.io_cls.to_csv(*args, **kwargs) @classmethod def _to_json(cls, *args, **kwargs): """ Write query compiler content to a JSON file. Parameters ---------- *args : args Arguments to pass to the writer method. **kwargs : kwargs Arguments to pass to the writer method. """ return cls.io_cls.to_json(*args, **kwargs) @classmethod def _to_json_series(cls, *args, **kwargs): """ Write query compiler content of a Series to a JSON file. Parameters ---------- *args : args Arguments to pass to the writer method. **kwargs : kwargs Arguments to pass to the writer method. """ return cls.io_cls.to_json_series(*args, **kwargs) @classmethod def _to_xml(cls, *args, **kwargs): """ Write query compiler content to a XML file. Parameters ---------- *args : args Arguments to pass to the writer method. **kwargs : kwargs Arguments to pass to the writer method. """ return cls.io_cls.to_xml(*args, **kwargs) @classmethod def _to_parquet(cls, *args, **kwargs): """ Write query compiler content to a parquet file. Parameters ---------- *args : args Arguments to pass to the writer method. **kwargs : kwargs Arguments to pass to the writer method. """ return cls.io_cls.to_parquet(*args, **kwargs) @classmethod def _to_ray(cls, modin_obj): """ Write query compiler content to a Ray Dataset. Parameters ---------- modin_obj : modin.pandas.DataFrame, modin.pandas.Series The Modin DataFrame/Series to write. Returns ------- ray.data.Dataset A Ray Dataset object. Notes ----- Modin DataFrame/Series can only be converted to a Ray Dataset if Modin uses a Ray engine. """ return cls.io_cls.to_ray(modin_obj) @classmethod def _to_dask(cls, modin_obj): """ Write query compiler content to a Dask DataFrame/Series. Parameters ---------- modin_obj : modin.pandas.DataFrame, modin.pandas.Series The Modin DataFrame/Series to write. Returns ------- dask.dataframe.DataFrame or dask.dataframe.Series A Dask DataFrame/Series object. Notes ----- Modin DataFrame/Series can only be converted to a Dask DataFrame/Series if Modin uses a Dask engine. """ return cls.io_cls.to_dask(modin_obj) # experimental methods that don't exist in pandas @classmethod @doc( _doc_io_method_raw_template, source="CSV files", params=_doc_io_method_kwargs_params, ) def _read_csv_glob(cls, **kwargs): current_execution = get_current_execution() if current_execution not in supported_executions: raise NotImplementedError( f"`_read_csv_glob()` is not implemented for {current_execution} execution." ) return cls.io_cls.read_csv_glob(**kwargs) @classmethod @doc( _doc_io_method_raw_template, source="Pickle files", params=_doc_io_method_kwargs_params, ) def _read_pickle_glob(cls, **kwargs): current_execution = get_current_execution() if current_execution not in supported_executions: raise NotImplementedError( f"`_read_pickle_glob()` is not implemented for {current_execution} execution." ) return cls.io_cls.read_pickle_glob(**kwargs) @classmethod @doc( _doc_io_method_raw_template, source="SQL files", params=_doc_io_method_kwargs_params, ) def _read_sql_distributed(cls, **kwargs): current_execution = get_current_execution() if current_execution not in supported_executions: extra_parameters = ( "partition_column", "lower_bound", "upper_bound", "max_sessions", ) if any( param in kwargs and kwargs[param] is not None for param in extra_parameters ): warnings.warn( f"Distributed read_sql() was only implemented for {', '.join(supported_executions)} executions." ) for param in extra_parameters: del kwargs[param] return cls.io_cls.read_sql(**kwargs) return cls.io_cls.read_sql_distributed(**kwargs) @classmethod @doc( _doc_io_method_raw_template, source="Custom text files", params=_doc_io_method_kwargs_params, ) def _read_custom_text(cls, **kwargs): current_execution = get_current_execution() if current_execution not in supported_executions: raise NotImplementedError( f"`_read_custom_text()` is not implemented for {current_execution} execution." ) return cls.io_cls.read_custom_text(**kwargs) @classmethod def _to_pickle_glob(cls, *args, **kwargs): """ Distributed pickle query compiler object. Parameters ---------- *args : args Arguments to the writer method. **kwargs : kwargs Arguments to the writer method. """ # TODO(https://github.com/modin-project/modin/issues/7429): Use # frame-level execution instead of the global, default execution. current_execution = get_current_execution() if current_execution not in supported_executions: raise NotImplementedError( f"`_to_pickle_glob()` is not implemented for {current_execution} execution." ) return cls.io_cls.to_pickle_glob(*args, **kwargs) @classmethod @doc( _doc_io_method_raw_template, source="Parquet files", params=_doc_io_method_kwargs_params, ) def _read_parquet_glob(cls, **kwargs): current_execution = get_current_execution() if current_execution not in supported_executions: raise NotImplementedError( f"`_read_parquet_glob()` is not implemented for {current_execution} execution." ) return cls.io_cls.read_parquet_glob(**kwargs) @classmethod def _to_parquet_glob(cls, *args, **kwargs): """ Write query compiler content to several parquet files. Parameters ---------- *args : args Arguments to pass to the writer method. **kwargs : kwargs Arguments to pass to the writer method. """ current_execution = get_current_execution() if current_execution not in supported_executions: raise NotImplementedError( f"`_to_parquet_glob()` is not implemented for {current_execution} execution." ) return cls.io_cls.to_parquet_glob(*args, **kwargs) @classmethod @doc( _doc_io_method_raw_template, source="Json files", params=_doc_io_method_kwargs_params, ) def _read_json_glob(cls, **kwargs): current_execution = get_current_execution() if current_execution not in supported_executions: raise NotImplementedError( f"`_read_json_glob()` is not implemented for {current_execution} execution." ) return cls.io_cls.read_json_glob(**kwargs) @classmethod def _to_json_glob(cls, *args, **kwargs): """ Write query compiler content to several json files. Parameters ---------- *args : args Arguments to pass to the writer method. **kwargs : kwargs Arguments to pass to the writer method. """ current_execution = get_current_execution() if current_execution not in supported_executions: raise NotImplementedError( f"`_to_json_glob()` is not implemented for {current_execution} execution." ) return cls.io_cls.to_json_glob(*args, **kwargs) @classmethod @doc( _doc_io_method_raw_template, source="XML files", params=_doc_io_method_kwargs_params, ) def _read_xml_glob(cls, **kwargs): current_execution = get_current_execution() if current_execution not in supported_executions: raise NotImplementedError( f"`_read_xml_glob()` is not implemented for {current_execution} execution." ) return cls.io_cls.read_xml_glob(**kwargs) @classmethod def _to_xml_glob(cls, *args, **kwargs): """ Write query compiler content to several XML files. Parameters ---------- *args : args Arguments to pass to the writer method. **kwargs : kwargs Arguments to pass to the writer method. """ current_execution = get_current_execution() if current_execution not in supported_executions: raise NotImplementedError( f"`_to_xml_glob()` is not implemented for {current_execution} execution." ) return cls.io_cls.to_xml_glob(*args, **kwargs) @doc(_doc_factory_class, execution_name="PandasOnRay") class PandasOnRayFactory(BaseFactory): @classmethod @doc(_doc_factory_prepare_method, io_module_name="``PandasOnRayIO``") def prepare(cls): from modin.core.execution.ray.implementations.pandas_on_ray.io import ( PandasOnRayIO, ) cls.io_cls = PandasOnRayIO @doc(_doc_factory_class, execution_name="PandasOnPython") class PandasOnPythonFactory(BaseFactory): @classmethod @doc(_doc_factory_prepare_method, io_module_name="``PandasOnPythonIO``") def prepare(cls): from modin.core.execution.python.implementations.pandas_on_python.io import ( PandasOnPythonIO, ) cls.io_cls = PandasOnPythonIO @doc(_doc_factory_class, execution_name="PandasOnDask") class PandasOnDaskFactory(BaseFactory): @classmethod @doc(_doc_factory_prepare_method, io_module_name="``PandasOnDaskIO``") def prepare(cls): from modin.core.execution.dask.implementations.pandas_on_dask.io import ( PandasOnDaskIO, ) cls.io_cls = PandasOnDaskIO @doc(_doc_factory_class, execution_name="PandasOnUnidist") class PandasOnUnidistFactory(BaseFactory): @classmethod @doc(_doc_factory_prepare_method, io_module_name="``PandasOnUnidistIO``") def prepare(cls): from modin.core.execution.unidist.implementations.pandas_on_unidist.io import ( PandasOnUnidistIO, ) cls.io_cls = PandasOnUnidistIO class NativeIO(BaseIO): """ I/O class for native pandas execution. This class inherits the default function implementations from the ``BaseIO`` parent class. """ _should_warn_on_default_to_pandas: bool = False query_compiler_cls = NativeQueryCompiler @doc(_doc_factory_class, execution_name="NativeOnNative") class NativeOnNativeFactory(BaseFactory): @classmethod @doc(_doc_factory_prepare_method, io_module_name="`NativeIO`") def prepare(cls): cls.io_cls = NativeIO ================================================ FILE: modin/core/execution/modin_aqp.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ The module for working with displaying progress bars for Modin execution engines. Modin Automatic Query Progress (AQP). """ import inspect import os import threading import time import warnings from modin.config import Engine, ProgressBar progress_bars = {} bar_lock = threading.Lock() def call_progress_bar(result_parts, line_no): """ Attach a progress bar to given `result_parts`. The progress bar is expected to be shown in a Jupyter Notebook cell. Parameters ---------- result_parts : list of list of object refs (futures) Objects which are being computed for which progress is requested. line_no : int Line number in the call stack which we're displaying progress for. """ with warnings.catch_warnings(): warnings.simplefilter("ignore") try: from tqdm.autonotebook import tqdm as tqdm_notebook except ImportError: raise ImportError("Please pip install tqdm to use the progress bar") from IPython import get_ipython try: cell_no = get_ipython().execution_count # This happens if we are not in ipython or jupyter. # No progress bar is supported in that case. except AttributeError: return pbar_id = f"{cell_no}-{line_no}" futures = [ block for row in result_parts for partition in row for block in partition.list_of_blocks ] bar_format = ( "{l_bar}{bar}{r_bar}" if "DEBUG_PROGRESS_BAR" in os.environ and os.environ["DEBUG_PROGRESS_BAR"] == "True" else "{desc}: {percentage:3.0f}%{bar} Elapsed time: {elapsed}, estimated remaining time: {remaining}" ) bar_lock.acquire() if pbar_id in progress_bars: if hasattr(progress_bars[pbar_id], "container"): if hasattr(progress_bars[pbar_id].container.children[0], "max"): index = 0 else: index = 1 progress_bars[pbar_id].container.children[index].max = progress_bars[ pbar_id ].container.children[index].max + len(futures) progress_bars[pbar_id].total = progress_bars[pbar_id].total + len(futures) progress_bars[pbar_id].refresh() else: progress_bars[pbar_id] = tqdm_notebook( total=len(futures), desc="Estimated completion of line " + str(line_no), bar_format=bar_format, ) bar_lock.release() threading.Thread(target=_show_time_updates, args=(progress_bars[pbar_id],)).start() # TODO(https://github.com/modin-project/modin/issues/7429): Use # frame-level engine config. modin_engine = Engine.get() engine_wrapper = None if modin_engine == "Ray": from modin.core.execution.ray.common.engine_wrapper import RayWrapper engine_wrapper = RayWrapper elif modin_engine == "Unidist": from modin.core.execution.unidist.common.engine_wrapper import UnidistWrapper engine_wrapper = UnidistWrapper else: raise NotImplementedError( f"ProgressBar feature is not supported for {modin_engine} engine." ) for i in range(1, len(futures) + 1): engine_wrapper.wait(futures, num_returns=i) progress_bars[pbar_id].update(1) progress_bars[pbar_id].refresh() if progress_bars[pbar_id].n == progress_bars[pbar_id].total: progress_bars[pbar_id].close() def display_time_updates(bar): """ Start displaying the progress `bar` in a notebook. Parameters ---------- bar : tqdm.tqdm The progress bar wrapper to display in a notebook cell. """ threading.Thread(target=_show_time_updates, args=(bar,)).start() def _show_time_updates(p_bar): """ Refresh displayed progress bar `p_bar` periodically until it is complete. Parameters ---------- p_bar : tqdm.tqdm The progress bar wrapper being displayed to refresh. """ while p_bar.total > p_bar.n: time.sleep(1) if p_bar.total > p_bar.n: p_bar.refresh() def progress_bar_wrapper(f): """ Wrap computation function inside a progress bar. Spawns another thread which displays a progress bar showing estimated completion time. Parameters ---------- f : callable The name of the function to be wrapped. Returns ------- callable Decorated version of `f` which reports progress. """ from functools import wraps @wraps(f) def magic(*args, **kwargs): result_parts = f(*args, **kwargs) if ProgressBar.get(): current_frame = inspect.currentframe() function_name = None while function_name != "": ( filename, line_number, function_name, lines, index, ) = inspect.getframeinfo(current_frame) current_frame = current_frame.f_back t = threading.Thread( target=call_progress_bar, args=(result_parts, line_number), ) t.start() # We need to know whether or not we are in a jupyter notebook from IPython import get_ipython try: ipy_str = str(type(get_ipython())) if "zmqshell" not in ipy_str: t.join() except Exception: pass return result_parts return magic ================================================ FILE: modin/core/execution/python/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Modin's functionality related to Python execution engine.""" ================================================ FILE: modin/core/execution/python/common/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Modin's functionality related to Python execution engine.""" from .engine_wrapper import PythonWrapper __all__ = ["PythonWrapper"] ================================================ FILE: modin/core/execution/python/common/engine_wrapper.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Modin's functionality related to Python execution engine.""" class PythonWrapper: """Python engine wrapper serving for the compatibility purpose with other engines.""" @classmethod def deploy(cls, func, f_args=None, f_kwargs=None, num_returns=1): """ Run the passed function. Parameters ---------- func : callable f_args : sequence, optional Positional arguments to pass to the `func`. f_kwargs : dict, optional Keyword arguments to pass to the `func`. num_returns : int, default: 1 Number of return values from the `func`. Returns ------- object Returns the result of the `func`. """ args = [] if f_args is None else f_args kwargs = {} if f_kwargs is None else f_kwargs return func(*args, **kwargs) @classmethod def is_future(cls, item): """ Check if the item is a Future. Parameters ---------- item : object Returns ------- boolean Always return false. """ return False @classmethod def materialize(cls, obj_id): """ Get the data from the data storage. The method only serves for the compatibility purpose, what it actually does is just return the passed value as is. Parameters ---------- obj_id : object Returns ------- object The passed `obj_id` itself. """ return obj_id @classmethod def put(cls, data, **kwargs): """ Put data into the data storage. The method only serves for the compatibility purpose, what it actually does is just return the passed value as is. Parameters ---------- data : object **kwargs : dict Returns ------- object The passed `data` itself. """ return data ================================================ FILE: modin/core/execution/python/implementations/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Modin's functionality related to Python execution engine and optimized for specific storage formats.""" ================================================ FILE: modin/core/execution/python/implementations/pandas_on_python/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Modin's functionality related to Python execution engine and optimized for pandas storage format.""" ================================================ FILE: modin/core/execution/python/implementations/pandas_on_python/dataframe/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Base Modin Dataframe class optimized for pandas on Python execution.""" ================================================ FILE: modin/core/execution/python/implementations/pandas_on_python/dataframe/dataframe.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Module contains class ``PandasOnPythonDataframe``. ``PandasOnPythonDataframe`` is dataframe class with pandas storage format and Python engine. """ from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe from modin.utils import _inherit_docstrings from ..partitioning.partition_manager import PandasOnPythonDataframePartitionManager class PandasOnPythonDataframe(PandasDataframe): """ Class for dataframes with pandas storage format and Python engine. ``PandasOnPythonDataframe`` doesn't implement any specific interfaces, all functionality is inherited from the ``PandasDataframe`` class. Parameters ---------- partitions : np.ndarray A 2D NumPy array of partitions. index : sequence The index for the dataframe. Converted to a ``pandas.Index``. columns : sequence The columns object for the dataframe. Converted to a ``pandas.Index``. row_lengths : list, optional The length of each partition in the rows. The "height" of each of the block partitions. Is computed if not provided. column_widths : list, optional The width of each partition in the columns. The "width" of each of the block partitions. Is computed if not provided. dtypes : pandas.Series, optional The data types for the dataframe columns. pandas_backend : {"pyarrow", None}, optional Backend used by pandas. None - means default NumPy backend. """ _partition_mgr_cls = PandasOnPythonDataframePartitionManager @property @_inherit_docstrings(PandasDataframe.engine) def engine(self) -> str: return "Python" ================================================ FILE: modin/core/execution/python/implementations/pandas_on_python/io/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Base IO classes optimized for pandas on Python execution.""" from .io import PandasOnPythonIO __all__ = [ "PandasOnPythonIO", ] ================================================ FILE: modin/core/execution/python/implementations/pandas_on_python/io/io.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module for housing IO classes with pandas storage format and Python engine.""" from modin.core.execution.python.implementations.pandas_on_python.dataframe.dataframe import ( PandasOnPythonDataframe, ) from modin.core.io import BaseIO from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler class PandasOnPythonIO(BaseIO): """ Class for storing IO functions operating on pandas storage format and Python engine. Inherits default function implementations from ``BaseIO`` parent class. """ frame_cls = PandasOnPythonDataframe query_compiler_cls = PandasQueryCompiler ================================================ FILE: modin/core/execution/python/implementations/pandas_on_python/partitioning/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Base Modin Dataframe classes related to its partitioning and optimized for pandas on Python execution.""" from .partition import PandasOnPythonDataframePartition from .partition_manager import PandasOnPythonDataframePartitionManager from .virtual_partition import ( PandasOnPythonDataframeAxisPartition, PandasOnPythonDataframeColumnPartition, PandasOnPythonDataframeRowPartition, ) __all__ = [ "PandasOnPythonDataframePartition", "PandasOnPythonDataframePartitionManager", "PandasOnPythonDataframeAxisPartition", "PandasOnPythonDataframeColumnPartition", "PandasOnPythonDataframeRowPartition", ] ================================================ FILE: modin/core/execution/python/implementations/pandas_on_python/partitioning/partition.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """The module defines interface for a partition with pandas storage format and Python engine.""" import warnings from modin.core.dataframe.pandas.partitioning.partition import PandasDataframePartition from modin.core.execution.python.common import PythonWrapper class PandasOnPythonDataframePartition(PandasDataframePartition): """ Partition class with interface for pandas storage format and Python engine. Class holds the data and metadata for a single partition and implements methods of parent abstract class ``PandasDataframePartition``. Parameters ---------- data : pandas.DataFrame ``pandas.DataFrame`` that should be wrapped with this class. length : int, optional Length of `data` (number of rows in the input dataframe). width : int, optional Width of `data` (number of columns in the input dataframe). call_queue : list, optional Call queue of the partition (list with entities that should be called before partition materialization). Notes ----- Objects of this class are treated as immutable by partition manager subclasses. There is no logic for updating in-place. """ execution_wrapper = PythonWrapper def __init__(self, data, length=None, width=None, call_queue=None): super().__init__() if hasattr(data, "copy"): data = data.copy() self._data = data if call_queue is None: call_queue = [] self.call_queue = call_queue self._length_cache = length self._width_cache = width def get(self): """ Flush the `call_queue` and return copy of the data. Returns ------- pandas.DataFrame Copy of DataFrame that was wrapped by this partition. Notes ----- Since this object is a simple wrapper, just return the copy of data. """ self.drain_call_queue() return self._data.copy() if hasattr(self._data, "copy") else self._data def apply(self, func, *args, **kwargs): """ Apply a function to the object wrapped by this partition. Parameters ---------- func : callable Function to apply. *args : iterable Additional positional arguments to be passed in `func`. **kwargs : dict Additional keyword arguments to be passed in `func`. Returns ------- PandasOnPythonDataframePartition New ``PandasOnPythonDataframePartition`` object. """ def call_queue_closure(data, call_queue): """ Apply callables from `call_queue` on copy of the `data` and return the result. Parameters ---------- data : pandas.DataFrame or pandas.Series Data to use for computations. call_queue : array-like Array with callables and it's kwargs to be applied to the `data`. Returns ------- pandas.DataFrame or pandas.Series """ result = data.copy() for func, f_args, f_kwargs in call_queue: try: result = func(result, *f_args, **f_kwargs) except Exception as err: self.call_queue = [] raise err return result self._data = call_queue_closure(self._data, self.call_queue) self.call_queue = [] with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=FutureWarning) return self.__constructor__(func(self._data.copy(), *args, **kwargs)) def drain_call_queue(self): """Execute all operations stored in the call queue on the object wrapped by this partition.""" if len(self.call_queue) == 0: return self.apply(lambda x: x) def wait(self): """ Wait for completion of computations on the object wrapped by the partition. Internally will be done by flushing the call queue. """ self.drain_call_queue() @classmethod def put(cls, obj): """ Create partition containing `obj`. Parameters ---------- obj : pandas.DataFrame DataFrame to be put into the new partition. Returns ------- PandasOnPythonDataframePartition New ``PandasOnPythonDataframePartition`` object. """ return cls(obj.copy(), len(obj.index), len(obj.columns)) @classmethod def preprocess_func(cls, func): """ Preprocess a function before an ``apply`` call. Parameters ---------- func : callable Function to preprocess. Returns ------- callable An object that can be accepted by ``apply``. Notes ----- No special preprocessing action is required, so unmodified `func` will be returned. """ return func ================================================ FILE: modin/core/execution/python/implementations/pandas_on_python/partitioning/partition_manager.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses class for managing partitions with pandas storage format and Python engine.""" from modin.core.dataframe.pandas.partitioning.partition_manager import ( PandasDataframePartitionManager, ) from modin.core.execution.python.common import PythonWrapper from .partition import PandasOnPythonDataframePartition from .virtual_partition import ( PandasOnPythonDataframeColumnPartition, PandasOnPythonDataframeRowPartition, ) class PandasOnPythonDataframePartitionManager(PandasDataframePartitionManager): """ Class for managing partitions with pandas storage format and Python engine. Inherits all functionality from ``PandasDataframePartitionManager`` base class. """ _partition_class = PandasOnPythonDataframePartition _column_partitions_class = PandasOnPythonDataframeColumnPartition _row_partition_class = PandasOnPythonDataframeRowPartition _execution_wrapper = PythonWrapper ================================================ FILE: modin/core/execution/python/implementations/pandas_on_python/partitioning/virtual_partition.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """The module defines interface for a virtual partition with pandas storage format and python engine.""" from modin.core.dataframe.pandas.partitioning.axis_partition import ( PandasDataframeAxisPartition, ) from modin.utils import _inherit_docstrings from .partition import PandasOnPythonDataframePartition class PandasOnPythonDataframeAxisPartition(PandasDataframeAxisPartition): """ Class defines axis partition interface with pandas storage format and Python engine. Inherits functionality from ``PandasDataframeAxisPartition`` class. Parameters ---------- list_of_partitions : Union[list, PandasOnPythonDataframePartition] List of ``PandasOnPythonDataframePartition`` and ``PandasOnPythonDataframeVirtualPartition`` objects, or a single ``PandasOnPythonDataframePartition``. get_ip : bool, default: False Whether to get node IP addresses to conforming partitions or not. full_axis : bool, default: True Whether or not the virtual partition encompasses the whole axis. call_queue : list, optional A list of tuples (callable, args, kwargs) that contains deferred calls. length : int, optional Length, or reference to length, of wrapped ``pandas.DataFrame``. width : int, optional Width, or reference to width, of wrapped ``pandas.DataFrame``. """ partition_type = PandasOnPythonDataframePartition @_inherit_docstrings(PandasOnPythonDataframeAxisPartition) class PandasOnPythonDataframeColumnPartition(PandasOnPythonDataframeAxisPartition): axis = 0 @_inherit_docstrings(PandasOnPythonDataframeAxisPartition) class PandasOnPythonDataframeRowPartition(PandasOnPythonDataframeAxisPartition): axis = 1 ================================================ FILE: modin/core/execution/ray/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Modin's functionality related to Ray execution engine.""" ================================================ FILE: modin/core/execution/ray/common/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Common utilities for Ray execution engine.""" from .engine_wrapper import MaterializationHook, RayWrapper, SignalActor from .utils import initialize_ray __all__ = [ "initialize_ray", "RayWrapper", "MaterializationHook", "SignalActor", ] ================================================ FILE: modin/core/execution/ray/common/deferred_execution.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module with classes and utilities for deferred remote execution in Ray workers.""" from enum import Enum from itertools import islice from typing import ( Any, Callable, Dict, Generator, Iterable, List, Optional, Tuple, Union, ) import pandas import ray from ray._private.services import get_node_ip_address from modin.config import RayTaskCustomResources from modin.core.execution.ray.common import MaterializationHook, RayWrapper from modin.logging import get_logger ObjectRefType = Union[ray.ObjectRef, None] ObjectRefOrListType = Union[ObjectRefType, List[ObjectRefType]] ListOrTuple = (list, tuple) class DeferredExecution: """ Deferred execution task. This class represents a single node in the execution tree. The input is either an object reference or another node on which this node depends. The output is calculated by the specified Callable. If the input is a DeferredExecution node, it is executed first and the execution output is used as the input for this one. All the executions are performed in a single batch (i.e. using a single remote call) and the results are saved in all the nodes that have multiple subscribers. Parameters ---------- data : ObjectRefType or DeferredExecution The execution input. func : callable or ObjectRefType A function to be executed. args : list or tuple Additional positional arguments to be passed in `func`. kwargs : dict Additional keyword arguments to be passed in `func`. num_returns : int, optional The number of the return values. Attributes ---------- data : ObjectRefType or DeferredExecution The execution input. func : callable or ObjectRefType A function to be executed. args : list or tuple Additional positional arguments to be passed in `func`. kwargs : dict Additional keyword arguments to be passed in `func`. num_returns : int The number of the return values. flat_args : bool True means that there are no lists or DeferredExecution objects in `args`. In this case, no arguments processing is performed and `args` is passed to the remote method as is. flat_kwargs : bool The same as `flat_args` but for the `kwargs` values. """ def __init__( self, data: Union[ ObjectRefType, "DeferredExecution", List[Union[ObjectRefType, "DeferredExecution"]], ], func: Union[Callable, ObjectRefType], args: Union[List[Any], Tuple[Any]], kwargs: Dict[str, Any], num_returns=1, ): if isinstance(data, DeferredExecution): data.subscribe() self.data = data self.func = func self.args = args self.kwargs = kwargs self.num_returns = num_returns self.flat_args = self._flat_args(args) self.flat_kwargs = self._flat_args(kwargs.values()) self.subscribers = 0 @classmethod def _flat_args(cls, args: Iterable): """ Check if the arguments list is flat and subscribe to all `DeferredExecution` objects. Parameters ---------- args : Iterable Returns ------- bool """ flat = True for arg in args: if isinstance(arg, DeferredExecution): flat = False arg.subscribe() elif isinstance(arg, ListOrTuple): flat = False cls._flat_args(arg) return flat def exec( self, ) -> Tuple[ObjectRefOrListType, Union["MetaList", List], Union[int, List[int]]]: """ Execute this task, if required. Returns ------- tuple The execution result, MetaList, containing the length, width and the worker's ip address (the last value in the list) and the values offset in the list. I.e. length = meta_list[offset], width = meta_list[offset + 1], ip = meta_list[-1]. """ if self.has_result: return self.data, self.meta, self.meta_offset if ( not isinstance(self.data, DeferredExecution) and self.flat_args and self.flat_kwargs and self.num_returns == 1 ): result, length, width, ip = remote_exec_func.options( resources=RayTaskCustomResources.get() ).remote(self.func, self.data, *self.args, **self.kwargs) meta = MetaList([length, width, ip]) self._set_result(result, meta, 0) return result, meta, 0 # If there are no subscribers, we still need the result here. We don't need to decrement # it back. After the execution, the result is saved and the counter has no effect. self.subscribers += 2 consumers, output = self._deconstruct() # The last result is the MetaList, so adding +1 here. num_returns = sum(c.num_returns for c in consumers) + 1 results = self._remote_exec_chain(num_returns, *output) meta = MetaList(results.pop()) meta_offset = 0 results = iter(results) for de in consumers: if de.num_returns == 1: de._set_result(next(results), meta, meta_offset) meta_offset += 2 else: res = list(islice(results, num_returns)) offsets = list(range(0, 2 * num_returns, 2)) de._set_result(res, meta, offsets) meta_offset += 2 * num_returns return self.data, self.meta, self.meta_offset @property def has_result(self): """ Return true if this task has already been executed and the result is set. Returns ------- bool """ return not hasattr(self, "func") def subscribe(self): """ Increment the `subscribers` counter. Subscriber is any instance that could trigger the execution of this task. In case of a multiple subscribers, the execution could be triggerred multiple times. To prevent the multiple executions, the execution result is returned from the worker and saved in this instance. Subsequent calls to `execute()` return the previously saved result. """ self.subscribers += 1 def unsubscribe(self): """Decrement the `subscribers` counter.""" self.subscribers -= 1 assert self.subscribers >= 0 def _deconstruct(self) -> Tuple[List["DeferredExecution"], List[Any]]: """ Convert the specified execution tree to a flat list. This is required for the automatic Ray object references materialization before passing the list to a Ray worker. The format of the list is the following: sequence< >... If before is >= 0, then the next n objects are the function arguments. If it is -1, it means that the method arguments contain list and/or DeferredExecution (chain) objects. In this case the next values are read one by one until `_Tag.END` is encountered. If the value is `_Tag.LIST`, then the next sequence of values up to `_Tag.END` is converted to list. If the value is `_Tag.CHAIN`, then the next sequence of values up to `_Tag.END` has exactly the same format, as described here. If the value is `_Tag.REF`, then the next value is a reference id, i.e. the actual value should be retrieved by this id from the previously saved objects. The could also be `_Tag.REF` or `_Tag.LIST`. If before is >=0, then the next 2*n values are the argument names and values in the following format - [name1, value1, name2, value2...]. If it's -1, then the next values are converted to list in the same way as and the argument names are the next len() values. is an integer reference id. If it's not 0, then there is another chain referring to the execution result of this method and, thus, it must be saved so that other chains could retrieve the object by the id. field contains either the `num_returns` value or 0. If it's 0, the execution result is not returned, but is just passed to the next task in the chain. If it's 1, the result is returned as is. Otherwise, it's expected that the result is iterable and the specified number of values is returned from the iterator. The values lengths and widths are added to the meta list. Returns ------- tuple of list * The first list is the result consumers. If a DeferredExecution has multiple subscribers, the execution result should be returned and saved in order to avoid duplicate executions. These DeferredExecution tasks are added to this list and, after the execution, the results are passed to the ``_set_result()`` method of each task. * The second is a flat list of arguments that could be passed to the remote executor. """ stack = [] result_consumers = [] output = [] # Using stack and generators to avoid the ``RecursionError``s. stack.append(self._deconstruct_chain(self, output, stack, result_consumers)) while stack: try: gen = stack.pop() next_gen = next(gen) stack.append(gen) stack.append(next_gen) except StopIteration: pass return result_consumers, output @classmethod def _deconstruct_chain( cls, de: "DeferredExecution", output: List, stack: List, result_consumers: List["DeferredExecution"], ): """ Deconstruct the specified DeferredExecution chain. Parameters ---------- de : DeferredExecution The chain to be deconstructed. output : list Put the arguments to this list. stack : list Used to eliminate recursive calls, that may lead to the RecursionError. result_consumers : list of DeferredExecution The result consumers. Yields ------ Generator The ``_deconstruct_list()`` generator. """ out_append = output.append out_extend = output.extend while True: de.unsubscribe() if (out_pos := getattr(de, "out_pos", None)) and not de.has_result: out_append(_Tag.REF) out_append(out_pos) output[out_pos] = out_pos if de.subscribers == 0: # We may have subscribed to the same node multiple times. # It could happen, for example, if it's passed to the args # multiple times, or it's one of the parent nodes and also # passed to the args. In this case, there are no multiple # subscribers, and we don't need to return the result. output[out_pos + 1] = 0 result_consumers.remove(de) break elif not isinstance(data := de.data, DeferredExecution): if isinstance(data, ListOrTuple): yield cls._deconstruct_list( data, output, stack, result_consumers, out_append ) else: out_append(data) if not de.has_result: stack.append(de) break else: stack.append(de) de = data while stack and isinstance(stack[-1], DeferredExecution): de: DeferredExecution = stack.pop() args = de.args kwargs = de.kwargs out_append(de.func) if de.flat_args: out_append(len(args)) out_extend(args) else: out_append(-1) yield cls._deconstruct_list( args, output, stack, result_consumers, out_append ) if de.flat_kwargs: out_append(len(kwargs)) for item in kwargs.items(): out_extend(item) else: out_append(-1) yield cls._deconstruct_list( kwargs.values(), output, stack, result_consumers, out_append ) out_extend(kwargs) out_append(0) # Placeholder for ref id if de.subscribers > 0: # Ref id. This is the index in the output list. de.out_pos = len(output) - 1 result_consumers.append(de) out_append(de.num_returns) # Return result for this node else: out_append(0) # Do not return result for this node @classmethod def _deconstruct_list( cls, lst: Iterable, output: List, stack: List, result_consumers: List["DeferredExecution"], out_append: Callable, ): """ Deconstruct the specified list. Parameters ---------- lst : list output : list stack : list result_consumers : list out_append : Callable The reference to the ``list.append()`` method. Yields ------ Generator Either ``_deconstruct_list()`` or ``_deconstruct_chain()`` generator. """ for obj in lst: if isinstance(obj, DeferredExecution): if out_pos := getattr(obj, "out_pos", None): obj.unsubscribe() if obj.has_result: out_append(obj.data) else: out_append(_Tag.REF) out_append(out_pos) output[out_pos] = out_pos if obj.subscribers == 0: output[out_pos + 1] = 0 result_consumers.remove(obj) else: out_append(_Tag.CHAIN) yield cls._deconstruct_chain(obj, output, stack, result_consumers) out_append(_Tag.END) elif isinstance(obj, ListOrTuple): out_append(_Tag.LIST) yield cls._deconstruct_list( obj, output, stack, result_consumers, out_append ) else: out_append(obj) out_append(_Tag.END) @staticmethod def _remote_exec_chain(num_returns: int, *args: Tuple) -> List[Any]: """ Execute the deconstructed chain in a worker process. Parameters ---------- num_returns : int The number of return values. *args : tuple A deconstructed chain to be executed. Returns ------- list The execution results. The last element of this list is the ``MetaList``. """ # Prefer _remote_exec_single_chain(). It has fewer arguments and # does not require the num_returns to be specified in options. if num_returns == 2: return _remote_exec_single_chain.options( resources=RayTaskCustomResources.get() ).remote(*args) else: return _remote_exec_multi_chain.options( num_returns=num_returns, resources=RayTaskCustomResources.get() ).remote(num_returns, *args) def _set_result( self, result: ObjectRefOrListType, meta: "MetaList", meta_offset: Union[int, List[int]], ): """ Set the execution result. Parameters ---------- result : ObjectRefOrListType meta : MetaList meta_offset : int or list of int """ del self.func, self.args, self.kwargs, self.flat_args, self.flat_kwargs self.data = result self.meta = meta self.meta_offset = meta_offset def __reduce__(self): """Not serializable.""" raise NotImplementedError("DeferredExecution is not serializable!") class MetaList: """ Meta information, containing the result lengths and the worker address. Parameters ---------- obj : ray.ObjectID or list """ def __init__(self, obj: Union[ray.ObjectID, List]): self._obj = obj def __getitem__(self, index): """ Get item at the specified index. Parameters ---------- index : int Returns ------- Any """ obj = self._obj return obj[index] if isinstance(obj, list) else MetaListHook(self, index) def __setitem__(self, index, value): """ Set item at the specified index. Parameters ---------- index : int value : Any """ obj = self._obj if not isinstance(obj, list): self._obj = obj = RayWrapper.materialize(obj) obj[index] = value class MetaListHook(MaterializationHook): """ Used by MetaList.__getitem__() for lazy materialization and getting a single value from the list. Parameters ---------- meta : MetaList Non-materialized list to get the value from. idx : int The value index in the list. """ def __init__(self, meta: MetaList, idx: int): self.meta = meta self.idx = idx def pre_materialize(self): """ Get item at self.idx or object ref if not materialized. Returns ------- object """ obj = self.meta._obj return obj[self.idx] if isinstance(obj, list) else obj def post_materialize(self, materialized): """ Save the materialized list in self.meta and get the item at self.idx. Parameters ---------- materialized : list Returns ------- object """ self.meta._obj = materialized return materialized[self.idx] class _Tag(Enum): # noqa: PR01 """ A set of special values used for the method arguments de/construction. See ``DeferredExecution._deconstruct()`` for details. """ # The next item is an execution chain CHAIN = 0 # The next item is a reference REF = 1 # The next item a list LIST = 2 # End of list or chain END = 3 class _RemoteExecutor: """Remote functions for DeferredExecution.""" @staticmethod def exec_func(fn: Callable, obj: Any, args: Tuple, kwargs: Dict) -> Any: """ Execute the specified function. Parameters ---------- fn : Callable obj : Any args : Tuple kwargs : dict Returns ------- Any """ try: try: return fn(obj, *args, **kwargs) # Sometimes Arrow forces us to make a copy of an object before we operate on it. We # don't want the error to propagate to the user, and we want to avoid copying unless # we absolutely have to. except ValueError as err: if isinstance(obj, (pandas.DataFrame, pandas.Series)): return fn(obj.copy(), *args, **kwargs) else: raise err except Exception as err: get_logger().error( f"{err}. fn={fn}, obj={obj}, args={args}, kwargs={kwargs}" ) raise err @classmethod def construct(cls, num_returns: int, args: Tuple): # pragma: no cover """ Construct and execute the specified chain. This function is called in a worker process. The last value, returned by this generator, is the meta list, containing the objects lengths and widths and the worker ip address, as the last value in the list. Parameters ---------- num_returns : int args : tuple Yields ------ Any The execution results and the MetaList as the last value. """ chain = list(reversed(args)) meta = [] try: stack = [cls.construct_chain(chain, {}, meta, None)] while stack: try: gen = stack.pop() obj = next(gen) stack.append(gen) if isinstance(obj, Generator): stack.append(obj) else: yield obj except StopIteration: pass except Exception as err: get_logger().error(f"{err}. args={args}, chain={list(reversed(chain))}") raise err meta.append(get_node_ip_address()) yield meta @classmethod def construct_chain( cls, chain: List, refs: Dict[int, Any], meta: List, lst: Optional[List], ): # pragma: no cover """ Construct the chain and execute it one by one. Parameters ---------- chain : list A flat list containing the execution tree, deconstructed by ``DeferredExecution._deconstruct()``. refs : dict If an execution result is required for multiple chains, the reference to this result is saved in this dict. meta : list The lengths of the returned objects are added to this list. lst : list If specified, the execution result is added to this list. This is used when a chain is passed as an argument to a DeferredExecution task. Yields ------ Any Either the ``construct_list()`` generator or the execution results. """ pop = chain.pop tg_e = _Tag.END obj = pop() if obj is _Tag.REF: obj = refs[pop()] elif obj is _Tag.LIST: obj = [] yield cls.construct_list(obj, chain, refs, meta) while chain: fn = pop() if fn == tg_e: lst.append(obj) break if (args_len := pop()) >= 0: if args_len == 0: args = [] else: args = chain[-args_len:] del chain[-args_len:] args.reverse() else: args = [] yield cls.construct_list(args, chain, refs, meta) if (args_len := pop()) >= 0: kwargs = {pop(): pop() for _ in range(args_len)} else: values = [] yield cls.construct_list(values, chain, refs, meta) kwargs = {pop(): v for v in values} obj = cls.exec_func(fn, obj, args, kwargs) if ref := pop(): # is not 0 - adding the result to refs refs[ref] = obj if (num_returns := pop()) == 0: continue itr = iter([obj] if num_returns == 1 else obj) for _ in range(num_returns): obj = next(itr) meta.append(len(obj) if hasattr(obj, "__len__") else 0) meta.append(len(obj.columns) if hasattr(obj, "columns") else 0) yield obj @classmethod def construct_list( cls, lst: List, chain: List, refs: Dict[int, Any], meta: List, ): # pragma: no cover """ Construct the list. Parameters ---------- lst : list chain : list refs : dict meta : list Yields ------ Any Either ``construct_chain()`` or ``construct_list()`` generator. """ pop = chain.pop lst_append = lst.append while True: obj = pop() if isinstance(obj, _Tag): if obj == _Tag.END: break elif obj == _Tag.CHAIN: yield cls.construct_chain(chain, refs, meta, lst) elif obj == _Tag.LIST: lst_append([]) yield cls.construct_list(lst[-1], chain, refs, meta) elif obj is _Tag.REF: lst_append(refs[pop()]) else: raise ValueError(f"Unexpected tag {obj}") else: lst_append(obj) def __reduce__(self): """ Use a single instance on deserialization. Returns ------- str Returns the ``_REMOTE_EXEC`` attribute name. """ return "_REMOTE_EXEC" _REMOTE_EXEC = _RemoteExecutor() @ray.remote(num_returns=4) def remote_exec_func( fn: Callable, obj: Any, *flat_args: Tuple, remote_executor=_REMOTE_EXEC, **flat_kwargs: Dict, ): # pragma: no cover """ Execute the specified function with the arguments in a worker process. The object `obj` is passed to the function as the first argument. Note: all the arguments must be flat, i.e. no lists, no chains. Parameters ---------- fn : Callable obj : Any *flat_args : list remote_executor : _RemoteExecutor, default: _REMOTE_EXEC Do not change, it's used to avoid excessive serializations. **flat_kwargs : dict Returns ------- tuple[Any, int, int, str] The execution result, the result length and width, the worked address. """ obj = remote_executor.exec_func(fn, obj, flat_args, flat_kwargs) return ( obj, len(obj) if hasattr(obj, "__len__") else 0, len(obj.columns) if hasattr(obj, "columns") else 0, get_node_ip_address(), ) @ray.remote(num_returns=2) def _remote_exec_single_chain( *args: Tuple, remote_executor=_REMOTE_EXEC ) -> Generator: # pragma: no cover """ Execute the deconstructed chain with a single return value in a worker process. Parameters ---------- *args : tuple A deconstructed chain to be executed. remote_executor : _RemoteExecutor, default: _REMOTE_EXEC Do not change, it's used to avoid excessive serializations. Returns ------- Generator """ return remote_executor.construct(num_returns=2, args=args) @ray.remote def _remote_exec_multi_chain( num_returns: int, *args: Tuple, remote_executor=_REMOTE_EXEC ) -> Generator: # pragma: no cover """ Execute the deconstructed chain with a multiple return values in a worker process. Parameters ---------- num_returns : int The number of return values. *args : tuple A deconstructed chain to be executed. remote_executor : _RemoteExecutor, default: _REMOTE_EXEC Do not change, it's used to avoid excessive serializations. Returns ------- Generator """ return remote_executor.construct(num_returns, args) ================================================ FILE: modin/core/execution/ray/common/engine_wrapper.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ The module with helper mixin for executing functions remotely. To be used as a piece of building a Ray-based engine. """ import asyncio import os from types import FunctionType from typing import Sequence import pandas import ray from modin.config import RayTaskCustomResources from modin.error_message import ErrorMessage @ray.remote def _deploy_ray_func(func, *args, return_pandas_df=None, **kwargs): # pragma: no cover """ Wrap `func` to ease calling it remotely. Parameters ---------- func : callable A local function that we want to call remotely. *args : iterable Positional arguments to pass to `func` when calling remotely. return_pandas_df : bool, optional Whether to convert the result of `func` to a pandas DataFrame or not. **kwargs : dict Keyword arguments to pass to `func` when calling remotely. Returns ------- ray.ObjectRef or list Ray identifier of the result being put to Plasma store. """ result = func(*args, **kwargs) if return_pandas_df and not isinstance(result, pandas.DataFrame): result = pandas.DataFrame(result) return result class RayWrapper: """Mixin that provides means of running functions remotely and getting local results.""" _func_cache = {} @classmethod def deploy( cls, func, f_args=None, f_kwargs=None, return_pandas_df=None, num_returns=1 ): """ Run local `func` remotely. Parameters ---------- func : callable or ray.ObjectID The function to perform. f_args : list or tuple, optional Positional arguments to pass to ``func``. f_kwargs : dict, optional Keyword arguments to pass to ``func``. return_pandas_df : bool, optional Whether to convert the result of `func` to a pandas DataFrame or not. num_returns : int, default: 1 Amount of return values expected from `func`. Returns ------- ray.ObjectRef or list Ray identifier of the result being put to Plasma store. """ args = [] if f_args is None else f_args kwargs = {} if f_kwargs is None else f_kwargs return _deploy_ray_func.options( num_returns=num_returns, resources=RayTaskCustomResources.get() ).remote(func, *args, return_pandas_df=return_pandas_df, **kwargs) @classmethod def is_future(cls, item): """ Check if the item is a Future. Parameters ---------- item : ray.ObjectID or object Future or object to check. Returns ------- boolean If the value is a future. """ return isinstance(item, ObjectRefTypes) @classmethod def materialize(cls, obj_id): """ Get the value of object from the Plasma store. Parameters ---------- obj_id : ray.ObjectID Ray object identifier to get the value by. Returns ------- object Whatever was identified by `obj_id`. """ if isinstance(obj_id, MaterializationHook): obj = obj_id.pre_materialize() return ( obj_id.post_materialize(ray.get(obj)) if isinstance(obj, ray.ObjectRef) else obj ) if not isinstance(obj_id, Sequence): return ray.get(obj_id) if isinstance(obj_id, ray.ObjectRef) else obj_id if all(isinstance(obj, ray.ObjectRef) for obj in obj_id): return ray.get(obj_id) ids = {} result = [] for obj in obj_id: if not isinstance(obj, ObjectRefTypes): result.append(obj) continue if isinstance(obj, MaterializationHook): oid = obj.pre_materialize() if isinstance(oid, ray.ObjectRef): hook = obj obj = oid else: result.append(oid) continue else: hook = None idx = ids.get(obj, None) if idx is None: ids[obj] = idx = len(ids) if hook is None: result.append(obj) else: hook._materialized_idx = idx result.append(hook) if len(ids) == 0: return result materialized = ray.get(list(ids.keys())) for i in range(len(result)): if isinstance((obj := result[i]), ObjectRefTypes): if isinstance(obj, MaterializationHook): result[i] = obj.post_materialize( materialized[obj._materialized_idx] ) else: result[i] = materialized[ids[obj]] return result @classmethod def put(cls, data, **kwargs): """ Store an object in the object store. Parameters ---------- data : object The Python object to be stored. **kwargs : dict Additional keyword arguments. Returns ------- ray.ObjectID Ray object identifier to get the value by. """ if isinstance(data, FunctionType): qname = data.__qualname__ if "" not in qname and "" not in qname: ref = cls._func_cache.get(data, None) if ref is None: if len(cls._func_cache) < 1024: ref = ray.put(data) cls._func_cache[data] = ref else: msg = "To many functions in the RayWrapper cache!" assert "MODIN_GITHUB_CI" not in os.environ, msg ErrorMessage.warn(msg) return ref return ray.put(data, **kwargs) @classmethod def wait(cls, obj_ids, num_returns=None): """ Wait on the objects without materializing them (blocking operation). ``ray.wait`` assumes a list of unique object references: see https://github.com/modin-project/modin/issues/5045 Parameters ---------- obj_ids : list, scalar num_returns : int, optional """ if not isinstance(obj_ids, Sequence): obj_ids = list(obj_ids) ids = set() for obj in obj_ids: if isinstance(obj, MaterializationHook): obj = obj.pre_materialize() if isinstance(obj, ray.ObjectRef): ids.add(obj) if num_ids := len(ids): ray.wait(list(ids), num_returns=num_returns or num_ids) @ray.remote class SignalActor: # pragma: no cover """ Help synchronize across tasks and actors on cluster. For details see: https://docs.ray.io/en/latest/advanced.html?highlight=signalactor#multi-node-synchronization-using-an-actor Parameters ---------- event_count : int Number of events required for synchronization. """ def __init__(self, event_count: int): self.events = [asyncio.Event() for _ in range(event_count)] def send(self, event_idx: int): """ Indicate that event with `event_idx` has occurred. Parameters ---------- event_idx : int """ self.events[event_idx].set() async def wait(self, event_idx: int): """ Wait until event with `event_idx` has occurred. Parameters ---------- event_idx : int """ await self.events[event_idx].wait() def is_set(self, event_idx: int) -> bool: """ Check that event with `event_idx` had occurred or not. Parameters ---------- event_idx : int Returns ------- bool """ return self.events[event_idx].is_set() class MaterializationHook: """The Hook is called during the materialization and allows performing pre/post computations.""" def pre_materialize(self): """ Get an object reference to be materialized or a pre-computed value. Returns ------- ray.ObjectRef or object """ raise NotImplementedError() def post_materialize(self, materialized): """ Perform computations on the materialized object. Parameters ---------- materialized : object The materialized object to be post-computed. Returns ------- object The post-computed object. """ raise NotImplementedError() def __reduce__(self): """ Replace this hook with the materialized object on serialization. Returns ------- tuple """ data = RayWrapper.materialize(self) if not isinstance(data, int): raise NotImplementedError("Only integers are currently supported") return int, (data,) ObjectRefTypes = (ray.ObjectRef, MaterializationHook) ================================================ FILE: modin/core/execution/ray/common/utils.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """The module holds utility and initialization routines for Modin on Ray.""" import os import sys import warnings from typing import Optional import psutil import ray from packaging import version from modin.config import ( CIAWSAccessKeyID, CIAWSSecretAccessKey, CpuCount, GithubCI, GpuCount, IsRayCluster, Memory, NPartitions, RayInitCustomResources, RayRedisAddress, RayRedisPassword, ValueSource, ) from modin.core.execution.utils import set_env from modin.error_message import ErrorMessage from .engine_wrapper import ObjectRefTypes, RayWrapper _OBJECT_STORE_TO_SYSTEM_MEMORY_RATIO = 0.6 # This constant should be in sync with the limit in ray, which is private, # not exposed to users, and not documented: # https://github.com/ray-project/ray/blob/4692e8d8023e789120d3f22b41ffb136b50f70ea/python/ray/_private/ray_constants.py#L57-L62 _MAC_OBJECT_STORE_LIMIT_BYTES = 2 * 2**30 _RAY_IGNORE_UNHANDLED_ERRORS_VAR = "RAY_IGNORE_UNHANDLED_ERRORS" ObjectIDType = ObjectRefTypes def initialize_ray( override_is_cluster=False, override_redis_address: str = None, override_redis_password: str = None, ): """ Initialize Ray based on parameters, ``modin.config`` variables and internal defaults. Parameters ---------- override_is_cluster : bool, default: False Whether to override the detection of Modin being run in a cluster and always assume this runs on cluster head node. This also overrides Ray worker detection and always runs the initialization function (runs from main thread only by default). If not specified, ``modin.config.IsRayCluster`` variable is used. override_redis_address : str, optional What Redis address to connect to when running in Ray cluster. If not specified, ``modin.config.RayRedisAddress`` is used. override_redis_password : str, optional What password to use when connecting to Redis. If not specified, ``modin.config.RayRedisPassword`` is used. """ # We need these vars to be set for each Ray's worker in order to ensure that # the `pandas` module has been fully imported inside of each process before # any execution begins: # https://github.com/modin-project/modin/pull/4603 env_vars = { "__MODIN_AUTOIMPORT_PANDAS__": "1", "PYTHONWARNINGS": "ignore::FutureWarning", } if GithubCI.get(): # need these to write parquet to the moto service mocking s3. env_vars.update( { "AWS_ACCESS_KEY_ID": CIAWSAccessKeyID.get(), "AWS_SECRET_ACCESS_KEY": CIAWSSecretAccessKey.get(), } ) extra_init_kw = {} is_cluster = override_is_cluster or IsRayCluster.get() if not ray.is_initialized() or override_is_cluster: redis_address = override_redis_address or RayRedisAddress.get() redis_password = ( ( ray.ray_constants.REDIS_DEFAULT_PASSWORD if is_cluster else RayRedisPassword.get() ) if override_redis_password is None and RayRedisPassword.get_value_source() == ValueSource.DEFAULT else override_redis_password or RayRedisPassword.get() ) if is_cluster: extra_init_kw["runtime_env"] = {"env_vars": env_vars} # We only start ray in a cluster setting for the head node. ray.init( address=redis_address or "auto", include_dashboard=False, ignore_reinit_error=True, _redis_password=redis_password, **extra_init_kw, ) else: object_store_memory = _get_object_store_memory() ray_init_kwargs = { "num_cpus": CpuCount.get(), "num_gpus": GpuCount.get(), "include_dashboard": False, "ignore_reinit_error": True, "object_store_memory": object_store_memory, "_redis_password": redis_password, "_memory": object_store_memory, "resources": RayInitCustomResources.get(), **extra_init_kw, } # It should be enough to simply set the required variables for the main process # for Ray to automatically propagate them to each new worker on the same node. # Although Ray doesn't guarantee this behavior it works as expected most of the # time and doesn't enforce us with any overhead that Ray's native `runtime_env` # is usually causing. You can visit this gh-issue for more info: # https://github.com/modin-project/modin/issues/5157#issuecomment-1500225150 with set_env(**env_vars): ray.init(**ray_init_kwargs) # Now ray is initialized, check runtime env config - especially useful if we join # an externally pre-configured cluster runtime_env_vars = ray.get_runtime_context().runtime_env.get("env_vars", {}) for varname, varvalue in env_vars.items(): if str(runtime_env_vars.get(varname, "")) != str(varvalue): if is_cluster: ErrorMessage.single_warning( "When using a pre-initialized Ray cluster, please ensure that the runtime env " + f"sets environment variable {varname} to {varvalue}" ) num_cpus = int(ray.cluster_resources()["CPU"]) NPartitions._put(num_cpus) CpuCount._put(num_cpus) # TODO(https://github.com/ray-project/ray/issues/28216): remove this # workaround once Ray gives a better way to suppress task errors. # Ideally we would not set global environment variables. # If user has explicitly set _RAY_IGNORE_UNHANDLED_ERRORS_VAR, don't # don't override its value. if _RAY_IGNORE_UNHANDLED_ERRORS_VAR not in os.environ: os.environ[_RAY_IGNORE_UNHANDLED_ERRORS_VAR] = "1" def _get_object_store_memory() -> Optional[int]: """ Get the object store memory we should start Ray with, in bytes. - If the ``Memory`` config variable is set, return that. - On Linux, take system memory from /dev/shm. On other systems use total virtual memory. - On Mac, never return more than Ray-specified upper limit. Returns ------- Optional[int] The object store memory size in bytes, or None if we should use the Ray default. """ object_store_memory = Memory.get() if object_store_memory is not None: return object_store_memory virtual_memory = psutil.virtual_memory().total if sys.platform.startswith("linux"): shm_fd = os.open("/dev/shm", os.O_RDONLY) try: shm_stats = os.fstatvfs(shm_fd) system_memory = shm_stats.f_bsize * shm_stats.f_bavail if system_memory / (virtual_memory / 2) < 0.99: warnings.warn( f"The size of /dev/shm is too small ({system_memory} bytes). The required size " + f"at least half of RAM ({virtual_memory // 2} bytes). Please, delete files in /dev/shm or " + "increase size of /dev/shm with --shm-size in Docker. Also, you can can override the memory " + "size for each Ray worker (in bytes) to the MODIN_MEMORY environment variable." ) finally: os.close(shm_fd) else: system_memory = virtual_memory bytes_per_gb = 1e9 object_store_memory = int( _OBJECT_STORE_TO_SYSTEM_MEMORY_RATIO * system_memory // bytes_per_gb * bytes_per_gb ) if object_store_memory == 0: return None # Newer versions of ray don't allow us to initialize ray with object store # size larger than that _MAC_OBJECT_STORE_LIMIT_BYTES. It seems that # object store > the limit is too slow even on ray 1.0.0. However, limiting # the object store to _MAC_OBJECT_STORE_LIMIT_BYTES only seems to start # helping at ray version 1.3.0. So if ray version is at least 1.3.0, cap # the object store at _MAC_OBJECT_STORE_LIMIT_BYTES. # For background on the ray bug see: # - https://github.com/ray-project/ray/issues/20388 # - https://github.com/modin-project/modin/issues/4872 if sys.platform == "darwin" and version.parse(ray.__version__) >= version.parse( "1.3.0" ): object_store_memory = min(object_store_memory, _MAC_OBJECT_STORE_LIMIT_BYTES) return object_store_memory def deserialize(obj): # pragma: no cover """ Deserialize a Ray object. Parameters ---------- obj : ObjectIDType, iterable of ObjectIDType, or mapping of keys to ObjectIDTypes Object(s) to deserialize. Returns ------- obj The deserialized object. """ if isinstance(obj, ObjectIDType): return RayWrapper.materialize(obj) elif isinstance(obj, (tuple, list)): # Ray will error if any elements are not ObjectIDType, but we still want ray to # perform batch deserialization for us -- thus, we must submit only the list elements # that are ObjectIDType, deserialize them, and restore them to their correct list index oid_indices, oids = [], [] for i, ray_id in enumerate(obj): if isinstance(ray_id, ObjectIDType): oid_indices.append(i) oids.append(ray_id) ray_result = RayWrapper.materialize(oids) new_lst = list(obj[:]) for i, deser_item in zip(oid_indices, ray_result): new_lst[i] = deser_item # Check that all objects have been deserialized assert not any([isinstance(o, ObjectIDType) for o in new_lst]) return new_lst elif isinstance(obj, dict) and any( isinstance(val, ObjectIDType) for val in obj.values() ): return dict(zip(obj.keys(), RayWrapper.materialize(list(obj.values())))) else: return obj ================================================ FILE: modin/core/execution/ray/generic/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Generic functionality for Ray execution engine.""" ================================================ FILE: modin/core/execution/ray/generic/io/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Generic IO functionality for Ray execution engine.""" from .io import RayIO __all__ = ["RayIO"] ================================================ FILE: modin/core/execution/ray/generic/io/io.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """The module holds base class implementing required I/O over Ray.""" from modin.core.io import BaseIO class RayIO(BaseIO): """Base class for doing I/O operations over Ray.""" @classmethod def from_ray(cls, ray_obj): """ Create a Modin `query_compiler` from a Ray Dataset. Parameters ---------- ray_obj : ray.data.Dataset The Ray Dataset to convert from. Returns ------- BaseQueryCompiler QueryCompiler containing data from the Ray Dataset. Notes ----- This function must be implemented in every subclass otherwise NotImplementedError will be raised. """ raise NotImplementedError( f"Modin dataset can't be created from `ray.data.Dataset` using {cls}." ) @classmethod def to_ray(cls, modin_obj): """ Convert a Modin DataFrame/Series to a Ray Dataset. Parameters ---------- modin_obj : modin.pandas.DataFrame, modin.pandas.Series The Modin DataFrame/Series to convert. Returns ------- ray.data.Dataset Converted object with type depending on input. Notes ----- This function must be implemented in every subclass otherwise NotImplementedError will be raised. """ raise NotImplementedError( f"`ray.data.Dataset` can't be created from Modin DataFrame/Series using {cls}." ) ================================================ FILE: modin/core/execution/ray/generic/partitioning/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Generic partitioning functionality for Ray execution engine.""" from .partition_manager import GenericRayDataframePartitionManager __all__ = [ "GenericRayDataframePartitionManager", ] ================================================ FILE: modin/core/execution/ray/generic/partitioning/partition_manager.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """The module holds Modin partition manager implemented for Ray.""" import numpy as np from modin.core.dataframe.pandas.partitioning.partition_manager import ( PandasDataframePartitionManager, ) from modin.core.execution.ray.common import RayWrapper class GenericRayDataframePartitionManager(PandasDataframePartitionManager): """The class implements the interface in `PandasDataframePartitionManager`.""" @classmethod def to_numpy(cls, partitions, **kwargs): """ Convert `partitions` into a NumPy array. Parameters ---------- partitions : NumPy array A 2-D array of partitions to convert to local NumPy array. **kwargs : dict Keyword arguments to pass to each partition ``.to_numpy()`` call. Returns ------- NumPy array """ if partitions.shape[1] == 1: parts = cls.get_objects_from_partitions(partitions.flatten()) parts = [part.to_numpy(**kwargs) for part in parts] else: parts = RayWrapper.materialize( [ obj.apply( lambda df, **kwargs: df.to_numpy(**kwargs) ).list_of_blocks[0] for row in partitions for obj in row ] ) rows, cols = partitions.shape parts = [parts[i * cols : (i + 1) * cols] for i in range(rows)] return np.block(parts) ================================================ FILE: modin/core/execution/ray/implementations/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Modin's functionality related to Ray execution engine and optimized for specific storage formats.""" ================================================ FILE: modin/core/execution/ray/implementations/pandas_on_ray/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Modin's functionality related to Ray execution engine and optimized for pandas storage format.""" ================================================ FILE: modin/core/execution/ray/implementations/pandas_on_ray/dataframe/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Base Modin Dataframe class optimized for pandas on Ray execution.""" from .dataframe import PandasOnRayDataframe __all__ = ["PandasOnRayDataframe"] ================================================ FILE: modin/core/execution/ray/implementations/pandas_on_ray/dataframe/dataframe.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses class that implements ``PandasDataframe`` using Ray.""" from modin.core.dataframe.base.dataframe.utils import Axis from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe from modin.utils import _inherit_docstrings from ..partitioning.partition_manager import PandasOnRayDataframePartitionManager class PandasOnRayDataframe(PandasDataframe): """ The class implements the interface in ``PandasDataframe`` using Ray. Parameters ---------- partitions : np.ndarray A 2D NumPy array of partitions. index : sequence The index for the dataframe. Converted to a ``pandas.Index``. columns : sequence The columns object for the dataframe. Converted to a ``pandas.Index``. row_lengths : list, optional The length of each partition in the rows. The "height" of each of the block partitions. Is computed if not provided. column_widths : list, optional The width of each partition in the columns. The "width" of each of the block partitions. Is computed if not provided. dtypes : pandas.Series, optional The data types for the dataframe columns. pandas_backend : {"pyarrow", None}, optional Backend used by pandas. None - means default NumPy backend. """ _partition_mgr_cls = PandasOnRayDataframePartitionManager def _get_lengths(self, parts, axis): """ Get list of dimensions for all the provided parts. Parameters ---------- parts : list List of parttions. axis : {0, 1} The axis along which to get the lengths (0 - length across rows or, 1 - width across columns). Returns ------- list """ if axis == Axis.ROW_WISE: dims = [part.length(False) for part in parts] else: dims = [part.width(False) for part in parts] return self._partition_mgr_cls.materialize_futures(dims) @property @_inherit_docstrings(PandasDataframe.engine) def engine(self) -> str: return "Ray" ================================================ FILE: modin/core/execution/ray/implementations/pandas_on_ray/io/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Base IO classes optimized for pandas on Ray execution.""" from .io import PandasOnRayIO __all__ = ["PandasOnRayIO"] ================================================ FILE: modin/core/execution/ray/implementations/pandas_on_ray/io/io.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """The module holds the factory which performs I/O using pandas on Ray.""" import io import numpy as np import pandas from pandas.io.common import get_handle, stringify_path from ray.data import from_pandas_refs from modin.config import RayTaskCustomResources from modin.core.execution.ray.common import RayWrapper, SignalActor from modin.core.execution.ray.generic.io import RayIO from modin.core.io import ( CSVDispatcher, ExcelDispatcher, FeatherDispatcher, FWFDispatcher, JSONDispatcher, ParquetDispatcher, SQLDispatcher, ) from modin.core.storage_formats.pandas.parsers import ( PandasCSVParser, PandasExcelParser, PandasFeatherParser, PandasFWFParser, PandasJSONParser, PandasParquetParser, PandasSQLParser, ) from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler from modin.distributed.dataframe.pandas.partitions import ( from_partitions, unwrap_partitions, ) from modin.experimental.core.io import ( ExperimentalCSVGlobDispatcher, ExperimentalCustomTextDispatcher, ExperimentalGlobDispatcher, ExperimentalSQLDispatcher, ) from modin.experimental.core.storage_formats.pandas.parsers import ( ExperimentalCustomTextParser, ExperimentalPandasCSVGlobParser, ExperimentalPandasJsonParser, ExperimentalPandasParquetParser, ExperimentalPandasPickleParser, ExperimentalPandasXmlParser, ) from ..dataframe import PandasOnRayDataframe from ..partitioning import PandasOnRayDataframePartition class PandasOnRayIO(RayIO): """Factory providing methods for performing I/O operations using pandas as storage format on Ray as engine.""" frame_cls = PandasOnRayDataframe frame_partition_cls = PandasOnRayDataframePartition query_compiler_cls = PandasQueryCompiler build_args = dict( frame_partition_cls=PandasOnRayDataframePartition, query_compiler_cls=PandasQueryCompiler, frame_cls=PandasOnRayDataframe, base_io=RayIO, ) def __make_read(*classes, build_args=build_args): # used to reduce code duplication return type("", (RayWrapper, *classes), build_args).read def __make_write(*classes, build_args=build_args): # used to reduce code duplication return type("", (RayWrapper, *classes), build_args).write read_csv = __make_read(PandasCSVParser, CSVDispatcher) read_fwf = __make_read(PandasFWFParser, FWFDispatcher) read_json = __make_read(PandasJSONParser, JSONDispatcher) read_parquet = __make_read(PandasParquetParser, ParquetDispatcher) to_parquet = __make_write(ParquetDispatcher) # Blocked on pandas-dev/pandas#12236. It is faster to default to pandas. # read_hdf = __make_read(PandasHDFParser, HDFReader) read_feather = __make_read(PandasFeatherParser, FeatherDispatcher) read_sql = __make_read(PandasSQLParser, SQLDispatcher) to_sql = __make_write(SQLDispatcher) read_excel = __make_read(PandasExcelParser, ExcelDispatcher) # experimental methods that don't exist in pandas read_csv_glob = __make_read( ExperimentalPandasCSVGlobParser, ExperimentalCSVGlobDispatcher ) read_parquet_glob = __make_read( ExperimentalPandasParquetParser, ExperimentalGlobDispatcher ) to_parquet_glob = __make_write( ExperimentalGlobDispatcher, build_args={**build_args, "base_write": RayIO.to_parquet}, ) read_json_glob = __make_read( ExperimentalPandasJsonParser, ExperimentalGlobDispatcher ) to_json_glob = __make_write( ExperimentalGlobDispatcher, build_args={**build_args, "base_write": RayIO.to_json}, ) read_xml_glob = __make_read(ExperimentalPandasXmlParser, ExperimentalGlobDispatcher) to_xml_glob = __make_write( ExperimentalGlobDispatcher, build_args={**build_args, "base_write": RayIO.to_xml}, ) read_pickle_glob = __make_read( ExperimentalPandasPickleParser, ExperimentalGlobDispatcher ) to_pickle_glob = __make_write( ExperimentalGlobDispatcher, build_args={**build_args, "base_write": RayIO.to_pickle}, ) read_custom_text = __make_read( ExperimentalCustomTextParser, ExperimentalCustomTextDispatcher ) read_sql_distributed = __make_read( ExperimentalSQLDispatcher, build_args={**build_args, "base_read": read_sql} ) del __make_read # to not pollute class namespace del __make_write # to not pollute class namespace @staticmethod def _to_csv_check_support(kwargs): """ Check if parallel version of ``to_csv`` could be used. Parameters ---------- kwargs : dict Keyword arguments passed to ``.to_csv()``. Returns ------- bool Whether parallel version of ``to_csv`` is applicable. """ path_or_buf = kwargs["path_or_buf"] compression = kwargs["compression"] if not isinstance(path_or_buf, str): return False # case when the pointer is placed at the beginning of the file. if "r" in kwargs["mode"] and "+" in kwargs["mode"]: return False # encodings with BOM don't support; # instead of one mark in result bytes we will have them by the number of partitions # so we should fallback in pandas for `utf-16`, `utf-32` with all aliases, in instance # (`utf_32_be`, `utf_16_le` and so on) if kwargs["encoding"] is not None: encoding = kwargs["encoding"].lower() if "u" in encoding or "utf" in encoding: if "16" in encoding or "32" in encoding: return False if compression is None or not compression == "infer": return False if any((path_or_buf.endswith(ext) for ext in [".gz", ".bz2", ".zip", ".xz"])): return False return True @classmethod def to_csv(cls, qc, **kwargs): """ Write records stored in the `qc` to a CSV file. Parameters ---------- qc : BaseQueryCompiler The query compiler of the Modin dataframe that we want to run ``to_csv`` on. **kwargs : dict Parameters for ``pandas.to_csv(**kwargs)``. """ kwargs["path_or_buf"] = stringify_path(kwargs["path_or_buf"]) if not cls._to_csv_check_support(kwargs): return RayIO.to_csv(qc, **kwargs) signals = SignalActor.options(resources=RayTaskCustomResources.get()).remote( len(qc._modin_frame._partitions) + 1 ) def func(df, **kw): # pragma: no cover """ Dump a chunk of rows as csv, then save them to target maintaining order. Parameters ---------- df : pandas.DataFrame A chunk of rows to write to a CSV file. **kw : dict Arguments to pass to ``pandas.to_csv(**kw)`` plus an extra argument `partition_idx` serving as chunk index to maintain rows order. """ partition_idx = kw["partition_idx"] # the copy is made to not implicitly change the input parameters; # to write to an intermediate buffer, we need to change `path_or_buf` in kwargs csv_kwargs = kwargs.copy() if partition_idx != 0: # we need to create a new file only for first recording # all the rest should be recorded in appending mode if "w" in csv_kwargs["mode"]: csv_kwargs["mode"] = csv_kwargs["mode"].replace("w", "a") # It is enough to write the header for the first partition csv_kwargs["header"] = False # for parallelization purposes, each partition is written to an intermediate buffer path_or_buf = csv_kwargs["path_or_buf"] is_binary = "b" in csv_kwargs["mode"] csv_kwargs["path_or_buf"] = io.BytesIO() if is_binary else io.StringIO() storage_options = csv_kwargs.pop("storage_options", None) df.to_csv(**csv_kwargs) csv_kwargs.update({"storage_options": storage_options}) content = csv_kwargs["path_or_buf"].getvalue() csv_kwargs["path_or_buf"].close() # each process waits for its turn to write to a file RayWrapper.materialize(signals.wait.remote(partition_idx)) # preparing to write data from the buffer to a file with get_handle( path_or_buf, # in case when using URL in implicit text mode # pandas try to open `path_or_buf` in binary mode csv_kwargs["mode"] if is_binary else csv_kwargs["mode"] + "t", encoding=kwargs["encoding"], errors=kwargs["errors"], compression=kwargs["compression"], storage_options=kwargs.get("storage_options", None), is_text=not is_binary, ) as handles: handles.handle.write(content) # signal that the next process can start writing to the file RayWrapper.materialize(signals.send.remote(partition_idx + 1)) # used for synchronization purposes return pandas.DataFrame() # signaling that the partition with id==0 can be written to the file RayWrapper.materialize(signals.send.remote(0)) # Ensure that the metadata is syncrhonized qc._modin_frame._propagate_index_objs(axis=None) result = qc._modin_frame._partition_mgr_cls.map_axis_partitions( axis=1, partitions=qc._modin_frame._partitions, map_func=func, keep_partitioning=True, lengths=None, enumerate_partitions=True, max_retries=0, ) # pending completion RayWrapper.materialize( [part.list_of_blocks[0] for row in result for part in row] ) @classmethod def from_ray(cls, ray_obj): """ Create a Modin `query_compiler` from a Ray Dataset. Parameters ---------- ray_obj : ray.data.Dataset The Ray Dataset to convert from. Returns ------- BaseQueryCompiler QueryCompiler containing data from the Ray Dataset. """ pd_objs = ray_obj.to_pandas_refs() return from_partitions(pd_objs, axis=0)._query_compiler @classmethod def to_ray(cls, modin_obj): """ Convert a Modin DataFrame/Series to a Ray Dataset. Parameters ---------- modin_obj : modin.pandas.DataFrame, modin.pandas.Series The Modin DataFrame/Series to convert. Returns ------- ray.data.Dataset Converted object with type depending on input. """ parts = unwrap_partitions(modin_obj, axis=0) return from_pandas_refs(parts) @classmethod def from_map(cls, func, iterable, *args, **kwargs): """ Create a Modin `query_compiler` from a map function. This method will construct a Modin `query_compiler` split by row partitions. The number of row partitions matches the number of elements in the iterable object. Parameters ---------- func : callable Function to map across the iterable object. iterable : Iterable An iterable object. *args : tuple Positional arguments to pass in `func`. **kwargs : dict Keyword arguments to pass in `func`. Returns ------- BaseQueryCompiler QueryCompiler containing data returned by map function. """ func = cls.frame_cls._partition_mgr_cls.preprocess_func(func) partitions = np.array( [ [ cls.frame_partition_cls( RayWrapper.deploy( func, f_args=(obj,) + args, return_pandas_df=True, **kwargs ) ) ] for obj in iterable ] ) return cls.query_compiler_cls(cls.frame_cls(partitions)) ================================================ FILE: modin/core/execution/ray/implementations/pandas_on_ray/partitioning/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Base Modin Dataframe classes related to its partitioning and optimized for pandas on Ray execution.""" from .partition import PandasOnRayDataframePartition from .partition_manager import PandasOnRayDataframePartitionManager from .virtual_partition import ( PandasOnRayDataframeColumnPartition, PandasOnRayDataframeRowPartition, PandasOnRayDataframeVirtualPartition, ) __all__ = [ "PandasOnRayDataframePartition", "PandasOnRayDataframePartitionManager", "PandasOnRayDataframeVirtualPartition", "PandasOnRayDataframeColumnPartition", "PandasOnRayDataframeRowPartition", ] ================================================ FILE: modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses class that wraps data (block partition) and its metadata.""" from typing import Callable, Union import pandas import ray from modin.config import LazyExecution, RayTaskCustomResources from modin.core.dataframe.pandas.partitioning.partition import PandasDataframePartition from modin.core.execution.ray.common import MaterializationHook, RayWrapper from modin.core.execution.ray.common.deferred_execution import ( DeferredExecution, MetaList, MetaListHook, ) from modin.core.execution.ray.common.utils import ObjectIDType from modin.logging import disable_logging, get_logger from modin.pandas.indexing import compute_sliced_len from modin.utils import _inherit_docstrings class PandasOnRayDataframePartition(PandasDataframePartition): """ The class implements the interface in ``PandasDataframePartition``. Parameters ---------- data : ObjectIDType or DeferredExecution A reference to ``pandas.DataFrame`` that needs to be wrapped with this class or a reference to DeferredExecution that needs to be executed on demand. length : ObjectIDType or int, optional Length or reference to it of wrapped ``pandas.DataFrame``. width : ObjectIDType or int, optional Width or reference to it of wrapped ``pandas.DataFrame``. ip : ObjectIDType or str, optional Node IP address or reference to it that holds wrapped ``pandas.DataFrame``. meta : MetaList Meta information, containing the lengths and the worker address (the last value). meta_offset : int The lengths offset in the meta list. """ execution_wrapper = RayWrapper def __init__( self, data: Union[ray.ObjectRef, DeferredExecution], length: int = None, width: int = None, ip: str = None, meta: MetaList = None, meta_offset: int = 0, ): super().__init__() if isinstance(data, DeferredExecution): data.subscribe() self._data_ref = data # The metadata is stored in the MetaList at 0 offset. If the data is # a DeferredExecution, the _meta will be replaced with the list, returned # by the remote function. The returned list may contain data for multiple # results and, in this case, _meta_offset corresponds to the meta related to # this partition. if meta is None: self._meta = MetaList([length, width, ip]) self._meta_offset = 0 else: self._meta = meta self._meta_offset = meta_offset log = get_logger() self._is_debug(log) and log.debug( "Partition ID: {}, Height: {}, Width: {}, Node IP: {}".format( self._identity, str(self._length_cache), str(self._width_cache), str(self._ip_cache), ) ) @disable_logging def __del__(self): """Unsubscribe from DeferredExecution.""" if isinstance(self._data_ref, DeferredExecution): self._data_ref.unsubscribe() def apply(self, func: Union[Callable, ray.ObjectRef], *args, **kwargs): """ Apply a function to the object wrapped by this partition. Parameters ---------- func : callable or ray.ObjectRef A function to apply. *args : iterable Additional positional arguments to be passed in `func`. **kwargs : dict Additional keyword arguments to be passed in `func`. Returns ------- PandasOnRayDataframePartition A new ``PandasOnRayDataframePartition`` object. Notes ----- It does not matter if `func` is callable or an ``ray.ObjectRef``. Ray will handle it correctly either way. The keyword arguments are sent as a dictionary. """ log = get_logger() self._is_debug(log) and log.debug(f"ENTER::Partition.apply::{self._identity}") de = DeferredExecution(self._data_ref, func, args, kwargs) data, meta, meta_offset = de.exec() self._is_debug(log) and log.debug(f"EXIT::Partition.apply::{self._identity}") return self.__constructor__(data, meta=meta, meta_offset=meta_offset) @_inherit_docstrings(PandasDataframePartition.add_to_apply_calls) def add_to_apply_calls( self, func: Union[Callable, ray.ObjectRef], *args, length=None, width=None, **kwargs, ): return self.__constructor__( data=DeferredExecution(self._data_ref, func, args, kwargs), length=length, width=width, ) @_inherit_docstrings(PandasDataframePartition.drain_call_queue) def drain_call_queue(self): data = self._data_ref if not isinstance(data, DeferredExecution): return data log = get_logger() self._is_debug(log) and log.debug( f"ENTER::Partition.drain_call_queue::{self._identity}" ) self._data_ref, self._meta, self._meta_offset = data.exec() self._is_debug(log) and log.debug( f"EXIT::Partition.drain_call_queue::{self._identity}" ) @_inherit_docstrings(PandasDataframePartition.wait) def wait(self): self.drain_call_queue() RayWrapper.wait(self._data_ref) def __copy__(self): """ Create a copy of this partition. Returns ------- PandasOnRayDataframePartition A copy of this partition. """ return self.__constructor__( self._data_ref, meta=self._meta, meta_offset=self._meta_offset, ) def mask(self, row_labels, col_labels): """ Lazily create a mask that extracts the indices provided. Parameters ---------- row_labels : list-like, slice or label The row labels for the rows to extract. col_labels : list-like, slice or label The column labels for the columns to extract. Returns ------- PandasOnRayDataframePartition A new ``PandasOnRayDataframePartition`` object. """ log = get_logger() self._is_debug(log) and log.debug(f"ENTER::Partition.mask::{self._identity}") new_obj = super().mask(row_labels, col_labels) if isinstance(row_labels, slice) and isinstance( (len_cache := self._length_cache), ObjectIDType ): if row_labels == slice(None): # fast path - full axis take new_obj._length_cache = len_cache else: new_obj._length_cache = SlicerHook(len_cache, row_labels) if isinstance(col_labels, slice) and isinstance( (width_cache := self._width_cache), ObjectIDType ): if col_labels == slice(None): # fast path - full axis take new_obj._width_cache = width_cache else: new_obj._width_cache = SlicerHook(width_cache, col_labels) self._is_debug(log) and log.debug(f"EXIT::Partition.mask::{self._identity}") return new_obj @classmethod def put(cls, obj: pandas.DataFrame): """ Put the data frame into Plasma store and wrap it with partition object. Parameters ---------- obj : pandas.DataFrame A data frame to be put. Returns ------- PandasOnRayDataframePartition A new ``PandasOnRayDataframePartition`` object. """ return cls(cls.execution_wrapper.put(obj), len(obj.index), len(obj.columns)) @classmethod def preprocess_func(cls, func): """ Put a function into the Plasma store to use in ``apply``. Parameters ---------- func : callable A function to preprocess. Returns ------- ray.ObjectRef A reference to `func`. """ return cls.execution_wrapper.put(func) def length(self, materialize=True): """ Get the length of the object wrapped by this partition. Parameters ---------- materialize : bool, default: True Whether to forcibly materialize the result into an integer. If ``False`` was specified, may return a future of the result if it hasn't been materialized yet. Returns ------- int or ray.ObjectRef The length of the object. """ if (length := self._length_cache) is None: self.drain_call_queue() if (length := self._length_cache) is None: length, self._width_cache = _get_index_and_columns.options( resources=RayTaskCustomResources.get() ).remote(self._data_ref) self._length_cache = length if materialize and isinstance(length, ObjectIDType): self._length_cache = length = RayWrapper.materialize(length) return length def width(self, materialize=True): """ Get the width of the object wrapped by the partition. Parameters ---------- materialize : bool, default: True Whether to forcibly materialize the result into an integer. If ``False`` was specified, may return a future of the result if it hasn't been materialized yet. Returns ------- int or ray.ObjectRef The width of the object. """ if (width := self._width_cache) is None: self.drain_call_queue() if (width := self._width_cache) is None: self._length_cache, width = _get_index_and_columns.options( resources=RayTaskCustomResources.get() ).remote(self._data_ref) self._width_cache = width if materialize and isinstance(width, ObjectIDType): self._width_cache = width = RayWrapper.materialize(width) return width def ip(self, materialize=True): """ Get the node IP address of the object wrapped by this partition. Parameters ---------- materialize : bool, default: True Whether to forcibly materialize the result into an integer. If ``False`` was specified, may return a future of the result if it hasn't been materialized yet. Returns ------- str IP address of the node that holds the data. """ if (ip := self._ip_cache) is None: self.drain_call_queue() if materialize and isinstance(ip, ObjectIDType): self._ip_cache = ip = RayWrapper.materialize(ip) return ip @property def _data(self) -> ray.ObjectRef: # noqa: GL08 self.drain_call_queue() return self._data_ref @property def _length_cache(self): # noqa: GL08 return self._meta[self._meta_offset] @_length_cache.setter def _length_cache(self, value): # noqa: GL08 self._meta[self._meta_offset] = value @property def _width_cache(self): # noqa: GL08 return self._meta[self._meta_offset + 1] @_width_cache.setter def _width_cache(self, value): # noqa: GL08 self._meta[self._meta_offset + 1] = value @property def _ip_cache(self): # noqa: GL08 return self._meta[-1] @_ip_cache.setter def _ip_cache(self, value): # noqa: GL08 self._meta[-1] = value @ray.remote(num_returns=2) def _get_index_and_columns(df): # pragma: no cover """ Get the number of rows and columns of a pandas DataFrame. Parameters ---------- df : pandas.DataFrame A pandas DataFrame which dimensions are needed. Returns ------- int The number of rows. int The number of columns. """ return len(df.index), len(df.columns) PandasOnRayDataframePartition._eager_exec_func = PandasOnRayDataframePartition.apply PandasOnRayDataframePartition._lazy_exec_func = ( PandasOnRayDataframePartition.add_to_apply_calls ) def _configure_lazy_exec(cls: LazyExecution): """Configure lazy execution mode for PandasOnRayDataframePartition.""" mode = cls.get() get_logger().debug(f"Ray lazy execution mode: {mode}") if mode == "Auto": PandasOnRayDataframePartition.apply = ( PandasOnRayDataframePartition._eager_exec_func ) PandasOnRayDataframePartition.add_to_apply_calls = ( PandasOnRayDataframePartition._lazy_exec_func ) elif mode == "On": def lazy_exec(self, func, *args, **kwargs): return self._lazy_exec_func(func, *args, length=None, width=None, **kwargs) PandasOnRayDataframePartition.apply = lazy_exec PandasOnRayDataframePartition.add_to_apply_calls = ( PandasOnRayDataframePartition._lazy_exec_func ) elif mode == "Off": def eager_exec(self, func, *args, length=None, width=None, **kwargs): return self._eager_exec_func(func, *args, **kwargs) PandasOnRayDataframePartition.apply = ( PandasOnRayDataframePartition._eager_exec_func ) PandasOnRayDataframePartition.add_to_apply_calls = eager_exec else: raise ValueError(f"Invalid lazy execution mode: {mode}") LazyExecution.subscribe(_configure_lazy_exec) class SlicerHook(MaterializationHook): """ Used by mask() for the slilced length computation. Parameters ---------- ref : ObjectIDType Non-materialized length to be sliced. slc : slice The slice to be applied. """ def __init__(self, ref: ObjectIDType, slc: slice): self.ref = ref self.slc = slc def pre_materialize(self): """ Get the sliced length or object ref if not materialized. Returns ------- int or ObjectIDType """ if isinstance(self.ref, MetaListHook): len_or_ref = self.ref.pre_materialize() return ( compute_sliced_len(self.slc, len_or_ref) if isinstance(len_or_ref, int) else len_or_ref ) return self.ref def post_materialize(self, materialized): """ Get the sliced length. Parameters ---------- materialized : list or int Returns ------- int """ if isinstance(self.ref, MetaListHook): materialized = self.ref.post_materialize(materialized) return compute_sliced_len(self.slc, materialized) ================================================ FILE: modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition_manager.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses class that implements ``GenericRayDataframePartitionManager`` using Ray.""" import numpy as np from pandas.core.dtypes.common import is_numeric_dtype from modin.config import AsyncReadMode from modin.core.execution.modin_aqp import progress_bar_wrapper from modin.core.execution.ray.common import RayWrapper from modin.core.execution.ray.generic.partitioning import ( GenericRayDataframePartitionManager, ) from modin.logging import get_logger from modin.utils import _inherit_docstrings from .partition import PandasOnRayDataframePartition from .virtual_partition import ( PandasOnRayDataframeColumnPartition, PandasOnRayDataframeRowPartition, ) class PandasOnRayDataframePartitionManager(GenericRayDataframePartitionManager): """The class implements the interface in `PandasDataframePartitionManager`.""" # This object uses RayRemotePartition objects as the underlying store. _partition_class = PandasOnRayDataframePartition _column_partitions_class = PandasOnRayDataframeColumnPartition _row_partition_class = PandasOnRayDataframeRowPartition _execution_wrapper = RayWrapper materialize_futures = RayWrapper.materialize @classmethod def wait_partitions(cls, partitions): """ Wait on the objects wrapped by `partitions` in parallel, without materializing them. This method will block until all computations in the list have completed. Parameters ---------- partitions : np.ndarray NumPy array with ``PandasDataframePartition``-s. """ RayWrapper.wait( [block for partition in partitions for block in partition.list_of_blocks] ) @classmethod @_inherit_docstrings( GenericRayDataframePartitionManager.split_pandas_df_into_partitions ) def split_pandas_df_into_partitions( cls, df, row_chunksize, col_chunksize, update_bar ): # it was found out, that with the following condition it's more beneficial # to use the distributed splitting, let's break them down: # 1. The distributed splitting is used only when there's more than 6mln elements # in the `df`, as with fewer data it's better to use the sequential splitting # 2. Only used with numerical data, as with other dtypes, putting the whole big # dataframe into the storage takes too much time. # 3. The distributed splitting consumes more memory that the sequential one. # It was estimated that it requires ~2.5x of the dataframe size, for now there # was no good way found to automatically fall back to the sequential # implementation in case of not enough memory, so currently we're enabling # the distributed version only if 'AsyncReadMode' is set to True. Follow this # discussion for more info on why automatical dispatching is hard: # https://github.com/modin-project/modin/pull/6640#issuecomment-1759932664 enough_elements = (len(df) * len(df.columns)) > 6_000_000 all_numeric_types = all(is_numeric_dtype(dtype) for dtype in df.dtypes) async_mode_on = AsyncReadMode.get() distributed_splitting = enough_elements and all_numeric_types and async_mode_on log = get_logger() if not distributed_splitting: log.info( "Using sequential splitting in '.from_pandas()' because of some of the conditions are False: " + f"{enough_elements=}; {all_numeric_types=}; {async_mode_on=}" ) return super().split_pandas_df_into_partitions( df, row_chunksize, col_chunksize, update_bar ) log.info("Using distributed splitting in '.from_pandas()'") put_func = cls._partition_class.put def mask(part, row_loc, col_loc): # 2D iloc works surprisingly slow, so doing this chained iloc calls: # https://github.com/pandas-dev/pandas/issues/55202 return part.apply(lambda df: df.iloc[row_loc, :].iloc[:, col_loc]) main_part = put_func(df) parts = [ [ update_bar( mask( main_part, slice(i, i + row_chunksize), slice(j, j + col_chunksize), ), ) for j in range(0, len(df.columns), col_chunksize) ] for i in range(0, len(df), row_chunksize) ] return np.array(parts) def _make_wrapped_method(name: str): """ Define new attribute that should work with progress bar. Parameters ---------- name : str Name of `GenericRayDataframePartitionManager` attribute that should be reused. Notes ----- - `classmethod` decorator shouldn't be applied twice, so we refer to `__func__` attribute. - New attribute is defined for `PandasOnRayDataframePartitionManager`. """ setattr( PandasOnRayDataframePartitionManager, name, classmethod( progress_bar_wrapper( getattr(GenericRayDataframePartitionManager, name).__func__ ) ), ) for method in ( "map_partitions", "lazy_map_partitions", "map_axis_partitions", "_apply_func_to_list_of_partitions", "apply_func_to_select_indices", "apply_func_to_select_indices_along_full_axis", "apply_func_to_indices_both_axis", "n_ary_operation", ): _make_wrapped_method(method) ================================================ FILE: modin/core/execution/ray/implementations/pandas_on_ray/partitioning/virtual_partition.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses classes responsible for storing a virtual partition and applying a function to it.""" import pandas import ray from ray.util import get_node_ip_address from modin.config import RayTaskCustomResources from modin.core.dataframe.pandas.partitioning.axis_partition import ( PandasDataframeAxisPartition, ) from modin.core.execution.ray.common import RayWrapper from modin.utils import _inherit_docstrings from .partition import PandasOnRayDataframePartition class PandasOnRayDataframeVirtualPartition(PandasDataframeAxisPartition): """ The class implements the interface in ``PandasDataframeAxisPartition``. Parameters ---------- list_of_partitions : Union[list, PandasOnRayDataframePartition] List of ``PandasOnRayDataframePartition`` and ``PandasOnRayDataframeVirtualPartition`` objects, or a single ``PandasOnRayDataframePartition``. get_ip : bool, default: False Whether to get node IP addresses to conforming partitions or not. full_axis : bool, default: True Whether or not the virtual partition encompasses the whole axis. call_queue : list, optional A list of tuples (callable, args, kwargs) that contains deferred calls. length : ray.ObjectRef or int, optional Length, or reference to length, of wrapped ``pandas.DataFrame``. width : ray.ObjectRef or int, optional Width, or reference to width, of wrapped ``pandas.DataFrame``. """ _PARTITIONS_METADATA_LEN = 3 # (length, width, ip) partition_type = PandasOnRayDataframePartition axis = None # these variables are intentionally initialized at runtime (see #6023) _DEPLOY_AXIS_FUNC = None _DEPLOY_SPLIT_FUNC = None _DRAIN_FUNC = None @classmethod def _get_deploy_axis_func(cls): # noqa: GL08 if cls._DEPLOY_AXIS_FUNC is None: cls._DEPLOY_AXIS_FUNC = RayWrapper.put( PandasDataframeAxisPartition.deploy_axis_func ) return cls._DEPLOY_AXIS_FUNC @classmethod def _get_deploy_split_func(cls): # noqa: GL08 if cls._DEPLOY_SPLIT_FUNC is None: cls._DEPLOY_SPLIT_FUNC = RayWrapper.put( PandasDataframeAxisPartition.deploy_splitting_func ) return cls._DEPLOY_SPLIT_FUNC @classmethod def _get_drain_func(cls): # noqa: GL08 if cls._DRAIN_FUNC is None: cls._DRAIN_FUNC = RayWrapper.put(PandasDataframeAxisPartition.drain) return cls._DRAIN_FUNC @property def list_of_ips(self): """ Get the IPs holding the physical objects composing this partition. Returns ------- List A list of IPs as ``ray.ObjectRef`` or str. """ # Defer draining call queue until we get the ip address result = [None] * len(self.list_of_block_partitions) for idx, partition in enumerate(self.list_of_block_partitions): partition.drain_call_queue() result[idx] = partition.ip(materialize=False) return result @classmethod @_inherit_docstrings(PandasDataframeAxisPartition.deploy_splitting_func) def deploy_splitting_func( cls, axis, func, f_args, f_kwargs, num_splits, *partitions, extract_metadata=False, ): return _deploy_ray_func.options( num_returns=( num_splits * (1 + cls._PARTITIONS_METADATA_LEN) if extract_metadata else num_splits ), resources=RayTaskCustomResources.get(), ).remote( cls._get_deploy_split_func(), *f_args, num_splits, *partitions, axis=axis, f_to_deploy=func, f_len_args=len(f_args), f_kwargs=f_kwargs, extract_metadata=extract_metadata, ) @classmethod def deploy_axis_func( cls, axis, func, f_args, f_kwargs, num_splits, maintain_partitioning, *partitions, min_block_size, lengths=None, manual_partition=False, max_retries=None, ): """ Deploy a function along a full axis. Parameters ---------- axis : {0, 1} The axis to perform the function along. func : callable The function to perform. f_args : list or tuple Positional arguments to pass to ``func``. f_kwargs : dict Keyword arguments to pass to ``func``. num_splits : int The number of splits to return (see ``split_result_of_axis_func_pandas``). maintain_partitioning : bool If True, keep the old partitioning if possible. If False, create a new partition layout. *partitions : iterable All partitions that make up the full axis (row or column). min_block_size : int Minimum number of rows/columns in a single split. lengths : list, optional The list of lengths to shuffle the object. manual_partition : bool, default: False If True, partition the result with `lengths`. max_retries : int, default: None The max number of times to retry the func. Returns ------- list A list of ``ray.ObjectRef``-s. """ return _deploy_ray_func.options( num_returns=(num_splits if lengths is None else len(lengths)) * (1 + cls._PARTITIONS_METADATA_LEN), **({"max_retries": max_retries} if max_retries is not None else {}), resources=RayTaskCustomResources.get(), ).remote( cls._get_deploy_axis_func(), *f_args, num_splits, maintain_partitioning, *partitions, axis=axis, f_to_deploy=func, f_len_args=len(f_args), f_kwargs=f_kwargs, manual_partition=manual_partition, min_block_size=min_block_size, lengths=lengths, return_generator=True, ) @classmethod def deploy_func_between_two_axis_partitions( cls, axis, func, f_args, f_kwargs, num_splits, len_of_left, other_shape, *partitions, min_block_size, ): """ Deploy a function along a full axis between two data sets. Parameters ---------- axis : {0, 1} The axis to perform the function along. func : callable The function to perform. f_args : list or tuple Positional arguments to pass to ``func``. f_kwargs : dict Keyword arguments to pass to ``func``. num_splits : int The number of splits to return (see ``split_result_of_axis_func_pandas``). len_of_left : int The number of values in `partitions` that belong to the left data set. other_shape : np.ndarray The shape of right frame in terms of partitions, i.e. (other_shape[i-1], other_shape[i]) will indicate slice to restore i-1 axis partition. *partitions : iterable All partitions that make up the full axis (row or column) for both data sets. min_block_size : int Minimum number of rows/columns in a single split. Returns ------- list A list of ``ray.ObjectRef``-s. """ return _deploy_ray_func.options( num_returns=num_splits * (1 + cls._PARTITIONS_METADATA_LEN), resources=RayTaskCustomResources.get(), ).remote( PandasDataframeAxisPartition.deploy_func_between_two_axis_partitions, *f_args, num_splits, len_of_left, other_shape, *partitions, axis=axis, f_to_deploy=func, f_len_args=len(f_args), f_kwargs=f_kwargs, min_block_size=min_block_size, return_generator=True, ) def wait(self): """Wait completing computations on the object wrapped by the partition.""" self.drain_call_queue() futures = self.list_of_blocks RayWrapper.wait(futures) @_inherit_docstrings(PandasOnRayDataframeVirtualPartition) class PandasOnRayDataframeColumnPartition(PandasOnRayDataframeVirtualPartition): axis = 0 @_inherit_docstrings(PandasOnRayDataframeVirtualPartition) class PandasOnRayDataframeRowPartition(PandasOnRayDataframeVirtualPartition): axis = 1 @ray.remote def _deploy_ray_func( deployer, *positional_args, axis, f_to_deploy, f_len_args, f_kwargs, extract_metadata=True, **kwargs, ): # pragma: no cover """ Execute a function on an axis partition in a worker process. This is ALWAYS called on either ``PandasDataframeAxisPartition.deploy_axis_func`` or ``PandasDataframeAxisPartition.deploy_func_between_two_axis_partitions``, which both serve to deploy another dataframe function on a Ray worker process. The provided `positional_args` contains positional arguments for both: `deployer` and for `f_to_deploy`, the parameters can be separated using the `f_len_args` value. The parameters are combined so they will be deserialized by Ray before the kernel is executed (`f_kwargs` will never contain more Ray objects, and thus does not require deserialization). Parameters ---------- deployer : callable A `PandasDataFrameAxisPartition.deploy_*` method that will call ``f_to_deploy``. *positional_args : list The first `f_len_args` elements in this list represent positional arguments to pass to the `f_to_deploy`. The rest are positional arguments that will be passed to `deployer`. axis : {0, 1} The axis to perform the function along. This argument is keyword only. f_to_deploy : callable or RayObjectID The function to deploy. This argument is keyword only. f_len_args : int Number of positional arguments to pass to ``f_to_deploy``. This argument is keyword only. f_kwargs : dict Keyword arguments to pass to ``f_to_deploy``. This argument is keyword only. extract_metadata : bool, default: True Whether to return metadata (length, width, ip) of the result. Passing `False` may relax the load on object storage as the remote function would return 4 times fewer futures. Passing `False` makes sense for temporary results where you know for sure that the metadata will never be requested. This argument is keyword only. **kwargs : dict Keyword arguments to pass to ``deployer``. Returns ------- list : Union[tuple, list] The result of the function call, and metadata for it. Notes ----- Ray functions are not detected by codecov (thus pragma: no cover). """ f_args = positional_args[:f_len_args] deploy_args = positional_args[f_len_args:] result = deployer(axis, f_to_deploy, f_args, f_kwargs, *deploy_args, **kwargs) if not extract_metadata: for item in result: yield item else: ip = get_node_ip_address() for r in result: if isinstance(r, pandas.DataFrame): for item in [r, len(r), len(r.columns), ip]: yield item else: for item in [r, None, None, ip]: yield item ================================================ FILE: modin/core/execution/unidist/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Modin's functionality related to unidist execution engine.""" ================================================ FILE: modin/core/execution/unidist/common/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Common utilities for unidist execution engine.""" from .engine_wrapper import SignalActor, UnidistWrapper from .utils import initialize_unidist __all__ = [ "initialize_unidist", "UnidistWrapper", "SignalActor", ] ================================================ FILE: modin/core/execution/unidist/common/engine_wrapper.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ The module with helper mixin for executing functions remotely. To be used as a piece of building a unidist-based engine. """ import asyncio import pandas import unidist @unidist.remote def _deploy_unidist_func( func, *args, return_pandas_df=None, **kwargs ): # pragma: no cover """ Wrap `func` to ease calling it remotely. Parameters ---------- func : callable A local function that we want to call remotely. *args : iterable Positional arguments to pass to `func` when calling remotely. return_pandas_df : bool, optional Whether to convert the result of `func` to a pandas DataFrame or not. **kwargs : dict Keyword arguments to pass to `func` when calling remotely. Returns ------- unidist.ObjectRef or list[unidist.ObjectRef] Unidist identifier of the result being put to object store. """ result = func(*args, **kwargs) if return_pandas_df and not isinstance(result, pandas.DataFrame): result = pandas.DataFrame(result) return result class UnidistWrapper: """Mixin that provides means of running functions remotely and getting local results.""" @classmethod def deploy( cls, func, f_args=None, f_kwargs=None, return_pandas_df=None, num_returns=1 ): """ Run local `func` remotely. Parameters ---------- func : callable or unidist.ObjectRef The function to perform. f_args : list or tuple, optional Positional arguments to pass to ``func``. f_kwargs : dict, optional Keyword arguments to pass to ``func``. return_pandas_df : bool, optional Whether to convert the result of `func` to a pandas DataFrame or not. num_returns : int, default: 1 Amount of return values expected from `func`. Returns ------- unidist.ObjectRef or list Unidist identifier of the result being put to object store. """ args = [] if f_args is None else f_args kwargs = {} if f_kwargs is None else f_kwargs return _deploy_unidist_func.options(num_returns=num_returns).remote( func, *args, return_pandas_df=return_pandas_df, **kwargs ) @classmethod def is_future(cls, item): """ Check if the item is a Future. Parameters ---------- item : unidist.ObjectRef or object Future or object to check. Returns ------- boolean If the value is a future. """ return unidist.is_object_ref(item) @classmethod def materialize(cls, obj_id): """ Get the value of object from the object store. Parameters ---------- obj_id : unidist.ObjectRef Unidist object identifier to get the value by. Returns ------- object Whatever was identified by `obj_id`. """ return unidist.get(obj_id) @classmethod def put(cls, data, **kwargs): """ Put data into the object store. Parameters ---------- data : object Data to be put. **kwargs : dict Additional keyword arguments (mostly for compatibility). Returns ------- unidist.ObjectRef A reference to `data`. """ return unidist.put(data) @classmethod def wait(cls, obj_ids, num_returns=None): """ Wait on the objects without materializing them (blocking operation). ``unidist.wait`` assumes a list of unique object references: see https://github.com/modin-project/modin/issues/5045 Parameters ---------- obj_ids : list, scalar num_returns : int, optional """ if not isinstance(obj_ids, list): obj_ids = [obj_ids] unique_ids = list(set(obj_ids)) if num_returns is None: num_returns = len(unique_ids) if num_returns > 0: unidist.wait(unique_ids, num_returns=num_returns) @unidist.remote class SignalActor: # pragma: no cover """ Help synchronize across tasks and actors on cluster. Parameters ---------- event_count : int Number of events required for synchronization. Notes ----- For details see: https://docs.ray.io/en/latest/advanced.html?highlight=signalactor#multi-node-synchronization-using-an-actor. """ def __init__(self, event_count: int): self.events = [asyncio.Event() for _ in range(event_count)] def send(self, event_idx: int): """ Indicate that event with `event_idx` has occurred. Parameters ---------- event_idx : int """ self.events[event_idx].set() async def wait(self, event_idx: int): """ Wait until event with `event_idx` has occurred. Parameters ---------- event_idx : int """ await self.events[event_idx].wait() ================================================ FILE: modin/core/execution/unidist/common/utils.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """The module holds utility and initialization routines for Modin on unidist.""" import unidist import unidist.config as unidist_cfg import modin.config as modin_cfg from .engine_wrapper import UnidistWrapper def initialize_unidist(): """ Initialize unidist based on ``modin.config`` variables and internal defaults. """ if unidist_cfg.Backend.get() != "mpi": raise RuntimeError( f"Modin only supports MPI through unidist for now, got unidist backend '{unidist_cfg.Backend.get()}'" ) if not unidist.is_initialized(): modin_cfg.CpuCount.subscribe( lambda cpu_count: unidist_cfg.CpuCount.put(cpu_count.get()) ) unidist_cfg.MpiRuntimeEnv.put( {"env_vars": {"PYTHONWARNINGS": "ignore::FutureWarning"}} ) unidist.init() num_cpus = sum(v["CPU"] for v in unidist.cluster_resources().values()) modin_cfg.NPartitions._put(num_cpus) modin_cfg.CpuCount._put(num_cpus) def deserialize(obj): # pragma: no cover """ Deserialize a unidist object. Parameters ---------- obj : unidist.ObjectRef, iterable of unidist.ObjectRef, or mapping of keys to unidist.ObjectRef Object(s) to deserialize. Returns ------- obj The deserialized object(s). """ if unidist.is_object_ref(obj): return UnidistWrapper.materialize(obj) elif isinstance(obj, (tuple, list)): # Unidist will error if any elements are not ObjectRef, but we still want unidist to # perform batch deserialization for us -- thus, we must submit only the list elements # that are ObjectRef, deserialize them, and restore them to their correct list index ref_indices, refs = [], [] for i, unidist_ref in enumerate(obj): if unidist.is_object_ref(unidist_ref): ref_indices.append(i) refs.append(unidist_ref) unidist_result = UnidistWrapper.materialize(refs) new_lst = list(obj) for i, deser_item in zip(ref_indices, unidist_result): new_lst[i] = deser_item # Check that all objects have been deserialized assert not any(unidist.is_object_ref(o) for o in new_lst) return new_lst elif isinstance(obj, dict) and any( unidist.is_object_ref(val) for val in obj.values() ): return dict(zip(obj.keys(), deserialize(tuple(obj.values())))) else: return obj ================================================ FILE: modin/core/execution/unidist/generic/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Generic functionality for unidist execution engine.""" ================================================ FILE: modin/core/execution/unidist/generic/io/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Generic IO functionality for unidist execution engine.""" from .io import UnidistIO __all__ = ["UnidistIO"] ================================================ FILE: modin/core/execution/unidist/generic/io/io.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """The module holds base class implementing required I/O over unidist.""" from modin.core.io import BaseIO class UnidistIO(BaseIO): """Base class for doing I/O operations over unidist.""" ================================================ FILE: modin/core/execution/unidist/generic/partitioning/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Generic partitioning functionality for unidist execution engine.""" from .partition_manager import GenericUnidistDataframePartitionManager __all__ = [ "GenericUnidistDataframePartitionManager", ] ================================================ FILE: modin/core/execution/unidist/generic/partitioning/partition_manager.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """The module holds Modin partition manager implemented for unidist.""" import numpy as np from modin.core.dataframe.pandas.partitioning.partition_manager import ( PandasDataframePartitionManager, ) from modin.core.execution.unidist.common import UnidistWrapper class GenericUnidistDataframePartitionManager(PandasDataframePartitionManager): """The class implements the interface in `PandasDataframePartitionManager`.""" @classmethod def to_numpy(cls, partitions, **kwargs): """ Convert `partitions` into a NumPy array. Parameters ---------- partitions : NumPy array A 2-D array of partitions to convert to local NumPy array. **kwargs : dict Keyword arguments to pass to each partition ``.to_numpy()`` call. Returns ------- NumPy array """ if partitions.shape[1] == 1: parts = cls.get_objects_from_partitions(partitions.flatten()) parts = [part.to_numpy(**kwargs) for part in parts] else: parts = UnidistWrapper.materialize( [ obj.apply( lambda df, **kwargs: df.to_numpy(**kwargs) ).list_of_blocks[0] for row in partitions for obj in row ] ) rows, cols = partitions.shape parts = [parts[i * cols : (i + 1) * cols] for i in range(rows)] return np.block(parts) ================================================ FILE: modin/core/execution/unidist/implementations/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Experimental Modin's functionality related to unidist execution engine and optimized for specific storage formats.""" ================================================ FILE: modin/core/execution/unidist/implementations/pandas_on_unidist/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Modin's functionality related to unidist execution engine and optimized for pandas storage format.""" ================================================ FILE: modin/core/execution/unidist/implementations/pandas_on_unidist/dataframe/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Base Modin Dataframe class optimized for pandas on unidist execution.""" from .dataframe import PandasOnUnidistDataframe __all__ = ["PandasOnUnidistDataframe"] ================================================ FILE: modin/core/execution/unidist/implementations/pandas_on_unidist/dataframe/dataframe.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses class that implements ``PandasDataframe`` using unidist.""" from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe from modin.utils import _inherit_docstrings from ..partitioning.partition_manager import PandasOnUnidistDataframePartitionManager class PandasOnUnidistDataframe(PandasDataframe): """ The class implements the interface in ``PandasDataframe`` using unidist. Parameters ---------- partitions : np.ndarray A 2D NumPy array of partitions. index : sequence The index for the dataframe. Converted to a ``pandas.Index``. columns : sequence The columns object for the dataframe. Converted to a ``pandas.Index``. row_lengths : list, optional The length of each partition in the rows. The "height" of each of the block partitions. Is computed if not provided. column_widths : list, optional The width of each partition in the columns. The "width" of each of the block partitions. Is computed if not provided. dtypes : pandas.Series, optional The data types for the dataframe columns. pandas_backend : {"pyarrow", None}, optional Backend used by pandas. None - means default NumPy backend. """ _partition_mgr_cls = PandasOnUnidistDataframePartitionManager def support_materialization_in_worker_process(self) -> bool: # more details why this is not `True` in https://github.com/modin-project/modin/pull/6673 return False @property @_inherit_docstrings(PandasDataframe.engine) def engine(self) -> str: return "Unidist" ================================================ FILE: modin/core/execution/unidist/implementations/pandas_on_unidist/io/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Base IO classes optimized for pandas on unidist execution.""" from .io import PandasOnUnidistIO __all__ = ["PandasOnUnidistIO"] ================================================ FILE: modin/core/execution/unidist/implementations/pandas_on_unidist/io/io.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """The module holds the factory which performs I/O using pandas on unidist.""" import io import numpy as np import pandas from pandas.io.common import get_handle, stringify_path from modin.core.execution.unidist.common import SignalActor, UnidistWrapper from modin.core.execution.unidist.generic.io import UnidistIO from modin.core.io import ( CSVDispatcher, ExcelDispatcher, FeatherDispatcher, FWFDispatcher, JSONDispatcher, ParquetDispatcher, SQLDispatcher, ) from modin.core.storage_formats.pandas.parsers import ( PandasCSVParser, PandasExcelParser, PandasFeatherParser, PandasFWFParser, PandasJSONParser, PandasParquetParser, PandasSQLParser, ) from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler from modin.experimental.core.io import ( ExperimentalCSVGlobDispatcher, ExperimentalCustomTextDispatcher, ExperimentalGlobDispatcher, ExperimentalSQLDispatcher, ) from modin.experimental.core.storage_formats.pandas.parsers import ( ExperimentalCustomTextParser, ExperimentalPandasCSVGlobParser, ExperimentalPandasJsonParser, ExperimentalPandasParquetParser, ExperimentalPandasPickleParser, ExperimentalPandasXmlParser, ) from ..dataframe import PandasOnUnidistDataframe from ..partitioning import PandasOnUnidistDataframePartition class PandasOnUnidistIO(UnidistIO): """Factory providing methods for performing I/O operations using pandas as storage format on unidist as engine.""" frame_cls = PandasOnUnidistDataframe frame_partition_cls = PandasOnUnidistDataframePartition query_compiler_cls = PandasQueryCompiler build_args = dict( frame_partition_cls=PandasOnUnidistDataframePartition, query_compiler_cls=PandasQueryCompiler, frame_cls=PandasOnUnidistDataframe, base_io=UnidistIO, ) def __make_read(*classes, build_args=build_args): # used to reduce code duplication return type("", (UnidistWrapper, *classes), build_args).read def __make_write(*classes, build_args=build_args): # used to reduce code duplication return type("", (UnidistWrapper, *classes), build_args).write read_csv = __make_read(PandasCSVParser, CSVDispatcher) read_fwf = __make_read(PandasFWFParser, FWFDispatcher) read_json = __make_read(PandasJSONParser, JSONDispatcher) read_parquet = __make_read(PandasParquetParser, ParquetDispatcher) to_parquet = __make_write(ParquetDispatcher) # Blocked on pandas-dev/pandas#12236. It is faster to default to pandas. # read_hdf = __make_read(PandasHDFParser, HDFReader) read_feather = __make_read(PandasFeatherParser, FeatherDispatcher) read_sql = __make_read(PandasSQLParser, SQLDispatcher) to_sql = __make_write(SQLDispatcher) read_excel = __make_read(PandasExcelParser, ExcelDispatcher) # experimental methods that don't exist in pandas read_csv_glob = __make_read( ExperimentalPandasCSVGlobParser, ExperimentalCSVGlobDispatcher ) read_parquet_glob = __make_read( ExperimentalPandasParquetParser, ExperimentalGlobDispatcher ) to_parquet_glob = __make_write( ExperimentalGlobDispatcher, build_args={**build_args, "base_write": UnidistIO.to_parquet}, ) read_json_glob = __make_read( ExperimentalPandasJsonParser, ExperimentalGlobDispatcher ) to_json_glob = __make_write( ExperimentalGlobDispatcher, build_args={**build_args, "base_write": UnidistIO.to_json}, ) read_xml_glob = __make_read(ExperimentalPandasXmlParser, ExperimentalGlobDispatcher) to_xml_glob = __make_write( ExperimentalGlobDispatcher, build_args={**build_args, "base_write": UnidistIO.to_xml}, ) read_pickle_glob = __make_read( ExperimentalPandasPickleParser, ExperimentalGlobDispatcher ) to_pickle_glob = __make_write( ExperimentalGlobDispatcher, build_args={**build_args, "base_write": UnidistIO.to_pickle}, ) read_custom_text = __make_read( ExperimentalCustomTextParser, ExperimentalCustomTextDispatcher ) read_sql_distributed = __make_read( ExperimentalSQLDispatcher, build_args={**build_args, "base_read": read_sql} ) del __make_read # to not pollute class namespace del __make_write # to not pollute class namespace @staticmethod def _to_csv_check_support(kwargs): """ Check if parallel version of ``to_csv`` could be used. Parameters ---------- kwargs : dict Keyword arguments passed to ``.to_csv()``. Returns ------- bool Whether parallel version of ``to_csv`` is applicable. """ path_or_buf = kwargs["path_or_buf"] compression = kwargs["compression"] if not isinstance(path_or_buf, str): return False # case when the pointer is placed at the beginning of the file. if "r" in kwargs["mode"] and "+" in kwargs["mode"]: return False # encodings with BOM don't support; # instead of one mark in result bytes we will have them by the number of partitions # so we should fallback in pandas for `utf-16`, `utf-32` with all aliases, in instance # (`utf_32_be`, `utf_16_le` and so on) if kwargs["encoding"] is not None: encoding = kwargs["encoding"].lower() if "u" in encoding or "utf" in encoding: if "16" in encoding or "32" in encoding: return False if compression is None or not compression == "infer": return False if any((path_or_buf.endswith(ext) for ext in [".gz", ".bz2", ".zip", ".xz"])): return False return True @classmethod def to_csv(cls, qc, **kwargs): """ Write records stored in the `qc` to a CSV file. Parameters ---------- qc : BaseQueryCompiler The query compiler of the Modin dataframe that we want to run ``to_csv`` on. **kwargs : dict Parameters for ``pandas.to_csv(**kwargs)``. """ kwargs["path_or_buf"] = stringify_path(kwargs["path_or_buf"]) if not cls._to_csv_check_support(kwargs): return UnidistIO.to_csv(qc, **kwargs) signals = SignalActor.remote(len(qc._modin_frame._partitions) + 1) def func(df, **kw): # pragma: no cover """ Dump a chunk of rows as csv, then save them to target maintaining order. Parameters ---------- df : pandas.DataFrame A chunk of rows to write to a CSV file. **kw : dict Arguments to pass to ``pandas.to_csv(**kw)`` plus an extra argument `partition_idx` serving as chunk index to maintain rows order. """ partition_idx = kw["partition_idx"] # the copy is made to not implicitly change the input parameters; # to write to an intermediate buffer, we need to change `path_or_buf` in kwargs csv_kwargs = kwargs.copy() if partition_idx != 0: # we need to create a new file only for first recording # all the rest should be recorded in appending mode if "w" in csv_kwargs["mode"]: csv_kwargs["mode"] = csv_kwargs["mode"].replace("w", "a") # It is enough to write the header for the first partition csv_kwargs["header"] = False # for parallelization purposes, each partition is written to an intermediate buffer path_or_buf = csv_kwargs["path_or_buf"] is_binary = "b" in csv_kwargs["mode"] csv_kwargs["path_or_buf"] = io.BytesIO() if is_binary else io.StringIO() storage_options = csv_kwargs.pop("storage_options", None) df.to_csv(**csv_kwargs) csv_kwargs.update({"storage_options": storage_options}) content = csv_kwargs["path_or_buf"].getvalue() csv_kwargs["path_or_buf"].close() # each process waits for its turn to write to a file UnidistWrapper.materialize(signals.wait.remote(partition_idx)) # preparing to write data from the buffer to a file with get_handle( path_or_buf, # in case when using URL in implicit text mode # pandas try to open `path_or_buf` in binary mode csv_kwargs["mode"] if is_binary else csv_kwargs["mode"] + "t", encoding=kwargs["encoding"], errors=kwargs["errors"], compression=kwargs["compression"], storage_options=kwargs.get("storage_options", None), is_text=not is_binary, ) as handles: handles.handle.write(content) # signal that the next process can start writing to the file UnidistWrapper.materialize(signals.send.remote(partition_idx + 1)) # used for synchronization purposes return pandas.DataFrame() # signaling that the partition with id==0 can be written to the file UnidistWrapper.materialize(signals.send.remote(0)) # Ensure that the metadata is syncrhonized qc._modin_frame._propagate_index_objs(axis=None) result = qc._modin_frame._partition_mgr_cls.map_axis_partitions( axis=1, partitions=qc._modin_frame._partitions, map_func=func, keep_partitioning=True, lengths=None, enumerate_partitions=True, max_retries=0, ) # pending completion UnidistWrapper.materialize( [part.list_of_blocks[0] for row in result for part in row] ) @classmethod def from_map(cls, func, iterable, *args, **kwargs): """ Create a Modin `query_compiler` from a map function. This method will construct a Modin `query_compiler` split by row partitions. The number of row partitions matches the number of elements in the iterable object. Parameters ---------- func : callable Function to map across the iterable object. iterable : Iterable An iterable object. *args : tuple Positional arguments to pass in `func`. **kwargs : dict Keyword arguments to pass in `func`. Returns ------- BaseQueryCompiler QueryCompiler containing data returned by map function. """ func = cls.frame_cls._partition_mgr_cls.preprocess_func(func) partitions = np.array( [ [ cls.frame_partition_cls( UnidistWrapper.deploy( func, f_args=(obj,) + args, f_kwargs=kwargs, return_pandas_df=True, ) ) ] for obj in iterable ] ) return cls.query_compiler_cls(cls.frame_cls(partitions)) ================================================ FILE: modin/core/execution/unidist/implementations/pandas_on_unidist/partitioning/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Base Modin Dataframe classes related to its partitioning and optimized for pandas on unidist execution.""" from .partition import PandasOnUnidistDataframePartition from .partition_manager import PandasOnUnidistDataframePartitionManager from .virtual_partition import ( PandasOnUnidistDataframeColumnPartition, PandasOnUnidistDataframeRowPartition, PandasOnUnidistDataframeVirtualPartition, ) __all__ = [ "PandasOnUnidistDataframePartitionManager", "PandasOnUnidistDataframePartition", "PandasOnUnidistDataframeVirtualPartition", "PandasOnUnidistDataframeColumnPartition", "PandasOnUnidistDataframeRowPartition", ] ================================================ FILE: modin/core/execution/unidist/implementations/pandas_on_unidist/partitioning/partition.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses class that wraps data (block partition) and its metadata.""" import warnings import pandas import unidist from modin.core.dataframe.pandas.partitioning.partition import PandasDataframePartition from modin.core.execution.unidist.common import UnidistWrapper from modin.core.execution.unidist.common.utils import deserialize from modin.logging import get_logger from modin.pandas.indexing import compute_sliced_len compute_sliced_len = unidist.remote(compute_sliced_len) class PandasOnUnidistDataframePartition(PandasDataframePartition): """ The class implements the interface in ``PandasDataframePartition``. Parameters ---------- data : unidist.ObjectRef A reference to ``pandas.DataFrame`` that need to be wrapped with this class. length : unidist.ObjectRef or int, optional Length or reference to it of wrapped ``pandas.DataFrame``. width : unidist.ObjectRef or int, optional Width or reference to it of wrapped ``pandas.DataFrame``. ip : unidist.ObjectRef or str, optional Node IP address or reference to it that holds wrapped ``pandas.DataFrame``. call_queue : list Call queue that needs to be executed on wrapped ``pandas.DataFrame``. """ execution_wrapper = UnidistWrapper def __init__(self, data, length=None, width=None, ip=None, call_queue=None): super().__init__() assert unidist.is_object_ref(data) self._data = data self.call_queue = call_queue if call_queue is not None else [] self._length_cache = length self._width_cache = width self._ip_cache = ip log = get_logger() self._is_debug(log) and log.debug( "Partition ID: {}, Height: {}, Width: {}, Node IP: {}".format( self._identity, str(self._length_cache), str(self._width_cache), str(self._ip_cache), ) ) def apply(self, func, *args, **kwargs): """ Apply a function to the object wrapped by this partition. Parameters ---------- func : callable or unidist.ObjectRef A function to apply. *args : iterable Additional positional arguments to be passed in `func`. **kwargs : dict Additional keyword arguments to be passed in `func`. Returns ------- PandasOnUnidistDataframePartition A new ``PandasOnUnidistDataframePartition`` object. Notes ----- It does not matter if `func` is callable or an ``unidist.ObjectRef``. Unidist will handle it correctly either way. The keyword arguments are sent as a dictionary. """ log = get_logger() self._is_debug(log) and log.debug(f"ENTER::Partition.apply::{self._identity}") data = self._data call_queue = self.call_queue + [[func, args, kwargs]] if len(call_queue) > 1: self._is_debug(log) and log.debug( f"SUBMIT::_apply_list_of_funcs::{self._identity}" ) result, length, width, ip = _apply_list_of_funcs.remote(call_queue, data) else: # We handle `len(call_queue) == 1` in a different way because # this dramatically improves performance. result, length, width, ip = _apply_func.remote(data, func, *args, **kwargs) self._is_debug(log) and log.debug(f"SUBMIT::_apply_func::{self._identity}") self._is_debug(log) and log.debug(f"EXIT::Partition.apply::{self._identity}") return self.__constructor__(result, length, width, ip) def drain_call_queue(self): """Execute all operations stored in the call queue on the object wrapped by this partition.""" log = get_logger() self._is_debug(log) and log.debug( f"ENTER::Partition.drain_call_queue::{self._identity}" ) if len(self.call_queue) == 0: return data = self._data call_queue = self.call_queue if len(call_queue) > 1: self._is_debug(log) and log.debug( f"SUBMIT::_apply_list_of_funcs::{self._identity}" ) ( self._data, new_length, new_width, self._ip_cache, ) = _apply_list_of_funcs.remote(call_queue, data) else: # We handle `len(call_queue) == 1` in a different way because # this dramatically improves performance. func, f_args, f_kwargs = call_queue[0] self._is_debug(log) and log.debug(f"SUBMIT::_apply_func::{self._identity}") ( self._data, new_length, new_width, self._ip_cache, ) = _apply_func.remote(data, func, *f_args, **f_kwargs) self._is_debug(log) and log.debug( f"EXIT::Partition.drain_call_queue::{self._identity}" ) self.call_queue = [] # GH#4732 if we already have evaluated width/length cached as ints, # don't overwrite that cache with non-evaluated values. if not isinstance(self._length_cache, int): self._length_cache = new_length if not isinstance(self._width_cache, int): self._width_cache = new_width def wait(self): """Wait completing computations on the object wrapped by the partition.""" self.drain_call_queue() UnidistWrapper.wait(self._data) def mask(self, row_labels, col_labels): """ Lazily create a mask that extracts the indices provided. Parameters ---------- row_labels : list-like, slice or label The row labels for the rows to extract. col_labels : list-like, slice or label The column labels for the columns to extract. Returns ------- PandasOnUnidistDataframePartition A new ``PandasOnUnidistDataframePartition`` object. """ log = get_logger() self._is_debug(log) and log.debug(f"ENTER::Partition.mask::{self._identity}") new_obj = super().mask(row_labels, col_labels) if isinstance(row_labels, slice) and unidist.is_object_ref(self._length_cache): if row_labels == slice(None): # fast path - full axis take new_obj._length_cache = self._length_cache else: new_obj._length_cache = compute_sliced_len.remote( row_labels, self._length_cache ) if isinstance(col_labels, slice) and unidist.is_object_ref(self._width_cache): if col_labels == slice(None): # fast path - full axis take new_obj._width_cache = self._width_cache else: new_obj._width_cache = compute_sliced_len.remote( col_labels, self._width_cache ) self._is_debug(log) and log.debug(f"EXIT::Partition.mask::{self._identity}") return new_obj @classmethod def put(cls, obj): """ Put an object into object store and wrap it with partition object. Parameters ---------- obj : any An object to be put. Returns ------- PandasOnUnidistDataframePartition A new ``PandasOnUnidistDataframePartition`` object. """ return cls(cls.execution_wrapper.put(obj), len(obj.index), len(obj.columns)) @classmethod def preprocess_func(cls, func): """ Put a function into the object store to use in ``apply``. Parameters ---------- func : callable A function to preprocess. Returns ------- unidist.ObjectRef A reference to `func`. """ return cls.execution_wrapper.put(func) def length(self, materialize=True): """ Get the length of the object wrapped by this partition. Parameters ---------- materialize : bool, default: True Whether to forcibly materialize the result into an integer. If ``False`` was specified, may return a future of the result if it hasn't been materialized yet. Returns ------- int or unidist.ObjectRef The length of the object. """ if self._length_cache is None: if len(self.call_queue): self.drain_call_queue() else: ( self._length_cache, self._width_cache, ) = _get_index_and_columns_size.remote(self._data) if unidist.is_object_ref(self._length_cache) and materialize: self._length_cache = UnidistWrapper.materialize(self._length_cache) return self._length_cache def width(self, materialize=True): """ Get the width of the object wrapped by the partition. Parameters ---------- materialize : bool, default: True Whether to forcibly materialize the result into an integer. If ``False`` was specified, may return a future of the result if it hasn't been materialized yet. Returns ------- int or unidist.ObjectRef The width of the object. """ if self._width_cache is None: if len(self.call_queue): self.drain_call_queue() else: ( self._length_cache, self._width_cache, ) = _get_index_and_columns_size.remote(self._data) if unidist.is_object_ref(self._width_cache) and materialize: self._width_cache = UnidistWrapper.materialize(self._width_cache) return self._width_cache def ip(self, materialize=True): """ Get the node IP address of the object wrapped by this partition. Parameters ---------- materialize : bool, default: True Whether to forcibly materialize the result into an integer. If ``False`` was specified, may return a future of the result if it hasn't been materialized yet. Returns ------- str IP address of the node that holds the data. """ if self._ip_cache is None: if len(self.call_queue): self.drain_call_queue() else: self._ip_cache = self.apply(lambda df: pandas.DataFrame([]))._ip_cache if materialize and unidist.is_object_ref(self._ip_cache): self._ip_cache = UnidistWrapper.materialize(self._ip_cache) return self._ip_cache @unidist.remote(num_returns=2) def _get_index_and_columns_size(df): # pragma: no cover """ Get the number of rows and columns of a pandas DataFrame. Parameters ---------- df : pandas.DataFrame A pandas DataFrame which dimensions are needed. Returns ------- int The number of rows. int The number of columns. """ return len(df.index), len(df.columns) @unidist.remote(num_returns=4) def _apply_func(partition, func, *args, **kwargs): # pragma: no cover """ Execute a function on the partition in a worker process. Parameters ---------- partition : pandas.DataFrame A pandas DataFrame the function needs to be executed on. func : callable The function to perform on the partition. *args : list Positional arguments to pass to ``func``. **kwargs : dict Keyword arguments to pass to ``func``. Returns ------- pandas.DataFrame The resulting pandas DataFrame. int The number of rows of the resulting pandas DataFrame. int The number of columns of the resulting pandas DataFrame. str The node IP address of the worker process. Notes ----- Directly passing a call queue entry (i.e. a list of [func, args, kwargs]) instead of destructuring it causes a performance penalty. """ try: with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=FutureWarning) result = func(partition, *args, **kwargs) # Sometimes Arrow forces us to make a copy of an object before we operate on it. We # don't want the error to propagate to the user, and we want to avoid copying unless # we absolutely have to. except ValueError: with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=FutureWarning) result = func(partition.copy(), *args, **kwargs) return ( result, len(result) if hasattr(result, "__len__") else 0, len(getattr(result, "columns", ())), unidist.get_ip(), ) @unidist.remote(num_returns=4) def _apply_list_of_funcs(call_queue, partition): # pragma: no cover """ Execute all operations stored in the call queue on the partition in a worker process. Parameters ---------- call_queue : list A call queue that needs to be executed on the partition. partition : pandas.DataFrame A pandas DataFrame the call queue needs to be executed on. Returns ------- pandas.DataFrame The resulting pandas DataFrame. int The number of rows of the resulting pandas DataFrame. int The number of columns of the resulting pandas DataFrame. str The node IP address of the worker process. """ for func, f_args, f_kwargs in call_queue: func = deserialize(func) args = deserialize(f_args) kwargs = deserialize(f_kwargs) try: with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=FutureWarning) partition = func(partition, *args, **kwargs) # Sometimes Arrow forces us to make a copy of an object before we operate on it. We # don't want the error to propagate to the user, and we want to avoid copying unless # we absolutely have to. except ValueError: with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=FutureWarning) partition = func(partition.copy(), *args, **kwargs) return ( partition, len(partition) if hasattr(partition, "__len__") else 0, len(partition.columns) if hasattr(partition, "columns") else 0, unidist.get_ip(), ) ================================================ FILE: modin/core/execution/unidist/implementations/pandas_on_unidist/partitioning/partition_manager.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses class that implements ``GenericUnidistDataframePartitionManager`` using Unidist.""" from modin.core.execution.modin_aqp import progress_bar_wrapper from modin.core.execution.unidist.common import UnidistWrapper from modin.core.execution.unidist.generic.partitioning import ( GenericUnidistDataframePartitionManager, ) from .partition import PandasOnUnidistDataframePartition from .virtual_partition import ( PandasOnUnidistDataframeColumnPartition, PandasOnUnidistDataframeRowPartition, ) class PandasOnUnidistDataframePartitionManager(GenericUnidistDataframePartitionManager): """The class implements the interface in `PandasDataframePartitionManager`.""" # This object uses PandasOnUnidistDataframePartition objects as the underlying store. _partition_class = PandasOnUnidistDataframePartition _column_partitions_class = PandasOnUnidistDataframeColumnPartition _row_partition_class = PandasOnUnidistDataframeRowPartition _execution_wrapper = UnidistWrapper @classmethod def wait_partitions(cls, partitions): """ Wait on the objects wrapped by `partitions` in parallel, without materializing them. This method will block until all computations in the list have completed. Parameters ---------- partitions : np.ndarray NumPy array with ``PandasDataframePartition``-s. """ UnidistWrapper.wait( [block for partition in partitions for block in partition.list_of_blocks] ) def _make_wrapped_method(name: str): """ Define new attribute that should work with progress bar. Parameters ---------- name : str Name of `GenericUnidistDataframePartitionManager` attribute that should be reused. Notes ----- - `classmethod` decorator shouldn't be applied twice, so we refer to `__func__` attribute. - New attribute is defined for `PandasOnUnidistDataframePartitionManager`. """ setattr( PandasOnUnidistDataframePartitionManager, name, classmethod( progress_bar_wrapper( getattr(GenericUnidistDataframePartitionManager, name).__func__ ) ), ) for method in ( "map_partitions", "lazy_map_partitions", "map_axis_partitions", "_apply_func_to_list_of_partitions", "apply_func_to_select_indices", "apply_func_to_select_indices_along_full_axis", "apply_func_to_indices_both_axis", "n_ary_operation", ): _make_wrapped_method(method) ================================================ FILE: modin/core/execution/unidist/implementations/pandas_on_unidist/partitioning/virtual_partition.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses classes responsible for storing a virtual partition and applying a function to it.""" import warnings import pandas import unidist from modin.core.dataframe.pandas.partitioning.axis_partition import ( PandasDataframeAxisPartition, ) from modin.core.execution.unidist.common import UnidistWrapper from modin.core.execution.unidist.common.utils import deserialize from modin.utils import _inherit_docstrings from .partition import PandasOnUnidistDataframePartition class PandasOnUnidistDataframeVirtualPartition(PandasDataframeAxisPartition): """ The class implements the interface in ``PandasDataframeAxisPartition``. Parameters ---------- list_of_partitions : Union[list, PandasOnUnidistDataframePartition] List of ``PandasOnUnidistDataframePartition`` and ``PandasOnUnidistDataframeVirtualPartition`` objects, or a single ``PandasOnUnidistDataframePartition``. get_ip : bool, default: False Whether to get node IP addresses to conforming partitions or not. full_axis : bool, default: True Whether or not the virtual partition encompasses the whole axis. call_queue : list, optional A list of tuples (callable, args, kwargs) that contains deferred calls. length : unidist.ObjectRef or int, optional Length, or reference to length, of wrapped ``pandas.DataFrame``. width : unidist.ObjectRef or int, optional Width, or reference to width, of wrapped ``pandas.DataFrame``. """ _PARTITIONS_METADATA_LEN = 3 # (length, width, ip) partition_type = PandasOnUnidistDataframePartition axis = None # these variables are intentionally initialized at runtime (see #6023) _DEPLOY_AXIS_FUNC = None _DEPLOY_SPLIT_FUNC = None _DRAIN_FUNC = None @classmethod def _get_deploy_axis_func(cls): # noqa: GL08 if cls._DEPLOY_AXIS_FUNC is None: cls._DEPLOY_AXIS_FUNC = UnidistWrapper.put( PandasDataframeAxisPartition.deploy_axis_func ) return cls._DEPLOY_AXIS_FUNC @classmethod def _get_deploy_split_func(cls): # noqa: GL08 if cls._DEPLOY_SPLIT_FUNC is None: cls._DEPLOY_SPLIT_FUNC = UnidistWrapper.put( PandasDataframeAxisPartition.deploy_splitting_func ) return cls._DEPLOY_SPLIT_FUNC @classmethod def _get_drain_func(cls): # noqa: GL08 if cls._DRAIN_FUNC is None: cls._DRAIN_FUNC = UnidistWrapper.put(PandasDataframeAxisPartition.drain) return cls._DRAIN_FUNC @property def list_of_ips(self): """ Get the IPs holding the physical objects composing this partition. Returns ------- List A list of IPs as ``unidist.ObjectRef`` or str. """ # Defer draining call queue until we get the ip address result = [None] * len(self.list_of_block_partitions) for idx, partition in enumerate(self.list_of_block_partitions): partition.drain_call_queue() result[idx] = partition.ip(materialize=False) return result @classmethod @_inherit_docstrings(PandasDataframeAxisPartition.deploy_splitting_func) def deploy_splitting_func( cls, axis, func, f_args, f_kwargs, num_splits, *partitions, extract_metadata=False, ): return _deploy_unidist_func.options( num_returns=( num_splits * (1 + cls._PARTITIONS_METADATA_LEN) if extract_metadata else num_splits ), ).remote( cls._get_deploy_split_func(), axis, func, f_args, f_kwargs, num_splits, *partitions, extract_metadata=extract_metadata, ) @classmethod def deploy_axis_func( cls, axis, func, f_args, f_kwargs, num_splits, maintain_partitioning, *partitions, min_block_size, lengths=None, manual_partition=False, max_retries=None, ): """ Deploy a function along a full axis. Parameters ---------- axis : {0, 1} The axis to perform the function along. func : callable The function to perform. f_args : list or tuple Positional arguments to pass to ``func``. f_kwargs : dict Keyword arguments to pass to ``func``. num_splits : int The number of splits to return (see ``split_result_of_axis_func_pandas``). maintain_partitioning : bool If True, keep the old partitioning if possible. If False, create a new partition layout. *partitions : iterable All partitions that make up the full axis (row or column). min_block_size : int Minimum number of rows/columns in a single split. lengths : list, optional The list of lengths to shuffle the object. manual_partition : bool, default: False If True, partition the result with `lengths`. max_retries : int, default: None The max number of times to retry the func. Returns ------- list A list of ``unidist.ObjectRef``-s. """ return _deploy_unidist_func.options( num_returns=(num_splits if lengths is None else len(lengths)) * (1 + cls._PARTITIONS_METADATA_LEN), **({"max_retries": max_retries} if max_retries is not None else {}), ).remote( cls._get_deploy_axis_func(), axis, func, f_args, f_kwargs, num_splits, maintain_partitioning, *partitions, manual_partition=manual_partition, min_block_size=min_block_size, lengths=lengths, ) @classmethod def deploy_func_between_two_axis_partitions( cls, axis, func, f_args, f_kwargs, num_splits, len_of_left, other_shape, *partitions, min_block_size, ): """ Deploy a function along a full axis between two data sets. Parameters ---------- axis : {0, 1} The axis to perform the function along. func : callable The function to perform. f_args : list or tuple Positional arguments to pass to ``func``. f_kwargs : dict Keyword arguments to pass to ``func``. num_splits : int The number of splits to return (see ``split_result_of_axis_func_pandas``). len_of_left : int The number of values in `partitions` that belong to the left data set. other_shape : np.ndarray The shape of right frame in terms of partitions, i.e. (other_shape[i-1], other_shape[i]) will indicate slice to restore i-1 axis partition. *partitions : iterable All partitions that make up the full axis (row or column) for both data sets. min_block_size : int Minimum number of rows/columns in a single split. Returns ------- list A list of ``unidist.ObjectRef``-s. """ return _deploy_unidist_func.options( num_returns=num_splits * (1 + cls._PARTITIONS_METADATA_LEN) ).remote( PandasDataframeAxisPartition.deploy_func_between_two_axis_partitions, axis, func, f_args, f_kwargs, num_splits, len_of_left, other_shape, *partitions, min_block_size=min_block_size, ) def wait(self): """Wait completing computations on the object wrapped by the partition.""" self.drain_call_queue() futures = self.list_of_blocks UnidistWrapper.wait(futures) @_inherit_docstrings(PandasOnUnidistDataframeVirtualPartition) class PandasOnUnidistDataframeColumnPartition(PandasOnUnidistDataframeVirtualPartition): axis = 0 @_inherit_docstrings(PandasOnUnidistDataframeVirtualPartition) class PandasOnUnidistDataframeRowPartition(PandasOnUnidistDataframeVirtualPartition): axis = 1 @unidist.remote def _deploy_unidist_func( deployer, axis, f_to_deploy, f_args, f_kwargs, *args, extract_metadata=True, **kwargs, ): # pragma: no cover """ Execute a function on an axis partition in a worker process. This is ALWAYS called on either ``PandasDataframeAxisPartition.deploy_axis_func`` or ``PandasDataframeAxisPartition.deploy_func_between_two_axis_partitions``, which both serve to deploy another dataframe function on a unidist worker process. The provided ``f_args`` is thus are deserialized here (on the unidist worker) before the function is called (``f_kwargs`` will never contain more unidist objects, and thus does not require deserialization). Parameters ---------- deployer : callable A `PandasDataFrameAxisPartition.deploy_*` method that will call ``f_to_deploy``. axis : {0, 1} The axis to perform the function along. f_to_deploy : callable or unidist.ObjectRef The function to deploy. f_args : list or tuple Positional arguments to pass to ``f_to_deploy``. f_kwargs : dict Keyword arguments to pass to ``f_to_deploy``. *args : list Positional arguments to pass to ``deployer``. extract_metadata : bool, default: True Whether to return metadata (length, width, ip) of the result. Passing `False` may relax the load on object storage as the remote function would return 4 times fewer futures. Passing `False` makes sense for temporary results where you know for sure that the metadata will never be requested. **kwargs : dict Keyword arguments to pass to ``deployer``. Returns ------- list : Union[tuple, list] The result of the function call, and metadata for it. Notes ----- Unidist functions are not detected by codecov (thus pragma: no cover). """ f_args = deserialize(f_args) with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=FutureWarning) result = deployer(axis, f_to_deploy, f_args, f_kwargs, *args, **kwargs) if not extract_metadata: return result ip = unidist.get_ip() if isinstance(result, pandas.DataFrame): return result, len(result), len(result.columns), ip elif all(isinstance(r, pandas.DataFrame) for r in result): return [i for r in result for i in [r, len(r), len(r.columns), ip]] else: return [i for r in result for i in [r, None, None, ip]] ================================================ FILE: modin/core/execution/utils.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """General utils for execution module.""" import contextlib import os from modin.error_message import ErrorMessage @contextlib.contextmanager def set_env(**environ): """ Temporarily set the process environment variables. """ old_environ = os.environ.copy() os.environ.update(environ) try: yield finally: os.environ.clear() os.environ.update(old_environ) if "_MODIN_DOC_CHECKER_" in os.environ: # The doc checker should get the non-processed functions def remote_function(func, ignore_defaults=False): return func # Check if the function already exists to avoid circular imports elif "remote_function" not in dir(): # TODO(https://github.com/modin-project/modin/issues/7429): Use # frame-level engine config. from modin.config import Engine if Engine.get() == "Ray": from modin.core.execution.ray.common import RayWrapper _preprocess_func = RayWrapper.put elif Engine.get() == "Unidist": from modin.core.execution.unidist.common import UnidistWrapper _preprocess_func = UnidistWrapper.put elif Engine.get() == "Dask": from modin.core.execution.dask.common import DaskWrapper # The function cache is not supported for Dask def remote_function(func, ignore_defaults=False): return DaskWrapper.put(func) else: def remote_function(func, ignore_defaults=False): return func if "remote_function" not in dir(): _remote_function_cache = {} def remote_function(func, ignore_defaults=False): # noqa: F811 if "" in func.__qualname__: # Nested function if func.__closure__: ErrorMessage.single_warning( f"The nested function {func} can not be cached, because " + "it captures objects from the outer scope." ) return func if not ignore_defaults and func.__defaults__: ErrorMessage.single_warning( f"The nested function {func} can not be cached, because it has " + "default values. Use `ignore_defaults` to forcibly enable caching." ) return func # For the nested functions, use __code__ as the key, because it's the same # object for each instance of the function. key = id(func.__code__) else: key = func ref = _remote_function_cache.get(key, None) if ref is None: ref = _preprocess_func(func) _remote_function_cache[key] = ref return ref ================================================ FILE: modin/core/io/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """IO functions implementations.""" from .column_stores.feather_dispatcher import FeatherDispatcher from .column_stores.hdf_dispatcher import HDFDispatcher from .column_stores.parquet_dispatcher import ParquetDispatcher from .file_dispatcher import FileDispatcher from .io import BaseIO from .sql.sql_dispatcher import SQLDispatcher from .text.csv_dispatcher import CSVDispatcher from .text.excel_dispatcher import ExcelDispatcher from .text.fwf_dispatcher import FWFDispatcher from .text.json_dispatcher import JSONDispatcher from .text.text_file_dispatcher import TextFileDispatcher __all__ = [ "BaseIO", "CSVDispatcher", "FWFDispatcher", "JSONDispatcher", "FileDispatcher", "TextFileDispatcher", "ParquetDispatcher", "HDFDispatcher", "FeatherDispatcher", "SQLDispatcher", "ExcelDispatcher", ] ================================================ FILE: modin/core/io/column_stores/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Columnar store format type IO functions implementations.""" ================================================ FILE: modin/core/io/column_stores/column_store_dispatcher.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Module houses `ColumnStoreDispatcher` class. `ColumnStoreDispatcher` contains utils for handling columnar store format files, inherits util functions for handling files from `FileDispatcher` class and can be used as base class for dipatchers of specific columnar store formats. """ import numpy as np import pandas from modin.config import MinColumnPartitionSize, MinRowPartitionSize, NPartitions from modin.core.io.file_dispatcher import FileDispatcher from modin.core.storage_formats.pandas.utils import compute_chunksize class ColumnStoreDispatcher(FileDispatcher): """ Class handles utils for reading columnar store format files. Inherits some util functions for processing files from `FileDispatcher` class. """ @classmethod def call_deploy(cls, fname, col_partitions, **kwargs): """ Deploy remote tasks to the workers with passed parameters. Parameters ---------- fname : str, path object or file-like object Name of the file to read. col_partitions : list List of arrays with columns names that should be read by each partition. **kwargs : dict Parameters of deploying read_* function. Returns ------- np.ndarray Array with references to the task deploy result for each partition. """ return np.array( [ cls.deploy( func=cls.parse, f_kwargs={ "fname": fname, "columns": cols, "num_splits": NPartitions.get(), **kwargs, }, num_returns=NPartitions.get() + 2, ) for cols in col_partitions ] ).T @classmethod def build_partition(cls, partition_ids, row_lengths, column_widths): """ Build array with partitions of `cls.frame_partition_cls` class. Parameters ---------- partition_ids : list Array with references to the partitions data. row_lengths : list Partitions rows lengths. column_widths : list Number of columns in each partition. Returns ------- np.ndarray array with shape equals to the shape of `partition_ids` and filed with partition objects. """ return np.array( [ [ cls.frame_partition_cls( partition_ids[i][j], length=row_lengths[i], width=column_widths[j], ) for j in range(len(partition_ids[i])) ] for i in range(len(partition_ids)) ] ) @classmethod def build_index(cls, partition_ids): """ Compute index and its split sizes of resulting Modin DataFrame. Parameters ---------- partition_ids : list Array with references to the partitions data. Returns ------- index : pandas.Index Index of resulting Modin DataFrame. row_lengths : list List with lengths of index chunks. """ index_len = ( 0 if len(partition_ids) == 0 else cls.materialize(partition_ids[-2][0]) ) if isinstance(index_len, int): index = pandas.RangeIndex(index_len) else: index = index_len index_len = len(index) num_partitions = NPartitions.get() min_block_size = MinRowPartitionSize.get() index_chunksize = compute_chunksize(index_len, num_partitions, min_block_size) if index_chunksize > index_len: row_lengths = [index_len] + [0 for _ in range(num_partitions - 1)] else: row_lengths = [ ( index_chunksize if (i + 1) * index_chunksize < index_len else max(0, index_len - (index_chunksize * i)) ) for i in range(num_partitions) ] return index, row_lengths @classmethod def build_columns(cls, columns, num_row_parts=None): """ Split columns into chunks that should be read by workers. Parameters ---------- columns : list List of columns that should be read from file. num_row_parts : int, optional Number of parts the dataset is split into. This parameter is used to align the column partitioning with it so we won't end up with an over partitioned frame. Returns ------- col_partitions : list List of lists with columns for reading by workers. column_widths : list List with lengths of `col_partitions` subarrays (number of columns that should be read by workers). """ columns_length = len(columns) if columns_length == 0: return [], [] if num_row_parts is None: # in column formats we mostly read columns in parallel rather than rows, # so we try to chunk columns as much as possible min_block_size = 1 else: num_remaining_parts = round(NPartitions.get() / num_row_parts) min_block_size = min( columns_length // num_remaining_parts, MinColumnPartitionSize.get() ) column_splits = compute_chunksize( columns_length, NPartitions.get(), max(1, min_block_size) ) col_partitions = [ columns[i : i + column_splits] for i in range(0, columns_length, column_splits) ] column_widths = [len(c) for c in col_partitions] return col_partitions, column_widths @classmethod def build_dtypes(cls, partition_ids, columns): """ Compute common for all partitions `dtypes` for each of the DataFrame column. Parameters ---------- partition_ids : list Array with references to the partitions data. columns : list List of columns that should be read from file. Returns ------- dtypes : pandas.Series Series with dtypes for columns. """ dtypes = pandas.concat(cls.materialize(list(partition_ids)), axis=0) dtypes.index = columns return dtypes @classmethod def build_query_compiler(cls, path, columns, **kwargs): """ Build query compiler from deployed tasks outputs. Parameters ---------- path : str, path object or file-like object Path to the file to read. columns : list List of columns that should be read from file. **kwargs : dict Parameters of deploying read_* function. Returns ------- new_query_compiler : BaseQueryCompiler Query compiler with imported data for further processing. """ col_partitions, column_widths = cls.build_columns(columns) partition_ids = cls.call_deploy(path, col_partitions, **kwargs) index, row_lens = cls.build_index(partition_ids) remote_parts = cls.build_partition(partition_ids[:-2], row_lens, column_widths) dtypes = ( cls.build_dtypes(partition_ids[-1], columns) if len(partition_ids) > 0 else None ) new_query_compiler = cls.query_compiler_cls( cls.frame_cls( remote_parts, index, columns, row_lens, column_widths, dtypes=dtypes, ) ) return new_query_compiler ================================================ FILE: modin/core/io/column_stores/feather_dispatcher.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses `FeatherDispatcher` class, that is used for reading `.feather` files.""" from pandas.io.common import stringify_path from modin.core.io.column_stores.column_store_dispatcher import ColumnStoreDispatcher from modin.core.io.file_dispatcher import OpenFile from modin.utils import import_optional_dependency class FeatherDispatcher(ColumnStoreDispatcher): """Class handles utils for reading `.feather` files.""" @classmethod def _read(cls, path, columns=None, **kwargs): """ Read data from the file path, returning a query compiler. Parameters ---------- path : str or file-like object The filepath of the feather file. columns : array-like, optional Columns to read from file. If not provided, all columns are read. **kwargs : dict `read_feather` function kwargs. Returns ------- BaseQueryCompiler Query compiler with imported data for further processing. Notes ----- `PyArrow` engine and local files only are supported for now, multi threading is set to False by default. PyArrow feather is used. Please refer to the documentation here https://arrow.apache.org/docs/python/api.html#feather-format """ path = stringify_path(path) path = cls.get_path(path) if columns is None: import_optional_dependency( "pyarrow", "pyarrow is required to read feather files." ) from pyarrow import ipc with OpenFile( path, **(kwargs.get("storage_options", None) or {}), ) as file: # Opens the file to extract its metadata reader = ipc.open_file(file) # TODO: pyarrow's schema contains much more metadata than just column names, it also # has dtypes and index information that we could use when building a dataframe index_cols = frozenset( col for col in reader.schema.pandas_metadata["index_columns"] # 'index_columns' field may also contain dictionary fields describing actual # RangeIndices, so we're only filtering here for string column names if isinstance(col, str) ) # Filtering out the columns that describe the frame's index columns = [col for col in reader.schema.names if col not in index_cols] return cls.build_query_compiler( path, columns, use_threads=False, storage_options=kwargs["storage_options"], dtype_backend=kwargs["dtype_backend"], ) ================================================ FILE: modin/core/io/column_stores/hdf_dispatcher.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses `HDFDispatcher` class, that is used for reading hdf data.""" import pandas from modin.core.io.column_stores.column_store_dispatcher import ColumnStoreDispatcher class HDFDispatcher(ColumnStoreDispatcher): # pragma: no cover """ Class handles utils for reading hdf data. Inherits some common for columnar store files util functions from `ColumnStoreDispatcher` class. """ @classmethod def _validate_hdf_format(cls, path_or_buf): """ Validate `path_or_buf` and then return `table_type` parameter of store group attribute. Parameters ---------- path_or_buf : str, buffer or path object Path to the file to open, or an open :class:`pandas.HDFStore` object. Returns ------- str `table_type` parameter of store group attribute. """ s = pandas.HDFStore(path_or_buf) groups = s.groups() if len(groups) == 0: raise ValueError("No dataset in HDF5 file.") candidate_only_group = groups[0] format = getattr(candidate_only_group._v_attrs, "table_type", None) s.close() return format @classmethod def _read(cls, path_or_buf, **kwargs): """ Load an h5 file from the file path or buffer, returning a query compiler. Parameters ---------- path_or_buf : str, buffer or path object Path to the file to open, or an open :class:`pandas.HDFStore` object. **kwargs : dict Pass into pandas.read_hdf function. Returns ------- BaseQueryCompiler Query compiler with imported data for further processing. """ if cls._validate_hdf_format(path_or_buf=path_or_buf) is None: return cls.single_worker_read( path_or_buf, reason="File format seems to be `fixed`. For better distribution consider " + "saving the file in `table` format. df.to_hdf(format=`table`).", **kwargs ) columns = kwargs.pop("columns", None) # Have to do this because of Dask's keyword arguments kwargs["_key"] = kwargs.pop("key", None) if not columns: start = kwargs.pop("start", None) stop = kwargs.pop("stop", None) empty_pd_df = pandas.read_hdf(path_or_buf, start=0, stop=0, **kwargs) if start is not None: kwargs["start"] = start if stop is not None: kwargs["stop"] = stop columns = empty_pd_df.columns return cls.build_query_compiler(path_or_buf, columns, **kwargs) ================================================ FILE: modin/core/io/column_stores/parquet_dispatcher.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses `ParquetDispatcher` class, that is used for reading `.parquet` files.""" from __future__ import annotations import functools import json import os import re from typing import TYPE_CHECKING import fsspec import numpy as np import pandas import pandas._libs.lib as lib from fsspec.core import url_to_fs from fsspec.spec import AbstractBufferedFile from packaging import version from pandas.io.common import stringify_path from modin.config import MinColumnPartitionSize, MinRowPartitionSize, NPartitions from modin.core.io.column_stores.column_store_dispatcher import ColumnStoreDispatcher from modin.error_message import ErrorMessage from modin.utils import _inherit_docstrings if TYPE_CHECKING: from modin.core.storage_formats.pandas.parsers import ParquetFileToRead class ColumnStoreDataset: """ Base class that encapsulates Parquet engine-specific details. This class exposes a set of functions that are commonly used in the `read_parquet` implementation. Attributes ---------- path : str, path object or file-like object The filepath of the parquet file in local filesystem or hdfs. storage_options : dict Parameters for specific storage engine. _fs_path : str, path object or file-like object The filepath or handle of the parquet dataset specific to the filesystem implementation. E.g. for `s3://test/example`, _fs would be set to S3FileSystem and _fs_path would be `test/example`. _fs : Filesystem Filesystem object specific to the given parquet file/dataset. dataset : ParquetDataset or ParquetFile Underlying dataset implementation for PyArrow and fastparquet respectively. """ def __init__(self, path, storage_options): # noqa : PR01 self.path = path.__fspath__() if isinstance(path, os.PathLike) else path self.storage_options = storage_options self._fs_path = None self._fs = None self.dataset = self._init_dataset() @property def pandas_metadata(self): """Return the pandas metadata of the dataset.""" raise NotImplementedError @property def columns(self): """Return the list of columns in the dataset.""" raise NotImplementedError @property def engine(self): """Return string representing what engine is being used.""" raise NotImplementedError @functools.cached_property def files(self): """Return the list of formatted file paths of the dataset.""" raise NotImplementedError @functools.cached_property def row_groups_per_file(self): """Return a list with the number of row groups per file.""" raise NotImplementedError @property def fs(self): """ Return the filesystem object associated with the dataset path. Returns ------- filesystem Filesystem object. """ if self._fs is None: if isinstance(self.path, AbstractBufferedFile): self._fs = self.path.fs else: self._fs, self._fs_path = url_to_fs(self.path, **self.storage_options) return self._fs @property def fs_path(self): """ Return the filesystem-specific path or file handle. Returns ------- fs_path : str, path object or file-like object String path specific to filesystem or a file handle. """ if self._fs_path is None: if isinstance(self.path, AbstractBufferedFile): self._fs_path = self.path else: self._fs, self._fs_path = url_to_fs(self.path, **self.storage_options) return self._fs_path def to_pandas_dataframe(self, columns): """ Read the given columns as a pandas dataframe. Parameters ---------- columns : list List of columns that should be read from file. """ raise NotImplementedError def _get_files(self, files): """ Retrieve list of formatted file names in dataset path. Parameters ---------- files : list List of files from path. Returns ------- fs_files : list List of files from path with fs-protocol prepended. """ # Older versions of fsspec doesn't support unstrip_protocol(). It # was only added relatively recently: # https://github.com/fsspec/filesystem_spec/pull/828 def _unstrip_protocol(protocol, path): protos = (protocol,) if isinstance(protocol, str) else protocol for protocol in protos: if path.startswith(f"{protocol}://"): return path return f"{protos[0]}://{path}" if isinstance(self.path, AbstractBufferedFile): return [self.path] # version.parse() is expensive, so we can split this into two separate loops if version.parse(fsspec.__version__) < version.parse("2022.5.0"): fs_files = [_unstrip_protocol(self.fs.protocol, fpath) for fpath in files] else: fs_files = [self.fs.unstrip_protocol(fpath) for fpath in files] return fs_files @_inherit_docstrings(ColumnStoreDataset) class PyArrowDataset(ColumnStoreDataset): def _init_dataset(self): # noqa: GL08 from pyarrow.parquet import ParquetDataset return ParquetDataset(self.fs_path, filesystem=self.fs) @property def pandas_metadata(self): return self.dataset.schema.pandas_metadata @property def columns(self): return self.dataset.schema.names @property def engine(self): return "pyarrow" @functools.cached_property def row_groups_per_file(self): from pyarrow.parquet import ParquetFile row_groups_per_file = [] # Count up the total number of row groups across all files and # keep track of row groups per file to use later. for file in self.files: with self.fs.open(file) as f: row_groups = ParquetFile(f).num_row_groups row_groups_per_file.append(row_groups) return row_groups_per_file @functools.cached_property def files(self): files = self.dataset.files return self._get_files(files) def to_pandas_dataframe( self, columns, ): from pyarrow.parquet import read_table return read_table( self._fs_path, columns=columns, filesystem=self.fs ).to_pandas() @_inherit_docstrings(ColumnStoreDataset) class FastParquetDataset(ColumnStoreDataset): def _init_dataset(self): # noqa: GL08 from fastparquet import ParquetFile return ParquetFile(self.fs_path, fs=self.fs) @property def pandas_metadata(self): if "pandas" not in self.dataset.key_value_metadata: return {} return json.loads(self.dataset.key_value_metadata["pandas"]) @property def columns(self): return self.dataset.columns @property def engine(self): return "fastparquet" @functools.cached_property def row_groups_per_file(self): from fastparquet import ParquetFile row_groups_per_file = [] # Count up the total number of row groups across all files and # keep track of row groups per file to use later. for file in self.files: with self.fs.open(file) as f: row_groups = ParquetFile(f).info["row_groups"] row_groups_per_file.append(row_groups) return row_groups_per_file @functools.cached_property def files(self): return self._get_files(self._get_fastparquet_files()) def to_pandas_dataframe(self, columns): return self.dataset.to_pandas(columns=columns) # Karthik Velayutham writes: # # fastparquet doesn't have a nice method like PyArrow, so we # have to copy some of their logic here while we work on getting # an easier method to get a list of valid files. # See: https://github.com/dask/fastparquet/issues/795 def _get_fastparquet_files(self): # noqa: GL08 if "*" in self.path: files = self.fs.glob(self.path) else: # (Resolving issue #6778) # # Users will pass in a directory to a delta table, which stores parquet # files in various directories along with other, non-parquet files. We # need to identify those parquet files and not the non-parquet files. # # However, we also need to support users passing in explicit files that # don't necessarily have the `.parq` or `.parquet` extension -- if a user # says that a file is parquet, then we should probably give it a shot. if self.fs.isfile(self.path): files = self.fs.find(self.path) else: files = [ f for f in self.fs.find(self.path) if f.endswith(".parquet") or f.endswith(".parq") ] return files class ParquetDispatcher(ColumnStoreDispatcher): """Class handles utils for reading `.parquet` files.""" index_regex = re.compile(r"__index_level_\d+__") @classmethod def get_dataset(cls, path, engine, storage_options): """ Retrieve Parquet engine specific Dataset implementation. Parameters ---------- path : str, path object or file-like object The filepath of the parquet file in local filesystem or hdfs. engine : str Parquet library to use (only 'PyArrow' is supported for now). storage_options : dict Parameters for specific storage engine. Returns ------- Dataset Either a PyArrowDataset or FastParquetDataset object. """ if engine == "auto": # We follow in concordance with pandas engine_classes = [PyArrowDataset, FastParquetDataset] error_msgs = "" for engine_class in engine_classes: try: return engine_class(path, storage_options) except ImportError as err: error_msgs += "\n - " + str(err) raise ImportError( "Unable to find a usable engine; " + "tried using: 'pyarrow', 'fastparquet'.\n" + "A suitable version of " + "pyarrow or fastparquet is required for parquet " + "support.\n" + "Trying to import the above resulted in these errors:" + f"{error_msgs}" ) elif engine == "pyarrow": return PyArrowDataset(path, storage_options) elif engine == "fastparquet": return FastParquetDataset(path, storage_options) else: raise ValueError("engine must be one of 'pyarrow', 'fastparquet'") @classmethod def _determine_partitioning( cls, dataset: ColumnStoreDataset ) -> "list[list[ParquetFileToRead]]": """ Determine which partition will read certain files/row groups of the dataset. Parameters ---------- dataset : ColumnStoreDataset Returns ------- list[list[ParquetFileToRead]] Each element in the returned list describes a list of files that a partition has to read. """ from modin.core.storage_formats.pandas.parsers import ParquetFileToRead parquet_files = dataset.files row_groups_per_file = dataset.row_groups_per_file num_row_groups = sum(row_groups_per_file) if num_row_groups == 0: return [] num_splits = min(NPartitions.get(), num_row_groups) part_size = num_row_groups // num_splits # If 'num_splits' does not divide 'num_row_groups' then we can't cover all of # the row groups using the original 'part_size'. According to the 'reminder' # there has to be that number of partitions that should read 'part_size + 1' # number of row groups. reminder = num_row_groups % num_splits part_sizes = [part_size] * (num_splits - reminder) + [part_size + 1] * reminder partition_files = [] file_idx = 0 row_group_idx = 0 row_groups_left_in_current_file = row_groups_per_file[file_idx] # this is used for sanity check at the end, verifying that we indeed added all of the row groups total_row_groups_added = 0 for size in part_sizes: row_groups_taken = 0 part_files = [] while row_groups_taken != size: if row_groups_left_in_current_file < 1: file_idx += 1 row_group_idx = 0 row_groups_left_in_current_file = row_groups_per_file[file_idx] to_take = min(size - row_groups_taken, row_groups_left_in_current_file) part_files.append( ParquetFileToRead( parquet_files[file_idx], row_group_start=row_group_idx, row_group_end=row_group_idx + to_take, ) ) row_groups_left_in_current_file -= to_take row_groups_taken += to_take row_group_idx += to_take total_row_groups_added += row_groups_taken partition_files.append(part_files) sanity_check = ( len(partition_files) == num_splits and total_row_groups_added == num_row_groups ) ErrorMessage.catch_bugs_and_request_email( failure_condition=not sanity_check, extra_log="row groups added does not match total num of row groups across parquet files", ) return partition_files @classmethod def call_deploy( cls, partition_files: "list[list[ParquetFileToRead]]", col_partitions: "list[list[str]]", storage_options: dict, engine: str, **kwargs, ): """ Deploy remote tasks to the workers with passed parameters. Parameters ---------- partition_files : list[list[ParquetFileToRead]] List of arrays with files that should be read by each partition. col_partitions : list[list[str]] List of arrays with columns names that should be read by each partition. storage_options : dict Parameters for specific storage engine. engine : {"auto", "pyarrow", "fastparquet"} Parquet library to use for reading. **kwargs : dict Parameters of deploying read_* function. Returns ------- List Array with references to the task deploy result for each partition. """ # If we don't have any columns to read, we should just return an empty # set of references. if len(col_partitions) == 0: return [] all_partitions = [] for files_to_read in partition_files: all_partitions.append( [ cls.deploy( func=cls.parse, f_kwargs={ "files_for_parser": files_to_read, "columns": cols, "engine": engine, "storage_options": storage_options, **kwargs, }, num_returns=3, ) for cols in col_partitions ] ) return all_partitions @classmethod def build_partition(cls, partition_ids, column_widths): """ Build array with partitions of `cls.frame_partition_cls` class. Parameters ---------- partition_ids : list Array with references to the partitions data. column_widths : list Number of columns in each partition. Returns ------- np.ndarray array with shape equals to the shape of `partition_ids` and filed with partition objects. Notes ----- The second level of partitions_ids contains a list of object references for each read call: partition_ids[i][j] -> [ObjectRef(df), ObjectRef(df.index), ObjectRef(len(df))]. """ return np.array( [ [ cls.frame_partition_cls( part_id[0], length=part_id[2], width=col_width, ) for part_id, col_width in zip(part_ids, column_widths) ] for part_ids in partition_ids ] ) @classmethod def build_index(cls, dataset, partition_ids, index_columns, filters): """ Compute index and its split sizes of resulting Modin DataFrame. Parameters ---------- dataset : Dataset Dataset object of Parquet file/files. partition_ids : list Array with references to the partitions data. index_columns : list List of index columns specified by pandas metadata. filters : list List of filters to be used in reading the Parquet file/files. Returns ------- index : pandas.Index Index of resulting Modin DataFrame. needs_index_sync : bool Whether the partition indices need to be synced with frame index because there's no index column, or at least one index column is a RangeIndex. Notes ----- See `build_partition` for more detail on the contents of partitions_ids. """ range_index = True range_index_metadata = None column_names_to_read = [] for column in index_columns: # https://pandas.pydata.org/docs/development/developer.html#storing-pandas-dataframe-objects-in-apache-parquet-format # describes the format of the index column metadata. # It is a list, where each entry is either a string or a dictionary. # A string means that a column stored in the dataset is (part of) the index. # A dictionary is metadata about a RangeIndex, which is metadata-only and not stored # in the dataset as a column. # There cannot be both for a single dataframe, because a MultiIndex can only contain # "actual data" columns and not RangeIndex objects. # See similar code in pyarrow: https://github.com/apache/arrow/blob/44811ba18477560711d512939535c8389dd7787b/python/pyarrow/pandas_compat.py#L912-L926 # and in fastparquet, here is where RangeIndex is handled: https://github.com/dask/fastparquet/blob/df1219300a96bc1baf9ebad85f4f5676a130c9e8/fastparquet/api.py#L809-L815 if isinstance(column, str): column_names_to_read.append(column) range_index = False elif column["kind"] == "range": range_index_metadata = column # When the index has meaningful values, stored in a column, we will replicate those # exactly in the Modin dataframe's index. This index may have repeated values, be unsorted, # etc. This is all fine. # A range index is the special case: we want the Modin dataframe to have a single range, # not a range that keeps restarting. i.e. if the partitions have index 0-9, 0-19, 0-29, # we want our Modin dataframe to have 0-59. # When there are no filters, it is relatively cheap to construct the index by # actually reading in the necessary data, here in the main process. # When there are filters, we let the workers materialize the indices before combining to # get a single range. # For the second check, let us consider the case where we have an empty dataframe, # that has a valid index. if (range_index and filters is None) or ( len(partition_ids) == 0 and len(column_names_to_read) != 0 ): complete_index = dataset.to_pandas_dataframe( columns=column_names_to_read ).index # Empty DataFrame case elif len(partition_ids) == 0: return [], False else: index_ids = [part_id[0][1] for part_id in partition_ids if len(part_id) > 0] index_objs = cls.materialize(index_ids) if range_index: # There are filters, so we had to materialize in order to # determine how many items there actually are total_filtered_length = sum( len(index_part) for index_part in index_objs ) metadata_length_mismatch = False if range_index_metadata is not None: metadata_implied_length = ( range_index_metadata["stop"] - range_index_metadata["start"] ) / range_index_metadata["step"] metadata_length_mismatch = ( total_filtered_length != metadata_implied_length ) # pyarrow ignores the RangeIndex metadata if it is not consistent with data length. # https://github.com/apache/arrow/blob/44811ba18477560711d512939535c8389dd7787b/python/pyarrow/pandas_compat.py#L924-L926 # fastparquet keeps the start and step from the metadata and just adjusts to the length. # https://github.com/dask/fastparquet/blob/df1219300a96bc1baf9ebad85f4f5676a130c9e8/fastparquet/api.py#L815 if range_index_metadata is None or ( isinstance(dataset, PyArrowDataset) and metadata_length_mismatch ): complete_index = pandas.RangeIndex(total_filtered_length) else: complete_index = pandas.RangeIndex( start=range_index_metadata["start"], step=range_index_metadata["step"], stop=( range_index_metadata["start"] + (total_filtered_length * range_index_metadata["step"]) ), name=range_index_metadata["name"], ) else: complete_index = index_objs[0].append(index_objs[1:]) return complete_index, range_index or (len(index_columns) == 0) @classmethod def _normalize_partitioning(cls, remote_parts, row_lengths, column_widths): """ Normalize partitioning according to the default partitioning scheme in Modin. The result of 'read_parquet()' is often under partitioned over rows and over partitioned over columns, so this method expands the number of row splits and shrink the number of column splits. Parameters ---------- remote_parts : np.ndarray row_lengths : list of ints or None Row lengths, if 'None', won't repartition across rows. column_widths : list of ints Returns ------- remote_parts : np.ndarray row_lengths : list of ints or None column_widths : list of ints """ if len(remote_parts) == 0: return remote_parts, row_lengths, column_widths from modin.core.storage_formats.pandas.utils import get_length_list # The code in this function is actually a duplication of what 'BaseQueryCompiler.repartition()' does, # however this implementation works much faster for some reason actual_row_nparts = remote_parts.shape[0] if row_lengths is not None: desired_row_nparts = max( 1, min(sum(row_lengths) // MinRowPartitionSize.get(), NPartitions.get()) ) else: desired_row_nparts = actual_row_nparts # only repartition along rows if the actual number of row splits 1.5 times SMALLER than desired if 1.5 * actual_row_nparts < desired_row_nparts: # assuming that the sizes of parquet's row groups are more or less equal, # so trying to use the same number of splits for each partition splits_per_partition = desired_row_nparts // actual_row_nparts remainder = desired_row_nparts % actual_row_nparts new_parts = [] new_row_lengths = [] for row_idx, (part_len, row_parts) in enumerate( zip(row_lengths, remote_parts) ): num_splits = splits_per_partition # 'remainder' indicates how many partitions have to be split into 'num_splits + 1' splits # to have exactly 'desired_row_nparts' in the end if row_idx < remainder: num_splits += 1 if num_splits == 1: new_parts.append(row_parts) new_row_lengths.append(part_len) continue offset = len(new_parts) # adding empty row parts according to the number of splits new_parts.extend([[] for _ in range(num_splits)]) for part in row_parts: split = cls.frame_cls._partition_mgr_cls._column_partitions_class( [part] ).apply( lambda df: df, num_splits=num_splits, maintain_partitioning=False, ) for i in range(num_splits): new_parts[offset + i].append(split[i]) new_row_lengths.extend( get_length_list(part_len, num_splits, MinRowPartitionSize.get()) ) remote_parts = np.array(new_parts) row_lengths = new_row_lengths desired_col_nparts = max( 1, min(sum(column_widths) // MinColumnPartitionSize.get(), NPartitions.get()), ) # only repartition along cols if the actual number of col splits 1.5 times BIGGER than desired if 1.5 * desired_col_nparts < remote_parts.shape[1]: remote_parts = np.array( [ ( cls.frame_cls._partition_mgr_cls._row_partition_class( row_parts ).apply( lambda df: df, num_splits=desired_col_nparts, maintain_partitioning=False, ) ) for row_parts in remote_parts ] ) column_widths = get_length_list( sum(column_widths), desired_col_nparts, MinColumnPartitionSize.get() ) return remote_parts, row_lengths, column_widths @classmethod def build_query_compiler(cls, dataset, columns, index_columns, **kwargs): """ Build query compiler from deployed tasks outputs. Parameters ---------- dataset : Dataset Dataset object of Parquet file/files. columns : list List of columns that should be read from file. index_columns : list List of index columns specified by pandas metadata. **kwargs : dict Parameters of deploying read_* function. Returns ------- new_query_compiler : BaseQueryCompiler Query compiler with imported data for further processing. """ storage_options = kwargs.pop("storage_options", {}) or {} filters = kwargs.get("filters", None) partition_files = cls._determine_partitioning(dataset) col_partitions, column_widths = cls.build_columns( columns, num_row_parts=len(partition_files), ) partition_ids = cls.call_deploy( partition_files, col_partitions, storage_options, dataset.engine, **kwargs ) index, sync_index = cls.build_index( dataset, partition_ids, index_columns, filters ) remote_parts = cls.build_partition(partition_ids, column_widths) if len(partition_ids) > 0: row_lengths = [part.length() for part in remote_parts.T[0]] else: row_lengths = None remote_parts, row_lengths, column_widths = cls._normalize_partitioning( remote_parts, row_lengths, column_widths ) if ( dataset.pandas_metadata and "column_indexes" in dataset.pandas_metadata and len(dataset.pandas_metadata["column_indexes"]) == 1 and dataset.pandas_metadata["column_indexes"][0]["numpy_type"] == "int64" ): columns = pandas.Index(columns).astype("int64").to_list() frame = cls.frame_cls( remote_parts, index, columns, row_lengths=row_lengths, column_widths=column_widths, dtypes=None, ) if sync_index: frame.synchronize_labels(axis=0) return cls.query_compiler_cls(frame) @classmethod def _read(cls, path, engine, columns, use_nullable_dtypes, dtype_backend, **kwargs): """ Load a parquet object from the file path, returning a query compiler. Parameters ---------- path : str, path object or file-like object The filepath of the parquet file in local filesystem or hdfs. engine : {"auto", "pyarrow", "fastparquet"} Parquet library to use. columns : list If not None, only these columns will be read from the file. use_nullable_dtypes : Union[bool, lib.NoDefault] dtype_backend : {"numpy_nullable", "pyarrow", lib.no_default} **kwargs : dict Keyword arguments. Returns ------- BaseQueryCompiler A new Query Compiler. Notes ----- ParquetFile API is used. Please refer to the documentation here https://arrow.apache.org/docs/python/parquet.html """ if ( (set(kwargs) - {"storage_options", "filters", "filesystem"}) or use_nullable_dtypes != lib.no_default or kwargs.get("filesystem") is not None ): return cls.single_worker_read( path, engine=engine, columns=columns, use_nullable_dtypes=use_nullable_dtypes, dtype_backend=dtype_backend, reason="Parquet options that are not currently supported", **kwargs, ) path = stringify_path(path) if isinstance(path, list): # TODO(https://github.com/modin-project/modin/issues/5723): read all # files in parallel. compilers: list[cls.query_compiler_cls] = [ cls._read( p, engine, columns, use_nullable_dtypes, dtype_backend, **kwargs ) for p in path ] return compilers[0].concat(axis=0, other=compilers[1:], ignore_index=True) if isinstance(path, str): if os.path.isdir(path): path_generator = os.walk(path) else: storage_options = kwargs.get("storage_options") if storage_options is not None: fs, fs_path = url_to_fs(path, **storage_options) else: fs, fs_path = url_to_fs(path) path_generator = fs.walk(fs_path) partitioned_columns = set() # We do a tree walk of the path directory because partitioned # parquet directories have a unique column at each directory level. # Thus, we can use os.walk(), which does a dfs search, to walk # through the different columns that the data is partitioned on for _, dir_names, files in path_generator: if dir_names: partitioned_columns.add(dir_names[0].split("=")[0]) if files: # Metadata files, git files, .DSStore # TODO: fix conditional for column partitioning, see issue #4637 if len(files[0]) > 0 and files[0][0] == ".": continue break partitioned_columns = list(partitioned_columns) if len(partitioned_columns): return cls.single_worker_read( path, engine=engine, columns=columns, use_nullable_dtypes=use_nullable_dtypes, dtype_backend=dtype_backend, reason="Mixed partitioning columns in Parquet", **kwargs, ) dataset = cls.get_dataset(path, engine, kwargs.get("storage_options") or {}) index_columns = ( dataset.pandas_metadata.get("index_columns", []) if dataset.pandas_metadata else [] ) # If we have columns as None, then we default to reading in all the columns column_names = columns if columns else dataset.columns columns = [ c for c in column_names if c not in index_columns and not cls.index_regex.match(c) ] return cls.build_query_compiler( dataset, columns, index_columns, dtype_backend=dtype_backend, **kwargs ) @classmethod def write(cls, qc, **kwargs): """ Write a ``DataFrame`` to the binary parquet format. Parameters ---------- qc : BaseQueryCompiler The query compiler of the Modin dataframe that we want to run `to_parquet` on. **kwargs : dict Parameters for `pandas.to_parquet(**kwargs)`. """ kwargs["path"] = stringify_path(kwargs["path"]) output_path = kwargs["path"] if not isinstance(output_path, str): return cls.base_io.to_parquet(qc, **kwargs) client_kwargs = (kwargs.get("storage_options") or {}).get("client_kwargs", {}) fs, url = fsspec.core.url_to_fs(output_path, client_kwargs=client_kwargs) fs.mkdirs(url, exist_ok=True) def func(df, **kw): # pragma: no cover """ Dump a chunk of rows as parquet, then save them to target maintaining order. Parameters ---------- df : pandas.DataFrame A chunk of rows to write to a parquet file. **kw : dict Arguments to pass to ``pandas.to_parquet(**kwargs)`` plus an extra argument `partition_idx` serving as chunk index to maintain rows order. """ compression = kwargs["compression"] partition_idx = kw["partition_idx"] kwargs["path"] = ( f"{output_path}/part-{partition_idx:04d}.{compression}.parquet" ) df.to_parquet(**kwargs) return pandas.DataFrame() # Ensure that the metadata is synchronized qc._modin_frame._propagate_index_objs(axis=None) result = qc._modin_frame._partition_mgr_cls.map_axis_partitions( axis=1, partitions=qc._modin_frame._partitions, map_func=func, keep_partitioning=True, lengths=None, enumerate_partitions=True, ) # pending completion cls.materialize([part.list_of_blocks[0] for row in result for part in row]) ================================================ FILE: modin/core/io/file_dispatcher.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Module houses `FileDispatcher` class. `FileDispatcher` can be used as abstract base class for dispatchers of specific file formats or for direct files processing. """ import os import fsspec import numpy as np from pandas.io.common import is_fsspec_url, is_url from modin.config import AsyncReadMode from modin.logging import ClassLogger from modin.logging.config import LogLevel from modin.utils import ModinAssumptionError NOT_IMPLEMENTED_MESSAGE = "Implement in children classes!" class OpenFile: """ OpenFile is a context manager for an input file. OpenFile uses fsspec to open files on __enter__. On __exit__, it closes the fsspec file. This class exists to encapsulate the special behavior in __enter__ around anon=False and anon=True for s3 buckets. Parameters ---------- file_path : str String that represents the path to the file (paths to S3 buckets are also acceptable). mode : str, default: "rb" String, which defines which mode file should be open. compression : str, default: "infer" File compression name. **kwargs : dict Keywords arguments to be passed into ``fsspec.open`` function. Attributes ---------- file_path : str String that represents the path to the file mode : str String that defines which mode the file should be opened in. compression : str File compression name. file : fsspec.core.OpenFile The opened file. kwargs : dict Keywords arguments to be passed into ``fsspec.open`` function. """ def __init__(self, file_path, mode="rb", compression="infer", **kwargs): self.file_path = file_path self.mode = mode self.compression = compression self.kwargs = kwargs def __enter__(self): """ Open the file with fsspec and return the opened file. Returns ------- fsspec.core.OpenFile The opened file. """ try: from botocore.exceptions import NoCredentialsError credential_error_type = ( NoCredentialsError, PermissionError, ) except ModuleNotFoundError: credential_error_type = (PermissionError,) args = (self.file_path, self.mode, self.compression) self.file = fsspec.open(*args, **self.kwargs) try: return self.file.open() except credential_error_type: self.kwargs["anon"] = True self.file = fsspec.open(*args, **self.kwargs) return self.file.open() def __exit__(self, *args): """ Close the file. Parameters ---------- *args : any type Variable positional arguments, all unused. """ self.file.close() class FileDispatcher(ClassLogger, modin_layer="CORE-IO", log_level=LogLevel.DEBUG): """ Class handles util functions for reading data from different kinds of files. Notes ----- `_read`, `deploy`, `parse` and `materialize` are abstract methods and should be implemented in the child classes (functions signatures can differ between child classes). """ BUFFER_UNSUPPORTED_MSG = ( "Reading from buffers or other non-path-like objects is not supported" ) frame_cls = None frame_partition_cls = None query_compiler_cls = None @classmethod def read(cls, *args, **kwargs): """ Read data according passed `args` and `kwargs`. Parameters ---------- *args : iterable Positional arguments to be passed into `_read` function. **kwargs : dict Keywords arguments to be passed into `_read` function. Returns ------- query_compiler : BaseQueryCompiler Query compiler with imported data for further processing. Notes ----- `read` is high-level function that calls specific for defined storage format, engine and dispatcher class `_read` function with passed parameters and performs some postprocessing work on the resulting query_compiler object. """ try: query_compiler = cls._read(*args, **kwargs) except ModinAssumptionError as err: param_name = "path_or_buf" if "path_or_buf" in kwargs else "fname" fname = kwargs.pop(param_name) return cls.single_worker_read(fname, *args, reason=str(err), **kwargs) # TextFileReader can also be returned from `_read`. if not AsyncReadMode.get() and hasattr(query_compiler, "dtypes"): # at the moment it is not possible to use `wait_partitions` function; # in a situation where the reading function is called in a row with the # same parameters, `wait_partitions` considers that we have waited for # the end of remote calculations, however, when trying to materialize the # received data, it is clear that the calculations have not yet ended. # for example, `test_io_exp.py::test_read_evaluated_dict` is failed because of that. # see #5944 for details _ = query_compiler.dtypes return query_compiler @classmethod def _read(cls, *args, **kwargs): """ Perform reading of the data from file. Should be implemented in the child class. Parameters ---------- *args : iterable Positional arguments of the function. **kwargs : dict Keywords arguments of the function. """ raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) @classmethod def get_path(cls, file_path): """ Process `file_path` in accordance to it's type. Parameters ---------- file_path : str, os.PathLike[str] object or file-like object The file, or a path to the file. Paths to S3 buckets are also acceptable. Returns ------- str Updated or verified `file_path` parameter. Notes ----- if `file_path` is a URL, parameter will be returned as is, otherwise absolute path will be returned. """ if is_fsspec_url(file_path) or is_url(file_path): return file_path else: return os.path.abspath(file_path) @classmethod def file_size(cls, f): """ Get the size of file associated with file handle `f`. Parameters ---------- f : file-like object File-like object, that should be used to get file size. Returns ------- int File size in bytes. """ cur_pos = f.tell() f.seek(0, os.SEEK_END) size = f.tell() f.seek(cur_pos, os.SEEK_SET) return size @classmethod def file_exists(cls, file_path, storage_options=None): """ Check if `file_path` exists. Parameters ---------- file_path : str String that represents the path to the file (paths to S3 buckets are also acceptable). storage_options : dict, optional Keyword from `read_*` functions. Returns ------- bool Whether file exists or not. """ if not is_fsspec_url(file_path) and not is_url(file_path): return os.path.exists(file_path) try: from botocore.exceptions import ( ConnectTimeoutError, EndpointConnectionError, NoCredentialsError, ) credential_error_type = ( NoCredentialsError, PermissionError, EndpointConnectionError, ConnectTimeoutError, ) except ModuleNotFoundError: credential_error_type = (PermissionError,) if storage_options is not None: new_storage_options = dict(storage_options) new_storage_options.pop("anon", None) else: new_storage_options = {} fs, _ = fsspec.core.url_to_fs(file_path, **new_storage_options) exists = False try: exists = fs.exists(file_path) except credential_error_type: fs, _ = fsspec.core.url_to_fs(file_path, anon=True, **new_storage_options) exists = fs.exists(file_path) return exists @classmethod def deploy(cls, func, *args, num_returns=1, **kwargs): # noqa: PR01 """ Deploy remote task. Should be implemented in the task class (for example in the `RayWrapper`). """ raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) def parse(self, func, args, num_returns): # noqa: PR01 """ Parse file's data in the worker process. Should be implemented in the parser class (for example in the `PandasCSVParser`). """ raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) @classmethod def materialize(cls, obj_id): # noqa: PR01 """ Get results from worker. Should be implemented in the task class (for example in the `RayWrapper`). """ raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) @classmethod def build_partition(cls, partition_ids, row_lengths, column_widths): """ Build array with partitions of `cls.frame_partition_cls` class. Parameters ---------- partition_ids : list Array with references to the partitions data. row_lengths : list Partitions rows lengths. column_widths : list Number of columns in each partition. Returns ------- np.ndarray array with shape equals to the shape of `partition_ids` and filed with partition objects. """ return np.array( [ [ cls.frame_partition_cls( partition_ids[i][j], length=row_lengths[i], width=column_widths[j], ) for j in range(len(partition_ids[i])) ] for i in range(len(partition_ids)) ] ) @classmethod def _file_not_found_msg(cls, filename: str): # noqa: GL08 return f"No such file: '{filename}'" ================================================ FILE: modin/core/io/io.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Module houses `BaseIO` class. `BaseIO` is base class for IO classes, that stores IO functions. """ from typing import Any import pandas from pandas._libs.lib import no_default from pandas.util._decorators import doc from modin.core.storage_formats import BaseQueryCompiler from modin.db_conn import ModinDatabaseConnection from modin.error_message import ErrorMessage from modin.pandas.io import ExcelFile from modin.utils import _inherit_docstrings _doc_default_io_method = """ {summary} using pandas. For parameters description please refer to pandas API. Returns ------- {returns} """ _doc_returns_qc = """BaseQueryCompiler QueryCompiler with read data.""" _doc_returns_qc_or_parser = """BaseQueryCompiler or TextParser QueryCompiler or TextParser with read data.""" class BaseIO: """Class for basic utils and default implementation of IO functions.""" query_compiler_cls: BaseQueryCompiler = None frame_cls = None _should_warn_on_default_to_pandas: bool = True @classmethod def _maybe_warn_on_default(cls, *, message: str = "", reason: str = "") -> None: """ If this class is configured to warn on default to pandas, warn. Parameters ---------- message : str, default: "" Method that is causing a default to pandas. reason : str, default: "" Reason for default. """ if cls._should_warn_on_default_to_pandas: ErrorMessage.default_to_pandas(message=message, reason=reason) @classmethod def from_non_pandas(cls, *args, **kwargs): """ Create a Modin `query_compiler` from a non-pandas `object`. Parameters ---------- *args : iterable Positional arguments to be passed into `func`. **kwargs : dict Keyword arguments to be passed into `func`. """ return None @classmethod def from_pandas(cls, df) -> BaseQueryCompiler: """ Create a Modin `query_compiler` from a `pandas.DataFrame`. Parameters ---------- df : pandas.DataFrame The pandas DataFrame to convert from. Returns ------- BaseQueryCompiler QueryCompiler containing data from the `pandas.DataFrame`. """ return cls.query_compiler_cls.from_pandas(df, cls.frame_cls) @classmethod def from_arrow(cls, at): """ Create a Modin `query_compiler` from a `pyarrow.Table`. Parameters ---------- at : Arrow Table The Arrow Table to convert from. Returns ------- BaseQueryCompiler QueryCompiler containing data from the Arrow Table. """ return cls.query_compiler_cls.from_arrow(at, cls.frame_cls) @classmethod def from_interchange_dataframe(cls, df): """ Create a Modin QueryCompiler from a DataFrame supporting the DataFrame exchange protocol `__dataframe__()`. Parameters ---------- df : DataFrame The DataFrame object supporting the DataFrame exchange protocol. Returns ------- BaseQueryCompiler QueryCompiler containing data from the DataFrame. """ return cls.query_compiler_cls.from_interchange_dataframe(df, cls.frame_cls) @classmethod def from_ray(cls, ray_obj): """ Create a Modin `query_compiler` from a Ray Dataset. Parameters ---------- ray_obj : ray.data.Dataset The Ray Dataset to convert from. Returns ------- BaseQueryCompiler QueryCompiler containing data from the Ray Dataset. Notes ----- Ray Dataset can only be converted to a Modin Dataframe if Modin uses a Ray engine. If another engine is used, the runtime exception will be raised. """ raise RuntimeError( "Modin Dataframe can only be converted to a Ray Dataset if Modin uses a Ray engine." ) @classmethod def from_dask(cls, dask_obj): """ Create a Modin `query_compiler` from a Dask DataFrame. Parameters ---------- dask_obj : dask.dataframe.DataFrame The Dask DataFrame to convert from. Returns ------- BaseQueryCompiler QueryCompiler containing data from the Dask DataFrame. Notes ----- Dask DataFrame can only be converted to a Modin DataFrame if Modin uses a Dask engine. If another engine is used, the runtime exception will be raised. """ raise RuntimeError( "Modin DataFrame can only be converted to a Dask DataFrame if Modin uses a Dask engine." ) @classmethod def from_map(cls, func, iterable, *args, **kwargs): """ Create a Modin `query_compiler` from a map function. This method will construct a Modin `query_compiler` split by row partitions. The number of row partitions matches the number of elements in the iterable object. Parameters ---------- func : callable Function to map across the iterable object. iterable : Iterable An iterable object. *args : tuple Positional arguments to pass in `func`. **kwargs : dict Keyword arguments to pass in `func`. Returns ------- BaseQueryCompiler QueryCompiler containing data returned by map function. """ raise RuntimeError( "Modin DataFrame can only be created if Modin uses Ray, Dask or MPI engine." ) @classmethod @_inherit_docstrings(pandas.read_parquet, apilink="pandas.read_parquet") @doc( _doc_default_io_method, summary="Load a parquet object from the file path, returning a query compiler", returns=_doc_returns_qc, ) def read_parquet(cls, **kwargs): # noqa: PR01 cls._maybe_warn_on_default(message="`read_parquet`") return cls.from_pandas(pandas.read_parquet(**kwargs)) @classmethod @_inherit_docstrings(pandas.read_csv, apilink="pandas.read_csv") @doc( _doc_default_io_method, summary="Read a comma-separated values (CSV) file into query compiler", returns=_doc_returns_qc_or_parser, ) def read_csv( cls, filepath_or_buffer, **kwargs, ): # noqa: PR01 cls._maybe_warn_on_default(message="`read_csv`") pd_obj = pandas.read_csv(filepath_or_buffer, **kwargs) if isinstance(pd_obj, pandas.DataFrame): return cls.from_pandas(pd_obj) if isinstance(pd_obj, pandas.io.parsers.TextFileReader): # Overwriting the read method should return a Modin DataFrame for calls # to __next__ and get_chunk pd_read = pd_obj.read pd_obj.read = lambda *args, **kw: cls.from_pandas(pd_read(*args, **kw)) return pd_obj @classmethod @_inherit_docstrings(pandas.read_json, apilink="pandas.read_json") @doc( _doc_default_io_method, summary="Convert a JSON string to query compiler", returns=_doc_returns_qc, ) def read_json( cls, **kwargs, ): # noqa: PR01 cls._maybe_warn_on_default(message="`read_json`") return cls.from_pandas(pandas.read_json(**kwargs)) @classmethod @_inherit_docstrings(pandas.read_gbq, apilink="pandas.read_gbq") @doc( _doc_default_io_method, summary="Load data from Google BigQuery into query compiler", returns=_doc_returns_qc, ) def read_gbq( cls, query: str, project_id=None, index_col=None, col_order=None, reauth=False, auth_local_webserver=False, dialect=None, location=None, configuration=None, credentials=None, use_bqstorage_api=None, private_key=None, verbose=None, progress_bar_type=None, max_results=None, ): # noqa: PR01 cls._maybe_warn_on_default(message="`read_gbq`") return cls.from_pandas( pandas.read_gbq( query, project_id=project_id, index_col=index_col, col_order=col_order, reauth=reauth, auth_local_webserver=auth_local_webserver, dialect=dialect, location=location, configuration=configuration, credentials=credentials, use_bqstorage_api=use_bqstorage_api, progress_bar_type=progress_bar_type, max_results=max_results, ) ) @classmethod @_inherit_docstrings(pandas.read_html, apilink="pandas.read_html") @doc( _doc_default_io_method, summary="Read HTML tables into query compiler", returns=_doc_returns_qc, ) def read_html( cls, io, *, match=".+", flavor=None, header=None, index_col=None, skiprows=None, attrs=None, parse_dates=False, thousands=",", encoding=None, decimal=".", converters=None, na_values=None, keep_default_na=True, displayed_only=True, **kwargs, ): # noqa: PR01 cls._maybe_warn_on_default(message="`read_html`") result = pandas.read_html( io=io, match=match, flavor=flavor, header=header, index_col=index_col, skiprows=skiprows, attrs=attrs, parse_dates=parse_dates, thousands=thousands, encoding=encoding, decimal=decimal, converters=converters, na_values=na_values, keep_default_na=keep_default_na, displayed_only=displayed_only, **kwargs, ) return (cls.from_pandas(df) for df in result) @classmethod @_inherit_docstrings(pandas.read_clipboard, apilink="pandas.read_clipboard") @doc( _doc_default_io_method, summary="Read text from clipboard into query compiler", returns=_doc_returns_qc, ) def read_clipboard(cls, sep=r"\s+", **kwargs): # pragma: no cover # noqa: PR01 cls._maybe_warn_on_default(message="`read_clipboard`") return cls.from_pandas(pandas.read_clipboard(sep=sep, **kwargs)) @classmethod @_inherit_docstrings(pandas.read_excel, apilink="pandas.read_excel") @doc( _doc_default_io_method, summary="Read an Excel file into query compiler", returns="""BaseQueryCompiler or dict : QueryCompiler or dict with read data.""", ) def read_excel(cls, **kwargs): # noqa: PR01 cls._maybe_warn_on_default(message="`read_excel`") if isinstance(kwargs["io"], ExcelFile): # otherwise, Modin objects may be passed to the pandas context, resulting # in undefined behavior # for example in the case: pd.read_excel(pd.ExcelFile), since reading from # pd.ExcelFile in `read_excel` isn't supported kwargs["io"]._set_pandas_mode() intermediate = pandas.read_excel(**kwargs) if isinstance(intermediate, dict): parsed = type(intermediate)() for key in intermediate.keys(): parsed[key] = cls.from_pandas(intermediate.get(key)) return parsed else: return cls.from_pandas(intermediate) @classmethod @_inherit_docstrings(pandas.read_hdf, apilink="pandas.read_hdf") @doc( _doc_default_io_method, summary="Read data from hdf store into query compiler", returns=_doc_returns_qc, ) def read_hdf( cls, path_or_buf, key=None, mode: str = "r", errors: str = "strict", where=None, start=None, stop=None, columns=None, iterator=False, chunksize=None, **kwargs, ): # noqa: PR01 from modin.pandas.io import HDFStore cls._maybe_warn_on_default(message="`read_hdf`") modin_store = isinstance(path_or_buf, HDFStore) if modin_store: path_or_buf._return_modin_dataframe = False df = pandas.read_hdf( path_or_buf, key=key, mode=mode, columns=columns, errors=errors, where=where, start=start, stop=stop, iterator=iterator, chunksize=chunksize, **kwargs, ) if modin_store: path_or_buf._return_modin_dataframe = True return cls.from_pandas(df) @classmethod @_inherit_docstrings(pandas.read_feather, apilink="pandas.read_feather") @doc( _doc_default_io_method, summary="Load a feather-format object from the file path into query compiler", returns=_doc_returns_qc, ) def read_feather( cls, path, **kwargs, ): # noqa: PR01 cls._maybe_warn_on_default(message="`read_feather`") return cls.from_pandas( pandas.read_feather( path, **kwargs, ) ) @classmethod @_inherit_docstrings(pandas.read_stata, apilink="pandas.read_stata") @doc( _doc_default_io_method, summary="Read Stata file into query compiler", returns=_doc_returns_qc, ) def read_stata( cls, filepath_or_buffer, **kwargs, ): # noqa: PR01 cls._maybe_warn_on_default(message="`read_stata`") return cls.from_pandas(pandas.read_stata(filepath_or_buffer, **kwargs)) @classmethod @_inherit_docstrings(pandas.read_sas, apilink="pandas.read_sas") @doc( _doc_default_io_method, summary="Read SAS files stored as either XPORT or SAS7BDAT format files\ninto query compiler", returns=_doc_returns_qc, ) def read_sas( cls, filepath_or_buffer, *, format=None, index=None, encoding=None, chunksize=None, iterator=False, **kwargs, ): # pragma: no cover # noqa: PR01 cls._maybe_warn_on_default(message="`read_sas`") return cls.from_pandas( pandas.read_sas( filepath_or_buffer, format=format, index=index, encoding=encoding, chunksize=chunksize, iterator=iterator, **kwargs, ) ) @classmethod @_inherit_docstrings(pandas.read_pickle, apilink="pandas.read_pickle") @doc( _doc_default_io_method, summary="Load pickled pandas object (or any object) from file into query compiler", returns=_doc_returns_qc, ) def read_pickle( cls, filepath_or_buffer, **kwargs, ): # noqa: PR01 cls._maybe_warn_on_default(message="`read_pickle`") return cls.from_pandas( pandas.read_pickle( filepath_or_buffer, **kwargs, ) ) @classmethod @_inherit_docstrings(pandas.read_sql, apilink="pandas.read_sql") @doc( _doc_default_io_method, summary="Read SQL query or database table into query compiler", returns=_doc_returns_qc, ) def read_sql( cls, sql, con, index_col=None, coerce_float=True, params=None, parse_dates=None, columns=None, chunksize=None, dtype_backend=no_default, dtype=None, ): # noqa: PR01 cls._maybe_warn_on_default(message="`read_sql`") if isinstance(con, ModinDatabaseConnection): con = con.get_connection() result = pandas.read_sql( sql, con, index_col=index_col, coerce_float=coerce_float, params=params, parse_dates=parse_dates, columns=columns, chunksize=chunksize, dtype_backend=dtype_backend, dtype=dtype, ) if isinstance(result, (pandas.DataFrame, pandas.Series)): return cls.from_pandas(result) return (cls.from_pandas(df) for df in result) @classmethod @_inherit_docstrings(pandas.read_fwf, apilink="pandas.read_fwf") @doc( _doc_default_io_method, summary="Read a table of fixed-width formatted lines into query compiler", returns=_doc_returns_qc_or_parser, ) def read_fwf( cls, filepath_or_buffer, *, colspecs="infer", widths=None, infer_nrows=100, dtype_backend=no_default, iterator=False, chunksize=None, **kwds, ): # noqa: PR01 cls._maybe_warn_on_default(message="`read_fwf`") pd_obj = pandas.read_fwf( filepath_or_buffer, colspecs=colspecs, widths=widths, infer_nrows=infer_nrows, dtype_backend=dtype_backend, iterator=iterator, chunksize=chunksize, **kwds, ) if isinstance(pd_obj, pandas.DataFrame): return cls.from_pandas(pd_obj) if isinstance(pd_obj, pandas.io.parsers.TextFileReader): # Overwriting the read method should return a Modin DataFrame for calls # to __next__ and get_chunk pd_read = pd_obj.read pd_obj.read = lambda *args, **kwargs: cls.from_pandas( pd_read(*args, **kwargs) ) return pd_obj @classmethod @_inherit_docstrings(pandas.read_sql_table, apilink="pandas.read_sql_table") @doc( _doc_default_io_method, summary="Read SQL database table into query compiler", returns=_doc_returns_qc, ) def read_sql_table( cls, table_name, con, schema=None, index_col=None, coerce_float=True, parse_dates=None, columns=None, chunksize=None, dtype_backend=no_default, ): # noqa: PR01 cls._maybe_warn_on_default(message="`read_sql_table`") return cls.from_pandas( pandas.read_sql_table( table_name, con, schema=schema, index_col=index_col, coerce_float=coerce_float, parse_dates=parse_dates, columns=columns, chunksize=chunksize, dtype_backend=dtype_backend, ) ) @classmethod @_inherit_docstrings(pandas.read_sql_query, apilink="pandas.read_sql_query") @doc( _doc_default_io_method, summary="Read SQL query into query compiler", returns=_doc_returns_qc, ) def read_sql_query( cls, sql, con, **kwargs, ): # noqa: PR01 cls._maybe_warn_on_default(message="`read_sql_query`") return cls.from_pandas( pandas.read_sql_query( sql, con, **kwargs, ) ) @classmethod @_inherit_docstrings(pandas.read_spss, apilink="pandas.read_spss") @doc( _doc_default_io_method, summary="Load an SPSS file from the file path, returning a query compiler", returns=_doc_returns_qc, ) def read_spss( cls, path, usecols, convert_categoricals, dtype_backend ): # noqa: PR01 cls._maybe_warn_on_default(message="`read_spss`") return cls.from_pandas( pandas.read_spss( path, usecols=usecols, convert_categoricals=convert_categoricals, dtype_backend=dtype_backend, ) ) @classmethod @_inherit_docstrings(pandas.DataFrame.to_sql, apilink="pandas.DataFrame.to_sql") def to_sql( cls, qc, name, con, schema=None, if_exists="fail", index=True, index_label=None, chunksize=None, dtype=None, method=None, ): # noqa: PR01 """ Write records stored in a DataFrame to a SQL database using pandas. For parameters description please refer to pandas API. """ cls._maybe_warn_on_default(message="`to_sql`") df = qc.to_pandas() df.to_sql( name=name, con=con, schema=schema, if_exists=if_exists, index=index, index_label=index_label, chunksize=chunksize, dtype=dtype, method=method, ) @classmethod @_inherit_docstrings( pandas.DataFrame.to_pickle, apilink="pandas.DataFrame.to_pickle" ) def to_pickle( cls, obj: Any, filepath_or_buffer, **kwargs, ): # noqa: PR01, D200 """ Pickle (serialize) object to file. """ cls._maybe_warn_on_default(message="`to_pickle`") if isinstance(obj, BaseQueryCompiler): obj = obj.to_pandas() return pandas.to_pickle( obj, filepath_or_buffer=filepath_or_buffer, **kwargs, ) @classmethod @_inherit_docstrings(pandas.DataFrame.to_csv, apilink="pandas.DataFrame.to_csv") def to_csv(cls, obj, **kwargs): # noqa: PR01 """ Write object to a comma-separated values (CSV) file using pandas. For parameters description please refer to pandas API. """ cls._maybe_warn_on_default(message="`to_csv`") if isinstance(obj, BaseQueryCompiler): obj = obj.to_pandas() return obj.to_csv(**kwargs) @classmethod @_inherit_docstrings(pandas.DataFrame.to_json, apilink="pandas.DataFrame.to_json") def to_json(cls, obj, path, **kwargs): # noqa: PR01 """ Convert the object to a JSON string. For parameters description please refer to pandas API. """ cls._maybe_warn_on_default(message="`to_json`") if isinstance(obj, BaseQueryCompiler): obj = obj.to_pandas() return obj.to_json(path, **kwargs) @classmethod @_inherit_docstrings(pandas.Series.to_json, apilink="pandas.Series.to_json") def to_json_series(cls, obj, path, **kwargs): # noqa: PR01 """ Convert the object to a JSON string. For parameters description please refer to pandas API. """ cls._maybe_warn_on_default(message="`to_json`") if isinstance(obj, BaseQueryCompiler): obj = obj.to_pandas().squeeze(axis=1) return obj.to_json(path, **kwargs) @classmethod @_inherit_docstrings(pandas.DataFrame.to_xml, apilink="pandas.DataFrame.to_xml") def to_xml(cls, obj, path_or_buffer, **kwargs): # noqa: PR01 """ Convert the object to a XML string. For parameters description please refer to pandas API. """ cls._maybe_warn_on_default(message="`to_xml`") if isinstance(obj, BaseQueryCompiler): obj = obj.to_pandas() return obj.to_xml(path_or_buffer, **kwargs) @classmethod @_inherit_docstrings( pandas.DataFrame.to_parquet, apilink="pandas.DataFrame.to_parquet" ) def to_parquet(cls, obj, path, **kwargs): # noqa: PR01 """ Write object to the binary parquet format using pandas. For parameters description please refer to pandas API. """ cls._maybe_warn_on_default(message="`to_parquet`") if isinstance(obj, BaseQueryCompiler): obj = obj.to_pandas() return obj.to_parquet(path, **kwargs) @classmethod def to_ray(cls, modin_obj): """ Convert a Modin DataFrame/Series to a Ray Dataset. Parameters ---------- modin_obj : modin.pandas.DataFrame, modin.pandas.Series The Modin DataFrame/Series to convert. Returns ------- ray.data.Dataset Converted object with type depending on input. Notes ----- Modin DataFrame/Series can only be converted to a Ray Dataset if Modin uses a Ray engine. If another engine is used, the runtime exception will be raised. """ raise RuntimeError( "Modin Dataframe can only be converted to a Ray Dataset if Modin uses a Ray engine." ) @classmethod def to_dask(cls, modin_obj): """ Convert a Modin DataFrame to a Dask DataFrame. Parameters ---------- modin_obj : modin.pandas.DataFrame, modin.pandas.Series The Modin DataFrame/Series to convert. Returns ------- dask.dataframe.DataFrame or dask.dataframe.Series Converted object with type depending on input. Notes ----- Modin DataFrame/Series can only be converted to a Dask DataFrame/Series if Modin uses a Dask engine. If another engine is used, the runtime exception will be raised. """ raise RuntimeError( "Modin DataFrame can only be converted to a Dask DataFrame if Modin uses a Dask engine." ) ================================================ FILE: modin/core/io/sql/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """SQL format type IO functions implementations.""" ================================================ FILE: modin/core/io/sql/sql_dispatcher.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Module houses `SQLDispatcher` class. `SQLDispatcher` contains utils for handling SQL queries or database tables, inherits util functions for handling files from `FileDispatcher` class and can be used as base class for dipatchers of SQL queries. """ import math import numpy as np import pandas from modin.config import NPartitions, ReadSqlEngine from modin.core.io.file_dispatcher import FileDispatcher from modin.db_conn import ModinDatabaseConnection class SQLDispatcher(FileDispatcher): """Class handles utils for reading SQL queries or database tables.""" @classmethod def _is_supported_sqlalchemy_object(cls, obj): # noqa: GL08 supported = None try: import sqlalchemy as sa supported = isinstance(obj, (sa.engine.Engine, sa.engine.Connection)) except ImportError: supported = False return supported @classmethod def _read(cls, sql, con, index_col=None, **kwargs): """ Read a SQL query or database table into a query compiler. Parameters ---------- sql : str or SQLAlchemy Selectable (select or text object) SQL query to be executed or a table name. con : SQLAlchemy connectable, str, sqlite3 connection, or ModinDatabaseConnection Connection object to database. index_col : str or list of str, optional Column(s) to set as index(MultiIndex). **kwargs : dict Parameters to pass into `pandas.read_sql` function. Returns ------- BaseQueryCompiler Query compiler with imported data for further processing. """ if isinstance(con, str): con = ModinDatabaseConnection("sqlalchemy", con) if cls._is_supported_sqlalchemy_object(con): con = ModinDatabaseConnection( "sqlalchemy", con.engine.url.render_as_string(hide_password=False) ) if not isinstance(con, ModinDatabaseConnection): return cls.single_worker_read( sql, con=con, index_col=index_col, read_sql_engine=ReadSqlEngine.get(), reason="To use the parallel implementation of `read_sql`, pass either " + "a SQLAlchemy connectable, the SQL connection string, or a ModinDatabaseConnection " + "with the arguments required to make a connection, instead " + f"of {type(con)}. For documentation on the ModinDatabaseConnection, see " + "https://modin.readthedocs.io/en/latest/supported_apis/io_supported.html#connecting-to-a-database-for-read-sql", **kwargs, ) row_count_query = con.row_count_query(sql) connection_for_pandas = con.get_connection() colum_names_query = con.column_names_query(sql) row_cnt = pandas.read_sql(row_count_query, connection_for_pandas).squeeze() cols_names_df = pandas.read_sql( colum_names_query, connection_for_pandas, index_col=index_col ) cols_names = cols_names_df.columns num_partitions = NPartitions.get() partition_ids = [None] * num_partitions index_ids = [None] * num_partitions dtypes_ids = [None] * num_partitions limit = math.ceil(row_cnt / num_partitions) for part in range(num_partitions): offset = part * limit query = con.partition_query(sql, limit, offset) *partition_ids[part], index_ids[part], dtypes_ids[part] = cls.deploy( func=cls.parse, f_kwargs={ "num_splits": num_partitions, "sql": query, "con": con, "index_col": index_col, "read_sql_engine": ReadSqlEngine.get(), **kwargs, }, num_returns=num_partitions + 2, ) partition_ids[part] = [ cls.frame_partition_cls(obj) for obj in partition_ids[part] ] if index_col is None: # sum all lens returned from partitions index_lens = cls.materialize(index_ids) new_index = pandas.RangeIndex(sum(index_lens)) else: # concat index returned from partitions index_lst = [ x for part_index in cls.materialize(index_ids) for x in part_index ] new_index = pandas.Index(index_lst).set_names(index_col) new_frame = cls.frame_cls(np.array(partition_ids), new_index, cols_names) new_frame.synchronize_labels(axis=0) return cls.query_compiler_cls(new_frame) @classmethod def write(cls, qc, **kwargs): """ Write records stored in the `qc` to a SQL database. Parameters ---------- qc : BaseQueryCompiler The query compiler of the Modin dataframe that we want to run ``to_sql`` on. **kwargs : dict Parameters for ``pandas.to_sql(**kwargs)``. """ # we first insert an empty DF in order to create the full table in the database # This also helps to validate the input against pandas # we would like to_sql() to complete only when all rows have been inserted into the database # since the mapping operation is non-blocking, each partition will return an empty DF # so at the end, the blocking operation will be this empty DF to_pandas if not isinstance( kwargs["con"], str ) and not cls._is_supported_sqlalchemy_object(kwargs["con"]): return cls.base_io.to_sql(qc, **kwargs) # In the case that we are given a SQLAlchemy Connection or Engine, the objects # are not pickleable. We have to convert it to the URL string and connect from # each of the workers. if cls._is_supported_sqlalchemy_object(kwargs["con"]): kwargs["con"] = kwargs["con"].engine.url.render_as_string( hide_password=False ) empty_df = qc.getitem_row_array([0]).to_pandas().head(0) empty_df.to_sql(**kwargs) # so each partition will append its respective DF kwargs["if_exists"] = "append" columns = qc.columns def func(df): # pragma: no cover """ Override column names in the wrapped dataframe and convert it to SQL. Notes ----- This function returns an empty ``pandas.DataFrame`` because ``apply_full_axis`` expects a Frame object as a result of operation (and ``to_sql`` has no dataframe result). """ df.columns = columns df.to_sql(**kwargs) return pandas.DataFrame() # Ensure that the metadata is synchronized qc._modin_frame._propagate_index_objs(axis=None) result = qc._modin_frame.apply_full_axis(1, func, new_index=[], new_columns=[]) cls.materialize( [part.list_of_blocks[0] for row in result._partitions for part in row] ) ================================================ FILE: modin/core/io/text/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Text format type IO functions implementations.""" ================================================ FILE: modin/core/io/text/csv_dispatcher.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses `CSVDispatcher` class, that is used for reading `.csv` files.""" from modin.core.io.text.text_file_dispatcher import TextFileDispatcher class CSVDispatcher(TextFileDispatcher): """Class handles utils for reading `.csv` files.""" ================================================ FILE: modin/core/io/text/excel_dispatcher.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses `ExcelDispatcher` class, that is used for reading excel files.""" import os import re import warnings from io import BytesIO import pandas from pandas.io.common import stringify_path from modin.config import NPartitions from modin.core.io.text.text_file_dispatcher import TextFileDispatcher from modin.pandas.io import ExcelFile EXCEL_READ_BLOCK_SIZE = 4096 class ExcelDispatcher(TextFileDispatcher): """Class handles utils for reading excel files.""" @classmethod def _read(cls, io, **kwargs): """ Read data from `io` according to the passed `read_excel` `kwargs` parameters. Parameters ---------- io : str, bytes, ExcelFile, xlrd.Book, path object, or file-like object `io` parameter of `read_excel` function. **kwargs : dict Parameters of `read_excel` function. Returns ------- new_query_compiler : BaseQueryCompiler Query compiler with imported data for further processing. """ io = stringify_path(io) if ( kwargs.get("engine", None) is not None and kwargs.get("engine") != "openpyxl" ): return cls.single_worker_read( io, reason="Modin only implements parallel `read_excel` with `openpyxl` engine, " + 'please specify `engine=None` or `engine="openpyxl"` to ' + "use Modin's parallel implementation.", **kwargs ) if kwargs.get("skiprows") is not None: return cls.single_worker_read( io, reason="Modin doesn't support 'skiprows' parameter of `read_excel`", **kwargs ) if isinstance(io, bytes): io = BytesIO(io) # isinstance(ExcelFile, os.PathLike) == True if not isinstance(io, (str, os.PathLike, BytesIO)) or isinstance( io, (ExcelFile, pandas.ExcelFile) ): if isinstance(io, ExcelFile): io._set_pandas_mode() return cls.single_worker_read( io, reason="Modin only implements parallel `read_excel` the following types of `io`: " + "str, os.PathLike, io.BytesIO.", **kwargs ) from zipfile import ZipFile from openpyxl.reader.excel import ExcelReader from openpyxl.worksheet._reader import WorksheetReader from openpyxl.worksheet.worksheet import Worksheet from modin.core.storage_formats.pandas.parsers import PandasExcelParser sheet_name = kwargs.get("sheet_name", 0) if sheet_name is None or isinstance(sheet_name, list): return cls.single_worker_read( io, reason="`read_excel` functionality is only implemented for a single sheet at a " + "time. Multiple sheet reading coming soon!", **kwargs ) warnings.warn( "Parallel `read_excel` is a new feature! If you run into any " + "problems, please visit https://github.com/modin-project/modin/issues. " + "If you find a new issue and can't file it on GitHub, please " + "email bug_reports@modin.org." ) # NOTE: ExcelReader() in read-only mode does not close file handle by itself # work around that by passing file object if we received some path io_file = open(io, "rb") if isinstance(io, (str, os.PathLike)) else io try: ex = ExcelReader(io_file, read_only=True) ex.read() wb = ex.wb # Get shared strings ex.read_manifest() ex.read_strings() ws = Worksheet(wb) finally: if isinstance(io, (str, os.PathLike)): # close only if it were us who opened the object io_file.close() pandas_kw = dict(kwargs) # preserve original kwargs with ZipFile(io) as z: # Convert index to sheet name in file if isinstance(sheet_name, int): sheet_name = "sheet{}".format(sheet_name + 1) else: sheet_name = "sheet{}".format(wb.sheetnames.index(sheet_name) + 1) if any(sheet_name.lower() in name for name in z.namelist()): sheet_name = sheet_name.lower() elif any(sheet_name.title() in name for name in z.namelist()): sheet_name = sheet_name.title() else: raise ValueError("Sheet {} not found".format(sheet_name.lower())) # Pass this value to the workers kwargs["sheet_name"] = sheet_name f = z.open("xl/worksheets/{}.xml".format(sheet_name)) f = BytesIO(f.read()) total_bytes = cls.file_size(f) # Read some bytes from the sheet so we can extract the XML header and first # line. We need to make sure we get the first line of the data as well # because that is where the column names are. The header information will # be extracted and sent to all of the nodes. sheet_block = f.read(EXCEL_READ_BLOCK_SIZE) end_of_row_tag = b"" while end_of_row_tag not in sheet_block: sheet_block += f.read(EXCEL_READ_BLOCK_SIZE) idx_of_header_end = sheet_block.index(end_of_row_tag) + len(end_of_row_tag) sheet_header_with_first_row = sheet_block[:idx_of_header_end] if kwargs["header"] is not None: # Reset the file pointer to begin at the end of the header information. f.seek(idx_of_header_end) sheet_header = sheet_header_with_first_row else: start_of_row_tag = b"" # Use openpyxml to parse the data common_args = ( ws, BytesIO(sheet_header_with_first_row + footer), ex.shared_strings, False, ) if cls.need_rich_text_param(): reader = WorksheetReader(*common_args, rich_text=False) else: reader = WorksheetReader(*common_args) # Attach cells to the worksheet reader.bind_cells() data = PandasExcelParser.get_sheet_data( ws, kwargs.get("convert_float", True) ) # Extract column names from parsed data. if kwargs["header"] is None: column_names = pandas.RangeIndex(len(data[0])) else: column_names = pandas.Index(data[0]) index_col = kwargs.get("index_col", None) # Remove column names that are specified as `index_col` if index_col is not None: column_names = column_names.drop(column_names[index_col]) if not all(column_names) or kwargs.get("usecols"): # some column names are empty, use pandas reader to take the names from it pandas_kw["nrows"] = 1 df = pandas.read_excel(io, **pandas_kw) column_names = df.columns # Compute partition metadata upfront so it is uniform for all partitions chunk_size = max(1, (total_bytes - f.tell()) // NPartitions.get()) column_widths, num_splits = cls._define_metadata( pandas.DataFrame(columns=column_names), column_names ) kwargs["fname"] = io # Skiprows will be used to inform a partition how many rows come before it. kwargs["skiprows"] = 0 row_count = 0 data_ids = [] index_ids = [] dtypes_ids = [] kwargs["num_splits"] = num_splits while f.tell() < total_bytes: args = kwargs args["skiprows"] = row_count + args["skiprows"] args["start"] = f.tell() chunk = f.read(chunk_size) # This edge case can happen when we have reached the end of the data # but not the end of the file. if b"" row_count = re.subn(row_close_tag, b"", chunk)[1] # Make sure we are reading at least one row. while row_count == 0: chunk += f.read(chunk_size) row_count += re.subn(row_close_tag, b"", chunk)[1] last_index = chunk.rindex(row_close_tag) f.seek(-(len(chunk) - last_index) + len(row_close_tag), 1) args["end"] = f.tell() # If there is no data, exit before triggering computation. if b"" not in chunk and b"" in chunk: break remote_results_list = cls.deploy( func=cls.parse, f_kwargs=args, num_returns=num_splits + 2, ) data_ids.append(remote_results_list[:-2]) index_ids.append(remote_results_list[-2]) dtypes_ids.append(remote_results_list[-1]) # The end of the spreadsheet if b"" in chunk: break # Compute the index based on a sum of the lengths of each partition (by default) # or based on the column(s) that were requested. if index_col is None: row_lengths = cls.materialize(index_ids) new_index = pandas.RangeIndex(sum(row_lengths)) else: index_objs = cls.materialize(index_ids) row_lengths = [len(o) for o in index_objs] new_index = index_objs[0].append(index_objs[1:]) data_ids = cls.build_partition(data_ids, row_lengths, column_widths) # Compute dtypes by getting collecting and combining all of the partitions. The # reported dtypes from differing rows can be different based on the inference in # the limited data seen by each worker. We use pandas to compute the exact dtype # over the whole column for each column. The index is set below. dtypes = cls.get_dtypes(dtypes_ids, column_names) new_frame = cls.frame_cls( data_ids, new_index, column_names, row_lengths, column_widths, dtypes=dtypes, ) new_query_compiler = cls.query_compiler_cls(new_frame) if index_col is None: new_query_compiler._modin_frame.synchronize_labels(axis=0) return new_query_compiler ================================================ FILE: modin/core/io/text/fwf_dispatcher.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses `FWFDispatcher` class, that is used for reading of tables with fixed-width formatted lines.""" from typing import Optional, Sequence, Tuple, Union from modin.core.io.text.text_file_dispatcher import TextFileDispatcher class FWFDispatcher(TextFileDispatcher): """Class handles utils for reading of tables with fixed-width formatted lines.""" @classmethod def check_parameters_support( cls, filepath_or_buffer, read_kwargs: dict, skiprows_md: Union[Sequence, callable, int], header_size: int, ) -> Tuple[bool, Optional[str]]: """ Check support of parameters of `read_fwf` function. Parameters ---------- filepath_or_buffer : str, path object or file-like object `filepath_or_buffer` parameter of `read_fwf` function. read_kwargs : dict Parameters of `read_fwf` function. skiprows_md : int, array or callable `skiprows` parameter modified for easier handling by Modin. header_size : int Number of rows that are used by header. Returns ------- bool Whether passed parameters are supported or not. Optional[str] `None` if parameters are supported, otherwise an error message describing why parameters are not supported. """ if read_kwargs["infer_nrows"] > 100: return ( False, "`infer_nrows` is a significant portion of the number of rows, so Pandas may be faster", ) return super().check_parameters_support( filepath_or_buffer, read_kwargs, skiprows_md, header_size ) ================================================ FILE: modin/core/io/text/json_dispatcher.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses `JSONDispatcher` class, that is used for reading `.json` files.""" from io import BytesIO import numpy as np import pandas from pandas.io.common import stringify_path from modin.config import NPartitions from modin.core.io.file_dispatcher import OpenFile from modin.core.io.text.text_file_dispatcher import TextFileDispatcher class JSONDispatcher(TextFileDispatcher): """Class handles utils for reading `.json` files.""" @classmethod def _read(cls, path_or_buf, **kwargs): """ Read data from `path_or_buf` according to the passed `read_json` `kwargs` parameters. Parameters ---------- path_or_buf : str, path object or file-like object `path_or_buf` parameter of `read_json` function. **kwargs : dict Parameters of `read_json` function. Returns ------- BaseQueryCompiler Query compiler with imported data for further processing. """ path_or_buf = stringify_path(path_or_buf) path_or_buf = cls.get_path_or_buffer(path_or_buf) if isinstance(path_or_buf, str): if not cls.file_exists( path_or_buf, storage_options=kwargs.get("storage_options") ): return cls.single_worker_read( path_or_buf, reason=cls._file_not_found_msg(path_or_buf), **kwargs ) path_or_buf = cls.get_path(path_or_buf) elif not cls.pathlib_or_pypath(path_or_buf): return cls.single_worker_read( path_or_buf, reason=cls.BUFFER_UNSUPPORTED_MSG, **kwargs ) if not kwargs.get("lines", False): return cls.single_worker_read( path_or_buf, reason="`lines` argument not supported", **kwargs ) with OpenFile( path_or_buf, "rb", **(kwargs.get("storage_options", None) or {}), ) as f: columns = pandas.read_json(BytesIO(b"" + f.readline()), lines=True).columns kwargs["columns"] = columns empty_pd_df = pandas.DataFrame(columns=columns) with OpenFile( path_or_buf, "rb", kwargs.get("compression", "infer"), **(kwargs.get("storage_options", None) or {}), ) as f: column_widths, num_splits = cls._define_metadata(empty_pd_df, columns) args = {"fname": path_or_buf, "num_splits": num_splits, **kwargs} splits, _ = cls.partitioned_file( f, num_partitions=NPartitions.get(), ) partition_ids = [None] * len(splits) index_ids = [None] * len(splits) dtypes_ids = [None] * len(splits) for idx, (start, end) in enumerate(splits): args.update({"start": start, "end": end}) *partition_ids[idx], index_ids[idx], dtypes_ids[idx], _ = cls.deploy( func=cls.parse, f_kwargs=args, num_returns=num_splits + 3, ) # partition_id[-1] contains the columns for each partition, which will be useful # for implementing when `lines=False`. row_lengths = cls.materialize(index_ids) new_index = pandas.RangeIndex(sum(row_lengths)) partition_ids = cls.build_partition(partition_ids, row_lengths, column_widths) # Compute dtypes by getting collecting and combining all of the partitions. The # reported dtypes from differing rows can be different based on the inference in # the limited data seen by each worker. We use pandas to compute the exact dtype # over the whole column for each column. The index is set below. dtypes = cls.get_dtypes(dtypes_ids, columns) new_frame = cls.frame_cls( np.array(partition_ids), new_index, columns, row_lengths, column_widths, dtypes=dtypes, ) new_frame.synchronize_labels(axis=0) return cls.query_compiler_cls(new_frame) ================================================ FILE: modin/core/io/text/text_file_dispatcher.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Module houses `TextFileDispatcher` class. `TextFileDispatcher` contains utils for text formats files, inherits util functions for files from `FileDispatcher` class and can be used as base class for dipatchers of SQL queries. """ import codecs import io import os import warnings from csv import QUOTE_NONE from typing import Callable, Optional, Sequence, Tuple, Union import numpy as np import pandas import pandas._libs.lib as lib from pandas.core.dtypes.common import is_list_like from pandas.io.common import stringify_path from modin.config import MinColumnPartitionSize, NPartitions from modin.core.io.file_dispatcher import FileDispatcher, OpenFile from modin.core.io.text.utils import CustomNewlineIterator from modin.core.storage_formats.pandas.utils import compute_chunksize from modin.utils import _inherit_docstrings ColumnNamesTypes = Tuple[Union[pandas.Index, pandas.MultiIndex]] IndexColType = Union[int, str, bool, Sequence[int], Sequence[str], None] class TextFileDispatcher(FileDispatcher): """Class handles utils for reading text formats files.""" @classmethod def get_path_or_buffer(cls, filepath_or_buffer): """ Extract path from `filepath_or_buffer`. Parameters ---------- filepath_or_buffer : str, path object or file-like object `filepath_or_buffer` parameter of `read_csv` function. Returns ------- str or path object verified `filepath_or_buffer` parameter. Notes ----- Given a buffer, try and extract the filepath from it so that we can use it without having to fall back to pandas and share file objects between workers. Given a filepath, return it immediately. """ if ( hasattr(filepath_or_buffer, "name") and hasattr(filepath_or_buffer, "seekable") and filepath_or_buffer.seekable() and filepath_or_buffer.tell() == 0 ): buffer_filepath = filepath_or_buffer.name if cls.file_exists(buffer_filepath): warnings.warn( "For performance reasons, the filepath will be " + "used in place of the file handle passed in " + "to load the data" ) return cls.get_path(buffer_filepath) return filepath_or_buffer @classmethod def build_partition(cls, partition_ids, row_lengths, column_widths): """ Build array with partitions of `cls.frame_partition_cls` class. Parameters ---------- partition_ids : list Array with references to the partitions data. row_lengths : list Partitions rows lengths. column_widths : list Number of columns in each partition. Returns ------- np.ndarray array with shape equals to the shape of `partition_ids` and filed with partitions objects. """ return np.array( [ [ cls.frame_partition_cls( partition_ids[i][j], length=row_lengths[i], width=column_widths[j], ) for j in range(len(partition_ids[i])) ] for i in range(len(partition_ids)) ] ) @classmethod def pathlib_or_pypath(cls, filepath_or_buffer): """ Check if `filepath_or_buffer` is instance of `py.path.local` or `pathlib.Path`. Parameters ---------- filepath_or_buffer : str, path object or file-like object `filepath_or_buffer` parameter of `read_csv` function. Returns ------- bool Whether or not `filepath_or_buffer` is instance of `py.path.local` or `pathlib.Path`. """ try: import py if isinstance(filepath_or_buffer, py.path.local): return True except ImportError: # pragma: no cover pass try: import pathlib if isinstance(filepath_or_buffer, pathlib.Path): return True except ImportError: # pragma: no cover pass return False @classmethod def offset( cls, f, offset_size: int, quotechar: bytes = b'"', is_quoting: bool = True, encoding: str = None, newline: bytes = None, ): """ Move the file offset at the specified amount of bytes. Parameters ---------- f : file-like object File handle that should be used for offset movement. offset_size : int Number of bytes to read and ignore. quotechar : bytes, default: b'"' Indicate quote in a file. is_quoting : bool, default: True Whether or not to consider quotes. encoding : str, optional Encoding of `f`. newline : bytes, optional Byte or sequence of bytes indicating line endings. Returns ------- bool If file pointer reached the end of the file, but did not find closing quote returns `False`. `True` in any other case. """ if is_quoting: chunk = f.read(offset_size) outside_quotes = not chunk.count(quotechar) % 2 else: f.seek(offset_size, os.SEEK_CUR) outside_quotes = True # after we read `offset_size` bytes, we most likely break the line but # the modin implementation doesn't work correctly in the case, so we must # make sure that the line is read completely to the lineterminator, # which is what the `_read_rows` does outside_quotes, _ = cls._read_rows( f, nrows=1, quotechar=quotechar, is_quoting=is_quoting, outside_quotes=outside_quotes, encoding=encoding, newline=newline, ) return outside_quotes @classmethod def partitioned_file( cls, f, num_partitions: int = None, nrows: int = None, skiprows: int = None, quotechar: bytes = b'"', is_quoting: bool = True, encoding: str = None, newline: bytes = None, header_size: int = 0, pre_reading: int = 0, get_metadata_kw: dict = None, ): """ Compute chunk sizes in bytes for every partition. Parameters ---------- f : file-like object File handle of file to be partitioned. num_partitions : int, optional For what number of partitions split a file. If not specified grabs the value from `modin.config.NPartitions.get()`. nrows : int, optional Number of rows of file to read. skiprows : int, optional Specifies rows to skip. quotechar : bytes, default: b'"' Indicate quote in a file. is_quoting : bool, default: True Whether or not to consider quotes. encoding : str, optional Encoding of `f`. newline : bytes, optional Byte or sequence of bytes indicating line endings. header_size : int, default: 0 Number of rows, that occupied by header. pre_reading : int, default: 0 Number of rows between header and skipped rows, that should be read. get_metadata_kw : dict, optional Keyword arguments for `cls.read_callback` to compute metadata if needed. This option is not compatible with `pre_reading!=0`. Returns ------- list List with the next elements: int : partition start read byte int : partition end read byte pandas.DataFrame or None Dataframe from which metadata can be retrieved. Can be None if `get_metadata_kw=None`. """ if get_metadata_kw is not None and pre_reading != 0: raise ValueError( f"Incompatible combination of parameters: {get_metadata_kw=}, {pre_reading=}" ) read_rows_counter = 0 outside_quotes = True if num_partitions is None: num_partitions = NPartitions.get() - 1 if pre_reading else NPartitions.get() rows_skipper = cls.rows_skipper_builder( f, quotechar, is_quoting=is_quoting, encoding=encoding, newline=newline ) result = [] file_size = cls.file_size(f) pd_df_metadata = None if pre_reading: rows_skipper(header_size) pre_reading_start = f.tell() outside_quotes, read_rows = cls._read_rows( f, nrows=pre_reading, quotechar=quotechar, is_quoting=is_quoting, outside_quotes=outside_quotes, encoding=encoding, newline=newline, ) read_rows_counter += read_rows result.append((pre_reading_start, f.tell())) # add outside_quotes if is_quoting and not outside_quotes: warnings.warn("File has mismatched quotes") rows_skipper(skiprows) else: rows_skipper(skiprows) if get_metadata_kw: start = f.tell() # For correct behavior, if we want to avoid double skipping rows, # we need to get metadata after skipping. pd_df_metadata = cls.read_callback(f, **get_metadata_kw) f.seek(start) rows_skipper(header_size) start = f.tell() if nrows: partition_size = max(1, num_partitions, nrows // num_partitions) while f.tell() < file_size and read_rows_counter < nrows: if read_rows_counter + partition_size > nrows: # it's possible only if is_quoting==True partition_size = nrows - read_rows_counter outside_quotes, read_rows = cls._read_rows( f, nrows=partition_size, quotechar=quotechar, is_quoting=is_quoting, encoding=encoding, newline=newline, ) result.append((start, f.tell())) start = f.tell() read_rows_counter += read_rows # add outside_quotes if is_quoting and not outside_quotes: warnings.warn("File has mismatched quotes") else: partition_size = max(1, num_partitions, file_size // num_partitions) while f.tell() < file_size: outside_quotes = cls.offset( f, offset_size=partition_size, quotechar=quotechar, is_quoting=is_quoting, encoding=encoding, newline=newline, ) result.append((start, f.tell())) start = f.tell() # add outside_quotes if is_quoting and not outside_quotes: warnings.warn("File has mismatched quotes") return result, pd_df_metadata @classmethod def _read_rows( cls, f, nrows: int, quotechar: bytes = b'"', is_quoting: bool = True, outside_quotes: bool = True, encoding: str = None, newline: bytes = None, ): """ Move the file offset at the specified amount of rows. Parameters ---------- f : file-like object File handle that should be used for offset movement. nrows : int Number of rows to read. quotechar : bytes, default: b'"' Indicate quote in a file. is_quoting : bool, default: True Whether or not to consider quotes. outside_quotes : bool, default: True Whether the file pointer is within quotes or not at the time this function is called. encoding : str, optional Encoding of `f`. newline : bytes, optional Byte or sequence of bytes indicating line endings. Returns ------- bool If file pointer reached the end of the file, but did not find closing quote returns `False`. `True` in any other case. int Number of rows that were read. """ if nrows is not None and nrows <= 0: return True, 0 rows_read = 0 if encoding and ( "utf" in encoding and "8" not in encoding or encoding == "unicode_escape" or encoding.replace("-", "_") == "utf_8_sig" ): iterator = CustomNewlineIterator(f, newline) else: iterator = f for line in iterator: if is_quoting and line.count(quotechar) % 2: outside_quotes = not outside_quotes if outside_quotes: rows_read += 1 if rows_read >= nrows: break if isinstance(iterator, CustomNewlineIterator): iterator.seek() # case when EOF if not outside_quotes: rows_read += 1 return outside_quotes, rows_read @classmethod def compute_newline(cls, file_like, encoding, quotechar): """ Compute byte or sequence of bytes indicating line endings. Parameters ---------- file_like : file-like object File handle that should be used for line endings computing. encoding : str Encoding of `file_like`. quotechar : str Quotechar used for parsing `file-like`. Returns ------- bytes line endings """ newline = None if encoding is None: return newline, quotechar.encode("UTF-8") quotechar = quotechar.encode(encoding) encoding = encoding.replace("-", "_") if ( "utf" in encoding and "8" not in encoding or encoding == "unicode_escape" or encoding == "utf_8_sig" ): # trigger for computing f.newlines file_like.readline() # in bytes newline = file_like.newlines.encode(encoding) boms = () if encoding == "utf_8_sig": boms = (codecs.BOM_UTF8,) elif "16" in encoding: boms = (codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE) elif "32" in encoding: boms = (codecs.BOM_UTF32_BE, codecs.BOM_UTF32_LE) for bom in boms: if newline.startswith(bom): bom_len = len(bom) newline = newline[bom_len:] quotechar = quotechar[bom_len:] break return newline, quotechar # _read helper functions @classmethod def rows_skipper_builder( cls, f, quotechar, is_quoting, encoding=None, newline=None ): """ Build object for skipping passed number of lines. Parameters ---------- f : file-like object File handle that should be used for offset movement. quotechar : bytes Indicate quote in a file. is_quoting : bool Whether or not to consider quotes. encoding : str, optional Encoding of `f`. newline : bytes, optional Byte or sequence of bytes indicating line endings. Returns ------- object skipper object. """ def skipper(n): if n == 0 or n is None: return 0 else: return cls._read_rows( f, quotechar=quotechar, is_quoting=is_quoting, nrows=n, encoding=encoding, newline=newline, )[1] return skipper @classmethod def _define_header_size( cls, header: Union[int, Sequence[int], str, None] = "infer", names: Optional[Sequence] = lib.no_default, ) -> int: """ Define the number of rows that are used by header. Parameters ---------- header : int, list of int or str, default: "infer" Original `header` parameter of `read_csv` function. names : array-like, optional Original names parameter of `read_csv` function. Returns ------- header_size : int The number of rows that are used by header. """ header_size = 0 if header == "infer" and names in [lib.no_default, None]: header_size += 1 elif isinstance(header, int): header_size += header + 1 elif hasattr(header, "__iter__") and not isinstance(header, str): header_size += max(header) + 1 return header_size @classmethod def _define_metadata( cls, df: pandas.DataFrame, column_names: ColumnNamesTypes, ) -> Tuple[list, int]: """ Define partitioning metadata. Parameters ---------- df : pandas.DataFrame The DataFrame to split. column_names : ColumnNamesTypes Column names of df. Returns ------- column_widths : list Column width to use during new frame creation (number of columns for each partition). num_splits : int The maximum number of splits to separate the DataFrame into. """ # This is the number of splits for the columns num_splits = min(len(column_names) or 1, NPartitions.get()) min_block_size = MinColumnPartitionSize.get() column_chunksize = compute_chunksize(df.shape[1], num_splits, min_block_size) if column_chunksize > len(column_names): column_widths = [len(column_names)] # This prevents us from unnecessarily serializing a bunch of empty # objects. num_splits = 1 else: # split columns into chunks with maximal size column_chunksize, for example # if num_splits == 4, len(column_names) == 80 and column_chunksize == 32, # column_widths will be [32, 32, 16, 0] column_widths = [ ( column_chunksize if len(column_names) > (column_chunksize * (i + 1)) else ( 0 if len(column_names) < (column_chunksize * i) else len(column_names) - (column_chunksize * i) ) ) for i in range(num_splits) ] return column_widths, num_splits _parse_func = None @classmethod def preprocess_func(cls): # noqa: RT01 """Prepare a function for transmission to remote workers.""" if cls._parse_func is None: cls._parse_func = cls.put(cls.parse) return cls._parse_func @classmethod def _launch_tasks( cls, splits: list, *partition_args, **partition_kwargs ) -> Tuple[list, list, list]: """ Launch tasks to read partitions. Parameters ---------- splits : list List of tuples with partitions data, which defines parser task (start/end read bytes and etc.). *partition_args : tuple Positional arguments to be passed to the parser function. **partition_kwargs : dict `kwargs` that should be passed to the parser function. Returns ------- partition_ids : list array with references to the partitions data. index_ids : list array with references to the partitions index objects. dtypes_ids : list array with references to the partitions dtypes objects. """ partition_ids = [None] * len(splits) index_ids = [None] * len(splits) dtypes_ids = [None] * len(splits) # this is done mostly for performance; see PR#5678 for details func = cls.preprocess_func() for idx, (start, end) in enumerate(splits): partition_kwargs.update({"start": start, "end": end}) *partition_ids[idx], index_ids[idx], dtypes_ids[idx] = cls.deploy( func=func, f_args=partition_args, f_kwargs=partition_kwargs, num_returns=partition_kwargs.get("num_splits") + 2, ) return partition_ids, index_ids, dtypes_ids @classmethod def check_parameters_support( cls, filepath_or_buffer, read_kwargs: dict, skiprows_md: Union[Sequence, callable, int], header_size: int, ) -> Tuple[bool, Optional[str]]: """ Check support of only general parameters of `read_*` function. Parameters ---------- filepath_or_buffer : str, path object or file-like object `filepath_or_buffer` parameter of `read_*` function. read_kwargs : dict Parameters of `read_*` function. skiprows_md : int, array or callable `skiprows` parameter modified for easier handling by Modin. header_size : int Number of rows that are used by header. Returns ------- bool Whether passed parameters are supported or not. Optional[str] `None` if parameters are supported, otherwise an error message describing why parameters are not supported. """ skiprows = read_kwargs.get("skiprows") if isinstance(filepath_or_buffer, str): if not cls.file_exists( filepath_or_buffer, read_kwargs.get("storage_options") ): return (False, cls._file_not_found_msg(filepath_or_buffer)) elif not cls.pathlib_or_pypath(filepath_or_buffer): return (False, cls.BUFFER_UNSUPPORTED_MSG) if read_kwargs["chunksize"] is not None: return (False, "`chunksize` parameter is not supported") if read_kwargs.get("iterator"): return (False, "`iterator==True` parameter is not supported") if read_kwargs.get("dialect") is not None: return (False, "`dialect` parameter is not supported") if read_kwargs["lineterminator"] is not None: return (False, "`lineterminator` parameter is not supported") if read_kwargs["escapechar"] is not None: return (False, "`escapechar` parameter is not supported") if read_kwargs.get("skipfooter"): if read_kwargs.get("nrows") or read_kwargs.get("engine") == "c": return (False, "Exception is raised by pandas itself") skiprows_supported = True if is_list_like(skiprows_md) and skiprows_md[0] < header_size: skiprows_supported = False elif callable(skiprows): # check if `skiprows` callable gives True for any of header indices is_intersection = any( cls._get_skip_mask(pandas.RangeIndex(header_size), skiprows) ) if is_intersection: skiprows_supported = False if not skiprows_supported: return ( False, "Values of `header` and `skiprows` parameters have intersections; " + "this case is unsupported by Modin", ) return (True, None) @classmethod @_inherit_docstrings(pandas.io.parsers.base_parser.ParserBase._validate_usecols_arg) def _validate_usecols_arg(cls, usecols): msg = ( "'usecols' must either be list-like of all strings, all unicode, " + "all integers or a callable." ) if usecols is not None: if callable(usecols): return usecols, None if not is_list_like(usecols): raise ValueError(msg) usecols_dtype = lib.infer_dtype(usecols, skipna=False) if usecols_dtype not in ("empty", "integer", "string"): raise ValueError(msg) usecols = set(usecols) return usecols, usecols_dtype return usecols, None @classmethod def _manage_skiprows_parameter( cls, skiprows: Union[int, Sequence[int], Callable, None] = None, header_size: int = 0, ) -> Tuple[Union[int, Sequence, Callable], bool, int]: """ Manage `skiprows` parameter of read_csv and read_fwf functions. Change `skiprows` parameter in the way Modin could more optimally process it. `csv_dispatcher` and `fwf_dispatcher` have two mechanisms of rows skipping: 1) During file partitioning (setting of file limits that should be read by each partition) exact rows can be excluded from partitioning scope, thus they won't be read at all and can be considered as skipped. This is the most effective way of rows skipping (since it doesn't require any actual data reading and postprocessing), but in this case `skiprows` parameter can be an integer only. When it possible Modin always uses this approach by setting of `skiprows_partitioning` return value. 2) Rows for skipping can be dropped after full dataset import. This is more expensive way since it requires extra IO work and postprocessing afterwards, but `skiprows` parameter can be of any non-integer type supported by any pandas read function. These rows is specified by setting of `skiprows_md` return value. In some cases, if `skiprows` is uniformly distributed array (e.g. [1,2,3]), `skiprows` can be "squashed" and represented as integer to make a fastpath. If there is a gap between the first row for skipping and the last line of the header (that will be skipped too), then assign to read this gap first (assign the first partition to read these rows be setting of `pre_reading` return value). See `Examples` section for details. Parameters ---------- skiprows : int, array or callable, optional Original `skiprows` parameter of any pandas read function. header_size : int, default: 0 Number of rows that are used by header. Returns ------- skiprows_md : int, array or callable Updated skiprows parameter. If `skiprows` is an array, this array will be sorted. Also parameter will be aligned to actual data in the `query_compiler` (which, for example, doesn't contain header rows) pre_reading : int The number of rows that should be read before data file splitting for further reading (the number of rows for the first partition). skiprows_partitioning : int The number of rows that should be skipped virtually (skipped during data file partitioning). Examples -------- Let's consider case when `header`="infer" and `skiprows`=[3,4,5]. In this specific case fastpath can be done since `skiprows` is uniformly distributed array, so we can "squash" it to integer and set `skiprows_partitioning`=3. But if no additional action will be done, these three rows will be skipped right after header line, that corresponds to `skiprows`=[1,2,3]. Now, to avoid this discrepancy, we need to assign the first partition to read data between header line and the first row for skipping by setting of `pre_reading` parameter, so setting `pre_reading`=2. During data file partitiong, these lines will be assigned for reading for the first partition, and then file position will be set at the beginning of rows that should be skipped by `skiprows_partitioning`. After skipping of these rows, the rest data will be divided between the rest of partitions, see rows assignement below: 0 - header line (skip during partitioning) 1 - pre_reading (assign to read by the first partition) 2 - pre_reading (assign to read by the first partition) 3 - skiprows_partitioning (skip during partitioning) 4 - skiprows_partitioning (skip during partitioning) 5 - skiprows_partitioning (skip during partitioning) 6 - data to partition (divide between the rest of partitions) 7 - data to partition (divide between the rest of partitions) """ pre_reading = skiprows_partitioning = skiprows_md = 0 if isinstance(skiprows, int): skiprows_partitioning = skiprows elif is_list_like(skiprows) and len(skiprows) > 0: skiprows_md = np.sort(skiprows) if np.all(np.diff(skiprows_md) == 1): # `skiprows` is uniformly distributed array. pre_reading = ( skiprows_md[0] - header_size if skiprows_md[0] > header_size else 0 ) skiprows_partitioning = len(skiprows_md) skiprows_md = 0 elif skiprows_md[0] > header_size: skiprows_md = skiprows_md - header_size elif callable(skiprows): def skiprows_func(x): return skiprows(x + header_size) skiprows_md = skiprows_func return skiprows_md, pre_reading, skiprows_partitioning @classmethod def _define_index( cls, index_ids: list, index_name: str, ) -> Tuple[IndexColType, list]: """ Compute the resulting DataFrame index and index lengths for each of partitions. Parameters ---------- index_ids : list Array with references to the partitions index objects. index_name : str Name that should be assigned to the index if `index_col` is not provided. Returns ------- new_index : IndexColType Index that should be passed to the new_frame constructor. row_lengths : list Partitions rows lengths. """ index_objs = cls.materialize(index_ids) if len(index_objs) == 0 or isinstance(index_objs[0], int): row_lengths = index_objs new_index = pandas.RangeIndex(sum(index_objs)) else: row_lengths = [len(o) for o in index_objs] new_index = index_objs[0].append(index_objs[1:]) new_index.name = index_name return new_index, row_lengths @classmethod def _get_new_qc( cls, partition_ids: list, index_ids: list, dtypes_ids: list, index_col: IndexColType, index_name: str, column_widths: list, column_names: ColumnNamesTypes, skiprows_md: Union[Sequence, callable, None] = None, header_size: int = None, **kwargs, ): """ Get new query compiler from data received from workers. Parameters ---------- partition_ids : list Array with references to the partitions data. index_ids : list Array with references to the partitions index objects. dtypes_ids : list Array with references to the partitions dtypes objects. index_col : IndexColType `index_col` parameter of `read_csv` function. index_name : str Name that should be assigned to the index if `index_col` is not provided. column_widths : list Number of columns in each partition. column_names : ColumnNamesTypes Array with columns names. skiprows_md : array-like or callable, optional Specifies rows to skip. header_size : int, default: 0 Number of rows, that occupied by header. **kwargs : dict Parameters of `read_csv` function needed for postprocessing. Returns ------- new_query_compiler : BaseQueryCompiler New query compiler, created from `new_frame`. """ partition_ids = cls.build_partition( partition_ids, [None] * len(index_ids), column_widths ) new_frame = cls.frame_cls( partition_ids, lambda: cls._define_index(index_ids, index_name), column_names, None, column_widths, dtypes=lambda: cls.get_dtypes(dtypes_ids, column_names), ) new_query_compiler = cls.query_compiler_cls(new_frame) skipfooter = kwargs.get("skipfooter", None) if skipfooter: new_query_compiler = new_query_compiler.drop( new_query_compiler.index[-skipfooter:] ) if skiprows_md is not None: # skip rows that passed as array or callable nrows = kwargs.get("nrows", None) index_range = pandas.RangeIndex(len(new_query_compiler.index)) if is_list_like(skiprows_md): new_query_compiler = new_query_compiler.take_2d_positional( index=index_range.delete(skiprows_md) ) elif callable(skiprows_md): skip_mask = cls._get_skip_mask(index_range, skiprows_md) if not isinstance(skip_mask, np.ndarray): skip_mask = skip_mask.to_numpy("bool") view_idx = index_range[~skip_mask] new_query_compiler = new_query_compiler.take_2d_positional( index=view_idx ) else: raise TypeError( f"Not acceptable type of `skiprows` parameter: {type(skiprows_md)}" ) if not isinstance(new_query_compiler.index, pandas.MultiIndex): new_query_compiler = new_query_compiler.reset_index(drop=True) if nrows: new_query_compiler = new_query_compiler.take_2d_positional( pandas.RangeIndex(len(new_query_compiler.index))[:nrows] ) if index_col is None or index_col is False: new_query_compiler._modin_frame.synchronize_labels(axis=0) return new_query_compiler @classmethod def _read(cls, filepath_or_buffer, **kwargs): """ Read data from `filepath_or_buffer` according to `kwargs` parameters. Used in `read_csv` and `read_fwf` Modin implementations. Parameters ---------- filepath_or_buffer : str, path object or file-like object `filepath_or_buffer` parameter of read functions. **kwargs : dict Parameters of read functions. Returns ------- new_query_compiler : BaseQueryCompiler Query compiler with imported data for further processing. """ filepath_or_buffer = stringify_path(filepath_or_buffer) filepath_or_buffer_md = ( cls.get_path(filepath_or_buffer) if isinstance(filepath_or_buffer, str) else cls.get_path_or_buffer(filepath_or_buffer) ) compression_infered = cls.infer_compression( filepath_or_buffer, kwargs["compression"] ) # Getting frequently used kwargs; # They should be defined in higher level names = kwargs["names"] index_col = kwargs["index_col"] encoding = kwargs["encoding"] skiprows = kwargs["skiprows"] header = kwargs["header"] # Define header size for further skipping (Header can be skipped because header # information will be obtained further from empty_df, so no need to handle it # by workers) header_size = cls._define_header_size( header, names, ) ( skiprows_md, pre_reading, skiprows_partitioning, ) = cls._manage_skiprows_parameter(skiprows, header_size) should_handle_skiprows = skiprows_md is not None and not isinstance( skiprows_md, int ) (use_modin_impl, fallback_reason) = cls.check_parameters_support( filepath_or_buffer_md, kwargs, skiprows_md, header_size, ) if not use_modin_impl: return cls.single_worker_read( filepath_or_buffer, kwargs, reason=fallback_reason, ) is_quoting = kwargs["quoting"] != QUOTE_NONE usecols = kwargs["usecols"] use_inferred_column_names = cls._uses_inferred_column_names( names, skiprows, kwargs["skipfooter"], usecols ) # Computing metadata simultaneously with skipping rows allows us to not # do extra work and improve performance for certain cases, as otherwise, # it would require double re-reading of skipped rows in order to retrieve metadata. can_compute_metadata_while_skipping_rows = ( # basic supported case: isinstance(skiprows, int) without any additional params isinstance(skiprows, int) and (usecols is None or skiprows is None) and pre_reading == 0 ) get_metadata_kw = dict(kwargs, nrows=1, skipfooter=0, index_col=index_col) if get_metadata_kw.get("engine", None) == "pyarrow": # pyarrow engine doesn't support `nrows` option; # https://github.com/pandas-dev/pandas/issues/38872 can be used to track pyarrow engine features get_metadata_kw["engine"] = "c" if not can_compute_metadata_while_skipping_rows: pd_df_metadata = cls.read_callback( filepath_or_buffer_md, **get_metadata_kw, ) column_names = pd_df_metadata.columns column_widths, num_splits = cls._define_metadata( pd_df_metadata, column_names ) get_metadata_kw = None else: get_metadata_kw = dict(get_metadata_kw, skiprows=None) # `memory_map` doesn't work with file-like object so we can't use it here. # We can definitely skip it without violating the reading logic # since this parameter is intended to optimize reading. # For reading a couple of lines, this is not essential. get_metadata_kw.pop("memory_map", None) # These parameters are already used when opening file `f`, # they do not need to be used again. get_metadata_kw.pop("storage_options", None) get_metadata_kw.pop("compression", None) with OpenFile( filepath_or_buffer_md, "rb", compression_infered, **(kwargs.get("storage_options", None) or {}), ) as f: old_pos = f.tell() fio = io.TextIOWrapper(f, encoding=encoding, newline="") newline, quotechar = cls.compute_newline( fio, encoding, kwargs.get("quotechar", '"') ) f.seek(old_pos) splits, pd_df_metadata_temp = cls.partitioned_file( f, num_partitions=NPartitions.get(), nrows=kwargs["nrows"] if not should_handle_skiprows else None, skiprows=skiprows_partitioning, quotechar=quotechar, is_quoting=is_quoting, encoding=encoding, newline=newline, header_size=header_size, pre_reading=pre_reading, get_metadata_kw=get_metadata_kw, ) if can_compute_metadata_while_skipping_rows: pd_df_metadata = pd_df_metadata_temp # compute dtypes if possible common_dtypes = None if kwargs["dtype"] is None: most_common_dtype = (object,) common_dtypes = {} for col, dtype in pd_df_metadata.dtypes.to_dict().items(): if dtype in most_common_dtype: common_dtypes[col] = dtype column_names = pd_df_metadata.columns column_widths, num_splits = cls._define_metadata(pd_df_metadata, column_names) # kwargs that will be passed to the workers partition_kwargs = dict( kwargs, header_size=0 if use_inferred_column_names else header_size, names=column_names if use_inferred_column_names else names, header="infer" if use_inferred_column_names else header, skipfooter=0, skiprows=None, nrows=None, compression=compression_infered, common_dtypes=common_dtypes, ) # this is done mostly for performance; see PR#5678 for details filepath_or_buffer_md_ref = cls.put(filepath_or_buffer_md) kwargs_ref = cls.put(partition_kwargs) partition_ids, index_ids, dtypes_ids = cls._launch_tasks( splits, filepath_or_buffer_md_ref, kwargs_ref, num_splits=num_splits, ) new_query_compiler = cls._get_new_qc( partition_ids=partition_ids, index_ids=index_ids, dtypes_ids=dtypes_ids, index_col=index_col, index_name=pd_df_metadata.index.name, column_widths=column_widths, column_names=column_names, skiprows_md=skiprows_md if should_handle_skiprows else None, header_size=header_size, skipfooter=kwargs["skipfooter"], parse_dates=kwargs["parse_dates"], nrows=kwargs["nrows"] if should_handle_skiprows else None, ) return new_query_compiler @classmethod def _get_skip_mask(cls, rows_index: pandas.Index, skiprows: Callable): """ Get mask of skipped by callable `skiprows` rows. Parameters ---------- rows_index : pandas.Index Rows index to get mask for. skiprows : Callable Callable to check whether row index should be skipped. Returns ------- pandas.Index """ try: # direct `skiprows` call is more efficient than using of # map method, but in some cases it can work incorrectly, e.g. # when `skiprows` contains `in` operator mask = skiprows(rows_index) assert is_list_like(mask) except (ValueError, TypeError, AssertionError): # ValueError can be raised if `skiprows` callable contains membership operator # TypeError is raised if `skiprows` callable contains bitwise operator # AssertionError is raised if unexpected behavior was detected mask = rows_index.map(skiprows) return mask @staticmethod def _uses_inferred_column_names(names, skiprows, skipfooter, usecols): """ Tell whether need to use inferred column names in workers or not. 1) ``False`` is returned in 2 cases and means next: 1.a) `names` parameter was provided from the API layer. In this case parameter `names` must be provided as `names` parameter for ``read_csv`` in the workers. 1.b) `names` parameter wasn't provided from the API layer. In this case column names inference must happen in each partition. 2) ``True`` is returned in case when inferred column names from pre-reading stage must be provided as `names` parameter for ``read_csv`` in the workers. In case `names` was provided, the other parameters aren't checked. Otherwise, inferred column names should be used in a case of not full data reading which is defined by `skipfooter` parameter, when need to skip lines at the bottom of file or by `skiprows` parameter, when need to skip lines at the top of file (but if `usecols` was provided, column names inference must happen in the workers). Parameters ---------- names : array-like List of column names to use. skiprows : list-like, int or callable Line numbers to skip (0-indexed) or number of lines to skip (int) at the start of the file. If callable, the callable function will be evaluated against the row indices, returning ``True`` if the row should be skipped and ``False`` otherwise. skipfooter : int Number of lines at bottom of the file to skip. usecols : list-like or callable Subset of the columns. Returns ------- bool Whether to use inferred column names in ``read_csv`` of the workers or not. """ if names not in [None, lib.no_default]: return False if skipfooter != 0: return True if isinstance(skiprows, int) and skiprows == 0: return False if is_list_like(skiprows): return usecols is None return skiprows is not None ================================================ FILE: modin/core/io/text/utils.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Contains utility functions for dispatchers.""" import io class CustomNewlineIterator: r""" Used to iterate through files in binary mode line by line where newline != b'\n'. Parameters ---------- _file : file-like object File-like object to iterate over. newline : bytes Byte or sequence of bytes indicating line endings. """ def __init__(self, _file, newline): self.file = _file self.newline = newline self.bytes_read = self.chunk_size = 0 def __iter__(self): """ Iterate over lines. Yields ------ bytes Data from file. """ buffer_size = io.DEFAULT_BUFFER_SIZE chunk = self.file.read(buffer_size) self.chunk_size = 0 while chunk: self.bytes_read = 0 self.chunk_size = len(chunk) # split remove newline bytes from line lines = chunk.split(self.newline) for line in lines[:-1]: self.bytes_read += len(line) + len(self.newline) yield line chunk = self.file.read(buffer_size) if lines[-1]: # last line can be read without newline bytes chunk = lines[-1] + chunk def seek(self): """Change the stream positition to where the last returned line ends.""" self.file.seek(self.bytes_read - self.chunk_size, 1) ================================================ FILE: modin/core/storage_formats/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Modin's functionality related to storage formats supported.""" from .base import BaseQueryCompiler from .pandas import PandasQueryCompiler __all__ = ["BaseQueryCompiler", "PandasQueryCompiler"] ================================================ FILE: modin/core/storage_formats/base/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """The module represents the base query compiler that defines the common query compiler API.""" from .query_compiler import BaseQueryCompiler __all__ = ["BaseQueryCompiler"] ================================================ FILE: modin/core/storage_formats/base/doc_utils.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module contains decorators for documentation of the query compiler methods.""" from functools import partial from modin.utils import align_indents, append_to_docstring, format_string _one_column_warning = """ .. warning:: This method is supported only by one-column query compilers. """ _deprecation_warning = """ .. warning:: This method duplicates logic of ``{0}`` and will be removed soon. """ _refer_to_note = """ Notes ----- Please refer to ``modin.pandas.{0}`` for more information about parameters and output format. """ add_one_column_warning = append_to_docstring(_one_column_warning) def add_deprecation_warning(replacement_method): """ Build decorator which appends deprecation warning to the function's docstring. Appended warning indicates that the current method duplicates functionality of some other method and so is slated to be removed in the future. Parameters ---------- replacement_method : str Name of the method to use instead of deprecated. Returns ------- callable """ message = _deprecation_warning.format(replacement_method) return append_to_docstring(message) def add_refer_to(method): """ Build decorator which appends link to the high-level equivalent method to the function's docstring. Parameters ---------- method : str Method name in ``modin.pandas`` module to refer to. Returns ------- callable """ # FIXME: this would break numpydoc if there already is a `Notes` section note = _refer_to_note.format(method) return append_to_docstring(note) def doc_qc_method( template, params=None, refer_to=None, refer_to_module_name=None, one_column_method=False, **kwargs, ): """ Build decorator which adds docstring for query compiler method. Parameters ---------- template : str Method docstring in the NumPy docstyle format. Must contain {params} placeholder. params : str, optional Method parameters in the NumPy docstyle format to substitute in the `template`. `params` string should not include the "Parameters" header. refer_to : str, optional Method name in `refer_to_module_name` module to refer to for more information about parameters and output format. refer_to_module_name : str, optional one_column_method : bool, default: False Whether to append note that this method is for one-column query compilers only. **kwargs : dict Values to substitute in the `template`. Returns ------- callable """ params_template = """ Parameters ---------- {params} """ params = format_string(params_template, params=params) if params else "" substituted = format_string(template, params=params, refer_to=refer_to, **kwargs) if refer_to_module_name: refer_to = f"{refer_to_module_name}.{refer_to}" def decorator(func): func.__doc__ = substituted appendix = "" if refer_to: appendix += _refer_to_note.format(refer_to) if one_column_method: appendix += _one_column_warning if appendix: func = append_to_docstring(appendix)(func) return func return decorator def doc_binary_method(operation, sign, self_on_right=False, op_type="arithmetic"): """ Build decorator which adds docstring for binary method. Parameters ---------- operation : str Name of the binary operation. sign : str Sign which represents specified binary operation. self_on_right : bool, default: False Whether `self` is the right operand. op_type : {"arithmetic", "logical", "comparison"}, default: "arithmetic" Type of the binary operation. Returns ------- callable """ template = """ Perform element-wise {operation} (``{verbose}``). If axes are not equal, perform frames alignment first. Parameters ---------- other : BaseQueryCompiler, scalar or array-like Other operand of the binary operation. broadcast : bool, default: False If `other` is a one-column query compiler, indicates whether it is a Series or not. Frames and Series have to be processed differently, however we can't distinguish them at the query compiler level, so this parameter is a hint that is passed from a high-level API. {extra_params}**kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler Result of binary operation. """ extra_params = { "logical": """ level : int or label In case of MultiIndex match index values on the passed level. axis : {{0, 1}} Axis to match indices along for 1D `other` (list or QueryCompiler that represents Series). 0 is for index, when 1 is for columns. """, "arithmetic": """ level : int or label In case of MultiIndex match index values on the passed level. axis : {{0, 1}} Axis to match indices along for 1D `other` (list or QueryCompiler that represents Series). 0 is for index, when 1 is for columns. fill_value : float or None Value to fill missing elements during frame alignment. """, "series_comparison": """ level : int or label In case of MultiIndex match index values on the passed level. fill_value : float or None Value to fill missing elements during frame alignment. axis : {{0, 1}} Unused. Parameter needed for compatibility with DataFrame. """, } verbose_substitution = ( f"other {sign} self" if self_on_right else f"self {sign} other" ) params_substitution = extra_params.get(op_type, "") return doc_qc_method( template, extra_params=params_substitution, operation=operation, verbose=verbose_substitution, ) def doc_reduce_agg(method, refer_to, params=None, extra_params=None): """ Build decorator which adds docstring for the reduce method. Parameters ---------- method : str The result of the method. refer_to : str Method name in ``modin.pandas.DataFrame`` module to refer to for more information about parameters and output format. params : str, optional Method parameters in the NumPy docstyle format to substitute to the docstring template. extra_params : sequence of str, optional Method parameter names to append to the docstring template. Parameter type and description will be grabbed from ``extra_params_map`` (Please refer to the source code of this function to explore the map). Returns ------- callable """ template = """ Get the {method} for each column or row. {params} Returns ------- BaseQueryCompiler One-column QueryCompiler with index labels of the specified axis, where each row contains the {method} for the corresponding row or column. """ if params is None: params = """ axis : {{0, 1}} numeric_only : bool, optional""" extra_params_map = { "skipna": """ skipna : bool, default: True""", "min_count": """ min_count : int""", "ddof": """ ddof : int""", "*args": """ *args : iterable Serves the compatibility purpose. Does not affect the result.""", "**kwargs": """ **kwargs : dict Serves the compatibility purpose. Does not affect the result.""", } params += "".join( [ align_indents( source=params, target=extra_params_map.get(param, f"\n{param} : object") ) for param in (extra_params or []) ] ) return doc_qc_method( template, params=params, method=method, refer_to=f"DataFrame.{refer_to}", ) doc_cum_agg = partial( doc_qc_method, template=""" Get cumulative {method} for every row or column. Parameters ---------- fold_axis : {{0, 1}} skipna : bool **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler QueryCompiler of the same shape as `self`, where each element is the {method} of all the previous values in this row or column. """, refer_to_module_name="DataFrame", ) doc_resample = partial( doc_qc_method, template=""" Resample time-series data and apply aggregation on it. Group data into intervals by time-series row/column with a specified frequency and {action}. Parameters ---------- resample_kwargs : dict Resample parameters as expected by ``modin.pandas.DataFrame.resample`` signature. {extra_params} Returns ------- BaseQueryCompiler New QueryCompiler containing the result of resample aggregation built by the following rules: {build_rules} """, refer_to_module_name="resample.Resampler", ) def doc_resample_reduce(result, refer_to, params=None, compatibility_params=True): """ Build decorator which adds docstring for the resample reduce method. Parameters ---------- result : str The result of the method. refer_to : str Method name in ``modin.pandas.resample.Resampler`` module to refer to for more information about parameters and output format. params : str, optional Method parameters in the NumPy docstyle format to substitute to the docstring template. compatibility_params : bool, default: True Whether method takes `*args` and `**kwargs` that do not affect the result. Returns ------- callable """ action = f"compute {result} for each group" params_substitution = ( ( """ *args : iterable Serves the compatibility purpose. Does not affect the result. **kwargs : dict Serves the compatibility purpose. Does not affect the result. """ ) if compatibility_params else "" ) if params: params_substitution = format_string( "{params}\n{params_substitution}", params=params, params_substitution=params_substitution, ) build_rules = f""" - Labels on the specified axis are the group names (time-stamps) - Labels on the opposite of specified axis are preserved. - Each element of QueryCompiler is the {result} for the corresponding group and column/row.""" return doc_resample( action=action, extra_params=params_substitution, build_rules=build_rules, refer_to=refer_to, ) def doc_resample_agg(action, output, refer_to, params=None): """ Build decorator which adds docstring for the resample aggregation method. Parameters ---------- action : str What method does with the resampled data. output : str What is the content of column names in the result. refer_to : str Method name in ``modin.pandas.resample.Resampler`` module to refer to for more information about parameters and output format. params : str, optional Method parameters in the NumPy docstyle format to substitute to the docstring template. Returns ------- callable """ action = f"{action} for each group over the specified axis" params_substitution = """ *args : iterable Positional arguments to pass to the aggregation function. **kwargs : dict Keyword arguments to pass to the aggregation function. """ if params: params_substitution = format_string( "{params}\n{params_substitution}", params=params, params_substitution=params_substitution, ) build_rules = f""" - Labels on the specified axis are the group names (time-stamps) - Labels on the opposite of specified axis are a MultiIndex, where first level contains preserved labels of this axis and the second level is the {output}. - Each element of QueryCompiler is the result of corresponding function for the corresponding group and column/row.""" return doc_resample( action=action, extra_params=params_substitution, build_rules=build_rules, refer_to=refer_to, ) def doc_resample_fillna(method, refer_to, params=None, overwrite_template_params=False): """ Build decorator which adds docstring for the resample fillna query compiler method. Parameters ---------- method : str Fillna method name. refer_to : str Method name in ``modin.pandas.resample.Resampler`` module to refer to for more information about parameters and output format. params : str, optional Method parameters in the NumPy docstyle format to substitute to the docstring template. overwrite_template_params : bool, default: False If `params` is specified indicates whether to overwrite method parameters in the docstring template or append then at the end. Returns ------- callable """ action = f"fill missing values in each group independently using {method} method" params_substitution = "limit : int\n" if params: params_substitution = ( params if overwrite_template_params else format_string( "{params}\n{params_substitution}", params=params, params_substitution=params_substitution, ) ) build_rules = "- QueryCompiler contains unsampled data with missing values filled." return doc_resample( action=action, extra_params=params_substitution, build_rules=build_rules, refer_to=refer_to, ) doc_dt = partial( doc_qc_method, template=""" Get {prop} for each {dt_type} value. {params} Returns ------- BaseQueryCompiler New QueryCompiler with the same shape as `self`, where each element is {prop} for the corresponding {dt_type} value. """, one_column_method=True, refer_to_module_name="Series.dt", ) doc_dt_timestamp = partial(doc_dt, dt_type="datetime") doc_dt_interval = partial(doc_dt, dt_type="interval") doc_dt_period = partial(doc_dt, dt_type="period") doc_dt_round = partial( doc_qc_method, template=""" Perform {refer_to} operation on the underlying time-series data to the specified `freq`. Parameters ---------- freq : str ambiguous : {{"raise", "infer", "NaT"}} or bool mask, default: "raise" nonexistent : {{"raise", "shift_forward", "shift_backward", "NaT"}} or timedelta, default: "raise" Returns ------- BaseQueryCompiler New QueryCompiler with performed {refer_to} operation on every element. """, one_column_method=True, refer_to_module_name="Series.dt", ) doc_str_method = partial( doc_qc_method, template=""" Apply "{refer_to}" function to each string value in QueryCompiler. {params} Returns ------- BaseQueryCompiler New QueryCompiler containing the result of execution of the "{refer_to}" function against each string element. """, one_column_method=True, refer_to_module_name="Series.str", ) def doc_window_method( window_cls_name, result, refer_to, action=None, win_type="rolling window", params=None, build_rules="aggregation", ): """ Build decorator which adds docstring for a window method. Parameters ---------- window_cls_name : str The Window class the method is on. result : str The result of the method. refer_to : str Method name in ``modin.pandas.window.Window`` module to refer to for more information about parameters and output format. action : str, optional What method does with the created window. win_type : str, default: "rolling_window" Type of window that the method creates. params : str, optional Method parameters in the NumPy docstyle format to substitute to the docstring template. build_rules : str, default: "aggregation" Description of the data output format. Returns ------- callable """ template = """ Create {win_type} and {action} for each window over the given axis. Parameters ---------- fold_axis : {{0, 1}} {window_args_name} : list Rolling windows arguments with the same signature as ``modin.pandas.DataFrame.rolling``. {extra_params} Returns ------- BaseQueryCompiler New QueryCompiler containing {result} for each window, built by the following rules: {build_rules} """ doc_build_rules = { "aggregation": f""" - Output QueryCompiler has the same shape and axes labels as the source. - Each element is the {result} for the corresponding window.""", "udf_aggregation": """ - Labels on the specified axis are preserved. - Labels on the opposite of specified axis are MultiIndex, where first level contains preserved labels of this axis and the second level has the function names. - Each element of QueryCompiler is the result of corresponding function for the corresponding window and column/row.""", } if action is None: action = f"compute {result}" if win_type == "rolling window": window_args_name = "rolling_kwargs" elif win_type == "expanding window": window_args_name = "expanding_args" else: window_args_name = "window_kwargs" # We need that `params` value ended with new line to have # an empty line between "parameters" and "return" sections if params and params[-1] != "\n": params += "\n" if params is None: params = "" return doc_qc_method( template, result=result, action=action, win_type=win_type, extra_params=params, build_rules=doc_build_rules.get(build_rules, build_rules), refer_to=f"{window_cls_name}.{refer_to}", window_args_name=window_args_name, ) def doc_groupby_method(result, refer_to, action=None): """ Build decorator which adds docstring for the groupby reduce method. Parameters ---------- result : str The result of reduce. refer_to : str Method name in ``modin.pandas.groupby`` module to refer to for more information about parameters and output format. action : str, optional What method does with groups. Returns ------- callable """ template = """ Group QueryCompiler data and {action} for every group. Parameters ---------- by : BaseQueryCompiler, column or index label, Grouper or list of such Object that determine groups. axis : {{0, 1}} Axis to group and apply aggregation function along. 0 is for index, when 1 is for columns. groupby_kwargs : dict GroupBy parameters as expected by ``modin.pandas.DataFrame.groupby`` signature. agg_args : list-like Positional arguments to pass to the `agg_func`. agg_kwargs : dict Key arguments to pass to the `agg_func`. drop : bool, default: False If `by` is a QueryCompiler indicates whether or not by-data came from the `self`. Returns ------- BaseQueryCompiler QueryCompiler containing the result of groupby reduce built by the following rules: - Labels on the opposite of specified axis are preserved. - If groupby_args["as_index"] is True then labels on the specified axis are the group names, otherwise labels would be default: 0, 1 ... n. - If groupby_args["as_index"] is False, then first N columns/rows of the frame contain group names, where N is the columns/rows to group on. - Each element of QueryCompiler is the {result} for the corresponding group and column/row. .. warning `map_args` and `reduce_args` parameters are deprecated. They're leaked here from ``PandasQueryCompiler.groupby_*``, pandas storage format implements groupby via TreeReduce approach, but for other storage formats these parameters make no sense, and so they'll be removed in the future. """ if action is None: action = f"compute {result}" return doc_qc_method( template, result=result, action=action, refer_to=f"GroupBy.{refer_to}" ) ================================================ FILE: modin/core/storage_formats/base/query_compiler.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Module contains class ``BaseQueryCompiler``. ``BaseQueryCompiler`` is a parent abstract class for any other query compiler class. """ from __future__ import annotations import abc import warnings from enum import IntEnum from functools import cached_property from types import MappingProxyType from typing import TYPE_CHECKING, Any, Hashable, List, Literal, Optional, Union import numpy as np import pandas import pandas.core.resample from pandas._typing import DtypeBackend, IndexLabel, Suffixes from pandas.core.dtypes.common import is_number, is_scalar from modin.config.envvars import Backend, Execution from modin.core.dataframe.algebra.default2pandas import ( BinaryDefault, CatDefault, DataFrameDefault, DateTimeDefault, ExpandingDefault, GroupByDefault, ListDefault, ResampleDefault, RollingDefault, SeriesDefault, SeriesGroupByDefault, StrDefault, StructDefault, ) from modin.core.dataframe.base.interchange.dataframe_protocol.dataframe import ( ProtocolDataframe, ) from modin.error_message import ErrorMessage from modin.logging import ClassLogger from modin.logging.config import LogLevel from modin.logging.logger_decorator import disable_logging from modin.utils import MODIN_UNNAMED_SERIES_LABEL, try_cast_to_pandas from . import doc_utils if TYPE_CHECKING: from typing_extensions import Self # TODO: should be ModinDataframe # https://github.com/modin-project/modin/issues/7244 from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe from modin.pandas import DataFrame, Series from modin.pandas.base import BasePandasDataset def _get_axis(axis): """ Build index labels getter of the specified axis. Parameters ---------- axis : {0, 1} Axis to get labels from. Returns ------- callable(BaseQueryCompiler) -> pandas.Index """ def axis_getter(self: "BaseQueryCompiler") -> pandas.Index: self._maybe_warn_on_default(message=f"DataFrame.get_axis({axis})") return self.to_pandas().axes[axis] return axis_getter def _set_axis(axis): """ Build index labels setter of the specified axis. Parameters ---------- axis : {0, 1} Axis to set labels on. Returns ------- callable(BaseQueryCompiler) """ def axis_setter(self, labels): new_qc = DataFrameDefault.register(pandas.DataFrame.set_axis)( self, axis=axis, labels=labels ) self.__dict__.update(new_qc.__dict__) return axis_setter class QCCoercionCost(IntEnum): # noqa: PR01 """ Coercion costs between different Query Compiler backends. Coercion costs between query compilers can be expressed as integers in the range 0 to 1000, where 1000 is considered impossible. Since coercion costs can be a function of many variables ( dataset size, partitioning, network throughput, and query time ) we define a set range of cost values to simplify comparisons between two query compilers / engines in a unified way. COST_ZERO means there is no cost associated, or that the query compilers are the same. COST_IMPOSSIBLE means the coercion is effectively impossible, which can occur if the target system is unable to store the data as a result of the coercion. Currently this does not prevent coercion. """ COST_ZERO = 0 COST_LOW = 250 COST_MEDIUM = 500 COST_HIGH = 750 COST_IMPOSSIBLE = 1000 @classmethod def validate_coersion_cost(cls, cost: QCCoercionCost): """ Validate that the coercion cost is within range. Parameters ---------- cost : QCCoercionCost """ if int(cost) < int(QCCoercionCost.COST_ZERO) or int(cost) > int( QCCoercionCost.COST_IMPOSSIBLE ): raise ValueError("Query compiler coercsion cost out of range") # FIXME: many of the BaseQueryCompiler methods are hiding actual arguments # by using *args and **kwargs. They should be spread into actual parameters. # Currently actual arguments are placed in the methods docstrings, but since they're # not presented in the function's signature it makes linter to raise `PR02: unknown parameters` # warning. For now, they're silenced by using `noqa` (Modin issue #3108). class BaseQueryCompiler( ClassLogger, abc.ABC, modin_layer="QUERY-COMPILER", log_level=LogLevel.DEBUG ): """ Abstract class that handles the queries to Modin dataframes. This class defines common query compilers API, most of the methods are already implemented and defaulting to pandas. Attributes ---------- lazy_row_labels : bool, default False True if the backend defers computations of the row labels (`df.index` for a frame). Used by the frontend to avoid unnecessary execution or defer error validation. lazy_row_count : bool, default False True if the backend defers computations of the number of rows (`len(df.index)`). Used by the frontend to avoid unnecessary execution or defer error validation. lazy_column_types : bool, default False True if the backend defers computations of the column types (`df.dtypes`). Used by the frontend to avoid unnecessary execution or defer error validation. lazy_column_labels : bool, default False True if the backend defers computations of the column labels (`df.columns`). Used by the frontend to avoid unnecessary execution or defer error validation. lazy_column_count : bool, default False True if the backend defers computations of the number of columns (`len(df.columns)`). Used by the frontend to avoid unnecessary execution or defer error validation. _shape_hint : {"row", "column", None}, default: None Shape hint for frames known to be a column or a row, otherwise None. Notes ----- See the Abstract Methods and Fields section immediately below this for a list of requirements for subclassing this object. """ # four variables can handle reasonably complex automatic engine-switching # behavior, though the operation overhead (both initial and per-row) # values may vary by engine. _MAX_SIZE_THIS_ENGINE_CAN_HANDLE: int = 1 _OPERATION_INITIALIZATION_OVERHEAD: int = 0 _OPERATION_PER_ROW_OVERHEAD: int = 0 _TRANSFER_THRESHOLD: int = 0 _modin_frame: PandasDataframe _shape_hint: Optional[str] _should_warn_on_default_to_pandas: bool = True @classmethod def _maybe_warn_on_default(cls, *, message: str = "", reason: str = "") -> None: """ If this class is configured to warn on default to pandas, warn. Parameters ---------- message : str, default: "" Method that is defaulting to pandas. reason : str, default: "" Reason for default. """ if cls._should_warn_on_default_to_pandas: ErrorMessage.default_to_pandas(message=message, reason=reason) @disable_logging def get_backend(self) -> str: """ Get the backend for this query compiler. Returns ------- str The backend for this query compiler. """ return Backend.get_backend_for_execution( Execution( engine=self.engine, storage_format=self.storage_format, ) ) @property @abc.abstractmethod def storage_format(self) -> str: """ The storage format for this query compiler. Returns ------- str The storage format. """ pass @property @abc.abstractmethod def engine(self) -> str: """ The engine for this query compiler. Returns ------- str The engine. """ pass def __wrap_in_qc(self, obj): """ Wrap `obj` in query compiler. Parameters ---------- obj : any Object to wrap. Returns ------- BaseQueryCompiler Query compiler wrapping the object. """ if isinstance(obj, pandas.Series): if obj.name is None: obj.name = MODIN_UNNAMED_SERIES_LABEL obj = obj.to_frame() if isinstance(obj, pandas.DataFrame): return self.from_pandas(obj, type(self._modin_frame)) else: return obj def default_to_pandas(self, pandas_op, *args, **kwargs) -> Self: """ Do fallback to pandas for the passed function. Parameters ---------- pandas_op : callable(pandas.DataFrame) -> object Function to apply to the casted to pandas frame. *args : iterable Positional arguments to pass to `pandas_op`. **kwargs : dict Key-value arguments to pass to `pandas_op`. Returns ------- BaseQueryCompiler The result of the `pandas_op`, converted back to ``BaseQueryCompiler``. """ op_name = getattr(pandas_op, "__name__", str(pandas_op)) self._maybe_warn_on_default(message=op_name) args = try_cast_to_pandas(args) kwargs = try_cast_to_pandas(kwargs) with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=FutureWarning) result = pandas_op(try_cast_to_pandas(self), *args, **kwargs) if isinstance(result, (tuple, list)): if "Series.tolist" in pandas_op.__name__: # fast path: no need to iterate over the result from `tolist` function return result return [self.__wrap_in_qc(obj) for obj in result] return self.__wrap_in_qc(result) @disable_logging def move_to_cost( self, other_qc_type: type, api_cls_name: Optional[str], operation: str, arguments: MappingProxyType[str, Any], ) -> Optional[int]: """ Return the coercion costs of this qc to other_qc type. This is called for forced casting and opportunistic switching decision points. Values returned must be within the acceptable range of QCCoercionCost The question is: What are the transfer costs associated with moving this data to the other_qc_type? Parameters ---------- other_qc_type : QueryCompiler Class The query compiler class to which we should return the cost of switching. api_cls_name : Optional[str] The name of the class performing the operation which can be used as a consideration for the costing analysis. `None` means the function does not belong to a class. operation : str The operation being performed which can be used as a consideration for the costing analysis. arguments : MappingProxyType[str, Any] The arguments to the operation. Returns ------- Optional[int] Cost of migrating the data from this qc to the other_qc or None if the cost cannot be determined. """ if isinstance(self, other_qc_type): return QCCoercionCost.COST_ZERO if self.__class__._transfer_threshold() <= 0: return QCCoercionCost.COST_ZERO cost = int( ( QCCoercionCost.COST_IMPOSSIBLE * self._max_shape()[0] / self.__class__._transfer_threshold() ) ) if cost > QCCoercionCost.COST_IMPOSSIBLE: return QCCoercionCost.COST_IMPOSSIBLE return cost @classmethod def _stay_cost_rows( cls, rows: int, per_row_overhead: int, max_size: int, op_init_overhead: int ) -> int: """ Get the cost of staying on this query compiler for an operation. Parameters ---------- rows : int The number of input rows. per_row_overhead : int Per-row cost of this operation. max_size : int Max rows for this query compiler. op_init_overhead : int Overhead cost of this operation. Returns ------- int Cost of staying on this query compiler. """ if rows > max_size: return QCCoercionCost.COST_IMPOSSIBLE cost_all_rows = rows * per_row_overhead normalized_cost_all_rows = ( cost_all_rows / max_size * QCCoercionCost.COST_IMPOSSIBLE ) total_cost = normalized_cost_all_rows + op_init_overhead if total_cost > QCCoercionCost.COST_IMPOSSIBLE: return QCCoercionCost.COST_IMPOSSIBLE return int(total_cost) @disable_logging def stay_cost( self, api_cls_name: Optional[str], operation: str, arguments: MappingProxyType[str, Any], ) -> Optional[int]: """ Return the "opportunity cost" of not moving the data. This is called for opportunistic decision points where we have a single data frame which may be moved to another engine. This is can often the inverse of the move_to_cost, but it can be independently calculated and different. For instance, the move_to_cost may include the cost of network transmission to the other engine, where as the cost returned by 'stay_cost' may be simply the cost of running the operation locally. The question is: What is the cost of running this operation on the current dataframe? Values returned must be within the acceptable range of QCCoercionCost Parameters ---------- api_cls_name : str The class name performing the operation which can be used as a consideration for the costing analysis. `None` means the function is not associated with a class. operation : str, default: None The operation being performed which can be used as a consideration for the costing analysis. arguments : MappingProxyType[str, Any] The arguments to the operation. Returns ------- Optional[int] Cost of doing this operation on the current backend. """ return self._stay_cost_rows( self._max_shape()[0], self._OPERATION_PER_ROW_OVERHEAD, self.__class__._engine_max_size(), self._OPERATION_INITIALIZATION_OVERHEAD, ) @disable_logging @classmethod def move_to_me_cost( cls, other_qc: BaseQueryCompiler, api_cls_name: Optional[str], operation: str, arguments: MappingProxyType[str, Any], ) -> Optional[int]: """ Return the execution and hidden coercion costs from other_qc. This can be implemented as a class method version of stay_cost, though since this class is not yet instantiated it may have a different implementation. It may also include hidden transport or serialization costs. Values returned must be within the acceptable range of QCCoercionCost. The question is: What is the cost of executing this operation if it were to move to this query compiler? Parameters ---------- other_qc : BaseQueryCompiler The query compiler from which we should return the cost of switching. api_cls_name : Optional[str] The class name performing the operation which can be used as a consideration for the costing analysis. `None` means the function is not associated with a class. operation : str The operation being performed which can be used as a consideration for the costing analysis. arguments : MappingProxyType[str, Any] The arguments to the operation. Returns ------- Optional[int] Cost of migrating the data from other_qc to this qc or None if the cost cannot be determined. """ row_count = other_qc._max_shape()[0] return cls._stay_cost_rows( row_count, cls._OPERATION_PER_ROW_OVERHEAD, cls._engine_max_size(), cls._OPERATION_INITIALIZATION_OVERHEAD, ) @classmethod def _engine_max_size(cls) -> int: """Maximum number of rows this engine can handle.""" return cls._MAX_SIZE_THIS_ENGINE_CAN_HANDLE @classmethod def _transfer_threshold(cls) -> int: """Maximum number of rows this backend can handle before transferring data to another backend.""" return cls._TRANSFER_THRESHOLD @disable_logging @classmethod def max_cost(cls) -> int: """ Return the max cost allowed by this engine. Returns ------- int Max cost allowed for migrating the data to this qc. """ return QCCoercionCost.COST_IMPOSSIBLE # Abstract Methods and Fields: Must implement in children classes # In some cases, there you may be able to use the same implementation for # some of these abstract methods, but for the sake of generality they are # treated differently. lazy_row_labels = False lazy_row_count = False lazy_column_types = False lazy_column_labels = False lazy_column_count = False def _max_shape(self) -> tuple[int, int]: """ Return the maximum dimensions of the frame. For lazily evaluated engines the shape of the dataset may be expensive to determine (see lazy_shape), but the maximum shape can be calculated inexpensively. Returns ------- Tuple Maximum shape of the dataframe (height, width). """ return self.get_axis_len(axis=0), self.get_axis_len(axis=1) @property def lazy_shape(self): """ Whether either of the underlying dataframe's dimensions (row count/column count) are computed lazily. If True, the frontend should avoid length/shape checks as much as possible. Returns ------- bool """ return self.lazy_row_count or self.lazy_column_count _shape_hint = None # Metadata modification abstract methods def add_prefix(self, prefix, axis=1): """ Add string prefix to the index labels along specified axis. Parameters ---------- prefix : str The string to add before each label. axis : {0, 1}, default: 1 Axis to add prefix along. 0 is for index and 1 is for columns. Returns ------- BaseQueryCompiler New query compiler with updated labels. """ return DataFrameDefault.register(pandas.DataFrame.add_prefix)( self, prefix=prefix, axis=axis ) def add_suffix(self, suffix, axis=1): """ Add string suffix to the index labels along specified axis. Parameters ---------- suffix : str The string to add after each label. axis : {0, 1}, default: 1 Axis to add suffix along. 0 is for index and 1 is for columns. Returns ------- BaseQueryCompiler New query compiler with updated labels. """ return DataFrameDefault.register(pandas.DataFrame.add_suffix)( self, suffix=suffix, axis=axis ) # END Metadata modification abstract methods # Abstract copy def copy(self): """ Make a copy of this object. Returns ------- BaseQueryCompiler Copy of self. Notes ----- For copy, we don't want a situation where we modify the metadata of the copies if we end up modifying something here. We copy all of the metadata to prevent that. """ return DataFrameDefault.register(pandas.DataFrame.copy)(self) # END Abstract copy # Abstract join and append helper functions def concat(self, axis, other, **kwargs): # noqa: PR02 """ Concatenate `self` with passed query compilers along specified axis. Parameters ---------- axis : {0, 1} Axis to concatenate along. 0 is for index and 1 is for columns. other : BaseQueryCompiler or list of such Objects to concatenate with `self`. join : {'outer', 'inner', 'right', 'left'}, default: 'outer' Type of join that will be used if indices on the other axis are different. (note: if specified, has to be passed as ``join=value``). ignore_index : bool, default: False If True, do not use the index values along the concatenation axis. The resulting axis will be labeled 0, …, n - 1. (note: if specified, has to be passed as ``ignore_index=value``). sort : bool, default: False Whether or not to sort non-concatenation axis. (note: if specified, has to be passed as ``sort=value``). **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler Concatenated objects. """ concat_join = ["inner", "outer"] def concat(df, axis, other, **kwargs): kwargs.pop("join_axes", None) ignore_index = kwargs.get("ignore_index", False) if kwargs.get("join", "outer") in concat_join: if not isinstance(other, list): other = [other] other = [df] + other result = pandas.concat(other, axis=axis, **kwargs) else: if isinstance(other, (list, np.ndarray)) and len(other) == 1: other = other[0] ignore_index = kwargs.pop("ignore_index", None) kwargs["how"] = kwargs.pop("join", None) if ( isinstance(other, (pandas.DataFrame, pandas.Series)) or len(other) <= 1 ): kwargs["rsuffix"] = "r_" result = df.join(other, **kwargs) if ignore_index: if axis == 0: result = result.reset_index(drop=True) else: result.columns = pandas.RangeIndex(len(result.columns)) return result return DataFrameDefault.register(concat)(self, axis=axis, other=other, **kwargs) # END Abstract join and append helper functions # Data Management Methods @abc.abstractmethod def free(self): """Trigger a cleanup of this object.""" pass @abc.abstractmethod def finalize(self): """Finalize constructing the dataframe calling all deferred functions which were used to build it.""" pass @abc.abstractmethod def execute(self): """Wait for all computations to complete without materializing data.""" pass def support_materialization_in_worker_process(self) -> bool: """ Whether it's possible to call function `to_pandas` during the pickling process, at the moment of recreating the object. Returns ------- bool """ return self._modin_frame.support_materialization_in_worker_process() # END Data Management Methods # Data Movement Methods def move_to(self, target_backend: str) -> Union[BaseQueryCompiler, Any]: """ Move this query compiler to the specified backend. Parameters ---------- target_backend : str The backend to move to. Returns ------- BaseQueryCompiler or Any The new query compiler with the source data, or a sentinel `NotImplemented` value if transfer is not implemented. """ return NotImplemented @classmethod def move_from(cls, source_qc: BaseQueryCompiler) -> Union[BaseQueryCompiler, Any]: """ Move the source query compiler to the current backend. Parameters ---------- source_qc : BaseQueryCompiler The source query compiler to move data from. Returns ------- BaseQueryCompiler or Any A new query compiler with the source data, or a sentinel `NotImplemented` value if transfer is not implemented. """ return NotImplemented # END Data Movement Methods # To/From Pandas @abc.abstractmethod def to_pandas(self): """ Convert underlying query compilers data to ``pandas.DataFrame``. Returns ------- pandas.DataFrame The QueryCompiler converted to pandas. """ pass @classmethod @abc.abstractmethod def from_pandas(cls, df, data_cls): """ Build QueryCompiler from pandas DataFrame. Parameters ---------- df : pandas.DataFrame The pandas DataFrame to convert from. data_cls : type :py:class:`~modin.core.dataframe.pandas.dataframe.dataframe.PandasDataframe` class (or its descendant) to convert to. Returns ------- BaseQueryCompiler QueryCompiler containing data from the pandas DataFrame. """ pass # END To/From Pandas # From Arrow @classmethod @abc.abstractmethod def from_arrow(cls, at, data_cls): """ Build QueryCompiler from Arrow Table. Parameters ---------- at : Arrow Table The Arrow Table to convert from. data_cls : type :py:class:`~modin.core.dataframe.pandas.dataframe.dataframe.PandasDataframe` class (or its descendant) to convert to. Returns ------- BaseQueryCompiler QueryCompiler containing data from the pandas DataFrame. """ pass # END From Arrow # To NumPy def to_numpy(self, **kwargs): # noqa: PR02 """ Convert underlying query compilers data to NumPy array. Parameters ---------- dtype : dtype The dtype of the resulted array. copy : bool Whether to ensure that the returned value is not a view on another array. na_value : object The value to replace missing values with. **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- np.ndarray The QueryCompiler converted to NumPy array. """ return DataFrameDefault.register(pandas.DataFrame.to_numpy)(self, **kwargs) # END To NumPy def do_array_ufunc_implementation( self, frame: BasePandasDataset, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any, ) -> Union["DataFrame", "Series", Any]: """ Apply the provided NumPy ufunc to the underlying data. This method is called by the ``__array_ufunc__`` dispatcher on BasePandasDataset. Unlike other query compiler methods, this function directly operates on the input DataFrame/Series to allow for easier argument processing. The default implementation defaults to pandas, but a query compiler sub-class may override this method to provide a distributed implementation. See NumPy docs: https://numpy.org/doc/stable/user/basics.subclassing.html#array-ufunc-for-ufuncs Parameters ---------- frame : BasePandasDataset The DataFrame or Series on which the ufunc was called. Its query compiler must match ``self``. ufunc : np.ufunc The function to apply. method : str The name of the function to apply. *inputs : Any Positional arguments to pass to ``ufunc``. **kwargs : Any Keyword arguments to pass to ``ufunc``. Returns ------- DataFrame, Series, or Any The result of applying the ufunc to ``frame``. """ assert ( self is frame._query_compiler ), "array ufunc called with mismatched query compiler and input frame" # we can't use the regular default_to_pandas() method because self is one of the # `inputs` to __array_ufunc__, and pandas has some checks on the identity of the # inputs [1]. The usual default to pandas will call _to_pandas() on the inputs # as well as on self, but that gives inputs[0] a different identity from self. # # [1] https://github.com/pandas-dev/pandas/blob/2c4c072ade78b96a9eb05097a5fcf4347a3768f3/pandas/_libs/ops_dispatch.pyx#L99-L109 self._maybe_warn_on_default(message="__array_ufunc__") pandas_self = frame._to_pandas() pandas_result = pandas_self.__array_ufunc__( ufunc, method, *( pandas_self if each_input is frame else try_cast_to_pandas(each_input) for each_input in inputs ), **try_cast_to_pandas(kwargs), ) if isinstance(pandas_result, pandas.DataFrame): from modin.pandas import DataFrame return DataFrame(pandas_result) elif isinstance(pandas_result, pandas.Series): from modin.pandas import Series return Series(pandas_result) # ufuncs are required to be one-to-one mappings, so this branch should never be hit return pandas_result # pragma: no cover def do_array_function_implementation( self, frame: BasePandasDataset, func: callable, types: tuple, args: tuple, kwargs: dict, ) -> Union["DataFrame", "Series", Any]: """ Apply the provided NumPy array function to the underlying data. This method is called by the ``__array_function__`` dispatcher on BasePandasDataset. Unlike other query compiler methods, this function directly operates on the input DataFrame/Series to allow for easier argument processing. The default implementation defaults to pandas, but a query compiler sub-class may override this method to provide a distributed implementation. See NumPy docs: https://numpy.org/neps/nep-0018-array-function-protocol.html#nep18 Parameters ---------- frame : BasePandasDataset The DataFrame or Series on which the ufunc was called. Its query compiler must match ``self``. func : np.func The NumPy func to apply. types : tuple The types of the args. args : tuple The args to the func. kwargs : dict Additional keyword arguments. Returns ------- DataFrame | Series | Any The result of applying the function to this dataset. By default, it will return a NumPy array. """ from modin.pandas.base import BasePandasDataset assert ( self is frame._query_compiler ), "__array_function__ called with mismatched query compiler and input frame" # Replace each modin type with numpy ndarray, since we convert modin frames to np ndarrays. new_types = ( np.ndarray if issubclass(tpe, BasePandasDataset) else tpe for tpe in types ) return frame.__array__().__array_function__(func, new_types, args, kwargs) # Dataframe exchange protocol @abc.abstractmethod def to_interchange_dataframe( self, nan_as_null: bool = False, allow_copy: bool = True ) -> ProtocolDataframe: """ Get a DataFrame exchange protocol object representing data of the Modin DataFrame. See more about the protocol in https://data-apis.org/dataframe-protocol/latest/index.html. Parameters ---------- nan_as_null : bool, default: False A keyword intended for the consumer to tell the producer to overwrite null values in the data with ``NaN`` (or ``NaT``). This currently has no effect; once support for nullable extension dtypes is added, this value should be propagated to columns. allow_copy : bool, default: True A keyword that defines whether or not the library is allowed to make a copy of the data. For example, copying data would be necessary if a library supports strided buffers, given that this protocol specifies contiguous buffers. Currently, if the flag is set to ``False`` and a copy is needed, a ``RuntimeError`` will be raised. Returns ------- ProtocolDataframe A dataframe object following the DataFrame protocol specification. """ pass @classmethod @abc.abstractmethod def from_interchange_dataframe(cls, df: ProtocolDataframe, data_cls): """ Build QueryCompiler from a DataFrame object supporting the dataframe exchange protocol `__dataframe__()`. Parameters ---------- df : ProtocolDataframe The DataFrame object supporting the dataframe exchange protocol. data_cls : type :py:class:`~modin.core.dataframe.pandas.dataframe.dataframe.PandasDataframe` class (or its descendant) to convert to. Returns ------- BaseQueryCompiler QueryCompiler containing data from the DataFrame. """ pass # END Dataframe exchange protocol def to_list(self): """ Return a list of the values. These are each a scalar type, which is a Python scalar (for str, int, float) or a pandas scalar (for Timestamp/Timedelta/Interval/Period). Returns ------- list """ return SeriesDefault.register(pandas.Series.to_list)(self) @doc_utils.add_refer_to("DataFrame.to_dict") def dataframe_to_dict(self, orient="dict", into=dict, index=True): # noqa: PR01 """ Convert the DataFrame to a dictionary. Returns ------- dict or `into` instance """ return self.to_pandas().to_dict(orient, into, index) @doc_utils.add_refer_to("Series.to_dict") def series_to_dict(self, into=dict): # noqa: PR01 """ Convert the Series to a dictionary. Returns ------- dict or `into` instance """ return SeriesDefault.register(pandas.Series.to_dict)(self, into) # Abstract inter-data operations (e.g. add, sub) # These operations require two DataFrames and will change the shape of the # data if the index objects don't match. An outer join + op is performed, # such that columns/rows that don't have an index on the other DataFrame # result in NaN values. @doc_utils.add_refer_to("DataFrame.align") def align(self, other, **kwargs): """ Align two objects on their axes with the specified join method. Join method is specified for each axis Index. Parameters ---------- other : BaseQueryCompiler **kwargs : dict Other arguments for aligning. Returns ------- BaseQueryCompiler Aligned `self`. BaseQueryCompiler Aligned `other`. """ return DataFrameDefault.register(pandas.DataFrame.align)( self, other=other, **kwargs ) @doc_utils.doc_binary_method(operation="addition", sign="+") def add(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.DataFrame.add)(self, other=other, **kwargs) @doc_utils.add_refer_to("DataFrame.combine") def combine(self, other, **kwargs): # noqa: PR02 """ Perform column-wise combine with another QueryCompiler with passed `func`. If axes are not equal, perform frames alignment first. Parameters ---------- other : BaseQueryCompiler Left operand of the binary operation. func : callable(pandas.Series, pandas.Series) -> pandas.Series Function that takes two ``pandas.Series`` with aligned axes and returns one ``pandas.Series`` as resulting combination. fill_value : float or None Value to fill missing values with after frame alignment occurred. overwrite : bool If True, columns in `self` that do not exist in `other` will be overwritten with NaNs. **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler Result of combine. """ return BinaryDefault.register(pandas.DataFrame.combine)( self, other=other, **kwargs ) @doc_utils.add_refer_to("DataFrame.combine_first") def combine_first(self, other, **kwargs): # noqa: PR02 """ Fill null elements of `self` with value in the same location in `other`. If axes are not equal, perform frames alignment first. Parameters ---------- other : BaseQueryCompiler Provided frame to use to fill null values from. **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler """ return BinaryDefault.register(pandas.DataFrame.combine_first)( self, other=other, **kwargs ) @doc_utils.doc_binary_method(operation="equality comparison", sign="==") def eq(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.DataFrame.eq)(self, other=other, **kwargs) @doc_utils.doc_binary_method( operation="equality comparison", sign="==", op_type="series_comparison" ) def series_eq(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.Series.eq)( self, other=other, squeeze_self=True, squeeze_other=kwargs.pop("squeeze_other", False), **kwargs, ) @doc_utils.add_refer_to("DataFrame.equals") def equals(self, other): # noqa: PR01, RT01 return BinaryDefault.register(pandas.DataFrame.equals)(self, other=other) @doc_utils.doc_binary_method(operation="integer division", sign="//") def floordiv(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.DataFrame.floordiv)( self, other=other, **kwargs ) @doc_utils.add_refer_to("Series.divmod") def divmod(self, other, **kwargs): """ Return Integer division and modulo of `self` and `other`, element-wise (binary operator divmod). Equivalent to divmod(`self`, `other`), but with support to substitute a fill_value for missing data in either one of the inputs. Parameters ---------- other : BaseQueryCompiler or scalar value **kwargs : dict Other arguments for division. Returns ------- BaseQueryCompiler Compiler representing Series with divisor part of division. BaseQueryCompiler Compiler representing Series with modulo part of division. """ return SeriesDefault.register(pandas.Series.divmod)(self, other=other, **kwargs) @doc_utils.doc_binary_method( operation="greater than or equal comparison", sign=">=", op_type="comparison" ) def ge(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.DataFrame.ge)(self, other=other, **kwargs) @doc_utils.doc_binary_method( operation="greater than or equal comparison", sign=">=", op_type="series_comparison", ) def series_ge(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.Series.ge)( self, other=other, squeeze_self=True, squeeze_other=kwargs.pop("squeeze_other", False), **kwargs, ) @doc_utils.doc_binary_method( operation="greater than comparison", sign=">", op_type="comparison" ) def gt(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.DataFrame.gt)(self, other=other, **kwargs) @doc_utils.doc_binary_method( operation="greater than comparison", sign=">", op_type="series_comparison" ) def series_gt(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.Series.gt)( self, other=other, squeeze_self=True, squeeze_other=kwargs.pop("squeeze_other", False), **kwargs, ) @doc_utils.doc_binary_method( operation="less than or equal comparison", sign="<=", op_type="comparison" ) def le(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.DataFrame.le)(self, other=other, **kwargs) @doc_utils.doc_binary_method( operation="less than or equal comparison", sign="<=", op_type="series_comparison", ) def series_le(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.Series.le)( self, other=other, squeeze_self=True, squeeze_other=kwargs.pop("squeeze_other", False), **kwargs, ) @doc_utils.doc_binary_method( operation="less than comparison", sign="<", op_type="comparison" ) def lt(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.DataFrame.lt)(self, other=other, **kwargs) @doc_utils.doc_binary_method( operation="less than", sign="<", op_type="series_comparison" ) def series_lt(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.Series.lt)( self, other=other, squeeze_self=True, squeeze_other=kwargs.pop("squeeze_other", False), **kwargs, ) @doc_utils.doc_binary_method(operation="modulo", sign="%") def mod(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.DataFrame.mod)(self, other=other, **kwargs) @doc_utils.doc_binary_method(operation="multiplication", sign="*") def mul(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.DataFrame.mul)(self, other=other, **kwargs) @doc_utils.doc_binary_method( operation="multiplication", sign="*", self_on_right=True ) def rmul(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.DataFrame.rmul)( self, other=other, **kwargs ) @doc_utils.add_refer_to("DataFrame.corr") def corr(self, **kwargs): # noqa: PR02 """ Compute pairwise correlation of columns, excluding NA/null values. Parameters ---------- method : {'pearson', 'kendall', 'spearman'} or callable(pandas.Series, pandas.Series) -> pandas.Series Correlation method. min_periods : int Minimum number of observations required per pair of columns to have a valid result. If fewer than `min_periods` non-NA values are present the result will be NA. **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler Correlation matrix. """ return DataFrameDefault.register(pandas.DataFrame.corr)(self, **kwargs) @doc_utils.add_refer_to("Series.corr") def series_corr(self, **kwargs): # noqa: PR01 """ Compute correlation with `other` Series, excluding missing values. The two `Series` objects are not required to be the same length and will be aligned internally before the correlation function is applied. Returns ------- float Correlation with other. """ return SeriesDefault.register(pandas.Series.corr)(self, **kwargs) @doc_utils.add_refer_to("DataFrame.corrwith") def corrwith(self, **kwargs): # noqa: PR01 """ Compute pairwise correlation. Returns ------- BaseQueryCompiler """ return DataFrameDefault.register(pandas.DataFrame.corrwith)(self, **kwargs) @doc_utils.add_refer_to("DataFrame.cov") def cov(self, **kwargs): # noqa: PR02 """ Compute pairwise covariance of columns, excluding NA/null values. Parameters ---------- min_periods : int **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler Covariance matrix. """ return DataFrameDefault.register(pandas.DataFrame.cov)(self, **kwargs) def dot(self, other, **kwargs): # noqa: PR02 """ Compute the matrix multiplication of `self` and `other`. Parameters ---------- other : BaseQueryCompiler or NumPy array The other query compiler or NumPy array to matrix multiply with `self`. squeeze_self : boolean If `self` is a one-column query compiler, indicates whether it represents Series object. squeeze_other : boolean If `other` is a one-column query compiler, indicates whether it represents Series object. **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler A new query compiler that contains result of the matrix multiply. """ if kwargs.get("squeeze_self", False): applyier = pandas.Series.dot else: applyier = pandas.DataFrame.dot return BinaryDefault.register(applyier)(self, other=other, **kwargs) @doc_utils.doc_binary_method( operation="not equal comparison", sign="!=", op_type="comparison" ) def ne(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.DataFrame.ne)(self, other=other, **kwargs) @doc_utils.doc_binary_method( operation="not equal comparison", sign="!=", op_type="series_comparison" ) def series_ne(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.Series.ne)( self, other=other, squeeze_self=True, squeeze_other=kwargs.pop("squeeze_other", False), **kwargs, ) @doc_utils.doc_binary_method(operation="exponential power", sign="**") def pow(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.DataFrame.pow)(self, other=other, **kwargs) @doc_utils.doc_binary_method(operation="addition", sign="+", self_on_right=True) def radd(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.DataFrame.radd)( self, other=other, **kwargs ) @doc_utils.add_refer_to("Series.rdivmod") def rdivmod(self, other, **kwargs): """ Return Integer division and modulo of `self` and `other`, element-wise (binary operator rdivmod). Equivalent to `other` divmod `self`, but with support to substitute a fill_value for missing data in either one of the inputs. Parameters ---------- other : BaseQueryCompiler or scalar value **kwargs : dict Other arguments for division. Returns ------- BaseQueryCompiler Compiler representing Series with divisor part of division. BaseQueryCompiler Compiler representing Series with modulo part of division. """ return SeriesDefault.register(pandas.Series.rdivmod)( self, other=other, **kwargs ) @doc_utils.doc_binary_method( operation="integer division", sign="//", self_on_right=True ) def rfloordiv(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.DataFrame.rfloordiv)( self, other=other, **kwargs ) @doc_utils.doc_binary_method(operation="modulo", sign="%", self_on_right=True) def rmod(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.DataFrame.rmod)( self, other=other, **kwargs ) @doc_utils.doc_binary_method( operation="exponential power", sign="**", self_on_right=True ) def rpow(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.DataFrame.rpow)( self, other=other, **kwargs ) @doc_utils.doc_binary_method(operation="subtraction", sign="-", self_on_right=True) def rsub(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.DataFrame.rsub)( self, other=other, **kwargs ) @doc_utils.doc_binary_method(operation="division", sign="/", self_on_right=True) def rtruediv(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.DataFrame.rtruediv)( self, other=other, **kwargs ) @doc_utils.doc_binary_method(operation="subtraction", sign="-") def sub(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.DataFrame.sub)(self, other=other, **kwargs) @doc_utils.doc_binary_method(operation="division", sign="/") def truediv(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.DataFrame.truediv)( self, other=other, **kwargs ) @doc_utils.doc_binary_method(operation="conjunction", sign="&", op_type="logical") def __and__(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.DataFrame.__and__)( self, other=other, **kwargs ) @doc_utils.doc_binary_method(operation="disjunction", sign="|", op_type="logical") def __or__(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.DataFrame.__or__)( self, other=other, **kwargs ) @doc_utils.doc_binary_method( operation="conjunction", sign="&", op_type="logical", self_on_right=True ) def __rand__(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.DataFrame.__rand__)( self, other=other, **kwargs ) @doc_utils.doc_binary_method( operation="disjunction", sign="|", op_type="logical", self_on_right=True ) def __ror__(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.DataFrame.__ror__)( self, other=other, **kwargs ) @doc_utils.doc_binary_method( operation="exclusive or", sign="^", op_type="logical", self_on_right=True ) def __rxor__(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.DataFrame.__rxor__)( self, other=other, **kwargs ) @doc_utils.doc_binary_method(operation="exclusive or", sign="^", op_type="logical") def __xor__(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.DataFrame.__xor__)( self, other=other, **kwargs ) # FIXME: query compiler shoudln't care about differences between Frame and Series. # We should combine `df_update` and `series_update` into one method (Modin issue #3101). @doc_utils.add_refer_to("DataFrame.update") def df_update(self, other, **kwargs): # noqa: PR02 """ Update values of `self` using non-NA values of `other` at the corresponding positions. If axes are not equal, perform frames alignment first. Parameters ---------- other : BaseQueryCompiler Frame to grab replacement values from. join : {"left"} Specify type of join to align frames if axes are not equal (note: currently only one type of join is implemented). overwrite : bool Whether to overwrite every corresponding value of self, or only if it's NAN. filter_func : callable(pandas.Series, pandas.Series) -> numpy.ndarray Function that takes column of the self and return bool mask for values, that should be overwritten in the self frame. errors : {"raise", "ignore"} If "raise", will raise a ``ValueError`` if `self` and `other` both contain non-NA data in the same place. **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler New QueryCompiler with updated values. """ return BinaryDefault.register(pandas.DataFrame.update, inplace=True)( self, other=other, **kwargs ) @doc_utils.add_refer_to("Series.update") def series_update(self, other, **kwargs): # noqa: PR02 """ Update values of `self` using values of `other` at the corresponding indices. Parameters ---------- other : BaseQueryCompiler One-column query compiler with updated values. **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler New QueryCompiler with updated values. """ return BinaryDefault.register(pandas.Series.update, inplace=True)( self, other=other, squeeze_self=True, squeeze_other=True, **kwargs, ) @doc_utils.add_refer_to("DataFrame.asfreq") def asfreq(self, **kwargs): # noqa: PR01 """ Convert time series to specified frequency. Returns the original data conformed to a new index with the specified frequency. Returns ------- BaseQueryCompiler New QueryCompiler reindexed to the specified frequency. """ return DataFrameDefault.register(pandas.DataFrame.asfreq)(self, **kwargs) @doc_utils.add_refer_to("DataFrame.clip") def clip(self, lower, upper, **kwargs): # noqa: PR02 """ Trim values at input threshold. Parameters ---------- lower : float or list-like upper : float or list-like axis : {0, 1} **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler QueryCompiler with values limited by the specified thresholds. """ if isinstance(lower, BaseQueryCompiler): lower = lower.to_pandas().squeeze(1) if isinstance(upper, BaseQueryCompiler): upper = upper.to_pandas().squeeze(1) return DataFrameDefault.register(pandas.DataFrame.clip)( self, lower=lower, upper=upper, **kwargs ) @doc_utils.add_refer_to("DataFrame.where") def where(self, cond, other, **kwargs): # noqa: PR02 """ Update values of `self` using values from `other` at positions where `cond` is False. Parameters ---------- cond : BaseQueryCompiler Boolean mask. True - keep the self value, False - replace by `other` value. other : BaseQueryCompiler or pandas.Series Object to grab replacement values from. axis : {0, 1} Axis to align frames along if axes of self, `cond` and `other` are not equal. 0 is for index, when 1 is for columns. level : int or label, optional Level of MultiIndex to align frames along if axes of self, `cond` and `other` are not equal. Currently `level` parameter is not implemented, so only None value is acceptable. **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler QueryCompiler with updated data. """ return DataFrameDefault.register(pandas.DataFrame.where)( self, cond=cond, other=other, **kwargs ) @doc_utils.add_refer_to("DataFrame.merge") def merge(self, right, **kwargs): # noqa: PR02 """ Merge QueryCompiler objects using a database-style join. Parameters ---------- right : BaseQueryCompiler QueryCompiler of the right frame to merge with. how : {"left", "right", "outer", "inner", "cross"} on : label or list of such left_on : label or list of such right_on : label or list of such left_index : bool right_index : bool sort : bool suffixes : list-like copy : bool indicator : bool or str validate : str **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler QueryCompiler that contains result of the merge. """ return DataFrameDefault.register(pandas.DataFrame.merge)( self, right=right, **kwargs ) @doc_utils.add_refer_to("merge_ordered") def merge_ordered(self, right, **kwargs): # noqa: PR01 """ Perform a merge for ordered data with optional filling/interpolation. Returns ------- BaseQueryCompiler """ return DataFrameDefault.register(pandas.merge_ordered)(self, right, **kwargs) def _get_column_as_pandas_series(self, key): """ Get column data by label as pandas.Series. Parameters ---------- key : Any Column label. Returns ------- pandas.Series """ result = self.getitem_array([key]).to_pandas().squeeze(axis=1) if not isinstance(result, pandas.Series): raise RuntimeError( f"Expected getting column {key} to give " + f"pandas.Series, but instead got {type(result)}" ) return result def merge_asof( self, right: "BaseQueryCompiler", left_on: Optional[IndexLabel] = None, right_on: Optional[IndexLabel] = None, left_index: bool = False, right_index: bool = False, left_by=None, right_by=None, suffixes: Suffixes = ("_x", "_y"), tolerance=None, allow_exact_matches: bool = True, direction: str = "backward", ): # noqa: GL08 self._maybe_warn_on_default(message="`merge_asof`") # Pandas fallbacks for tricky cases: if ( # No idea how this works or why it does what it does; and in fact # there's a Pandas bug suggesting it's wrong: # https://github.com/pandas-dev/pandas/issues/33463 (left_index and right_on is not None) # This is the case where by is a list of columns. If we're copying lots # of columns out of Pandas, maybe not worth trying our path, it's not # clear it's any better: or not (left_by is None or is_scalar(left_by)) or not (right_by is None or is_scalar(right_by)) # The implementation below assumes that the right index is unique # because it uses merge_asof to map each position in the merged # index to the label of the one right row that should be merged # at that row position. or not right.index.is_unique ): return self.default_to_pandas( pandas.merge_asof, right, left_on=left_on, right_on=right_on, left_index=left_index, right_index=right_index, left_by=left_by, right_by=right_by, suffixes=suffixes, tolerance=tolerance, allow_exact_matches=allow_exact_matches, direction=direction, ) if left_on is None: left_column = self.index else: left_column = self._get_column_as_pandas_series(left_on) if right_on is None: right_column = right.index else: right_column = right._get_column_as_pandas_series(right_on) left_pandas_limited = {"on": left_column} right_pandas_limited = {"on": right_column, "right_labels": right.index} extra_kwargs = {} # extra arguments to Pandas merge_asof if left_by is not None or right_by is not None: extra_kwargs["by"] = "by" left_pandas_limited["by"] = self._get_column_as_pandas_series(left_by) right_pandas_limited["by"] = right._get_column_as_pandas_series(right_by) # 1. Construct Pandas DataFrames with just the 'on' and optional 'by' # columns, and the index as another column. left_pandas_limited = pandas.DataFrame(left_pandas_limited, index=self.index) right_pandas_limited = pandas.DataFrame(right_pandas_limited) # 2. Use Pandas' merge_asof to figure out how to map labels on left to # labels on the right. merged = pandas.merge_asof( left_pandas_limited, right_pandas_limited, on="on", direction=direction, allow_exact_matches=allow_exact_matches, tolerance=tolerance, **extra_kwargs, ) # Now merged["right_labels"] shows which labels from right map to left's index. # 3. Re-index right using the merged["right_labels"]; at this point right # should be same length and (semantically) same order as left: right_subset = right.reindex( axis=0, labels=pandas.Index(merged["right_labels"]) ) if not right_index: right_subset = right_subset.drop(columns=[right_on]) if right_by is not None and left_by == right_by: right_subset = right_subset.drop(columns=[right_by]) right_subset.index = self.index # 4. Merge left and the new shrunken right: result = self.merge( right_subset, left_index=True, right_index=True, suffixes=suffixes, how="left", ) # 5. Clean up to match Pandas output: if left_on is not None and right_index: result = result.insert( # In theory this could use get_indexer_for(), but that causes an error: list(result.columns).index(left_on + suffixes[0]), left_on, result.getitem_array([left_on + suffixes[0]]), ) if not left_index and not right_index: result = result.reset_index(drop=True) return result @doc_utils.add_refer_to("DataFrame.join") def join(self, right, **kwargs): # noqa: PR02 """ Join columns of another QueryCompiler. Parameters ---------- right : BaseQueryCompiler QueryCompiler of the right frame to join with. on : label or list of such how : {"left", "right", "outer", "inner"} lsuffix : str rsuffix : str sort : bool **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler QueryCompiler that contains result of the join. """ return DataFrameDefault.register(pandas.DataFrame.join)(self, right, **kwargs) # END Abstract inter-data operations # Abstract Transpose def transpose(self, *args, **kwargs): # noqa: PR02 """ Transpose this QueryCompiler. Parameters ---------- copy : bool Whether to copy the data after transposing. *args : iterable Serves the compatibility purpose. Does not affect the result. **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler Transposed new QueryCompiler. """ return DataFrameDefault.register(pandas.DataFrame.transpose)( self, *args, **kwargs ) def columnarize(self): """ Transpose this QueryCompiler if it has a single row but multiple columns. This method should be called for QueryCompilers representing a Series object, i.e. ``self.is_series_like()`` should be True. Returns ------- BaseQueryCompiler Transposed new QueryCompiler or self. """ if self._shape_hint == "column": return self result = self if len(self.columns) != 1 or ( len(self.index) == 1 and self.index[0] == MODIN_UNNAMED_SERIES_LABEL ): result = self.transpose() result._shape_hint = "column" return result def is_series_like(self): """ Check whether this QueryCompiler can represent ``modin.pandas.Series`` object. Returns ------- bool Return True if QueryCompiler has a single column or row, False otherwise. """ return len(self.columns) == 1 or len(self.index) == 1 # END Abstract Transpose # Abstract reindex/reset_index (may shuffle data) @doc_utils.add_refer_to("DataFrame.reindex") def reindex(self, axis, labels, **kwargs): # noqa: PR02 """ Align QueryCompiler data with a new index along specified axis. Parameters ---------- axis : {0, 1} Axis to align labels along. 0 is for index, 1 is for columns. labels : list-like Index-labels to align with. method : {None, "backfill"/"bfill", "pad"/"ffill", "nearest"} Method to use for filling holes in reindexed frame. fill_value : scalar Value to use for missing values in the resulted frame. limit : int tolerance : int **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler QueryCompiler with aligned axis. """ return DataFrameDefault.register(pandas.DataFrame.reindex)( self, axis=axis, labels=labels, **kwargs ) @doc_utils.add_refer_to("DataFrame.reset_index") def reset_index(self, **kwargs): # noqa: PR02 """ Reset the index, or a level of it. Parameters ---------- drop : bool Whether to drop the reset index or insert it at the beginning of the frame. level : int or label, optional Level to remove from index. Removes all levels by default. col_level : int or label If the columns have multiple levels, determines which level the labels are inserted into. col_fill : label If the columns have multiple levels, determines how the other levels are named. **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler QueryCompiler with reset index. """ return DataFrameDefault.register(pandas.DataFrame.reset_index)(self, **kwargs) def set_index_from_columns( self, keys: List[Hashable], drop: bool = True, append: bool = False ): """ Create new row labels from a list of columns. Parameters ---------- keys : list of hashable The list of column names that will become the new index. drop : bool, default: True Whether or not to drop the columns provided in the `keys` argument. append : bool, default: True Whether or not to add the columns in `keys` as new levels appended to the existing index. Returns ------- BaseQueryCompiler A new QueryCompiler with updated index. """ return DataFrameDefault.register(pandas.DataFrame.set_index)( self, keys=keys, drop=drop, append=append ) # END Abstract reindex/reset_index # Full Reduce operations # # These operations result in a reduced dimensionality of data. # Currently, this means a Pandas Series will be returned, but in the future # we will implement a Distributed Series, and this will be returned # instead. def is_monotonic_increasing(self): """ Return boolean if values in the object are monotonically increasing. Returns ------- bool """ return SeriesDefault.register(pandas.Series.is_monotonic_increasing)(self) def is_monotonic_decreasing(self): """ Return boolean if values in the object are monotonically decreasing. Returns ------- bool """ return SeriesDefault.register(pandas.Series.is_monotonic_decreasing)(self) @doc_utils.doc_reduce_agg( method="number of non-NaN values", refer_to="count", extra_params=["**kwargs"] ) def count(self, **kwargs): # noqa: PR02 return DataFrameDefault.register(pandas.DataFrame.count)(self, **kwargs) @doc_utils.doc_reduce_agg( method="maximum value", refer_to="max", extra_params=["skipna", "**kwargs"] ) def max(self, **kwargs): # noqa: PR02 return DataFrameDefault.register(pandas.DataFrame.max)(self, **kwargs) @doc_utils.doc_reduce_agg( method="mean value", refer_to="mean", extra_params=["skipna", "**kwargs"] ) def mean(self, **kwargs): # noqa: PR02 return DataFrameDefault.register(pandas.DataFrame.mean)(self, **kwargs) @doc_utils.doc_reduce_agg( method="minimum value", refer_to="min", extra_params=["skipna", "**kwargs"] ) def min(self, **kwargs): # noqa: PR02 return DataFrameDefault.register(pandas.DataFrame.min)(self, **kwargs) @doc_utils.doc_reduce_agg( method="production", refer_to="prod", extra_params=["**kwargs"], params="axis : {0, 1}", ) def prod(self, **kwargs): # noqa: PR02 return DataFrameDefault.register(pandas.DataFrame.prod)(self, **kwargs) @doc_utils.doc_reduce_agg( method="sum", refer_to="sum", extra_params=["**kwargs"], params="axis : {0, 1}", ) def sum(self, **kwargs): # noqa: PR02 return DataFrameDefault.register(pandas.DataFrame.sum)(self, **kwargs) @doc_utils.add_refer_to("DataFrame.mask") def mask(self, cond, other, **kwargs): # noqa: PR01 """ Replace values where the condition `cond` is True. Returns ------- BaseQueryCompiler New QueryCompiler with elements replaced with ones from `other` where `cond` is True. """ return DataFrameDefault.register(pandas.DataFrame.mask)( self, cond, other, **kwargs ) @doc_utils.add_refer_to("DataFrame.pct_change") def pct_change(self, **kwargs): # noqa: PR01 """ Percentage change between the current and a prior element. Returns ------- BaseQueryCompiler """ return DataFrameDefault.register(pandas.DataFrame.pct_change)(self, **kwargs) @doc_utils.add_refer_to("to_datetime") def to_datetime(self, *args, **kwargs): """ Convert columns of the QueryCompiler to the datetime dtype. Parameters ---------- *args : iterable **kwargs : dict Returns ------- BaseQueryCompiler QueryCompiler with all columns converted to datetime dtype. """ return SeriesDefault.register(pandas.to_datetime)(self, *args, **kwargs) # END Abstract full Reduce operations # Abstract map partitions operations # These operations are operations that apply a function to every partition. def abs(self): """ Get absolute numeric value of each element. Returns ------- BaseQueryCompiler QueryCompiler with absolute numeric value of each element. """ return DataFrameDefault.register(pandas.DataFrame.abs)(self) def map(self, func, *args, **kwargs): """ Apply passed function elementwise. Parameters ---------- func : callable(scalar) -> scalar Function to apply to each element of the QueryCompiler. *args : iterable **kwargs : dict Returns ------- BaseQueryCompiler Transformed QueryCompiler. """ return DataFrameDefault.register(pandas.DataFrame.map)( self, func, *args, **kwargs ) # FIXME: `**kwargs` which follows `numpy.conj` signature was inherited # from ``PandasQueryCompiler``, we should get rid of this dependency. # (Modin issue #3108) def conj(self, **kwargs): """ Get the complex conjugate for every element of self. Parameters ---------- **kwargs : dict Returns ------- BaseQueryCompiler QueryCompiler with conjugate applied element-wise. Notes ----- Please refer to ``numpy.conj`` for parameters description. """ def conj(df, *args, **kwargs): return pandas.DataFrame(np.conj(df)) return DataFrameDefault.register(conj)(self, **kwargs) @doc_utils.add_refer_to("DataFrame.interpolate") def interpolate(self, **kwargs): # noqa: PR01 """ Fill NaN values using an interpolation method. Returns ------- BaseQueryCompiler Returns the same object type as the caller, interpolated at some or all NaN values. """ return DataFrameDefault.register(pandas.DataFrame.interpolate)(self, **kwargs) # FIXME: # 1. This function takes Modin Series and DataFrames via `values` parameter, # we should avoid leaking of the high-level objects to the query compiler level. # (Modin issue #3106) # 2. Spread **kwargs into actual arguments (Modin issue #3108). def isin(self, values, ignore_indices=False, **kwargs): # noqa: PR02 """ Check for each element of `self` whether it's contained in passed `values`. Parameters ---------- values : list-like, modin.pandas.Series, modin.pandas.DataFrame or dict Values to check elements of self in. ignore_indices : bool, default: False Whether to execute ``isin()`` only on an intersection of indices. **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler Boolean mask for self of whether an element at the corresponding position is contained in `values`. """ if isinstance(values, type(self)) and ignore_indices: # Pandas logic is that it ignores indexing if 'values' is a 1D object values = values.to_pandas().squeeze(axis=1) if self._shape_hint == "column": return SeriesDefault.register(pandas.Series.isin)(self, values, **kwargs) else: return DataFrameDefault.register(pandas.DataFrame.isin)( self, values, **kwargs ) def isna(self): """ Check for each element of self whether it's NaN. Returns ------- BaseQueryCompiler Boolean mask for self of whether an element at the corresponding position is NaN. """ return DataFrameDefault.register(pandas.DataFrame.isna)(self) # FIXME: this method is not supposed to take any parameters (Modin issue #3108). def negative(self, **kwargs): """ Change the sign for every value of self. Parameters ---------- **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler Notes ----- Be aware, that all QueryCompiler values have to be numeric. """ return DataFrameDefault.register(pandas.DataFrame.__neg__)(self, **kwargs) def notna(self): """ Check for each element of `self` whether it's existing (non-missing) value. Returns ------- BaseQueryCompiler Boolean mask for `self` of whether an element at the corresponding position is not NaN. """ return DataFrameDefault.register(pandas.DataFrame.notna)(self) @doc_utils.add_refer_to("DataFrame.round") def round(self, **kwargs): # noqa: PR02 """ Round every numeric value up to specified number of decimals. Parameters ---------- decimals : int or list-like Number of decimals to round each column to. **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler QueryCompiler with rounded values. """ return DataFrameDefault.register(pandas.DataFrame.round)(self, **kwargs) # FIXME: # 1. high-level objects leaks to the query compiler (Modin issue #3106). # 2. remove `inplace` parameter. @doc_utils.add_refer_to("DataFrame.replace") def replace(self, **kwargs): # noqa: PR02 """ Replace values given in `to_replace` by `value`. Parameters ---------- to_replace : scalar, list-like, regex, modin.pandas.Series, or None value : scalar, list-like, regex or dict inplace : {False} This parameter serves the compatibility purpose. Always has to be False. limit : int or None regex : bool or same types as `to_replace` method : {"pad", "ffill", "bfill", None} **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler QueryCompiler with all `to_replace` values replaced by `value`. """ return DataFrameDefault.register(pandas.DataFrame.replace)(self, **kwargs) @doc_utils.add_refer_to("Series.argsort") def argsort(self, **kwargs): # noqa: PR02 """ Return the integer indices that would sort the Series values. Override ndarray.argsort. Argsorts the value, omitting NA/null values, and places the result in the same locations as the non-NA values. Parameters ---------- axis : {0 or 'index'} Unused. Parameter needed for compatibility with DataFrame. kind : {'mergesort', 'quicksort', 'heapsort', 'stable'}, default 'quicksort' Choice of sorting algorithm. See :func:`numpy.sort` for more information. 'mergesort' and 'stable' are the only stable algorithms. order : None Has no effect but is accepted for compatibility with NumPy. **kwargs : dict Serves compatibility purposes. Returns ------- BaseQueryCompiler One-column QueryCompiler with positions of values within the sort order with -1 indicating nan values. """ return SeriesDefault.register(pandas.Series.argsort)(self, **kwargs) @doc_utils.add_one_column_warning # FIXME: adding refer-to note will create two instances of the "Notes" section, # this breaks numpydoc style rules and also crashes the doc-style checker script. # For now manually added the refer-to message. # @doc_utils.add_refer_to("Series.view") def series_view(self, **kwargs): # noqa: PR02 """ Reinterpret underlying data with new dtype. Parameters ---------- dtype : dtype Data type to reinterpret underlying data with. **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler New QueryCompiler of the same data in memory, with reinterpreted values. Notes ----- - Be aware, that if this method do fallback to pandas, then newly created QueryCompiler will be the copy of the original data. - Please refer to ``modin.pandas.Series.view`` for more information about parameters and output format. """ return SeriesDefault.register(pandas.Series.view)(self, **kwargs) @doc_utils.add_one_column_warning @doc_utils.add_refer_to("to_numeric") def to_numeric(self, *args, **kwargs): # noqa: PR02 """ Convert underlying data to numeric dtype. Parameters ---------- errors : {"ignore", "raise", "coerce"} downcast : {"integer", "signed", "unsigned", "float", None} *args : iterable Serves the compatibility purpose. Does not affect the result. **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler New QueryCompiler with converted to numeric values. """ return SeriesDefault.register(pandas.to_numeric)(self, *args, **kwargs) @doc_utils.add_one_column_warning @doc_utils.add_refer_to("to_timedelta") def to_timedelta(self, unit="ns", errors="raise"): # noqa: PR02 """ Convert argument to timedelta. Parameters ---------- unit : str, default: "ns" Denotes the unit of the arg for numeric arg. Defaults to "ns". errors : {"ignore", "raise", "coerce"}, default: "raise" Returns ------- BaseQueryCompiler New QueryCompiler with converted to timedelta values. """ return SeriesDefault.register(pandas.to_timedelta)( self, unit=unit, errors=errors ) # 'qc.unique()' uses most of the arguments from 'df.drop_duplicates()', so refering to this method @doc_utils.add_refer_to("DataFrame.drop_duplicates") def unique(self, keep="first", ignore_index=True, subset=None): """ Get unique rows of `self`. Parameters ---------- keep : {"first", "last", False}, default: "first" Which duplicates to keep. ignore_index : bool, default: True If ``True``, the resulting axis will be labeled ``0, 1, …, n - 1``. subset : list, optional Only consider certain columns for identifying duplicates, if `None`, use all of the columns. Returns ------- BaseQueryCompiler New QueryCompiler with unique values. """ if subset is not None: mask = self.getitem_column_array(subset, ignore_order=True) else: mask = self without_duplicates = self.getitem_array(mask.duplicated(keep=keep).invert()) if ignore_index: without_duplicates = without_duplicates.reset_index(drop=True) return without_duplicates @doc_utils.add_one_column_warning @doc_utils.add_refer_to("Series.searchsorted") def searchsorted(self, **kwargs): # noqa: PR02 """ Find positions in a sorted `self` where `value` should be inserted to maintain order. Parameters ---------- value : list-like side : {"left", "right"} sorter : list-like, optional **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler One-column QueryCompiler which contains indices to insert. """ return SeriesDefault.register(pandas.Series.searchsorted)(self, **kwargs) # END Abstract map partitions operations @doc_utils.add_refer_to("DataFrame.stack") def stack(self, level, dropna, sort): """ Stack the prescribed level(s) from columns to index. Parameters ---------- level : int or label dropna : bool sort : bool Returns ------- BaseQueryCompiler """ return DataFrameDefault.register(pandas.DataFrame.stack)( self, level=level, dropna=dropna, sort=sort, ) # Abstract map partitions across select indices def astype(self, col_dtypes, errors: str = "raise"): # noqa: PR02 """ Convert columns dtypes to given dtypes. Parameters ---------- col_dtypes : dict or str Map for column names and new dtypes. errors : {'raise', 'ignore'}, default: 'raise' Control raising of exceptions on invalid data for provided dtype. - raise : allow exceptions to be raised - ignore : suppress exceptions. On error return original object. Returns ------- BaseQueryCompiler New QueryCompiler with updated dtypes. """ return DataFrameDefault.register(pandas.DataFrame.astype)( self, dtype=col_dtypes, errors=errors ) def infer_objects(self): """ Attempt to infer better dtypes for object columns. Attempts soft conversion of object-dtyped columns, leaving non-object and unconvertible columns unchanged. The inference rules are the same as during normal Series/DataFrame construction. Returns ------- BaseQueryCompiler New query compiler with udpated dtypes. """ return DataFrameDefault.register(pandas.DataFrame.infer_objects)(self) def convert_dtypes( self, infer_objects: bool = True, convert_string: bool = True, convert_integer: bool = True, convert_boolean: bool = True, convert_floating: bool = True, dtype_backend: DtypeBackend = "numpy_nullable", ): """ Convert columns to best possible dtypes using dtypes supporting ``pd.NA``. Parameters ---------- infer_objects : bool, default: True Whether object dtypes should be converted to the best possible types. convert_string : bool, default: True Whether object dtypes should be converted to ``pd.StringDtype()``. convert_integer : bool, default: True Whether, if possbile, conversion should be done to integer extension types. convert_boolean : bool, default: True Whether object dtypes should be converted to ``pd.BooleanDtype()``. convert_floating : bool, default: True Whether, if possible, conversion can be done to floating extension types. If `convert_integer` is also True, preference will be give to integer dtypes if the floats can be faithfully casted to integers. dtype_backend : {"numpy_nullable", "pyarrow"}, default: "numpy_nullable" Which dtype_backend to use, e.g. whether a DataFrame should use nullable dtypes for all dtypes that have a nullable implementation when "numpy_nullable" is set, PyArrow is used for all dtypes if "pyarrow" is set. Returns ------- BaseQueryCompiler New QueryCompiler with updated dtypes. """ return DataFrameDefault.register(pandas.DataFrame.convert_dtypes)( self, infer_objects=infer_objects, convert_string=convert_string, convert_integer=convert_integer, convert_boolean=convert_boolean, convert_floating=convert_floating, dtype_backend=dtype_backend, ) @property def dtypes(self): """ Get columns dtypes. Returns ------- pandas.Series Series with dtypes of each column. """ return self.to_pandas().dtypes # END Abstract map partitions across select indices # Abstract column/row partitions reduce operations # # These operations result in a reduced dimensionality of data. # Currently, this means a Pandas Series will be returned, but in the future # we will implement a Distributed Series, and this will be returned # instead. # FIXME: we're handling level parameter at front-end, it shouldn't # propagate to the query compiler (Modin issue #3102) @doc_utils.add_refer_to("DataFrame.all") def all(self, **kwargs): # noqa: PR02 """ Return whether all the elements are true, potentially over an axis. Parameters ---------- axis : {0, 1}, optional bool_only : bool, optional skipna : bool level : int or label **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler If axis was specified return one-column QueryCompiler with index labels of the specified axis, where each row contains boolean of whether all elements at the corresponding row or column are True. Otherwise return QueryCompiler with a single bool of whether all elements are True. """ return DataFrameDefault.register(pandas.DataFrame.all)(self, **kwargs) @doc_utils.add_refer_to("DataFrame.any") def any(self, **kwargs): # noqa: PR02 """ Return whether any element is true, potentially over an axis. Parameters ---------- axis : {0, 1}, optional bool_only : bool, optional skipna : bool level : int or label **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler If axis was specified return one-column QueryCompiler with index labels of the specified axis, where each row contains boolean of whether any element at the corresponding row or column is True. Otherwise return QueryCompiler with a single bool of whether any element is True. """ return DataFrameDefault.register(pandas.DataFrame.any)(self, **kwargs) def first_valid_index(self): """ Return index label of first non-NaN/NULL value. Returns ------- scalar """ return ( DataFrameDefault.register(pandas.DataFrame.first_valid_index)(self) .to_pandas() .squeeze() ) @doc_utils.add_refer_to("DataFrame.idxmax") def idxmax(self, **kwargs): # noqa: PR02 """ Get position of the first occurrence of the maximum for each row or column. Parameters ---------- axis : {0, 1} skipna : bool **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler One-column QueryCompiler with index labels of the specified axis, where each row contains position of the maximum element for the corresponding row or column. """ return DataFrameDefault.register(pandas.DataFrame.idxmax)(self, **kwargs) @doc_utils.add_refer_to("DataFrame.idxmin") def idxmin(self, **kwargs): # noqa: PR02 """ Get position of the first occurrence of the minimum for each row or column. Parameters ---------- axis : {0, 1} skipna : bool **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler One-column QueryCompiler with index labels of the specified axis, where each row contains position of the minimum element for the corresponding row or column. """ return DataFrameDefault.register(pandas.DataFrame.idxmin)(self, **kwargs) def last_valid_index(self): """ Return index label of last non-NaN/NULL value. Returns ------- scalar """ return ( DataFrameDefault.register(pandas.DataFrame.last_valid_index)(self) .to_pandas() .squeeze() ) @doc_utils.doc_reduce_agg( method="median value", refer_to="median", extra_params=["skipna", "**kwargs"] ) def median(self, **kwargs): # noqa: PR02 return DataFrameDefault.register(pandas.DataFrame.median)(self, **kwargs) @doc_utils.add_refer_to("DataFrame.memory_usage") def memory_usage(self, **kwargs): # noqa: PR02 """ Return the memory usage of each column in bytes. Parameters ---------- index : bool deep : bool **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler One-column QueryCompiler with index labels of `self`, where each row contains the memory usage for the corresponding column. """ return DataFrameDefault.register(pandas.DataFrame.memory_usage)(self, **kwargs) @doc_utils.add_refer_to("DataFrame.sizeof") def sizeof(self): """ Compute the total memory usage for `self`. Returns ------- BaseQueryCompiler Result that holds either a value or Series of values. """ return DataFrameDefault.register(pandas.DataFrame.__sizeof__)(self) @doc_utils.doc_reduce_agg( method="number of unique values", refer_to="nunique", params=""" axis : {0, 1} dropna : bool""", extra_params=["**kwargs"], ) def nunique(self, **kwargs): # noqa: PR02 return DataFrameDefault.register(pandas.DataFrame.nunique)(self, **kwargs) @doc_utils.doc_reduce_agg( method="value at the given quantile", refer_to="quantile", params=""" q : float axis : {0, 1} numeric_only : bool interpolation : {"linear", "lower", "higher", "midpoint", "nearest"}""", extra_params=["**kwargs"], ) def quantile_for_single_value(self, **kwargs): # noqa: PR02 return DataFrameDefault.register(pandas.DataFrame.quantile)(self, **kwargs) @doc_utils.doc_reduce_agg( method="unbiased skew", refer_to="skew", extra_params=["skipna", "**kwargs"] ) def skew(self, **kwargs): # noqa: PR02 return DataFrameDefault.register(pandas.DataFrame.skew)(self, **kwargs) @doc_utils.doc_reduce_agg( method="standard deviation of the mean", refer_to="sem", extra_params=["skipna", "ddof", "**kwargs"], ) def sem(self, **kwargs): # noqa: PR02 return DataFrameDefault.register(pandas.DataFrame.sem)(self, **kwargs) @doc_utils.doc_reduce_agg( method="standard deviation", refer_to="std", extra_params=["skipna", "ddof", "**kwargs"], ) def std(self, **kwargs): # noqa: PR02 return DataFrameDefault.register(pandas.DataFrame.std)(self, **kwargs) @doc_utils.doc_reduce_agg( method="variance", refer_to="var", extra_params=["skipna", "ddof", "**kwargs"] ) def var(self, **kwargs): # noqa: PR02 return DataFrameDefault.register(pandas.DataFrame.var)(self, **kwargs) # END Abstract column/row partitions reduce operations @doc_utils.add_refer_to("DataFrame.describe") def describe(self, percentiles: np.ndarray): """ Generate descriptive statistics. Parameters ---------- percentiles : list-like Returns ------- BaseQueryCompiler QueryCompiler object containing the descriptive statistics of the underlying data. """ return DataFrameDefault.register(pandas.DataFrame.describe)( self, percentiles=percentiles, include="all", ) # Map across rows/columns # These operations require some global knowledge of the full column/row # that is being operated on. This means that we have to put all of that # data in the same place. @doc_utils.doc_cum_agg(method="sum", refer_to="cumsum") def cumsum(self, fold_axis, **kwargs): # noqa: PR02 return DataFrameDefault.register(pandas.DataFrame.cumsum)(self, **kwargs) @doc_utils.doc_cum_agg(method="maximum", refer_to="cummax") def cummax(self, fold_axis, **kwargs): # noqa: PR02 return DataFrameDefault.register(pandas.DataFrame.cummax)(self, **kwargs) @doc_utils.doc_cum_agg(method="minimum", refer_to="cummin") def cummin(self, fold_axis, **kwargs): # noqa: PR02 return DataFrameDefault.register(pandas.DataFrame.cummin)(self, **kwargs) @doc_utils.doc_cum_agg(method="product", refer_to="cumprod") def cumprod(self, fold_axis, **kwargs): # noqa: PR02 return DataFrameDefault.register(pandas.DataFrame.cumprod)(self, **kwargs) @doc_utils.add_refer_to("DataFrame.diff") def diff(self, **kwargs): # noqa: PR02 """ First discrete difference of element. Parameters ---------- periods : int **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler QueryCompiler of the same shape as `self`, where each element is the difference between the corresponding value and the previous value in this row or column. """ return DataFrameDefault.register(pandas.DataFrame.diff)(self, **kwargs) @doc_utils.add_refer_to("DataFrame.dropna") def dropna(self, **kwargs): # noqa: PR02 """ Remove missing values. Parameters ---------- axis : {0, 1} how : {"any", "all"} thresh : int, optional subset : list of labels **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler New QueryCompiler with null values dropped along given axis. """ return DataFrameDefault.register(pandas.DataFrame.dropna)(self, **kwargs) @doc_utils.add_refer_to("DataFrame.duplicated") def duplicated(self, **kwargs): """ Return boolean Series denoting duplicate rows. Parameters ---------- **kwargs : dict Additional keyword arguments to be passed in to `pandas.DataFrame.duplicated`. Returns ------- BaseQueryCompiler New QueryCompiler containing boolean Series denoting duplicate rows. """ return DataFrameDefault.register(pandas.DataFrame.duplicated)(self, **kwargs) @doc_utils.add_refer_to("DataFrame.nlargest") def nlargest(self, n=5, columns=None, keep="first"): """ Return the first `n` rows ordered by `columns` in descending order. Parameters ---------- n : int, default: 5 columns : list of labels, optional Column labels to order by. (note: this parameter can be omitted only for a single-column query compilers representing Series object, otherwise `columns` has to be specified). keep : {"first", "last", "all"}, default: "first" Returns ------- BaseQueryCompiler """ if columns is None: return SeriesDefault.register(pandas.Series.nlargest)(self, n=n, keep=keep) else: return DataFrameDefault.register(pandas.DataFrame.nlargest)( self, n=n, columns=columns, keep=keep ) @doc_utils.add_refer_to("DataFrame.nsmallest") def nsmallest(self, n=5, columns=None, keep="first"): """ Return the first `n` rows ordered by `columns` in ascending order. Parameters ---------- n : int, default: 5 columns : list of labels, optional Column labels to order by. (note: this parameter can be omitted only for a single-column query compilers representing Series object, otherwise `columns` has to be specified). keep : {"first", "last", "all"}, default: "first" Returns ------- BaseQueryCompiler """ if columns is None: return SeriesDefault.register(pandas.Series.nsmallest)(self, n=n, keep=keep) else: return DataFrameDefault.register(pandas.DataFrame.nsmallest)( self, n=n, columns=columns, keep=keep ) @doc_utils.add_refer_to("DataFrame.query") def rowwise_query(self, expr, **kwargs): """ Query columns of the QueryCompiler with a boolean expression row-wise. Parameters ---------- expr : str **kwargs : dict Returns ------- BaseQueryCompiler New QueryCompiler containing the rows where the boolean expression is satisfied. """ raise NotImplementedError( "Row-wise queries execution is not implemented for the selected backend." ) @doc_utils.add_refer_to("DataFrame.eval") def eval(self, expr, **kwargs): """ Evaluate string expression on QueryCompiler columns. Parameters ---------- expr : str **kwargs : dict Returns ------- BaseQueryCompiler QueryCompiler containing the result of evaluation. """ return DataFrameDefault.register(pandas.DataFrame.eval)( self, expr=expr, **kwargs ) @doc_utils.add_refer_to("DataFrame.mode") def mode(self, **kwargs): # noqa: PR02 """ Get the modes for every column or row. Parameters ---------- axis : {0, 1} numeric_only : bool dropna : bool **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler New QueryCompiler with modes calculated along given axis. """ return DataFrameDefault.register(pandas.DataFrame.mode)(self, **kwargs) @doc_utils.add_refer_to("DataFrame.fillna") def fillna(self, **kwargs): # noqa: PR02 """ Replace NaN values using provided method. Parameters ---------- value : scalar or dict method : {"backfill", "bfill", "pad", "ffill", None} axis : {0, 1} inplace : {False} This parameter serves the compatibility purpose. Always has to be False. limit : int, optional downcast : dict, optional **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler New QueryCompiler with all null values filled. """ squeeze_self = kwargs.pop("squeeze_self", False) squeeze_value = kwargs.pop("squeeze_value", False) def fillna(df, value, **kwargs): if squeeze_self: df = df.squeeze(axis=1) if squeeze_value: value = value.squeeze(axis=1) return df.fillna(value, **kwargs) return DataFrameDefault.register(fillna)(self, **kwargs) @doc_utils.add_refer_to("DataFrame.rank") def rank(self, **kwargs): # noqa: PR02 """ Compute numerical rank along the specified axis. By default, equal values are assigned a rank that is the average of the ranks of those values, this behavior can be changed via `method` parameter. Parameters ---------- axis : {0, 1} method : {"average", "min", "max", "first", "dense"} numeric_only : bool na_option : {"keep", "top", "bottom"} ascending : bool pct : bool **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler QueryCompiler of the same shape as `self`, where each element is the numerical rank of the corresponding value along row or column. """ return DataFrameDefault.register(pandas.DataFrame.rank)(self, **kwargs) @doc_utils.add_refer_to("DataFrame.sort_index") def sort_index(self, **kwargs): # noqa: PR02 """ Sort data by index or column labels. Parameters ---------- axis : {0, 1} level : int, label or list of such ascending : bool inplace : bool kind : {"quicksort", "mergesort", "heapsort"} na_position : {"first", "last"} sort_remaining : bool ignore_index : bool key : callable(pandas.Index) -> pandas.Index, optional **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler New QueryCompiler containing the data sorted by columns or indices. """ return DataFrameDefault.register(pandas.DataFrame.sort_index)(self, **kwargs) @doc_utils.add_refer_to("DataFrame.melt") def melt(self, *args, **kwargs): # noqa: PR02 """ Unpivot QueryCompiler data from wide to long format. Parameters ---------- id_vars : list of labels, optional value_vars : list of labels, optional var_name : label value_name : label col_level : int or label ignore_index : bool *args : iterable Serves the compatibility purpose. Does not affect the result. **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler New QueryCompiler with unpivoted data. """ return DataFrameDefault.register(pandas.DataFrame.melt)(self, *args, **kwargs) @doc_utils.add_refer_to("DataFrame.sort_values") def sort_columns_by_row_values(self, rows, ascending=True, **kwargs): # noqa: PR02 """ Reorder the columns based on the lexicographic order of the given rows. Parameters ---------- rows : label or list of labels The row or rows to sort by. ascending : bool, default: True Sort in ascending order (True) or descending order (False). kind : {"quicksort", "mergesort", "heapsort"} na_position : {"first", "last"} ignore_index : bool key : callable(pandas.Index) -> pandas.Index, optional **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler New QueryCompiler that contains result of the sort. """ return DataFrameDefault.register(pandas.DataFrame.sort_values)( self, by=rows, axis=1, ascending=ascending, **kwargs ) @doc_utils.add_refer_to("DataFrame.sort_values") def sort_rows_by_column_values( self, columns, ascending=True, **kwargs ): # noqa: PR02 """ Reorder the rows based on the lexicographic order of the given columns. Parameters ---------- columns : label or list of labels The column or columns to sort by. ascending : bool, default: True Sort in ascending order (True) or descending order (False). kind : {"quicksort", "mergesort", "heapsort"} na_position : {"first", "last"} ignore_index : bool key : callable(pandas.Index) -> pandas.Index, optional **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler New QueryCompiler that contains result of the sort. """ # Avoid index/column name collisions by renaming and restoring after sorting index_renaming = None if is_scalar(columns): columns = [columns] if any(name in columns for name in self.index.names): index_renaming = self.index.names self.index = self.index.set_names([None] * len(self.index.names)) new_query_compiler = DataFrameDefault.register(pandas.DataFrame.sort_values)( self, by=columns, axis=0, ascending=ascending, **kwargs ) if index_renaming is not None: new_query_compiler.index = new_query_compiler.index.set_names( index_renaming ) return new_query_compiler # END Abstract map across rows/columns # Map across rows/columns # These operations require some global knowledge of the full column/row # that is being operated on. This means that we have to put all of that # data in the same place. @doc_utils.doc_reduce_agg( method="value at the given quantile", refer_to="quantile", params=""" q : list-like axis : {0, 1} numeric_only : bool interpolation : {"linear", "lower", "higher", "midpoint", "nearest"}""", extra_params=["**kwargs"], ) def quantile_for_list_of_values(self, **kwargs): # noqa: PR02 return DataFrameDefault.register(pandas.DataFrame.quantile)(self, **kwargs) # END Abstract map across rows/columns # Abstract __getitem__ methods def getitem_array(self, key): """ Mask QueryCompiler with `key`. Parameters ---------- key : BaseQueryCompiler, np.ndarray or list of column labels Boolean mask represented by QueryCompiler or ``np.ndarray`` of the same shape as `self`, or enumerable of columns to pick. Returns ------- BaseQueryCompiler New masked QueryCompiler. """ if isinstance(key, type(self)): key = key.to_pandas().squeeze(axis=1) def getitem_array(df, key): return df[key] return DataFrameDefault.register(getitem_array)(self, key) def getitem_column_array(self, key, numeric=False, ignore_order=False): """ Get column data for target labels. Parameters ---------- key : list-like Target labels by which to retrieve data. numeric : bool, default: False Whether or not the key passed in represents the numeric index or the named index. ignore_order : bool, default: False Allow returning columns in an arbitrary order for the sake of performance. Returns ------- BaseQueryCompiler New QueryCompiler that contains specified columns. """ def get_column(df, key): if numeric: return df.iloc[:, key] else: return df[key] return DataFrameDefault.register(get_column)(self, key=key) def getitem_row_array(self, key): """ Get row data for target indices. Parameters ---------- key : list-like Numeric indices of the rows to pick. Returns ------- BaseQueryCompiler New QueryCompiler that contains specified rows. """ def get_row(df, key): return df.iloc[key] return DataFrameDefault.register(get_row)(self, key=key) def lookup(self, row_labels, col_labels): # noqa: PR01, RT01, D200 """ Label-based "fancy indexing" function for ``DataFrame``. """ return self.default_to_pandas(pandas.DataFrame.lookup, row_labels, col_labels) # END Abstract __getitem__ methods # Abstract insert # This method changes the shape of the resulting data. In Pandas, this # operation is always inplace, but this object is immutable, so we just # return a new one from here and let the front end handle the inplace # update. def insert(self, loc, column, value): """ Insert new column. Parameters ---------- loc : int Insertion position. column : label Label of the new column. value : One-column BaseQueryCompiler, 1D array or scalar Data to fill new column with. Returns ------- BaseQueryCompiler QueryCompiler with new column inserted. """ def inserter(df, loc, column, value): if isinstance(value, pandas.DataFrame): value = value.squeeze(axis=1) df.insert(loc, column, value) return df return DataFrameDefault.register(inserter, inplace=True)( self, loc=loc, column=column, value=value ) # END Abstract insert # __setitem__ methods def setitem_bool(self, row_loc, col_loc, item): """ Set an item to the given location based on `row_loc` and `col_loc`. Parameters ---------- row_loc : BaseQueryCompiler Query Compiler holding a Series of booleans. col_loc : label Column label in `self`. item : scalar An item to be set. Returns ------- BaseQueryCompiler New QueryCompiler with the inserted item. Notes ----- Currently, this method is only used to set a scalar to the given location. """ def _set_item(df, row_loc, col_loc, item): df.loc[row_loc.squeeze(axis=1), col_loc] = item return df return DataFrameDefault.register(_set_item)( self, row_loc=row_loc, col_loc=col_loc, item=item ) # END __setitem__ methods # Abstract drop def drop(self, index=None, columns=None, errors: str = "raise"): """ Drop specified rows or columns. Parameters ---------- index : list of labels, optional Labels of rows to drop. columns : list of labels, optional Labels of columns to drop. errors : str, default: "raise" If 'ignore', suppress error and only existing labels are dropped. Returns ------- BaseQueryCompiler New QueryCompiler with removed data. """ if index is None and columns is None: return self else: return DataFrameDefault.register(pandas.DataFrame.drop)( self, index=index, columns=columns, errors=errors ) # END drop # UDF (apply and agg) methods # There is a wide range of behaviors that are supported, so a lot of the # logic can get a bit convoluted. def apply(self, func, axis, raw=False, result_type=None, *args, **kwargs): """ Apply passed function across given axis. Parameters ---------- func : callable(pandas.Series) -> scalar, str, list or dict of such The function to apply to each column or row. axis : {0, 1} Target axis to apply the function along. 0 is for index, 1 is for columns. raw : bool, default: False Whether to pass a high-level Series object (False) or a raw representation of the data (True). result_type : {"expand", "reduce", "broadcast", None}, default: None Determines how to treat list-like return type of the `func` (works only if a single function was passed): - "expand": expand list-like result into columns. - "reduce": keep result into a single cell (opposite of "expand"). - "broadcast": broadcast result to original data shape (overwrite the existing column/row with the function result). - None: use "expand" strategy if Series is returned, "reduce" otherwise. *args : iterable Positional arguments to pass to `func`. **kwargs : dict Keyword arguments to pass to `func`. Returns ------- BaseQueryCompiler QueryCompiler that contains the results of execution and is built by the following rules: - Index of the specified axis contains: the names of the passed functions if multiple functions are passed, otherwise: indices of the `func` result if "expand" strategy is used, indices of the original frame if "broadcast" strategy is used, a single label `MODIN_UNNAMED_SERIES_LABEL` if "reduce" strategy is used. - Labels of the opposite axis are preserved. - Each element is the result of execution of `func` against corresponding row/column. """ return DataFrameDefault.register(pandas.DataFrame.apply)( self, func=func, axis=axis, raw=raw, result_type=result_type, *args, **kwargs, ) def apply_on_series(self, func, *args, **kwargs): """ Apply passed function on underlying Series. Parameters ---------- func : callable(pandas.Series) -> scalar, str, list or dict of such The function to apply to each row. *args : iterable Positional arguments to pass to `func`. **kwargs : dict Keyword arguments to pass to `func`. Returns ------- BaseQueryCompiler """ assert self.is_series_like() return SeriesDefault.register(pandas.Series.apply)( self, func=func, *args, **kwargs, ) def explode(self, column): """ Explode the given columns. Parameters ---------- column : Union[Hashable, Sequence[Hashable]] The columns to explode. Returns ------- BaseQueryCompiler QueryCompiler that contains the results of execution. For each row in the input QueryCompiler, if the selected columns each contain M items, there will be M rows created by exploding the columns. """ return DataFrameDefault.register(pandas.DataFrame.explode)(self, column) # END UDF # Manual Partitioning methods (e.g. merge, groupby) # These methods require some sort of manual partitioning due to their # nature. They require certain data to exist on the same partition, and # after the shuffle, there should be only a local map required. # FIXME: `map_args` and `reduce_args` leaked there from `PandasQueryCompiler.groupby_*`, # pandas storage format implements groupby via TreeReduce approach, but for other storage formats these # parameters make no sense, they shouldn't be present in a base class. @doc_utils.doc_groupby_method( action="count non-null values", result="number of non-null values", refer_to="count", ) def groupby_count( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.count)( self, by=by, axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="check whether any element is True", result="boolean of whether there is any element which is True", refer_to="any", ) def groupby_any( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.any)( self, by=by, axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="get the index of the minimum value", result="index of minimum value", refer_to="idxmin", ) def groupby_idxmin( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False ): return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.idxmin)( self, by=by, axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="get the index of the maximum value", result="index of maximum value", refer_to="idxmax", ) def groupby_idxmax( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False ): return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.idxmax)( self, by=by, axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="get the minimum value", result="minimum value", refer_to="min" ) def groupby_min( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.min)( self, by=by, axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method(result="product", refer_to="prod") def groupby_prod( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.prod)( self, by=by, axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="get the maximum value", result="maximum value", refer_to="max" ) def groupby_max( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.max)( self, by=by, axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="check whether all elements are True", result="boolean of whether all elements are True", refer_to="all", ) def groupby_all( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.all)( self, by=by, axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method(result="sum", refer_to="sum") def groupby_sum( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.sum)( self, by=by, axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="get the number of elements", result="number of elements", refer_to="size", ) def groupby_size( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): result = GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.size)( self, by=by, axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, method="size", ) if not groupby_kwargs.get("as_index", False): # Renaming 'MODIN_UNNAMED_SERIES_LABEL' to a proper name result.columns = result.columns[:-1].append(pandas.Index(["size"])) return result @doc_utils.add_refer_to("GroupBy.rolling") def groupby_rolling( self, by, agg_func, axis, groupby_kwargs, rolling_kwargs, agg_args, agg_kwargs, drop=False, ): """ Group QueryCompiler data and apply passed aggregation function to a rolling window in each group. Parameters ---------- by : BaseQueryCompiler, column or index label, Grouper or list of such Object that determine groups. agg_func : str, dict or callable(Series | DataFrame) -> scalar | Series | DataFrame Function to apply to the GroupBy object. axis : {0, 1} Axis to group and apply aggregation function along. 0 is for index, when 1 is for columns. groupby_kwargs : dict GroupBy parameters as expected by ``modin.pandas.DataFrame.groupby`` signature. rolling_kwargs : dict Parameters to build a rolling window as expected by ``modin.pandas.window.RollingGroupby`` signature. agg_args : list-like Positional arguments to pass to the `agg_func`. agg_kwargs : dict Key arguments to pass to the `agg_func`. drop : bool, default: False If `by` is a QueryCompiler indicates whether or not by-data came from the `self`. Returns ------- BaseQueryCompiler QueryCompiler containing the result of groupby aggregation. """ if isinstance(agg_func, str): str_func = agg_func def agg_func(window, *args, **kwargs): return getattr(window, str_func)(*args, **kwargs) else: assert callable(agg_func) return self.groupby_agg( by=by, agg_func=lambda grp, *args, **kwargs: agg_func( grp.rolling(**rolling_kwargs), *args, **kwargs ), axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, how="direct", drop=drop, ) @doc_utils.add_refer_to("GroupBy.aggregate") def groupby_agg( self, by, agg_func, axis, groupby_kwargs, agg_args, agg_kwargs, how="axis_wise", drop=False, series_groupby=False, ): """ Group QueryCompiler data and apply passed aggregation function. Parameters ---------- by : BaseQueryCompiler, column or index label, Grouper or list of such Object that determine groups. agg_func : str, dict or callable(Series | DataFrame) -> scalar | Series | DataFrame Function to apply to the GroupBy object. axis : {0, 1} Axis to group and apply aggregation function along. 0 is for index, when 1 is for columns. groupby_kwargs : dict GroupBy parameters as expected by ``modin.pandas.DataFrame.groupby`` signature. agg_args : list-like Positional arguments to pass to the `agg_func`. agg_kwargs : dict Key arguments to pass to the `agg_func`. how : {'axis_wise', 'group_wise', 'transform'}, default: 'axis_wise' How to apply passed `agg_func`: - 'axis_wise': apply the function against each row/column. - 'group_wise': apply the function against every group. - 'transform': apply the function against every group and broadcast the result to the original Query Compiler shape. drop : bool, default: False If `by` is a QueryCompiler indicates whether or not by-data came from the `self`. series_groupby : bool, default: False Whether we should treat `self` as Series when performing groupby. Returns ------- BaseQueryCompiler QueryCompiler containing the result of groupby aggregation. """ if isinstance(by, type(self)) and len(by.columns) == 1: by = by.columns[0] if drop else by.to_pandas().squeeze() # converting QC 'by' to a list of column labels only if this 'by' comes from the self (if drop is True) elif drop and isinstance(by, type(self)): by = list(by.columns) defaulter = SeriesGroupByDefault if series_groupby else GroupByDefault return defaulter.register(defaulter.get_aggregation_method(how))( self, by=by, agg_func=agg_func, axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="compute the mean value", result="mean value", refer_to="mean" ) def groupby_mean( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, agg_func="mean", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="compute unbiased skew", result="unbiased skew", refer_to="skew" ) def groupby_skew( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): if axis == 1: # To avoid `ValueError: Operation skew does not support axis=1` due to the # difference in the behavior of `groupby(...).skew(axis=1)` and # `groupby(...).agg("skew", axis=1)`. return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.skew)( self, by=by, axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) return self.groupby_agg( by=by, agg_func="skew", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="compute cumulative count", result="count of all the previous values", refer_to="cumcount", ) def groupby_cumcount( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, agg_func="cumcount", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="compute cumulative sum", result="sum of all the previous values", refer_to="cumsum", ) def groupby_cumsum( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, agg_func="cumsum", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="get cumulative maximum", result="maximum of all the previous values", refer_to="cummax", ) def groupby_cummax( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, agg_func="cummax", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="get cumulative minimum", result="minimum of all the previous values", refer_to="cummin", ) def groupby_cummin( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, agg_func="cummin", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="get cumulative production", result="production of all the previous values", refer_to="cumprod", ) def groupby_cumprod( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, agg_func="cumprod", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="compute standard deviation", result="standard deviation", refer_to="std" ) def groupby_std( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, agg_func="std", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="compute standard error", result="standard error", refer_to="sem" ) def groupby_sem( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, agg_func="sem", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="compute numerical rank", result="numerical rank", refer_to="rank" ) def groupby_rank( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, agg_func="rank", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="compute variance", result="variance", refer_to="var" ) def groupby_var( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, agg_func="var", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="compute correlation", result="correlation", refer_to="corr" ) def groupby_corr( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, agg_func="corr", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="compute covariance", result="covariance", refer_to="cov" ) def groupby_cov( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, agg_func="cov", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="get the number of unique values", result="number of unique values", refer_to="nunique", ) def groupby_nunique( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, agg_func="nunique", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="get the median value", result="median value", refer_to="median" ) def groupby_median( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, agg_func="median", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="compute specified quantile", result="quantile value", refer_to="quantile", ) def groupby_quantile( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, agg_func="quantile", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="fill NaN values", result="`fill_value` if it was NaN, original value otherwise", refer_to="fillna", ) def groupby_fillna( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, agg_func="fillna", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) def groupby_diff( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False ): # noqa: GL08 return self.groupby_agg( by=by, agg_func="diff", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) def groupby_pct_change( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False ): # noqa: GL08 return self.groupby_agg( by=by, agg_func="pct_change", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="get data types", result="data type", refer_to="dtypes" ) def groupby_dtypes( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, agg_func="dtypes", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="construct DataFrame from group with provided name", result="DataFrame for given group", refer_to="get_group", ) def groupby_get_group( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, agg_func="get_group", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="shift data with the specified settings", result="shifted value", refer_to="shift", ) def groupby_shift( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, agg_func="shift", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="get first value in group", result="first value", refer_to="first", ) def groupby_first( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, agg_func="first", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="get last value in group", result="last value", refer_to="last", ) def groupby_last( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, agg_func="last", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="get first n values of a group", result="first n values of a group", refer_to="head", ) def groupby_head( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, agg_func="head", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="get last n values in group", result="last n values", refer_to="tail", ) def groupby_tail( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, agg_func="tail", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="get nth value in group", result="nth value", refer_to="nth", ) def groupby_nth( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, agg_func="nth", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="get group number of each value", result="group number of each value", refer_to="ngroup", ) def groupby_ngroup( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, agg_func="ngroup", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) @doc_utils.doc_groupby_method( action="get n largest values in group", result="n largest values", refer_to="nlargest", ) def groupby_nlargest( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, agg_func="nlargest", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, series_groupby=True, ) @doc_utils.doc_groupby_method( action="get n nsmallest values in group", result="n nsmallest values", refer_to="nsmallest", ) def groupby_nsmallest( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, agg_func="nsmallest", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, series_groupby=True, ) @doc_utils.doc_groupby_method( action="get unique values in group", result="unique values", refer_to="unique", ) def groupby_unique( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, agg_func="unique", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, series_groupby=True, ) def groupby_ohlc( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, is_df, ): # noqa: GL08 if not is_df: return self.groupby_agg( by=by, agg_func="ohlc", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, series_groupby=True, ) return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.ohlc)( self, by=by, axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=True, ) # END Manual Partitioning methods @doc_utils.add_refer_to("DataFrame.unstack") def unstack(self, level, fill_value): """ Pivot a level of the (necessarily hierarchical) index labels. Parameters ---------- level : int or label fill_value : scalar or dict Returns ------- BaseQueryCompiler """ return DataFrameDefault.register(pandas.DataFrame.unstack)( self, level=level, fill_value=fill_value ) @doc_utils.add_refer_to("wide_to_long") def wide_to_long(self, **kwargs): # noqa: PR01 """ Unpivot a DataFrame from wide to long format. Returns ------- BaseQueryCompiler """ return DataFrameDefault.register(pandas.wide_to_long)(self, **kwargs) @doc_utils.add_refer_to("DataFrame.pivot") def pivot(self, index, columns, values): """ Produce pivot table based on column values. Parameters ---------- index : label or list of such, pandas.Index, optional columns : label or list of such values : label or list of such, optional Returns ------- BaseQueryCompiler New QueryCompiler containing pivot table. """ return DataFrameDefault.register(pandas.DataFrame.pivot)( self, index=index, columns=columns, values=values ) @doc_utils.add_refer_to("DataFrame.pivot_table") def pivot_table( self, index, values, columns, aggfunc, fill_value, margins, dropna, margins_name, observed, sort, ): """ Create a spreadsheet-style pivot table from underlying data. Parameters ---------- index : label, pandas.Grouper, array or list of such values : label, optional columns : column, pandas.Grouper, array or list of such aggfunc : callable(pandas.Series) -> scalar, dict of list of such fill_value : scalar, optional margins : bool dropna : bool margins_name : str observed : bool sort : bool Returns ------- BaseQueryCompiler """ return DataFrameDefault.register(pandas.DataFrame.pivot_table)( self, index=index, values=values, columns=columns, aggfunc=aggfunc, fill_value=fill_value, margins=margins, dropna=dropna, margins_name=margins_name, observed=observed, sort=sort, ) @doc_utils.add_refer_to("get_dummies") def get_dummies(self, columns, **kwargs): # noqa: PR02 """ Convert categorical variables to dummy variables for certain columns. Parameters ---------- columns : label or list of such Columns to convert. prefix : str or list of such prefix_sep : str dummy_na : bool drop_first : bool dtype : dtype **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- BaseQueryCompiler New QueryCompiler with categorical variables converted to dummy. """ def get_dummies(df, columns, **kwargs): return pandas.get_dummies(df, columns=columns, **kwargs) return DataFrameDefault.register(get_dummies)(self, columns=columns, **kwargs) @doc_utils.add_one_column_warning @doc_utils.add_refer_to("Series.repeat") def repeat(self, repeats): """ Repeat each element of one-column QueryCompiler given number of times. Parameters ---------- repeats : int or array of ints The number of repetitions for each element. This should be a non-negative integer. Repeating 0 times will return an empty QueryCompiler. Returns ------- BaseQueryCompiler New QueryCompiler with repeated elements. """ return SeriesDefault.register(pandas.Series.repeat)(self, repeats=repeats) @doc_utils.add_refer_to("cut") def cut( self, bins, **kwargs, ): """ Bin values into discrete intervals. Parameters ---------- bins : int, array of ints, or IntervalIndex The criteria to bin by. **kwargs : dict The keyword arguments to pass through. Returns ------- BaseQueryCompiler or np.ndarray or list[np.ndarray] Returns the result of pd.cut. """ def squeeze_and_cut(df, *args, **kwargs): # We need this function to ensure we squeeze our internal # representation (a dataframe) to a Series. series = df.squeeze(axis=1) return pandas.cut(series, *args, **kwargs) # We use `default_to_pandas` here since the type and number of # results can change depending on the input arguments. return self.default_to_pandas(squeeze_and_cut, bins, **kwargs) # Indexing index = property(_get_axis(0), _set_axis(0)) columns = property(_get_axis(1), _set_axis(1)) def get_axis(self, axis): """ Return index labels of the specified axis. Parameters ---------- axis : {0, 1} Axis to return labels on. 0 is for index, when 1 is for columns. Returns ------- pandas.Index """ return self.index if axis == 0 else self.columns def get_axis_len(self, axis: Literal[0, 1]) -> int: """ Return the length of the specified axis. A query compiler may choose to override this method if it has a more efficient way of computing the length of an axis without materializing it. Parameters ---------- axis : {0, 1} Axis to return labels on. Returns ------- int """ return len(self.get_axis(axis)) def take_2d_labels( self, index, columns, ): """ Take the given labels. Parameters ---------- index : slice, scalar, list-like, or BaseQueryCompiler Labels of rows to grab. columns : slice, scalar, list-like, or BaseQueryCompiler Labels of columns to grab. Returns ------- BaseQueryCompiler Subset of this QueryCompiler. """ row_lookup, col_lookup = self.get_positions_from_labels(index, columns) if isinstance(row_lookup, slice): ErrorMessage.catch_bugs_and_request_email( failure_condition=row_lookup != slice(None), extra_log=f"Only None-slices are acceptable as a slice argument in masking, got: {row_lookup}", ) row_lookup = None if isinstance(col_lookup, slice): ErrorMessage.catch_bugs_and_request_email( failure_condition=col_lookup != slice(None), extra_log=f"Only None-slices are acceptable as a slice argument in masking, got: {col_lookup}", ) col_lookup = None return self.take_2d_positional(row_lookup, col_lookup) def get_positions_from_labels(self, row_loc, col_loc): """ Compute index and column positions from their respective locators. Inputs to this method are arguments the the pandas user could pass to loc. This function will compute the corresponding index and column positions that the user could equivalently pass to iloc. Parameters ---------- row_loc : scalar, slice, list, array or tuple Row locator. col_loc : scalar, slice, list, array or tuple Columns locator. Returns ------- row_lookup : slice(None) if full axis grab, pandas.RangeIndex if repetition is detected, numpy.ndarray otherwise List of index labels. col_lookup : slice(None) if full axis grab, pandas.RangeIndex if repetition is detected, numpy.ndarray otherwise List of columns labels. Notes ----- Usage of `slice(None)` as a resulting lookup is a hack to pass information about full-axis grab without computing actual indices that triggers lazy computations. Ideally, this API should get rid of using slices as indexers and either use a common ``Indexer`` object or range and ``np.ndarray`` only. """ from modin.pandas.indexing import ( boolean_mask_to_numeric, is_boolean_array, is_list_like, is_range_like, ) lookups = [] for axis, axis_loc in enumerate((row_loc, col_loc)): if is_scalar(axis_loc): axis_loc = np.array([axis_loc]) if isinstance(axis_loc, pandas.RangeIndex): axis_lookup = axis_loc elif isinstance(axis_loc, slice) or is_range_like(axis_loc): if isinstance(axis_loc, slice) and axis_loc == slice(None): axis_lookup = axis_loc else: axis_labels = self.get_axis(axis) # `slice_indexer` returns a fully-defined numeric slice for a non-fully-defined labels-based slice # RangeIndex and range use a semi-open interval, while # slice_indexer uses a closed interval. Subtract 1 step from the # end of the interval to get the equivalent closed interval. if axis_loc.stop is None or not is_number(axis_loc.stop): slice_stop = axis_loc.stop else: slice_stop = axis_loc.stop - ( 0 if axis_loc.step is None else axis_loc.step ) axis_lookup = axis_labels.slice_indexer( axis_loc.start, slice_stop, axis_loc.step, ) # Converting negative indices to their actual positions: axis_lookup = pandas.RangeIndex( start=( axis_lookup.start if axis_lookup.start >= 0 else axis_lookup.start + len(axis_labels) ), stop=( axis_lookup.stop if axis_lookup.stop >= 0 else axis_lookup.stop + len(axis_labels) ), step=axis_lookup.step, ) elif self.has_multiindex(axis): # `Index.get_locs` raises an IndexError by itself if missing labels were provided, # we don't have to do missing-check for the received `axis_lookup`. if isinstance(axis_loc, pandas.MultiIndex): axis_lookup = self.get_axis(axis).get_indexer_for(axis_loc) else: axis_lookup = self.get_axis(axis).get_locs(axis_loc) elif is_boolean_array(axis_loc): axis_lookup = boolean_mask_to_numeric(axis_loc) else: axis_labels = self.get_axis(axis) if is_list_like(axis_loc) and not isinstance( axis_loc, (np.ndarray, pandas.Index) ): # `Index.get_indexer_for` works much faster with numpy arrays than with python lists, # so although we lose some time here on converting to numpy, `Index.get_indexer_for` # speedup covers the loss that we gain here. axis_loc = np.array(axis_loc, dtype=axis_labels.dtype) axis_lookup = axis_labels.get_indexer_for(axis_loc) # `Index.get_indexer_for` sets -1 value for missing labels, we have to verify whether # there are any -1 in the received indexer to raise a KeyError here. missing_mask = axis_lookup == -1 if missing_mask.any(): missing_labels = ( axis_loc[missing_mask] if is_list_like(axis_loc) # If `axis_loc` is not a list-like then we can't select certain # labels that are missing and so printing the whole indexer else axis_loc ) raise KeyError(missing_labels) if isinstance(axis_lookup, pandas.Index) and not is_range_like(axis_lookup): axis_lookup = axis_lookup.values lookups.append(axis_lookup) return lookups def take_2d_positional(self, index=None, columns=None): """ Index QueryCompiler with passed keys. Parameters ---------- index : list-like of ints, optional Positional indices of rows to grab. columns : list-like of ints, optional Positional indices of columns to grab. Returns ------- BaseQueryCompiler New masked QueryCompiler. """ index = slice(None) if index is None else index columns = slice(None) if columns is None else columns def applyer(df): return df.iloc[index, columns] return DataFrameDefault.register(applyer)(self) def insert_item(self, axis, loc, value, how="inner", replace=False): """ Insert rows/columns defined by `value` at the specified position. If frames are not aligned along specified axis, perform frames alignment first. Parameters ---------- axis : {0, 1} Axis to insert along. 0 means insert rows, when 1 means insert columns. loc : int Position to insert `value`. value : BaseQueryCompiler Rows/columns to insert. how : {"inner", "outer", "left", "right"}, default: "inner" Type of join that will be used if frames are not aligned. replace : bool, default: False Whether to insert item after column/row at `loc-th` position or to replace it by `value`. Returns ------- BaseQueryCompiler New QueryCompiler with inserted values. """ assert isinstance(value, type(self)) def mask(idx): if len(idx) == len(self.get_axis(axis)): return self return ( self.getitem_column_array(idx, numeric=True) if axis else self.getitem_row_array(idx) ) if 0 <= loc < len(self.get_axis(axis)): first_mask = mask(list(range(loc))) second_mask_loc = loc + 1 if replace else loc second_mask = mask(list(range(second_mask_loc, len(self.get_axis(axis))))) return first_mask.concat(axis, [value, second_mask], join=how, sort=False) else: return self.concat(axis, [value], join=how, sort=False) def setitem(self, axis, key, value): """ Set the row/column defined by `key` to the `value` provided. Parameters ---------- axis : {0, 1} Axis to set `value` along. 0 means set row, 1 means set column. key : label Row/column label to set `value` in. value : BaseQueryCompiler, list-like or scalar Define new row/column value. Returns ------- BaseQueryCompiler New QueryCompiler with updated `key` value. """ def setitem(df, axis, key, value): if is_scalar(key) and isinstance(value, pandas.DataFrame): value = value.squeeze() if not axis: df[key] = value else: df.loc[key] = value return df return DataFrameDefault.register(setitem)(self, axis=axis, key=key, value=value) def write_items( self, row_numeric_index, col_numeric_index, item, need_columns_reindex=True ): """ Update QueryCompiler elements at the specified positions by passed values. In contrast to ``setitem`` this method allows to do 2D assignments. Parameters ---------- row_numeric_index : list of ints Row positions to write value. col_numeric_index : list of ints Column positions to write value. item : Any Values to write. If not a scalar will be broadcasted according to `row_numeric_index` and `col_numeric_index`. need_columns_reindex : bool, default: True In the case of assigning columns to a dataframe (broadcasting is part of the flow), reindexing is not needed. Returns ------- BaseQueryCompiler New QueryCompiler with updated values. """ # We have to keep this import away from the module level to avoid circular import from modin.pandas.utils import broadcast_item, is_scalar if not isinstance(row_numeric_index, slice): row_numeric_index = list(row_numeric_index) if not isinstance(col_numeric_index, slice): col_numeric_index = list(col_numeric_index) def write_items(df, broadcasted_items): if isinstance(df.iloc[row_numeric_index, col_numeric_index], pandas.Series): broadcasted_items = broadcasted_items.squeeze() df.iloc[row_numeric_index, col_numeric_index] = broadcasted_items return df if not is_scalar(item): broadcasted_item, _, _, _ = broadcast_item( self, row_numeric_index, col_numeric_index, item, need_columns_reindex=need_columns_reindex, sort_lookups_and_item=False, ) else: broadcasted_item = item return DataFrameDefault.register(write_items)( self, broadcasted_items=broadcasted_item ) # END Abstract methods for QueryCompiler @cached_property def __constructor__(self) -> type[Self]: """ Get query compiler constructor. By default, constructor method will invoke an init. Returns ------- callable """ return type(self) # __delitem__ # This will change the shape of the resulting data. def delitem(self, key): """ Drop `key` column. Parameters ---------- key : label Column name to drop. Returns ------- BaseQueryCompiler New QueryCompiler without `key` column. """ return self.drop(columns=[key]) # END __delitem__ def has_multiindex(self, axis=0): """ Check if specified axis is indexed by MultiIndex. Parameters ---------- axis : {0, 1}, default: 0 The axis to check (0 - index, 1 - columns). Returns ------- bool True if index at specified axis is MultiIndex and False otherwise. """ if axis == 0: return isinstance(self.index, pandas.MultiIndex) assert axis == 1 return isinstance(self.columns, pandas.MultiIndex) @property def frame_has_materialized_dtypes(self) -> bool: """ Check if the underlying dataframe has materialized dtypes. Returns ------- bool """ return self._modin_frame.has_materialized_dtypes @property def frame_has_materialized_columns(self) -> bool: """ Check if the underlying dataframe has materialized columns. Returns ------- bool """ return self._modin_frame.has_materialized_columns @property def frame_has_materialized_index(self) -> bool: """ Check if the underlying dataframe has materialized index. Returns ------- bool """ return self._modin_frame.has_materialized_index def set_frame_dtypes_cache(self, dtypes): """ Set dtypes cache for the underlying dataframe frame. Parameters ---------- dtypes : pandas.Series, ModinDtypes, callable or None """ self._modin_frame.set_dtypes_cache(dtypes) def set_frame_index_cache(self, index): """ Set index cache for underlying dataframe. Parameters ---------- index : sequence, callable or None """ self._modin_frame.set_index_cache(index) def set_frame_columns_cache(self, index): """ Set columns cache for underlying dataframe. Parameters ---------- index : sequence, callable or None """ self._modin_frame.set_columns_cache(index) @property def frame_has_index_cache(self): """ Check if the index cache exists for underlying dataframe. Returns ------- bool """ return self._modin_frame.has_index_cache @property def frame_has_columns_cache(self): """ Check if the columns cache exists for underlying dataframe. Returns ------- bool """ return self._modin_frame.has_columns_cache @property def frame_has_dtypes_cache(self) -> bool: """ Check if the dtypes cache exists for the underlying dataframe. Returns ------- bool """ return self._modin_frame.has_dtypes_cache def get_index_name(self, axis=0): """ Get index name of specified axis. Parameters ---------- axis : {0, 1}, default: 0 Axis to get index name on. Returns ------- hashable Index name, None for MultiIndex. """ return self.get_axis(axis).name def set_index_name(self, name, axis=0): """ Set index name for the specified axis. Parameters ---------- name : hashable New index name. axis : {0, 1}, default: 0 Axis to set name along. """ self.get_axis(axis).name = name def get_index_names(self, axis=0): """ Get index names of specified axis. Parameters ---------- axis : {0, 1}, default: 0 Axis to get index names on. Returns ------- list Index names. """ return self.get_axis(axis).names def set_index_names(self, names, axis=0): """ Set index names for the specified axis. Parameters ---------- names : list New index names. axis : {0, 1}, default: 0 Axis to set names along. """ self.get_axis(axis).names = names def get_dtypes_set(self): """ Get a set of dtypes that are in this query compiler. Returns ------- set """ return set(self.dtypes.values) # DateTime methods def between_time(self, **kwargs): # noqa: PR01 """ Select values between particular times of the day (e.g., 9:00-9:30 AM). By setting start_time to be later than end_time, you can get the times that are not between the two times. Returns ------- BaseQueryCompiler """ return DataFrameDefault.register(pandas.DataFrame.between_time)(self, **kwargs) def shift( self, periods, freq, axis, fill_value, ): # noqa: GL08 return DataFrameDefault.register(pandas.DataFrame.shift)( self, periods, freq, axis, fill_value ) def tz_convert( self, tz, axis=0, level=None, copy=True, ): """ Convert tz-aware axis to target time zone. Parameters ---------- tz : str or tzinfo object or None Target time zone. Passing None will convert to UTC and remove the timezone information. axis : int, default: 0 The axis to localize. level : int, str, default: None If axis is a MultiIndex, convert a specific level. Otherwise must be None. copy : bool, default: True Also make a copy of the underlying data. Returns ------- BaseQueryCompiler A new query compiler with the converted axis. """ if level is not None: new_labels = ( pandas.Series(index=self.get_axis(axis)) .tz_convert(tz, level=level) .index ) else: new_labels = self.get_axis(axis).tz_convert(tz) obj = self.copy() if copy else self if axis == 0: obj.index = new_labels else: obj.columns = new_labels return obj def tz_localize( self, tz, axis=0, level=None, copy=True, ambiguous="raise", nonexistent="raise" ): """ Localize tz-naive index of a Series or DataFrame to target time zone. Parameters ---------- tz : tzstr or tzinfo or None Time zone to localize. Passing None will remove the time zone information and preserve local time. axis : int, default: 0 The axis to localize. level : int, str, default: None If axis is a MultiIndex, localize a specific level. Otherwise must be None. copy : bool, default: True Also make a copy of the underlying data. ambiguous : str, bool-ndarray, NaT, default: "raise" Behaviour on ambiguous times. nonexistent : str, default: "raise" What to do with nonexistent times. Returns ------- BaseQueryCompiler A new query compiler with the localized axis. """ new_labels = ( pandas.Series(index=self.get_axis(axis)) .tz_localize( tz, axis=axis, level=level, copy=False, ambiguous=ambiguous, nonexistent=nonexistent, ) .index ) obj = self.copy() if copy else self if axis == 0: obj.index = new_labels else: obj.columns = new_labels return obj @doc_utils.doc_dt_round(refer_to="ceil") def dt_ceil(self, freq, ambiguous="raise", nonexistent="raise"): return DateTimeDefault.register(pandas.Series.dt.ceil)( self, freq, ambiguous, nonexistent ) @doc_utils.add_one_column_warning @doc_utils.add_refer_to("Series.dt.components") def dt_components(self): """ Spread each date-time value into its components (days, hours, minutes...). Returns ------- BaseQueryCompiler """ return DateTimeDefault.register(pandas.Series.dt.components)(self) @doc_utils.doc_dt_timestamp( prop="the date without timezone information", refer_to="date" ) def dt_date(self): return DateTimeDefault.register(pandas.Series.dt.date)(self) @doc_utils.doc_dt_timestamp(prop="day component", refer_to="day") def dt_day(self): return DateTimeDefault.register(pandas.Series.dt.day)(self) @doc_utils.doc_dt_timestamp( prop="day name", refer_to="day_name", params="locale : str, optional" ) def dt_day_name(self, locale=None): return DateTimeDefault.register(pandas.Series.dt.day_name)(self, locale) @doc_utils.doc_dt_timestamp(prop="integer day of week", refer_to="dayofweek") # FIXME: `dt_dayofweek` is an alias for `dt_weekday`, one of them should # be removed (Modin issue #3107). def dt_dayofweek(self): return DateTimeDefault.register(pandas.Series.dt.dayofweek)(self) @doc_utils.doc_dt_timestamp(prop="day of year", refer_to="dayofyear") def dt_dayofyear(self): return DateTimeDefault.register(pandas.Series.dt.dayofyear)(self) @doc_utils.doc_dt_interval(prop="days", refer_to="days") def dt_days(self): return DateTimeDefault.register(pandas.Series.dt.days)(self) @doc_utils.doc_dt_timestamp( prop="number of days in month", refer_to="days_in_month" ) # FIXME: `dt_days_in_month` is an alias for `dt_daysinmonth`, one of them should # be removed (Modin issue #3107). def dt_days_in_month(self): return DateTimeDefault.register(pandas.Series.dt.days_in_month)(self) @doc_utils.doc_dt_timestamp(prop="number of days in month", refer_to="daysinmonth") def dt_daysinmonth(self): return DateTimeDefault.register(pandas.Series.dt.daysinmonth)(self) @doc_utils.doc_dt_period(prop="the timestamp of end time", refer_to="end_time") def dt_end_time(self): return DateTimeDefault.register(pandas.Series.dt.end_time)(self) @doc_utils.doc_dt_round(refer_to="floor") def dt_floor(self, freq, ambiguous="raise", nonexistent="raise"): return DateTimeDefault.register(pandas.Series.dt.floor)( self, freq, ambiguous, nonexistent ) @doc_utils.add_one_column_warning @doc_utils.add_refer_to("Series.dt.freq") def dt_freq(self): """ Get the time frequency of the underlying time-series data. Returns ------- BaseQueryCompiler QueryCompiler containing a single value, the frequency of the data. """ return DateTimeDefault.register(pandas.Series.dt.freq)(self) @doc_utils.add_refer_to("Series.dt.unit") def dt_unit(self): # noqa: RT01 return DateTimeDefault.register(pandas.Series.dt.unit)(self) @doc_utils.add_refer_to("Series.dt.as_unit") def dt_as_unit(self, *args, **kwargs): # noqa: PR01, RT01 return DateTimeDefault.register(pandas.Series.dt.as_unit)(self, *args, **kwargs) @doc_utils.doc_dt_timestamp( prop="Calculate year, week, and day according to the ISO 8601 standard.", refer_to="isocalendar", ) def dt_isocalendar(self): return DateTimeDefault.register(pandas.Series.dt.isocalendar)(self) @doc_utils.doc_dt_timestamp(prop="hour", refer_to="hour") def dt_hour(self): return DateTimeDefault.register(pandas.Series.dt.hour)(self) @doc_utils.doc_dt_timestamp( prop="the boolean of whether corresponding year is leap", refer_to="is_leap_year", ) def dt_is_leap_year(self): return DateTimeDefault.register(pandas.Series.dt.is_leap_year)(self) @doc_utils.doc_dt_timestamp( prop="the boolean of whether the date is the last day of the month", refer_to="is_month_end", ) def dt_is_month_end(self): return DateTimeDefault.register(pandas.Series.dt.is_month_end)(self) @doc_utils.doc_dt_timestamp( prop="the boolean of whether the date is the first day of the month", refer_to="is_month_start", ) def dt_is_month_start(self): return DateTimeDefault.register(pandas.Series.dt.is_month_start)(self) @doc_utils.doc_dt_timestamp( prop="the boolean of whether the date is the last day of the quarter", refer_to="is_quarter_end", ) def dt_is_quarter_end(self): return DateTimeDefault.register(pandas.Series.dt.is_quarter_end)(self) @doc_utils.doc_dt_timestamp( prop="the boolean of whether the date is the first day of the quarter", refer_to="is_quarter_start", ) def dt_is_quarter_start(self): return DateTimeDefault.register(pandas.Series.dt.is_quarter_start)(self) @doc_utils.doc_dt_timestamp( prop="the boolean of whether the date is the last day of the year", refer_to="is_year_end", ) def dt_is_year_end(self): return DateTimeDefault.register(pandas.Series.dt.is_year_end)(self) @doc_utils.doc_dt_timestamp( prop="the boolean of whether the date is the first day of the year", refer_to="is_year_start", ) def dt_is_year_start(self): return DateTimeDefault.register(pandas.Series.dt.is_year_start)(self) @doc_utils.doc_dt_timestamp(prop="microseconds component", refer_to="microsecond") def dt_microsecond(self): return DateTimeDefault.register(pandas.Series.dt.microsecond)(self) @doc_utils.doc_dt_interval(prop="microseconds component", refer_to="microseconds") def dt_microseconds(self): return DateTimeDefault.register(pandas.Series.dt.microseconds)(self) @doc_utils.doc_dt_timestamp(prop="minute component", refer_to="minute") def dt_minute(self): return DateTimeDefault.register(pandas.Series.dt.minute)(self) @doc_utils.doc_dt_timestamp(prop="month component", refer_to="month") def dt_month(self): return DateTimeDefault.register(pandas.Series.dt.month)(self) @doc_utils.doc_dt_timestamp( prop="the month name", refer_to="month name", params="locale : str, optional" ) def dt_month_name(self, locale=None): return DateTimeDefault.register(pandas.Series.dt.month_name)(self, locale) @doc_utils.doc_dt_timestamp(prop="nanoseconds component", refer_to="nanosecond") def dt_nanosecond(self): return DateTimeDefault.register(pandas.Series.dt.nanosecond)(self) @doc_utils.doc_dt_interval(prop="nanoseconds component", refer_to="nanoseconds") def dt_nanoseconds(self): return DateTimeDefault.register(pandas.Series.dt.nanoseconds)(self) @doc_utils.add_one_column_warning @doc_utils.add_refer_to("Series.dt.normalize") def dt_normalize(self): """ Set the time component of each date-time value to midnight. Returns ------- BaseQueryCompiler New QueryCompiler containing date-time values with midnight time. """ return DateTimeDefault.register(pandas.Series.dt.normalize)(self) @doc_utils.doc_dt_timestamp(prop="quarter component", refer_to="quarter") def dt_quarter(self): return DateTimeDefault.register(pandas.Series.dt.quarter)(self) @doc_utils.doc_dt_period(prop="the fiscal year", refer_to="qyear") def dt_qyear(self): return DateTimeDefault.register(pandas.Series.dt.qyear)(self) @doc_utils.doc_dt_round(refer_to="round") def dt_round(self, freq, ambiguous="raise", nonexistent="raise"): return DateTimeDefault.register(pandas.Series.dt.round)( self, freq, ambiguous, nonexistent ) @doc_utils.doc_dt_timestamp(prop="seconds component", refer_to="second") def dt_second(self): return DateTimeDefault.register(pandas.Series.dt.second)(self) @doc_utils.doc_dt_interval(prop="seconds component", refer_to="seconds") def dt_seconds(self): return DateTimeDefault.register(pandas.Series.dt.seconds)(self) @doc_utils.doc_dt_period(prop="the timestamp of start time", refer_to="start_time") def dt_start_time(self): return DateTimeDefault.register(pandas.Series.dt.start_time)(self) @doc_utils.add_refer_to("Series.dt.strftime") def dt_strftime(self, date_format): """ Format underlying date-time data using specified format. Parameters ---------- date_format : str Returns ------- BaseQueryCompiler New QueryCompiler containing formatted date-time values. """ return DateTimeDefault.register(pandas.Series.dt.strftime)(self, date_format) @doc_utils.doc_dt_timestamp(prop="time component", refer_to="time") def dt_time(self): return DateTimeDefault.register(pandas.Series.dt.time)(self) @doc_utils.doc_dt_timestamp( prop="time component with timezone information", refer_to="timetz" ) def dt_timetz(self): return DateTimeDefault.register(pandas.Series.dt.timetz)(self) @doc_utils.add_refer_to("Series.dt.asfreq") def dt_asfreq(self, freq=None, how: str = "E"): """ Convert the PeriodArray to the specified frequency `freq`. Equivalent to applying pandas.Period.asfreq() with the given arguments to each Period in this PeriodArray. Parameters ---------- freq : str, optional A frequency. how : str {'E', 'S'}, default: 'E' Whether the elements should be aligned to the end or start within pa period. * 'E', "END", or "FINISH" for end, * 'S', "START", or "BEGIN" for start. January 31st ("END") vs. January 1st ("START") for example. Returns ------- BaseQueryCompiler New QueryCompiler containing period data. """ return DateTimeDefault.register(pandas.Series.dt.asfreq)(self, freq, how) @doc_utils.add_one_column_warning @doc_utils.add_refer_to("Series.dt.to_period") def dt_to_period(self, freq=None): """ Convert underlying data to the period at a particular frequency. Parameters ---------- freq : str, optional Returns ------- BaseQueryCompiler New QueryCompiler containing period data. """ return DateTimeDefault.register(pandas.Series.dt.to_period)(self, freq) @doc_utils.add_one_column_warning @doc_utils.add_refer_to("Series.dt.to_pydatetime") def dt_to_pydatetime(self): """ Convert underlying data to array of python native ``datetime``. Returns ------- BaseQueryCompiler New QueryCompiler containing 1D array of ``datetime`` objects. """ return DateTimeDefault.register(pandas.Series.dt.to_pydatetime)(self) # FIXME: there are no references to this method, we should either remove it # or add a call reference at the DataFrame level (Modin issue #3103). @doc_utils.add_one_column_warning @doc_utils.add_refer_to("Series.dt.to_pytimedelta") def dt_to_pytimedelta(self): """ Convert underlying data to array of python native ``datetime.timedelta``. Returns ------- BaseQueryCompiler New QueryCompiler containing 1D array of ``datetime.timedelta``. """ return DateTimeDefault.register(pandas.Series.dt.to_pytimedelta)(self) @doc_utils.doc_dt_period( prop="the timestamp representation", refer_to="to_timestamp" ) def dt_to_timestamp(self): return DateTimeDefault.register(pandas.Series.dt.to_timestamp)(self) @doc_utils.doc_dt_interval(prop="duration in seconds", refer_to="total_seconds") def dt_total_seconds(self): return DateTimeDefault.register(pandas.Series.dt.total_seconds)(self) @doc_utils.add_one_column_warning @doc_utils.add_refer_to("Series.dt.tz") def dt_tz(self): """ Get the time-zone of the underlying time-series data. Returns ------- BaseQueryCompiler QueryCompiler containing a single value, time-zone of the data. """ return DateTimeDefault.register(pandas.Series.dt.tz)(self) @doc_utils.add_one_column_warning @doc_utils.add_refer_to("Series.dt.tz_convert") def dt_tz_convert(self, tz): """ Convert time-series data to the specified time zone. Parameters ---------- tz : str, pytz.timezone Returns ------- BaseQueryCompiler New QueryCompiler containing values with converted time zone. """ return DateTimeDefault.register(pandas.Series.dt.tz_convert)(self, tz) @doc_utils.add_one_column_warning @doc_utils.add_refer_to("Series.dt.tz_localize") def dt_tz_localize(self, tz, ambiguous="raise", nonexistent="raise"): """ Localize tz-naive to tz-aware. Parameters ---------- tz : str, pytz.timezone, optional ambiguous : {"raise", "inner", "NaT"} or bool mask, default: "raise" nonexistent : {"raise", "shift_forward", "shift_backward, "NaT"} or pandas.timedelta, default: "raise" Returns ------- BaseQueryCompiler New QueryCompiler containing values with localized time zone. """ return DateTimeDefault.register(pandas.Series.dt.tz_localize)( self, tz, ambiguous, nonexistent ) @doc_utils.doc_dt_timestamp(prop="integer day of week", refer_to="weekday") def dt_weekday(self): return DateTimeDefault.register(pandas.Series.dt.weekday)(self) @doc_utils.doc_dt_timestamp(prop="year component", refer_to="year") def dt_year(self): return DateTimeDefault.register(pandas.Series.dt.year)(self) # End of DateTime methods def first(self, offset: pandas.DateOffset): """ Select initial periods of time series data based on a date offset. When having a query compiler with dates as index, this function can select the first few rows based on a date offset. Parameters ---------- offset : pandas.DateOffset The offset length of the data to select. Returns ------- BaseQueryCompiler New compiler containing the selected data. """ return DataFrameDefault.register(pandas.DataFrame.first)(self, offset) def last(self, offset: pandas.DateOffset): """ Select final periods of time series data based on a date offset. For a query compiler with a sorted DatetimeIndex, this function selects the last few rows based on a date offset. Parameters ---------- offset : pandas.DateOffset The offset length of the data to select. Returns ------- BaseQueryCompiler New compiler containing the selected data. """ return DataFrameDefault.register(pandas.DataFrame.last)(self, offset) # Resample methods # FIXME: # 1. Query Compiler shouldn't care about differences between Series and DataFrame # so `resample_agg_df` and `resample_agg_ser` should be combined (Modin issue #3104). # 2. In DataFrame API `Resampler.aggregate` is an alias for `Resampler.apply` # we should remove one of these methods: `resample_agg_*` or `resample_app_*` (Modin issue #3107). @doc_utils.doc_resample_agg( action="apply passed aggregation function", params="func : str, dict, callable(pandas.Series) -> scalar, or list of such", output="function names", refer_to="agg", ) def resample_agg_df(self, resample_kwargs, func, *args, **kwargs): return ResampleDefault.register(pandas.core.resample.Resampler.aggregate)( self, resample_kwargs, func, *args, **kwargs ) @doc_utils.add_deprecation_warning(replacement_method="resample_agg_df") @doc_utils.doc_resample_agg( action="apply passed aggregation function in a one-column query compiler", params="func : str, dict, callable(pandas.Series) -> scalar, or list of such", output="function names", refer_to="agg", ) def resample_agg_ser(self, resample_kwargs, func, *args, **kwargs): return ResampleDefault.register( pandas.core.resample.Resampler.aggregate, squeeze_self=True )(self, resample_kwargs, func, *args, **kwargs) @doc_utils.add_deprecation_warning(replacement_method="resample_agg_df") @doc_utils.doc_resample_agg( action="apply passed aggregation function", params="func : str, dict, callable(pandas.Series) -> scalar, or list of such", output="function names", refer_to="apply", ) def resample_app_df(self, resample_kwargs, func, *args, **kwargs): return ResampleDefault.register(pandas.core.resample.Resampler.apply)( self, resample_kwargs, func, *args, **kwargs ) @doc_utils.add_deprecation_warning(replacement_method="resample_agg_df") @doc_utils.doc_resample_agg( action="apply passed aggregation function in a one-column query compiler", params="func : str, dict, callable(pandas.Series) -> scalar, or list of such", output="function names", refer_to="apply", ) def resample_app_ser(self, resample_kwargs, func, *args, **kwargs): return ResampleDefault.register( pandas.core.resample.Resampler.apply, squeeze_self=True )(self, resample_kwargs, func, *args, **kwargs) def resample_asfreq(self, resample_kwargs, fill_value): """ Resample time-series data and get the values at the new frequency. Group data into intervals by time-series row/column with a specified frequency and get values at the new frequency. Parameters ---------- resample_kwargs : dict Resample parameters as expected by ``modin.pandas.DataFrame.resample`` signature. fill_value : scalar Returns ------- BaseQueryCompiler New QueryCompiler containing values at the specified frequency. """ return ResampleDefault.register(pandas.core.resample.Resampler.asfreq)( self, resample_kwargs, fill_value ) @doc_utils.doc_resample_fillna(method="back-fill", refer_to="bfill") def resample_bfill(self, resample_kwargs, limit): return ResampleDefault.register(pandas.core.resample.Resampler.bfill)( self, resample_kwargs, limit ) @doc_utils.doc_resample_reduce( result="number of non-NA values", refer_to="count", compatibility_params=False ) def resample_count(self, resample_kwargs): return ResampleDefault.register(pandas.core.resample.Resampler.count)( self, resample_kwargs ) @doc_utils.doc_resample_fillna(method="forward-fill", refer_to="ffill") def resample_ffill(self, resample_kwargs, limit): return ResampleDefault.register(pandas.core.resample.Resampler.ffill)( self, resample_kwargs, limit ) # FIXME: we should combine all resample fillna methods into `resample_fillna` # (Modin issue #3107) @doc_utils.doc_resample_fillna( method="specified", refer_to="fillna", params="method : str" ) def resample_fillna(self, resample_kwargs, method, limit): return ResampleDefault.register(pandas.core.resample.Resampler.fillna)( self, resample_kwargs, method, limit ) @doc_utils.doc_resample_reduce(result="first element", refer_to="first") def resample_first(self, resample_kwargs, *args, **kwargs): return ResampleDefault.register(pandas.core.resample.Resampler.first)( self, resample_kwargs, *args, **kwargs ) # FIXME: This function takes Modin DataFrame via `obj` parameter, # we should avoid leaking of the high-level objects to the query compiler level. # (Modin issue #3106) def resample_get_group(self, resample_kwargs, name, obj): """ Resample time-series data and get the specified group. Group data into intervals by time-series row/column with a specified frequency and get the values of the specified group. Parameters ---------- resample_kwargs : dict Resample parameters as expected by ``modin.pandas.DataFrame.resample`` signature. name : object obj : modin.pandas.DataFrame, optional Returns ------- BaseQueryCompiler New QueryCompiler containing the values from the specified group. """ return ResampleDefault.register(pandas.core.resample.Resampler.get_group)( self, resample_kwargs, name, obj ) @doc_utils.doc_resample_fillna( method="specified interpolation", refer_to="interpolate", params=""" method : str axis : {0, 1} limit : int inplace : {False} This parameter serves the compatibility purpose. Always has to be False. limit_direction : {"forward", "backward", "both"} limit_area : {None, "inside", "outside"} downcast : str, optional **kwargs : dict """, overwrite_template_params=True, ) def resample_interpolate( self, resample_kwargs, method, axis, limit, inplace, limit_direction, limit_area, downcast, **kwargs, ): return ResampleDefault.register(pandas.core.resample.Resampler.interpolate)( self, resample_kwargs, method, axis=axis, limit=limit, inplace=inplace, limit_direction=limit_direction, limit_area=limit_area, downcast=downcast, **kwargs, ) @doc_utils.doc_resample_reduce(result="last element", refer_to="last") def resample_last(self, resample_kwargs, *args, **kwargs): return ResampleDefault.register(pandas.core.resample.Resampler.last)( self, resample_kwargs, *args, **kwargs ) @doc_utils.doc_resample_reduce(result="maximum value", refer_to="max") def resample_max(self, resample_kwargs, *args, **kwargs): return ResampleDefault.register(pandas.core.resample.Resampler.max)( self, resample_kwargs, *args, **kwargs ) @doc_utils.doc_resample_reduce(result="mean value", refer_to="mean") def resample_mean(self, resample_kwargs, *args, **kwargs): return ResampleDefault.register(pandas.core.resample.Resampler.mean)( self, resample_kwargs, *args, **kwargs ) @doc_utils.doc_resample_reduce(result="median value", refer_to="median") def resample_median(self, resample_kwargs, *args, **kwargs): return ResampleDefault.register(pandas.core.resample.Resampler.median)( self, resample_kwargs, *args, **kwargs ) @doc_utils.doc_resample_reduce(result="minimum value", refer_to="min") def resample_min(self, resample_kwargs, *args, **kwargs): return ResampleDefault.register(pandas.core.resample.Resampler.min)( self, resample_kwargs, *args, **kwargs ) @doc_utils.doc_resample_fillna(method="'nearest'", refer_to="nearest") def resample_nearest(self, resample_kwargs, limit): return ResampleDefault.register(pandas.core.resample.Resampler.nearest)( self, resample_kwargs, limit ) @doc_utils.doc_resample_reduce(result="number of unique values", refer_to="nunique") def resample_nunique(self, resample_kwargs, *args, **kwargs): return ResampleDefault.register(pandas.core.resample.Resampler.nunique)( self, resample_kwargs, *args, **kwargs ) # FIXME: Query Compiler shouldn't care about differences between Series and DataFrame # so `resample_ohlc_df` and `resample_ohlc_ser` should be combined (Modin issue #3104). @doc_utils.doc_resample_agg( action="compute open, high, low and close values", output="labels of columns containing computed values", refer_to="ohlc", ) def resample_ohlc_df(self, resample_kwargs, *args, **kwargs): return ResampleDefault.register(pandas.core.resample.Resampler.ohlc)( self, resample_kwargs, *args, **kwargs ) @doc_utils.doc_resample_agg( action="compute open, high, low and close values", output="labels of columns containing computed values", refer_to="ohlc", ) def resample_ohlc_ser(self, resample_kwargs, *args, **kwargs): return ResampleDefault.register( pandas.core.resample.Resampler.ohlc, squeeze_self=True )(self, resample_kwargs, *args, **kwargs) # FIXME: This method require us to build high-level resampler object # which we shouldn't do at the query compiler. We need to move this at the front. # (Modin issue #3105) @doc_utils.add_refer_to("Resampler.pipe") def resample_pipe(self, resample_kwargs, func, *args, **kwargs): """ Resample time-series data and apply aggregation on it. Group data into intervals by time-series row/column with a specified frequency, build equivalent ``pandas.Resampler`` object and apply passed function to it. Parameters ---------- resample_kwargs : dict Resample parameters as expected by ``modin.pandas.DataFrame.resample`` signature. func : callable(pandas.Resampler) -> object or tuple(callable, str) *args : iterable Positional arguments to pass to function. **kwargs : dict Keyword arguments to pass to function. Returns ------- BaseQueryCompiler New QueryCompiler containing the result of passed function. """ return ResampleDefault.register(pandas.core.resample.Resampler.pipe)( self, resample_kwargs, func, *args, **kwargs ) @doc_utils.doc_resample_reduce( result="product", params="min_count : int", refer_to="prod", ) def resample_prod(self, resample_kwargs, min_count, *args, **kwargs): return ResampleDefault.register(pandas.core.resample.Resampler.prod)( self, resample_kwargs, min_count, *args, **kwargs ) @doc_utils.doc_resample_reduce( result="quantile", params="q : float", refer_to="quantile" ) def resample_quantile(self, resample_kwargs, q, *args, **kwargs): return ResampleDefault.register(pandas.core.resample.Resampler.quantile)( self, resample_kwargs, q, *args, **kwargs ) @doc_utils.doc_resample_reduce( result="standard error of the mean", refer_to="sem", ) def resample_sem(self, resample_kwargs, *args, **kwargs): return ResampleDefault.register(pandas.core.resample.Resampler.sem)( self, resample_kwargs, *args, **kwargs ) @doc_utils.doc_resample_reduce( result="number of elements in a group", refer_to="size" ) def resample_size(self, resample_kwargs, *args, **kwargs): return ResampleDefault.register(pandas.core.resample.Resampler.size)( self, resample_kwargs, *args, **kwargs ) @doc_utils.doc_resample_reduce( result="standard deviation", params="ddof : int", refer_to="std" ) def resample_std(self, resample_kwargs, ddof, *args, **kwargs): return ResampleDefault.register(pandas.core.resample.Resampler.std)( self, resample_kwargs, ddof, *args, **kwargs ) @doc_utils.doc_resample_reduce( result="sum", params="min_count : int", refer_to="sum", ) def resample_sum(self, resample_kwargs, min_count, *args, **kwargs): return ResampleDefault.register(pandas.core.resample.Resampler.sum)( self, resample_kwargs, min_count, *args, **kwargs ) def resample_transform(self, resample_kwargs, arg, *args, **kwargs): """ Resample time-series data and apply aggregation on it. Group data into intervals by time-series row/column with a specified frequency and call passed function on each group. In contrast to ``resample_app_df`` apply function to the whole group, instead of a single axis. Parameters ---------- resample_kwargs : dict Resample parameters as expected by ``modin.pandas.DataFrame.resample`` signature. arg : callable(pandas.DataFrame) -> pandas.Series *args : iterable Positional arguments to pass to function. **kwargs : dict Keyword arguments to pass to function. Returns ------- BaseQueryCompiler New QueryCompiler containing the result of passed function. """ return ResampleDefault.register(pandas.core.resample.Resampler.transform)( self, resample_kwargs, arg, *args, **kwargs ) @doc_utils.doc_resample_reduce( result="variance", params="ddof : int", refer_to="var" ) def resample_var(self, resample_kwargs, ddof, *args, **kwargs): return ResampleDefault.register(pandas.core.resample.Resampler.var)( self, resample_kwargs, ddof, *args, **kwargs ) # End of Resample methods # Str methods @doc_utils.doc_str_method(refer_to="capitalize", params="") def str_capitalize(self): return StrDefault.register(pandas.Series.str.capitalize)(self) @doc_utils.doc_str_method( refer_to="center", params=""" width : int fillchar : str, default: ' '""", ) def str_center(self, width, fillchar=" "): return StrDefault.register(pandas.Series.str.center)(self, width, fillchar) @doc_utils.doc_str_method( refer_to="contains", params=""" pat : str case : bool, default: True flags : int, default: 0 na : object, default: None regex : bool, default: True""", ) def str_contains(self, pat, case=True, flags=0, na=None, regex=True): return StrDefault.register(pandas.Series.str.contains)( self, pat, case, flags, na, regex ) @doc_utils.doc_str_method( refer_to="count", params=""" pat : str flags : int, default: 0""", ) def str_count(self, pat, flags=0): return StrDefault.register(pandas.Series.str.count)(self, pat, flags) @doc_utils.doc_str_method( refer_to="endswith", params=""" pat : str na : object, default: None""", ) def str_endswith(self, pat, na=None): return StrDefault.register(pandas.Series.str.endswith)(self, pat, na) @doc_utils.doc_str_method( refer_to="find", params=""" sub : str start : int, default: 0 end : int, optional""", ) def str_find(self, sub, start=0, end=None): return StrDefault.register(pandas.Series.str.find)(self, sub, start, end) @doc_utils.doc_str_method( refer_to="findall", params=""" pat : str flags : int, default: 0""", ) def str_findall(self, pat, flags=0): return StrDefault.register(pandas.Series.str.findall)(self, pat, flags) @doc_utils.doc_str_method( refer_to="fullmatch", params=""" pat : str case : bool, default: True flags : int, default: 0 na : object, default: None""", ) def str_fullmatch(self, pat, case=True, flags=0, na=None): return StrDefault.register(pandas.Series.str.fullmatch)( self, pat, case, flags, na ) @doc_utils.doc_str_method(refer_to="get", params="i : int") def str_get(self, i): return StrDefault.register(pandas.Series.str.get)(self, i) @doc_utils.doc_str_method(refer_to="get_dummies", params="sep : str") def str_get_dummies(self, sep): return StrDefault.register(pandas.Series.str.get_dummies)(self, sep) @doc_utils.doc_str_method( refer_to="index", params=""" sub : str start : int, default: 0 end : int, optional""", ) def str_index(self, sub, start=0, end=None): return StrDefault.register(pandas.Series.str.index)(self, sub, start, end) @doc_utils.doc_str_method(refer_to="isalnum", params="") def str_isalnum(self): return StrDefault.register(pandas.Series.str.isalnum)(self) @doc_utils.doc_str_method(refer_to="isalpha", params="") def str_isalpha(self): return StrDefault.register(pandas.Series.str.isalpha)(self) @doc_utils.doc_str_method(refer_to="isdecimal", params="") def str_isdecimal(self): return StrDefault.register(pandas.Series.str.isdecimal)(self) @doc_utils.doc_str_method(refer_to="isdigit", params="") def str_isdigit(self): return StrDefault.register(pandas.Series.str.isdigit)(self) @doc_utils.doc_str_method(refer_to="islower", params="") def str_islower(self): return StrDefault.register(pandas.Series.str.islower)(self) @doc_utils.doc_str_method(refer_to="isnumeric", params="") def str_isnumeric(self): return StrDefault.register(pandas.Series.str.isnumeric)(self) @doc_utils.doc_str_method(refer_to="isspace", params="") def str_isspace(self): return StrDefault.register(pandas.Series.str.isspace)(self) @doc_utils.doc_str_method(refer_to="istitle", params="") def str_istitle(self): return StrDefault.register(pandas.Series.str.istitle)(self) @doc_utils.doc_str_method(refer_to="isupper", params="") def str_isupper(self): return StrDefault.register(pandas.Series.str.isupper)(self) @doc_utils.doc_str_method(refer_to="join", params="sep : str") def str_join(self, sep): return StrDefault.register(pandas.Series.str.join)(self, sep) @doc_utils.doc_str_method(refer_to="len", params="") def str_len(self): return StrDefault.register(pandas.Series.str.len)(self) @doc_utils.doc_str_method( refer_to="ljust", params=""" width : int fillchar : str, default: ' '""", ) def str_ljust(self, width, fillchar=" "): return StrDefault.register(pandas.Series.str.ljust)(self, width, fillchar) @doc_utils.doc_str_method(refer_to="lower", params="") def str_lower(self): return StrDefault.register(pandas.Series.str.lower)(self) @doc_utils.doc_str_method(refer_to="lstrip", params="to_strip : str, optional") def str_lstrip(self, to_strip=None): return StrDefault.register(pandas.Series.str.lstrip)(self, to_strip) @doc_utils.doc_str_method( refer_to="match", params=""" pat : str case : bool, default: True flags : int, default: 0 na : object, default: None""", ) def str_match(self, pat, case=True, flags=0, na=None): return StrDefault.register(pandas.Series.str.match)(self, pat, case, flags, na) @doc_utils.doc_str_method( refer_to="extract", params=""" pat : str flags : int, default: 0 expand : bool, default: True""", ) def str_extract(self, pat, flags=0, expand=True): return StrDefault.register(pandas.Series.str.extract)(self, pat, flags, expand) @doc_utils.doc_str_method( refer_to="extractall", params=""" pat : str flags : int, default: 0""", ) def str_extractall(self, pat, flags=0): return StrDefault.register(pandas.Series.str.extractall)(self, pat, flags) @doc_utils.doc_str_method( refer_to="normalize", params="form : {'NFC', 'NFKC', 'NFD', 'NFKD'}" ) def str_normalize(self, form): return StrDefault.register(pandas.Series.str.normalize)(self, form) @doc_utils.doc_str_method( refer_to="pad", params=""" width : int side : {'left', 'right', 'both'}, default: 'left' fillchar : str, default: ' '""", ) def str_pad(self, width, side="left", fillchar=" "): return StrDefault.register(pandas.Series.str.pad)(self, width, side, fillchar) @doc_utils.doc_str_method( refer_to="partition", params=""" sep : str, default: ' ' expand : bool, default: True""", ) def str_partition(self, sep=" ", expand=True): return StrDefault.register(pandas.Series.str.partition)(self, sep, expand) @doc_utils.doc_str_method(refer_to="removeprefix", params="prefix : str") def str_removeprefix(self, prefix): return StrDefault.register(pandas.Series.str.removeprefix)(self, prefix) @doc_utils.doc_str_method(refer_to="removesuffix", params="suffix : str") def str_removesuffix(self, suffix): return StrDefault.register(pandas.Series.str.removesuffix)(self, suffix) @doc_utils.doc_str_method(refer_to="repeat", params="repeats : int") def str_repeat(self, repeats): return StrDefault.register(pandas.Series.str.repeat)(self, repeats) @doc_utils.doc_str_method( refer_to="replace", params=""" pat : str repl : str or callable n : int, default: -1 case : bool, optional flags : int, default: 0 regex : bool, default: None""", ) def str_replace(self, pat, repl, n=-1, case=None, flags=0, regex=None): return StrDefault.register(pandas.Series.str.replace)( self, pat, repl, n, case, flags, regex ) @doc_utils.doc_str_method( refer_to="rfind", params=""" sub : str start : int, default: 0 end : int, optional""", ) def str_rfind(self, sub, start=0, end=None): return StrDefault.register(pandas.Series.str.rfind)(self, sub, start, end) @doc_utils.doc_str_method( refer_to="rindex", params=""" sub : str start : int, default: 0 end : int, optional""", ) def str_rindex(self, sub, start=0, end=None): return StrDefault.register(pandas.Series.str.rindex)(self, sub, start, end) @doc_utils.doc_str_method( refer_to="rjust", params=""" width : int fillchar : str, default: ' '""", ) def str_rjust(self, width, fillchar=" "): return StrDefault.register(pandas.Series.str.rjust)(self, width, fillchar) @doc_utils.doc_str_method( refer_to="rpartition", params=""" sep : str, default: ' ' expand : bool, default: True""", ) def str_rpartition(self, sep=" ", expand=True): return StrDefault.register(pandas.Series.str.rpartition)(self, sep, expand) @doc_utils.doc_str_method( refer_to="rsplit", params=""" pat : str, optional n : int, default: -1 expand : bool, default: False""", ) def str_rsplit(self, pat=None, *, n=-1, expand=False): return StrDefault.register(pandas.Series.str.rsplit)( self, pat, n=n, expand=expand ) @doc_utils.doc_str_method(refer_to="rstrip", params="to_strip : str, optional") def str_rstrip(self, to_strip=None): return StrDefault.register(pandas.Series.str.rstrip)(self, to_strip) @doc_utils.doc_str_method( refer_to="slice", params=""" start : int, optional stop : int, optional step : int, optional""", ) def str_slice(self, start=None, stop=None, step=None): return StrDefault.register(pandas.Series.str.slice)(self, start, stop, step) @doc_utils.doc_str_method( refer_to="slice_replace", params=""" start : int, optional stop : int, optional repl : str or callable, optional""", ) def str_slice_replace(self, start=None, stop=None, repl=None): return StrDefault.register(pandas.Series.str.slice_replace)( self, start, stop, repl ) @doc_utils.doc_str_method( refer_to="split", params=""" pat : str, optional n : int, default: -1 expand : bool, default: False regex : bool, default: None""", ) def str_split(self, pat=None, *, n=-1, expand=False, regex=None): return StrDefault.register(pandas.Series.str.split)( self, pat, n=n, expand=expand, regex=regex ) @doc_utils.doc_str_method( refer_to="startswith", params=""" pat : str na : object, default: None""", ) def str_startswith(self, pat, na=None): return StrDefault.register(pandas.Series.str.startswith)(self, pat, na) @doc_utils.doc_str_method(refer_to="strip", params="to_strip : str, optional") def str_strip(self, to_strip=None): return StrDefault.register(pandas.Series.str.strip)(self, to_strip) @doc_utils.doc_str_method(refer_to="swapcase", params="") def str_swapcase(self): return StrDefault.register(pandas.Series.str.swapcase)(self) @doc_utils.doc_str_method(refer_to="title", params="") def str_title(self): return StrDefault.register(pandas.Series.str.title)(self) @doc_utils.doc_str_method(refer_to="translate", params="table : dict") def str_translate(self, table): return StrDefault.register(pandas.Series.str.translate)(self, table) @doc_utils.doc_str_method(refer_to="upper", params="") def str_upper(self): return StrDefault.register(pandas.Series.str.upper)(self) @doc_utils.doc_str_method( refer_to="wrap", params=""" width : int **kwargs : dict""", ) def str_wrap(self, width, **kwargs): return StrDefault.register(pandas.Series.str.wrap)(self, width, **kwargs) @doc_utils.doc_str_method(refer_to="zfill", params="width : int") def str_zfill(self, width): return StrDefault.register(pandas.Series.str.zfill)(self, width) @doc_utils.doc_str_method(refer_to="__getitem__", params="key : object") def str___getitem__(self, key): return StrDefault.register(pandas.Series.str.__getitem__)(self, key) @doc_utils.doc_str_method( refer_to="encode", params=""" encoding : str, errors : str, default = 'strict'""", ) def str_encode(self, encoding, errors): return StrDefault.register(pandas.Series.str.encode)(self, encoding, errors) @doc_utils.doc_str_method( refer_to="decode", params=""" encoding : str, errors : str, default = 'strict' dtype : str or dtype, optional""", ) def str_decode(self, encoding, errors, dtype): return StrDefault.register(pandas.Series.str.decode)( self, encoding, errors, dtype ) @doc_utils.doc_str_method( refer_to="cat", params=""" others : Series, Index, DataFrame, np.ndarray or list-like, sep : str, default: '', na_rep : str or None, default: None, join : {'left', 'right', 'outer', 'inner'}, default: 'left'""", ) def str_cat(self, others, sep=None, na_rep=None, join="left"): return StrDefault.register(pandas.Series.str.cat)( self, others, sep, na_rep, join ) @doc_utils.doc_str_method( refer_to="casefold", params="", ) def str_casefold(self): return StrDefault.register(pandas.Series.str.casefold)(self) # End of Str methods # Rolling methods # FIXME: most of the rolling/window methods take *args and **kwargs parameters # which are only needed for the compatibility with numpy, this behavior is inherited # from the API level, we should get rid of it (Modin issue #3108). @doc_utils.doc_window_method( window_cls_name="Rolling", result="the result of passed functions", action="apply specified functions", refer_to="aggregate", params=""" func : str, dict, callable(pandas.Series) -> scalar, or list of such *args : iterable **kwargs : dict""", build_rules="udf_aggregation", ) def rolling_aggregate(self, fold_axis, rolling_kwargs, func, *args, **kwargs): return RollingDefault.register(pandas.core.window.rolling.Rolling.aggregate)( self, rolling_kwargs, func, *args, **kwargs ) # FIXME: at the query compiler method `rolling_apply` is an alias for `rolling_aggregate`, # one of these should be removed (Modin issue #3107). @doc_utils.add_deprecation_warning(replacement_method="rolling_aggregate") @doc_utils.doc_window_method( window_cls_name="Rolling", result="the result of passed function", action="apply specified function", refer_to="apply", params=""" func : callable(pandas.Series) -> scalar raw : bool, default: False engine : None, default: None This parameters serves the compatibility purpose. Always has to be None. engine_kwargs : None, default: None This parameters serves the compatibility purpose. Always has to be None. args : tuple, optional kwargs : dict, optional""", build_rules="udf_aggregation", ) def rolling_apply( self, fold_axis, rolling_kwargs, func, raw=False, engine=None, engine_kwargs=None, args=None, kwargs=None, ): return RollingDefault.register(pandas.core.window.rolling.Rolling.apply)( self, rolling_kwargs, func, raw, engine, engine_kwargs, args, kwargs ) @doc_utils.doc_window_method( window_cls_name="Rolling", result="correlation", refer_to="corr", params=""" other : modin.pandas.Series, modin.pandas.DataFrame, list-like, optional pairwise : bool, optional *args : iterable **kwargs : dict""", ) def rolling_corr( self, fold_axis, rolling_kwargs, other=None, pairwise=None, *args, **kwargs ): return RollingDefault.register(pandas.core.window.rolling.Rolling.corr)( self, rolling_kwargs, other, pairwise, *args, **kwargs ) @doc_utils.doc_window_method( window_cls_name="Rolling", result="number of non-NA values", refer_to="count" ) def rolling_count(self, fold_axis, rolling_kwargs): return RollingDefault.register(pandas.core.window.rolling.Rolling.count)( self, rolling_kwargs ) @doc_utils.doc_window_method( window_cls_name="Rolling", result="covariance", refer_to="cov", params=""" other : modin.pandas.Series, modin.pandas.DataFrame, list-like, optional pairwise : bool, optional ddof : int, default: 1 **kwargs : dict""", ) def rolling_cov( self, fold_axis, rolling_kwargs, other=None, pairwise=None, ddof=1, **kwargs ): return RollingDefault.register(pandas.core.window.rolling.Rolling.cov)( self, rolling_kwargs, other, pairwise, ddof, **kwargs ) @doc_utils.doc_window_method( window_cls_name="Rolling", result="unbiased kurtosis", refer_to="kurt", params="**kwargs : dict", ) def rolling_kurt(self, fold_axis, rolling_kwargs, **kwargs): return RollingDefault.register(pandas.core.window.rolling.Rolling.kurt)( self, rolling_kwargs, **kwargs ) @doc_utils.doc_window_method( window_cls_name="Rolling", result="maximum value", refer_to="max", params=""" *args : iterable **kwargs : dict""", ) def rolling_max(self, fold_axis, rolling_kwargs, *args, **kwargs): return RollingDefault.register(pandas.core.window.rolling.Rolling.max)( self, rolling_kwargs, *args, **kwargs ) @doc_utils.doc_window_method( window_cls_name="Rolling", result="mean value", refer_to="mean", params=""" *args : iterable **kwargs : dict""", ) def rolling_mean(self, fold_axis, rolling_kwargs, *args, **kwargs): return RollingDefault.register(pandas.core.window.rolling.Rolling.mean)( self, rolling_kwargs, *args, **kwargs ) @doc_utils.doc_window_method( window_cls_name="Rolling", result="median value", refer_to="median", params="**kwargs : dict", ) def rolling_median(self, fold_axis, rolling_kwargs, **kwargs): return RollingDefault.register(pandas.core.window.rolling.Rolling.median)( self, rolling_kwargs, **kwargs ) @doc_utils.doc_window_method( window_cls_name="Rolling", result="minimum value", refer_to="min", params=""" *args : iterable **kwargs : dict""", ) def rolling_min(self, fold_axis, rolling_kwargs, *args, **kwargs): return RollingDefault.register(pandas.core.window.rolling.Rolling.min)( self, rolling_kwargs, *args, **kwargs ) @doc_utils.doc_window_method( window_cls_name="Rolling", result="quantile", refer_to="quantile", params=""" quantile : float interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}, default: 'linear' **kwargs : dict""", ) def rolling_quantile( self, fold_axis, rolling_kwargs, quantile, interpolation="linear", **kwargs ): return RollingDefault.register(pandas.core.window.rolling.Rolling.quantile)( self, rolling_kwargs, quantile, interpolation, **kwargs ) @doc_utils.doc_window_method( window_cls_name="Rolling", result="unbiased skewness", refer_to="skew", params="**kwargs : dict", ) def rolling_skew(self, fold_axis, rolling_kwargs, **kwargs): return RollingDefault.register(pandas.core.window.rolling.Rolling.skew)( self, rolling_kwargs, **kwargs ) @doc_utils.doc_window_method( window_cls_name="Rolling", result="standard deviation", refer_to="std", params=""" ddof : int, default: 1 *args : iterable **kwargs : dict""", ) def rolling_std(self, fold_axis, rolling_kwargs, ddof=1, *args, **kwargs): return RollingDefault.register(pandas.core.window.rolling.Rolling.std)( self, rolling_kwargs, ddof, *args, **kwargs ) @doc_utils.doc_window_method( window_cls_name="Rolling", result="sum", refer_to="sum", params=""" *args : iterable **kwargs : dict""", ) def rolling_sum(self, fold_axis, rolling_kwargs, *args, **kwargs): return RollingDefault.register(pandas.core.window.rolling.Rolling.sum)( self, rolling_kwargs, *args, **kwargs ) @doc_utils.doc_window_method( window_cls_name="Rolling", result="sem", refer_to="sem", params=""" *args : iterable **kwargs : dict""", ) def rolling_sem(self, fold_axis, rolling_kwargs, *args, **kwargs): return RollingDefault.register(pandas.core.window.rolling.Rolling.sem)( self, rolling_kwargs, *args, **kwargs ) @doc_utils.doc_window_method( window_cls_name="Rolling", result="variance", refer_to="var", params=""" ddof : int, default: 1 *args : iterable **kwargs : dict""", ) def rolling_var(self, fold_axis, rolling_kwargs, ddof=1, *args, **kwargs): return RollingDefault.register(pandas.core.window.rolling.Rolling.var)( self, rolling_kwargs, ddof, *args, **kwargs ) @doc_utils.doc_window_method( window_cls_name="Rolling", result="rank", refer_to="rank", params=""" method : {'average', 'min', 'max'}, default: 'average' ascending : bool, default: True pct : bool, default: False numeric_only : bool, default: False *args : iterable **kwargs : dict""", ) def rolling_rank( self, fold_axis, rolling_kwargs, method="average", ascending=True, pct=False, numeric_only=False, *args, **kwargs, ): return RollingDefault.register(pandas.core.window.rolling.Rolling.rank)( self, rolling_kwargs, method=method, ascending=ascending, pct=pct, numeric_only=numeric_only, *args, **kwargs, ) # End of Rolling methods # Begin Expanding methods @doc_utils.doc_window_method( window_cls_name="Expanding", result="the result of passed functions", action="apply specified functions", refer_to="aggregate", win_type="expanding window", params=""" func : str, dict, callable(pandas.Series) -> scalar, or list of such *args : iterable **kwargs : dict""", build_rules="udf_aggregation", ) def expanding_aggregate(self, fold_axis, expanding_args, func, *args, **kwargs): return ExpandingDefault.register( pandas.core.window.expanding.Expanding.aggregate )(self, expanding_args, func, *args, **kwargs) @doc_utils.doc_window_method( window_cls_name="Expanding", result="sum", refer_to="sum", win_type="expanding window", params=""" *args : iterable **kwargs : dict""", ) def expanding_sum(self, fold_axis, expanding_args, *args, **kwargs): return ExpandingDefault.register(pandas.core.window.expanding.Expanding.sum)( self, expanding_args, *args, **kwargs ) @doc_utils.doc_window_method( window_cls_name="Expanding", result="minimum value", refer_to="min", win_type="expanding window", params=""" *args : iterable **kwargs : dict""", ) def expanding_min(self, fold_axis, expanding_args, *args, **kwargs): return ExpandingDefault.register(pandas.core.window.expanding.Expanding.min)( self, expanding_args, *args, **kwargs ) @doc_utils.doc_window_method( window_cls_name="Expanding", result="maximum value", refer_to="max", win_type="expanding window", params=""" *args : iterable **kwargs : dict""", ) def expanding_max(self, fold_axis, expanding_args, *args, **kwargs): return ExpandingDefault.register(pandas.core.window.expanding.Expanding.max)( self, expanding_args, *args, **kwargs ) @doc_utils.doc_window_method( window_cls_name="Expanding", result="mean value", refer_to="mean", win_type="expanding window", params=""" *args : iterable **kwargs : dict""", ) def expanding_mean(self, fold_axis, expanding_args, *args, **kwargs): return ExpandingDefault.register(pandas.core.window.expanding.Expanding.mean)( self, expanding_args, *args, **kwargs ) @doc_utils.doc_window_method( window_cls_name="Expanding", result="median", refer_to="median", win_type="expanding window", params=""" numeric_only : bool, default: False engine : Optional[str], default: None engine_kwargs : Optional[dict], default: None **kwargs : dict""", ) def expanding_median( self, fold_axis, expanding_args, numeric_only=False, engine=None, engine_kwargs=None, **kwargs, ): return ExpandingDefault.register(pandas.core.window.expanding.Expanding.median)( self, expanding_args, numeric_only=numeric_only, engine=engine, engine_kwargs=engine_kwargs, **kwargs, ) @doc_utils.doc_window_method( window_cls_name="Expanding", result="variance", refer_to="var", win_type="expanding window", params=""" ddof : int, default: 1 *args : iterable **kwargs : dict""", ) def expanding_var(self, fold_axis, expanding_args, ddof=1, *args, **kwargs): return ExpandingDefault.register(pandas.core.window.expanding.Expanding.var)( self, expanding_args, ddof=ddof, *args, **kwargs ) @doc_utils.doc_window_method( window_cls_name="Expanding", result="standard deviation", refer_to="std", win_type="expanding window", params=""" ddof : int, default: 1 *args : iterable **kwargs : dict""", ) def expanding_std(self, fold_axis, expanding_args, ddof=1, *args, **kwargs): return ExpandingDefault.register(pandas.core.window.expanding.Expanding.std)( self, expanding_args, ddof=ddof, *args, **kwargs ) @doc_utils.doc_window_method( window_cls_name="Expanding", result="correlation", refer_to="corr", win_type="expanding window", params=""" squeeze_self : bool squeeze_other : bool other : pandas.Series or pandas.DataFrame, default: None pairwise : bool | None, default: None ddof : int, default: 1 numeric_only : bool, default: False **kwargs : dict""", ) def expanding_corr( self, fold_axis, expanding_args, squeeze_self, squeeze_other, other=None, pairwise=None, ddof=1, numeric_only=False, **kwargs, ): other_for_default = ( other if other is None else ( other.to_pandas().squeeze(axis=1) if squeeze_other else other.to_pandas() ) ) return ExpandingDefault.register( pandas.core.window.expanding.Expanding.corr, squeeze_self=squeeze_self, )( self, expanding_args, other=other_for_default, pairwise=pairwise, ddof=ddof, numeric_only=numeric_only, **kwargs, ) @doc_utils.doc_window_method( window_cls_name="Expanding", result="sample covariance", refer_to="cov", win_type="expanding window", params=""" squeeze_self : bool squeeze_other : bool other : pandas.Series or pandas.DataFrame, default: None pairwise : bool | None, default: None ddof : int, default: 1 numeric_only : bool, default: False **kwargs : dict""", ) def expanding_cov( self, fold_axis, expanding_args, squeeze_self, squeeze_other, other=None, pairwise=None, ddof=1, numeric_only=False, **kwargs, ): other_for_default = ( other if other is None else ( other.to_pandas().squeeze(axis=1) if squeeze_other else other.to_pandas() ) ) return ExpandingDefault.register( pandas.core.window.expanding.Expanding.cov, squeeze_self=squeeze_self, )( self, expanding_args, other=other_for_default, pairwise=pairwise, ddof=ddof, numeric_only=numeric_only, **kwargs, ) @doc_utils.doc_window_method( window_cls_name="Expanding", result="standard deviation", refer_to="std", win_type="expanding window", params=""" ddof : int, default: 1 *args : iterable **kwargs : dict""", ) def expanding_count(self, fold_axis, expanding_args, ddof=1, *args, **kwargs): return ExpandingDefault.register(pandas.core.window.expanding.Expanding.count)( self, expanding_args, *args, **kwargs ) @doc_utils.doc_window_method( window_cls_name="Expanding", result="quantile", refer_to="quantile", win_type="expanding window", params=""" quantile : float interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}, default: 'linear' **kwargs : dict""", ) def expanding_quantile( self, fold_axis, expanding_args, quantile, interpolation, **kwargs ): return ExpandingDefault.register( pandas.core.window.expanding.Expanding.quantile )(self, expanding_args, quantile, interpolation, **kwargs) @doc_utils.doc_window_method( window_cls_name="Expanding", result="unbiased standard error mean", refer_to="std", win_type="expanding window", params=""" ddof : int, default: 1 numeric_only : bool, default: False *args : iterable **kwargs : dict""", ) def expanding_sem( self, fold_axis, expanding_args, ddof=1, numeric_only=False, *args, **kwargs ): return ExpandingDefault.register(pandas.core.window.expanding.Expanding.sem)( self, expanding_args, ddof=ddof, numeric_only=numeric_only, *args, **kwargs ) @doc_utils.doc_window_method( window_cls_name="Expanding", result="unbiased skewness", refer_to="skew", win_type="expanding window", params=""" numeric_only : bool, default: False **kwargs : dict""", ) def expanding_skew(self, fold_axis, expanding_args, numeric_only=False, **kwargs): return ExpandingDefault.register(pandas.core.window.expanding.Expanding.skew)( self, expanding_args, numeric_only=numeric_only, **kwargs ) @doc_utils.doc_window_method( window_cls_name="Expanding", result="Fisher’s definition of kurtosis without bias", refer_to="kurt", win_type="expanding window", params=""" numeric_only : bool, default: False **kwargs : dict""", ) def expanding_kurt(self, fold_axis, expanding_args, numeric_only=False, **kwargs): return ExpandingDefault.register(pandas.core.window.expanding.Expanding.kurt)( self, expanding_args, numeric_only=numeric_only, **kwargs ) @doc_utils.doc_window_method( window_cls_name="Expanding", result="rank", refer_to="rank", win_type="expanding window", params=""" method : {'average', 'min', 'max'}, default: 'average' ascending : bool, default: True pct : bool, default: False numeric_only : bool, default: False *args : iterable **kwargs : dict""", ) def expanding_rank( self, fold_axis, expanding_args, method="average", ascending=True, pct=False, numeric_only=False, *args, **kwargs, ): return ExpandingDefault.register(pandas.core.window.expanding.Expanding.rank)( self, expanding_args, method=method, ascending=ascending, pct=pct, numeric_only=numeric_only, *args, **kwargs, ) # End of Expanding methods # Window methods @doc_utils.doc_window_method( window_cls_name="Rolling", win_type="window of the specified type", result="mean", refer_to="mean", params=""" *args : iterable **kwargs : dict""", ) def window_mean(self, fold_axis, window_kwargs, *args, **kwargs): return RollingDefault.register(pandas.core.window.Window.mean)( self, window_kwargs, *args, **kwargs ) @doc_utils.doc_window_method( window_cls_name="Rolling", win_type="window of the specified type", result="standard deviation", refer_to="std", params=""" ddof : int, default: 1 *args : iterable **kwargs : dict""", ) def window_std(self, fold_axis, window_kwargs, ddof=1, *args, **kwargs): return RollingDefault.register(pandas.core.window.Window.std)( self, window_kwargs, ddof, *args, **kwargs ) @doc_utils.doc_window_method( window_cls_name="Rolling", win_type="window of the specified type", result="sum", refer_to="sum", params=""" *args : iterable **kwargs : dict""", ) def window_sum(self, fold_axis, window_kwargs, *args, **kwargs): return RollingDefault.register(pandas.core.window.Window.sum)( self, window_kwargs, *args, **kwargs ) @doc_utils.doc_window_method( window_cls_name="Rolling", win_type="window of the specified type", result="variance", refer_to="var", params=""" ddof : int, default: 1 *args : iterable **kwargs : dict""", ) def window_var(self, fold_axis, window_kwargs, ddof=1, *args, **kwargs): return RollingDefault.register(pandas.core.window.Window.var)( self, window_kwargs, ddof, *args, **kwargs ) # End of Window methods # Categories methods @doc_utils.add_one_column_warning @doc_utils.add_refer_to("Series.cat.codes") def cat_codes(self): """ Convert underlying categories data into its codes. Returns ------- BaseQueryCompiler New QueryCompiler containing the integer codes of the underlying categories. """ return CatDefault.register(pandas.Series.cat.codes)(self) # End of Categories methods # List accessor's methods @doc_utils.add_one_column_warning @doc_utils.add_refer_to("Series.list.flatten") def list_flatten(self): """ Flatten list values. Returns ------- BaseQueryCompiler """ return ListDefault.register(pandas.Series.list.flatten)(self) @doc_utils.add_one_column_warning @doc_utils.add_refer_to("Series.list.len") def list_len(self): """ Return the length of each list in the Series. Returns ------- BaseQueryCompiler """ return ListDefault.register(pandas.Series.list.len)(self) @doc_utils.add_one_column_warning @doc_utils.add_refer_to("Series.list.__getitem__") def list__getitem__(self, key): # noqa: PR01 """ Index or slice lists in the Series. Returns ------- BaseQueryCompiler """ return ListDefault.register(pandas.Series.list.__getitem__)(self, key=key) # End of List accessor's methods # Struct accessor's methods @doc_utils.add_one_column_warning @doc_utils.add_refer_to("Series.struct.dtypes") def struct_dtypes(self): """ Return the dtype object of each child field of the struct. Returns ------- BaseQueryCompiler """ return StructDefault.register(pandas.Series.struct.dtypes)(self) @doc_utils.add_one_column_warning @doc_utils.add_refer_to("Series.struct.field") def struct_field(self, name_or_index): # noqa: PR01 """ Extract a child field of a struct as a Series. Returns ------- BaseQueryCompiler """ return StructDefault.register(pandas.Series.struct.field)( self, name_or_index=name_or_index ) @doc_utils.add_one_column_warning @doc_utils.add_refer_to("Series.struct.explode") def struct_explode(self): """ Extract all child fields of a struct as a DataFrame. Returns ------- BaseQueryCompiler """ return StructDefault.register(pandas.Series.struct.explode)(self) # End of Struct accessor's methods # DataFrame methods def invert(self): """ Apply bitwise inversion for each element of the QueryCompiler. Returns ------- BaseQueryCompiler New QueryCompiler containing bitwise inversion for each value. """ return DataFrameDefault.register(pandas.DataFrame.__invert__)(self) @doc_utils.doc_reduce_agg( method="unbiased kurtosis", refer_to="kurt", extra_params=["skipna", "**kwargs"] ) def kurt(self, axis, numeric_only=False, skipna=True, **kwargs): return DataFrameDefault.register(pandas.DataFrame.kurt)( self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs ) sum_min_count = sum prod_min_count = prod @doc_utils.add_refer_to("DataFrame.compare") def compare(self, other, align_axis, keep_shape, keep_equal, result_names): """ Compare data of two QueryCompilers and highlight the difference. Parameters ---------- other : BaseQueryCompiler Query compiler to compare with. Have to be the same shape and the same labeling as `self`. align_axis : {0, 1} keep_shape : bool keep_equal : bool result_names : tuple Returns ------- BaseQueryCompiler New QueryCompiler containing the differences between `self` and passed query compiler. """ return DataFrameDefault.register(pandas.DataFrame.compare)( self, other=other, align_axis=align_axis, keep_shape=keep_shape, keep_equal=keep_equal, result_names=result_names, ) @doc_utils.add_refer_to("Series.case_when") def case_when(self, caselist): # noqa: PR01, RT01, D200 """ Replace values where the conditions are True. """ # A workaround for https://github.com/modin-project/modin/issues/7041 qc_type = type(self) caselist = [ tuple( data.to_pandas().squeeze(axis=1) if isinstance(data, qc_type) else data for data in case_tuple ) for case_tuple in caselist ] return SeriesDefault.register(pandas.Series.case_when)(self, caselist=caselist) def get_pandas_backend(self) -> Optional[str]: """ Get backend stored in `_modin_frame`. Returns ------- str | None Backend name. """ return self._modin_frame._pandas_backend def repartition(self, axis=None): """ Repartitioning QueryCompiler objects to get ideal partitions inside. Allows to improve performance where the query compiler can't improve yet by doing implicit repartitioning. Parameters ---------- axis : {0, 1, None}, optional The axis along which the repartitioning occurs. `None` is used for repartitioning along both axes. Returns ------- BaseQueryCompiler The repartitioned BaseQueryCompiler. """ axes = [0, 1] if axis is None else [axis] new_query_compiler = self for _ax in axes: new_query_compiler = new_query_compiler.__constructor__( new_query_compiler._modin_frame.apply_full_axis( _ax, lambda df: df, new_index=self._modin_frame.copy_index_cache(copy_lengths=_ax == 1), new_columns=self._modin_frame.copy_columns_cache( copy_lengths=_ax == 0 ), dtypes=self._modin_frame.copy_dtypes_cache(), keep_partitioning=False, sync_labels=False, ) ) return new_query_compiler # End of DataFrame methods ================================================ FILE: modin/core/storage_formats/base/query_compiler_calculator.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Module contains ``BackendCostCalculator`` class. ``BackendCostCalculator`` is used to determine the casting cost between a set of different backends. It aggregates the cost across all query compilers to determine the best query compiler to use. """ import random from types import MappingProxyType from typing import Any, Optional from modin.config import Backend, BackendJoinConsiderAllBackends from modin.core.storage_formats.base.query_compiler import ( BaseQueryCompiler, QCCoercionCost, ) from modin.logging import get_logger from modin.logging.metrics import emit_metric def all_switchable_backends() -> list[str]: """ Return a list of all currently active backends that are candidates for switching. Returns ------- list A list of valid backends. """ return list( filter( # Disable automatically switching to these engines for now, because # 1) _get_prepared_factory_for_backend() currently calls # _initialize_engine(), which starts up the ray/dask/unidist # processes # 2) we can't decide to switch to unidist in the middle of execution. lambda backend: backend not in ("Ray", "Unidist", "Dask"), Backend.get_active_backends(), ) ) class AggregatedBackendData: """ Contains information on Backends considered for computation. Parameters ---------- backend : str String representing the backend name. qc_cls : type[QueryCompiler] The query compiler sub-class for this backend. """ def __init__(self, backend: str, qc_cls: type[BaseQueryCompiler]): self.backend = backend self.qc_cls = qc_cls self.cost = 0 self.max_cost = qc_cls.max_cost() class BackendCostCalculator: """ Calculate which Backend should be used for an operation. Given a set of QueryCompilers containing various data, determine which query compiler's backend would minimize the cost of casting or coercion. Use the aggregate sum of coercion to determine overall cost. Parameters ---------- operation_arguments : MappingProxyType[str, Any] Mapping from operation argument names to their values. api_cls_name : str or None Representing the class name of the function being called. operation : str representing the operation being performed query_compilers : list of query compiler arguments preop_switch : bool True if the operation is a pre-operation switch point. """ def __init__( self, *, operation_arguments: MappingProxyType[str, Any], api_cls_name: Optional[str], operation: str, query_compilers: list[BaseQueryCompiler], preop_switch: bool, ): from modin.core.execution.dispatching.factories.dispatcher import ( FactoryDispatcher, ) self._qc_list: list[BaseQueryCompiler] = [] self._result_backend = None self._api_cls_name = api_cls_name self._op = operation self._operation_arguments = operation_arguments self._backend_data = {} self._qc_list = query_compilers[:] for query_compiler in query_compilers: # If a QC's backend was not configured as active, we need to create an entry for it here. backend = query_compiler.get_backend() if backend not in self._backend_data: self._backend_data[backend] = AggregatedBackendData( backend, FactoryDispatcher._get_prepared_factory_for_backend( backend=backend ).io_cls.query_compiler_cls, ) if preop_switch and BackendJoinConsiderAllBackends.get(): # Initialize backend data for any backends not found among query compiler arguments. # Because we default to the first query compiler's backend if no cost information is available, # this initialization must occur after iterating over query compiler arguments to ensure # correct ordering in dictionary arguments. for backend in all_switchable_backends(): if backend not in self._backend_data: self._backend_data[backend] = AggregatedBackendData( backend, FactoryDispatcher._get_prepared_factory_for_backend( backend=backend ).io_cls.query_compiler_cls, ) def calculate(self) -> str: """ Calculate which query compiler we should cast to. Switching calculation is performed as follows: - For every registered query compiler in qc_list, with backend `backend_from`, compute `self_cost = qc_from.stay_cost(...)` and add it to the total cost for `backend_from`. - For every valid target `backend_to`, compute `qc_from.move_to_cost(qc_cls_to, ...)`. If it returns None, instead compute `qc_cls_to.move_to_me_cost(qc_from, ...)`. Add the result to the cost for `backend_to`. At a high level, the cost for choosing a particular backend is the sum of (all stay costs for data already on that backend) + (cost of moving all other query compilers to this backend) If the operation is a registered pre-operation switch point, then the list of target backends is ALL active backends. Otherwise, only backends found among the arguments are considered. Post-operation switch points are not yet supported. If the arguments contain no query compilers for a particular backend, then there are no stay costs. In this scenario, we expect the move_to cost for this backend to outweigh the corresponding stay costs for each query compiler's original backend. If no argument QCs have cost information for each other (that is, move_to_cost and move_to_me_cost returns None), then we attempt to move all data to the backend of the first QC. We considered a few alternative algorithms for switching calculation: 1. Instead of considering all active backends, consider only backends found among input QCs. This was used in the calculator's original implementation, as we figured transfer cost to unrelated backends would outweigh any possible gains in computation speed. However, certain pathological cases that significantly changed the size of input or output data (e.g. cross join) would create situations where transferring data after the computation became prohibitively expensive, so we chose to allow switching to unrelated backends. Additionally, the original implementation had a bug where stay_cost was only computed for the _first_ query compiler of each backend, thus under-reporting the cost of computation for any backend with multiple QCs present. In practice this very rarely affected the chosen result. 2. Compute stay/move costs only once for each backend pair, but force QCs to consider other arguments when calculating. This approach is the most robust and accurate for cases like cross join, where a product of transfer costs between backends is more reflective of cost than size. This approach requires more work in the query compiler, as each QC must be aware of when multiple QC arguments are passed and adjust the cost computation accordingly. It is also unclear how often this would make a meaningful difference compared to the summation approach. Returns ------- str A string representing a backend. Raises ------ ValueError Raises ValueError when the reported transfer cost for every backend exceeds its maximum cost. """ if self._result_backend is not None: return self._result_backend if len(self._qc_list) == 1: return self._qc_list[0].get_backend() if len(self._qc_list) == 0: raise ValueError("No query compilers registered") # See docstring for explanation of switching decision algorithm. for qc_from in self._qc_list: # Add self cost for the current query compiler self_cost = qc_from.stay_cost( self._api_cls_name, self._op, self._operation_arguments ) backend_from = qc_from.get_backend() if self_cost is not None: self._add_cost_data(backend_from, self_cost) for backend_to, agg_data_to in self._backend_data.items(): if backend_to == backend_from: continue qc_cls_to = agg_data_to.qc_cls cost = qc_from.move_to_cost( qc_cls_to, self._api_cls_name, self._op, self._operation_arguments, ) if cost is not None: self._add_cost_data(backend_to, cost) else: # We have some information asymmetry in query compilers, # qc_from does not know about qc_to types so we instead # ask the same question but of qc_to. cost = qc_cls_to.move_to_me_cost( qc_from, self._api_cls_name, self._op, self._operation_arguments, ) if cost is not None: self._add_cost_data(backend_to, cost) self._result_backend = None def get_min_cost_backend(skip_exceeds_max_cost=True) -> str: result = None min_value = None for k, v in self._backend_data.items(): if skip_exceeds_max_cost and v.cost > v.max_cost: continue if min_value is None or min_value > v.cost: min_value = v.cost result = k return result # Get the best backend, skipping backends where we may exceed # the total cost self._result_backend = get_min_cost_backend(skip_exceeds_max_cost=True) # If we still do not have a backend, pick the best backend while # ignoring max_cost if self._result_backend is None: self._result_backend = get_min_cost_backend(skip_exceeds_max_cost=False) # This should not happen if self._result_backend is None: raise ValueError("No backends are available to calculate costs.") if len(self._backend_data) > 1: get_logger().info( f"BackendCostCalculator results for {'pd' if self._api_cls_name is None else self._api_cls_name}.{self._op}: {self._calc_result_log(self._result_backend)}" ) # Does not need to be secure, should not use system entropy metrics_group = "%04x" % random.randrange(16**4) for qc in self._qc_list: max_shape = qc._max_shape() backend = qc.get_backend() emit_metric( f"hybrid.merge.candidate.{backend}.group.{metrics_group}.rows", max_shape[0], ) emit_metric( f"hybrid.merge.candidate.{backend}.group.{metrics_group}.cols", max_shape[1], ) for k, v in self._backend_data.items(): emit_metric( f"hybrid.merge.candidate.{k}.group.{metrics_group}.cost", v.cost ) emit_metric( f"hybrid.merge.decision.{self._result_backend}.group.{metrics_group}", 1, ) return self._result_backend def _add_cost_data(self, backend, cost): """ Add the cost data to the calculator. Parameters ---------- backend : str String representing the backend for this engine. cost : dict Dictionary of query compiler classes to costs. """ # We can assume that if we call this method, backend # exists in the backend_data map QCCoercionCost.validate_coersion_cost(cost) self._backend_data[backend].cost += cost def _calc_result_log(self, selected_backend: str) -> str: """ Create a string summary of the backend costs. The format is [*|][backend name]:[cost]/[max_cost],... where '*' indicates this was the selected backend and [cost]/[max_cost] represents the aggregated cost of moving to that backend over the maximum cost allowed on that backend. Parameters ---------- selected_backend : str String representing the backend selected by the calculator. Returns ------- str String representation of calculator state. """ return ", ".join( f"{'*'+k if k is selected_backend else k}:{v.cost}/{v.max_cost}" for k, v in self._backend_data.items() ) ================================================ FILE: modin/core/storage_formats/pandas/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """The module represents the query compiler level for the pandas storage format.""" from .query_compiler import PandasQueryCompiler __all__ = ["PandasQueryCompiler"] ================================================ FILE: modin/core/storage_formats/pandas/aggregations.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Contains implementations for aggregation functions.""" from __future__ import annotations from enum import Enum from typing import TYPE_CHECKING, Callable, Tuple import numpy as np import pandas from pandas.core.dtypes.common import is_numeric_dtype if TYPE_CHECKING: from .query_compiler import PandasQueryCompiler from modin.utils import MODIN_UNNAMED_SERIES_LABEL class CorrCovBuilder: """Responsible for building pandas query compiler's methods computing correlation and covariance matrices.""" class Method(Enum): """Enum specifying what method to use (either CORR for correlation or COV for covariance).""" CORR = 1 COV = 2 @classmethod def build_corr_method( cls, ) -> Callable[[PandasQueryCompiler, str, int, bool], PandasQueryCompiler]: """ Build a query compiler method computing the correlation matrix. Returns ------- callable(qc: PandasQueryCompiler, method: str, min_periods: int, numeric_only: bool) -> PandasQueryCompiler A callable matching the ``BaseQueryCompiler.corr`` signature and computing the correlation matrix. """ def corr_method( qc: PandasQueryCompiler, method: str, min_periods: int = 1, numeric_only: bool = True, ) -> PandasQueryCompiler: # Further implementation is designed for the default pandas backend (numpy) if method != "pearson" or qc.get_pandas_backend() == "pyarrow": return super(type(qc), qc).corr( method=method, min_periods=min_periods, numeric_only=numeric_only ) if not numeric_only and qc.frame_has_materialized_columns: new_index, new_columns = ( qc._modin_frame.copy_columns_cache(), qc._modin_frame.copy_columns_cache(), ) new_dtypes = pandas.Series( np.repeat(pandas.api.types.pandas_dtype("float"), len(new_columns)), index=new_columns, ) elif numeric_only and qc.frame_has_materialized_dtypes: old_dtypes = qc.dtypes new_columns = old_dtypes[old_dtypes.map(is_numeric_dtype)].index new_index = new_columns.copy() new_dtypes = pandas.Series( np.repeat(pandas.api.types.pandas_dtype("float"), len(new_columns)), index=new_columns, ) else: new_index, new_columns, new_dtypes = None, None, None map, reduce = cls._build_map_reduce_methods( min_periods, method=cls.Method.CORR, numeric_only=numeric_only ) reduced = qc._modin_frame.apply_full_axis(axis=1, func=map) # The 'reduced' dataset has the shape either (num_cols, num_cols + 3) for a non-NaN case # or (num_cols, num_cols * 4) for a NaN case, so it's acceptable to call `.combine_and_apply()` # here as the number of cols is usually quite small result = reduced.combine_and_apply( func=reduce, new_index=new_index, new_columns=new_columns, new_dtypes=new_dtypes, ) return qc.__constructor__(result) return corr_method @classmethod def build_cov_method( cls, ) -> Callable[[PandasQueryCompiler, int, int], PandasQueryCompiler]: """ Build a query compiler method computing the covariance matrix. Returns ------- callable(qc: PandasQueryCompiler, min_periods: int, ddof: int) -> PandasQueryCompiler A callable matching the ``BaseQueryCompiler.cov`` signature and computing the covariance matrix. """ raise NotImplementedError("Computing covariance is not yet implemented.") @classmethod def _build_map_reduce_methods( cls, min_periods: int, method: Method, numeric_only: bool ) -> Tuple[ Callable[[pandas.DataFrame], pandas.DataFrame], Callable[[pandas.DataFrame], pandas.DataFrame], ]: """ Build MapReduce kernels for the specified corr/cov method. Parameters ---------- min_periods : int The parameter to pass to the reduce method. method : CorrCovBuilder.Method Whether the kernels compute correlation or covariance. numeric_only : bool Whether to only include numeric types. Returns ------- Tuple[Callable(pandas.DataFrame) -> pandas.DataFrame, Callable(pandas.DataFrame) -> pandas.DataFrame] A tuple holding the Map (at the first position) and the Reduce (at the second position) kernels computing correlation/covariance matrix. """ if method == cls.Method.COV: raise NotImplementedError("Computing covariance is not yet implemented.") return lambda df: _CorrCovKernels.map( df, numeric_only ), lambda df: _CorrCovKernels.reduce(df, min_periods, method) class _CorrCovKernels: """Holds kernel functions computing correlation/covariance matrices in a MapReduce manner.""" @classmethod def map(cls, df: pandas.DataFrame, numeric_only: bool) -> pandas.DataFrame: """ Perform the Map phase to compute the corr/cov matrix. In this kernel we compute all the required components to compute the correlation matrix at the reduce phase, the required components are: 1. Matrix holding sums of pairwise multiplications between all columns defined as ``M[col1, col2] = sum(col1[i] * col2[i] for i in range(col_len))`` 2. Sum for each column (special case if there are NaN values) 3. Sum of squares for each column (special case if there are NaN values) 4. Number of values in each column (special case if there are NaN values) Parameters ---------- df : pandas.DataFrame Partition to compute the aggregations for. numeric_only : bool Whether to only include numeric types. Returns ------- pandas.DataFrame A MultiIndex columned DataFrame holding the described aggregation results for this specifix partition under the following keys: ``["mul", "sum", "pow2_sum", "count"]`` """ if numeric_only: df = df.select_dtypes(include="number") # It's more convenient to use a NumPy array here as it appears to perform # much faster in for-loops which this kernel function has plenty of raw_df = df.values.T try: nan_mask = np.isnan(raw_df) except TypeError as e: # Pandas raises ValueError on unsupported types, so casting # the exception to a proper type raise ValueError("Unsupported types with 'numeric_only=False'") from e has_nans = nan_mask.sum() != 0 if has_nans: if not raw_df.flags.writeable: # making a copy if the buffer is read-only raw_df = raw_df.copy() # Replacing all NaNs with zeros so we can use much # faster `np.sum()` instead of slow `np.nansum()` np.putmask(raw_df, nan_mask, values=0) cols = df.columns # Here we compute a sum of pairwise multiplications between all columns # result: # col1: [sum(col1 * col2), sum(col1 * col3), ... sum(col1 * colN)] # col2: [sum(col2 * col3), sum(col2 * col4), ... sum(col2 * colN)] # ... sum_of_pairwise_mul = pandas.DataFrame( np.dot(raw_df, raw_df.T), index=cols, columns=cols, copy=False ) if has_nans: sums, sums_of_squares, count = cls._compute_nan_aggs(raw_df, cols, nan_mask) else: sums, sums_of_squares, count = cls._compute_non_nan_aggs(df) aggregations = pandas.concat( [sum_of_pairwise_mul, sums, sums_of_squares, count], copy=False, axis=1, keys=["mul", "sum", "pow2_sum", "count"], ) return aggregations @staticmethod def _compute_non_nan_aggs( df: pandas.DataFrame, ) -> Tuple[pandas.Series, pandas.Series, pandas.Series]: """ Compute sums, sums of square and the number of observations for a partition assuming there are no NaN values in it. Parameters ---------- df : pandas.DataFrame Partition to compute the aggregations for. Returns ------- Tuple[sums: pandas.Series, sums_of_squares: pandas.Series, count: pandas.Series] A tuple storing Series where each of them holds the result for one of the described aggregations. """ sums = df.sum().rename(MODIN_UNNAMED_SERIES_LABEL) sums_of_squares = (df**2).sum().rename(MODIN_UNNAMED_SERIES_LABEL) count = pandas.Series( np.repeat(len(df), len(df.columns)), index=df.columns, copy=False ).rename(MODIN_UNNAMED_SERIES_LABEL) return sums, sums_of_squares, count @staticmethod def _compute_nan_aggs( raw_df: np.ndarray, cols: pandas.Index, nan_mask: np.ndarray ) -> Tuple[pandas.DataFrame, pandas.DataFrame, pandas.DataFrame]: """ Compute sums, sums of square and the number of observations for a partition assuming there are NaN values in it. Parameters ---------- raw_df : np.ndarray Raw values of the partition to compute the aggregations for. cols : pandas.Index Columns of the partition. nan_mask : np.ndarray[bool] Boolean mask showing positions of NaN values in the `raw_df`. Returns ------- Tuple[sums: pandas.DataFrame, sums_of_squares: pandas.DataFrame, count: pandas.DataFrame] A tuple storing DataFrames where each of them holds the result for one of the described aggregations. """ # Unfortunately, in case of NaN values we forced to compute multiple sums/square sums/counts # for each column because we have to exclude values at positions of NaN values in each other # column individually. # Imagine we have a dataframe like this: # col1: 1, 2 , 3 , 4 # col2: 2, NaN, 3 , 4 # col3: 4, 5 , NaN, 7 # In this case we would need to compute 2 different sums/square sums/count for 'col1': # - The first one excluding the values at the NaN possitions of 'col2' (1 + 3 + 4) # - And the second one excluding the values at the NaN positions of 'col3' (1 + 2 + 4) # and then also do the same for the rest columns. At the end this should form a matrix # of pairwise sums/square sums/counts: # sums[col1, col2] = sum(col1[i] for i in non_NA_indices_of_col2) # sums[col2, col1] = sum(col2[i] for i in non_NA_indices_of_col1) # ... # Note that sums[col1, col2] != sums[col2, col1] sums = {} sums_of_squares = {} count = {} # TODO: is it possible to get rid of this for-loop somehow? for i, col in enumerate(cols): # Here we're taking each column, resizing it to the original frame's shape to compute # aggregations for each other column and then excluding values at those positions where # other columns had NaN values by setting zeros using the validity mask: # col1: 1, 2 , 3 , 4 df[i].resize() col1: 1, 2, 3, 4 putmask() col1: 1, 2, 3, 4 # col2: 2, NaN, 3 , 4 -------------> col1: 1, 2, 3, 4 --------> col1: 1, 0, 3, 4 # col3: 4, 5 , NaN, 7 col1: 1, 2, 3, 4 col1: 1, 2, 0, 4 # Note that 'NaN' values in this diagram are just for the sake of visibility, in reality # they were already replaced by zeroes at the beginning of the 'map' phase. col_vals = np.resize(raw_df[i], raw_df.shape) np.putmask(col_vals, nan_mask, values=0) sums[col] = pandas.Series(np.sum(col_vals, axis=1), index=cols, copy=False) sums_of_squares[col] = pandas.Series( np.sum(col_vals**2, axis=1), index=cols, copy=False ) count[col] = pandas.Series( nan_mask.shape[1] - np.count_nonzero(nan_mask | nan_mask[i], axis=1), index=cols, copy=False, ) sums = pandas.concat(sums, axis=1, copy=False) sums_of_squares = pandas.concat(sums_of_squares, axis=1, copy=False) count = pandas.concat(count, axis=1, copy=False) return sums, sums_of_squares, count @classmethod def reduce( cls, df: pandas.DataFrame, min_periods: int, method: CorrCovBuilder.Method ) -> pandas.DataFrame: """ Perform the Reduce phase to compute the corr/cov matrix. Parameters ---------- df : pandas.DataFrame A dataframe holding aggregations computed for each partition concatenated along the rows axis. min_periods : int Minimum number of observations required per pair of columns to have a valid result. method : CorrCovBuilder.Method Whether to build a correlation or a covariance matrix. Returns ------- pandas.DataFrame Either correlation or covariance matrix. """ if method == CorrCovBuilder.Method.COV: raise NotImplementedError("Computing covariance is not yet implemented.") # The `df` here accumulates the aggregation results retrieved from each row partition # and combined together along the rows axis, so the `df` looks something like this: # mul sums pow2_sums # a . . . # b . . . <--- part1 result # c . . . # --------------------------- # a . . . # b . . . <--- part2 result # c . . . # --------------------------- # ... # So to get the total result we have to group on the index and sum the values total_agg = df.groupby(level=0).sum() total_agg = cls._maybe_combine_nan_and_non_nan_aggs(total_agg) sum_of_pairwise_mul = total_agg["mul"] sums = total_agg["sum"] sums_of_squares = total_agg["pow2_sum"] count = total_agg["count"] cols = sum_of_pairwise_mul.columns # If there are NaNs in the original dataframe, then we have computed a matrix # of sums/square sums/counts at the Map phase, meaning that we now have multiple # columns in `sums`. has_nans = len(sums.columns) > 1 if not has_nans: # 'count' is the same for all columns in a non-NaN case, so converting # it to scalar for faster binary operations count = count.iloc[0, 0] if count < min_periods: # Fast-path for too small data return pandas.DataFrame(index=cols, columns=cols, dtype="float") # Converting frame to a Series for more convenient handling sums = sums.squeeze(axis=1) sums_of_squares = sums_of_squares.squeeze(axis=1) means = sums / count std = np.sqrt(sums_of_squares - 2 * means * sums + count * (means**2)) # The 'is_nans' condition was moved out of the loop, so the loops themselves # work faster as not being slowed by extra conditions in them if has_nans: return cls._build_corr_table_nan( sum_of_pairwise_mul, means, sums, count, std, cols, min_periods ) else: # We've already processed the 'min_periods' parameter for a non-na case above, # so don't need to pass it here return cls._build_corr_table_non_nan( sum_of_pairwise_mul, means, sums, count, std, cols ) @staticmethod def _maybe_combine_nan_and_non_nan_aggs( total_agg: pandas.DataFrame, ) -> pandas.DataFrame: """ Pair the aggregation results of partitions having and not having NaN values if needed. Parameters ---------- total_agg : pandas.DataFrame A dataframe holding aggregations computed for each partition concatenated along the rows axis. Returns ------- pandas.DataFrame DataFrame with aligned results. """ # Here we try to align the results between partitions that had and didn't have NaNs. # At the result of the Map phase, partitions with and without NaNs would produce # different results: # - Partitions with NaNs produce a matrix of pairwise sums/square sums/counts # - And parts without NaNs produce regular one-column sums/square sums/counts # # As the result, `total_agg` will be something like this: # mul | sum pow2_sum count | sum pow2_sum count # a b | a b a b a b | __reduced__ __reduced__ __reduced__ # a . . | . . . . . . | . . . # b . . | . . . . . . | . . . # --------|-----------------------|---------------------------------------- # ^-- these are results ^-- and these are results for # for partitions that partitions that didn't have NaNs # had NaNs # So, to get an actual total result of these aggregations, we have to additionally # sum the results from non-NaN and NaN partitions. # # Here we sample the 'sum' columns to check whether we had mixed NaNs and # non-NaNs partitions, if it's not the case we can skip the described step: nsums = total_agg.columns.get_locs(["sum"]) if not ( len(nsums) > 1 and ("sum", MODIN_UNNAMED_SERIES_LABEL) in total_agg.columns ): return total_agg cols = total_agg.columns # Finding column positions for aggregational columns all_agg_idxs = np.where( cols.get_loc("sum") | cols.get_loc("pow2_sum") | cols.get_loc("count") )[0] # Finding column positions for aggregational columns that store # results of non-NaN partitions non_na_agg_idxs = cols.get_indexer_for( pandas.Index( [ ("sum", MODIN_UNNAMED_SERIES_LABEL), ("pow2_sum", MODIN_UNNAMED_SERIES_LABEL), ("count", MODIN_UNNAMED_SERIES_LABEL), ] ) ) # Finding column positions for aggregational columns that store # results of NaN partitions by deducting non-NaN indices from all indices na_agg_idxs = np.setdiff1d(all_agg_idxs, non_na_agg_idxs, assume_unique=True) # Using `.values` here so we can ignore the indices (it's really hard # to arrange them for pandas to properly perform the summation) parts_with_nans = total_agg.values[:, na_agg_idxs] parts_without_nans = ( total_agg.values[:, non_na_agg_idxs] # Before doing the summation we have to align the shapes # Imagine that we have 'parts_with_nans' like: # sum pow2_sum count # a b a b a b # a 1 2 3 4 5 6 # b 1 2 3 4 5 6 # # And the 'parts_without_nans' like: # sum pow2_sum count # a 1 3 5 # b 2 4 6 # # Here we want to sum them in an order so the digit matches (1 + 1), (2 + 2), ... # For that we first have to repeat the values in 'parts_without_nans': # parts_without_nans.repeat(parts_with_nans.shape[0]): # sum pow2_sum count # a 1 3 5 # b 1 3 5 # a 2 4 6 # b 2 4 6 # # And then reshape it using the "Fortran" order: # parts_without_nans.reshape(parts_with_nans.shape, order="F"): # sum pow2_sum count # a b a b a b # a 1 2 3 4 5 6 # b 1 2 3 4 5 6 # After that the shapes & orders are aligned and we can perform the summation .repeat(repeats=len(parts_with_nans), axis=0).reshape( parts_with_nans.shape, order="F" ) ) replace_values = parts_with_nans + parts_without_nans if not total_agg.values.flags.writeable: # making a copy if the buffer is read-only as # we will need to modify `total_agg` inplace total_agg = total_agg.copy() total_agg.values[:, na_agg_idxs] = replace_values return total_agg @staticmethod def _build_corr_table_nan( sum_of_pairwise_mul: pandas.DataFrame, means: pandas.DataFrame, sums: pandas.DataFrame, count: pandas.DataFrame, std: pandas.DataFrame, cols: pandas.Index, min_periods: int, ) -> pandas.DataFrame: """ Build correlation matrix for a DataFrame that had NaN values in it. Parameters ---------- sum_of_pairwise_mul : pandas.DataFrame means : pandas.DataFrame sums : pandas.DataFrame count : pandas.DataFrame std : pandas.DataFrame cols : pandas.Index min_periods : int Returns ------- pandas.DataFrame Correlation matrix. """ res = pandas.DataFrame(index=cols, columns=cols, dtype="float") nan_mask = count < min_periods for col in cols: top = ( sum_of_pairwise_mul.loc[col] - sums.loc[col] * means[col] - means.loc[col] * sums[col] + count.loc[col] * means.loc[col] * means[col] ) down = std.loc[col] * std[col] res.loc[col, :] = top / down res[nan_mask] = np.nan return res @staticmethod def _build_corr_table_non_nan( sum_of_pairwise_mul: pandas.DataFrame, means: pandas.Series, sums: pandas.Series, count: int, std: pandas.Series, cols: pandas.Index, ) -> pandas.DataFrame: """ Build correlation matrix for a DataFrame that didn't have NaN values in it. Parameters ---------- sum_of_pairwise_mul : pandas.DataFrame means : pandas.Series sums : pandas.Series count : int std : pandas.Series cols : pandas.Index Returns ------- pandas.DataFrame Correlation matrix. """ res = pandas.DataFrame(index=cols, columns=cols, dtype="float") for col in cols: top = ( sum_of_pairwise_mul.loc[col] - sums.loc[col] * means - means.loc[col] * sums + count * means.loc[col] * means ) down = std.loc[col] * std res.loc[col, :] = top / down return res ================================================ FILE: modin/core/storage_formats/pandas/groupby.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Contains implementations for GroupbyReduce functions.""" import numpy as np import pandas from pandas.core.dtypes.cast import find_common_type from modin.config import RangePartitioning from modin.core.dataframe.algebra import GroupByReduce from modin.error_message import ErrorMessage from modin.utils import hashable class GroupbyReduceImpl: """Provide TreeReduce implementations for certain groupby aggregations.""" @classmethod def get_impl(cls, agg_name): """ Get TreeReduce implementations for the specified `agg_name`. Parameters ---------- agg_name : hashable Returns ------- (map_fn: Union[callable, str], reduce_fn: Union[callable, str], default2pandas_fn: callable) """ try: return cls._groupby_reduce_impls[agg_name] except KeyError: raise KeyError(f"Have no implementation for {agg_name}.") @classmethod def has_impl_for(cls, agg_func): """ Check whether the class has TreeReduce implementation for the specified `agg_func`. Parameters ---------- agg_func : hashable or dict Returns ------- bool """ if hashable(agg_func): return agg_func in cls._groupby_reduce_impls if not isinstance(agg_func, dict): return False # We have to keep this import away from the module level to avoid circular import from modin.pandas.utils import walk_aggregation_dict for _, func, _, _ in walk_aggregation_dict(agg_func): if func not in cls._groupby_reduce_impls: return False return True @classmethod def build_qc_method(cls, agg_name, finalizer_fn=None): """ Build a TreeReduce implemented query compiler method for the specified groupby aggregation. Parameters ---------- agg_name : hashable finalizer_fn : callable(pandas.DataFrame) -> pandas.DataFrame, default: None A callable to execute at the end a groupby kernel against groupby result. Returns ------- callable Function that takes query compiler and executes GroupBy aggregation with TreeReduce algorithm. """ map_fn, reduce_fn, d2p_fn = cls.get_impl(agg_name) map_reduce_method = GroupByReduce.register( map_fn, reduce_fn, default_to_pandas_func=d2p_fn, finalizer_fn=finalizer_fn ) def method(query_compiler, *args, **kwargs): if RangePartitioning.get(): try: if finalizer_fn is not None: raise NotImplementedError( "Range-partitioning groupby is not implemented yet when a finalizing function is specified." ) return query_compiler._groupby_shuffle( *args, agg_func=agg_name, **kwargs ) except NotImplementedError as e: ErrorMessage.warn( f"Can't use range-partitioning groupby implementation because of: {e}" + "\nFalling back to a TreeReduce implementation." ) return map_reduce_method(query_compiler, *args, **kwargs) return method @staticmethod def _build_skew_impl(): """ Build TreeReduce implementation for 'skew' groupby aggregation. Returns ------- (map_fn: callable, reduce_fn: callable, default2pandas_fn: callable) """ def skew_map(dfgb, *args, **kwargs): if dfgb._selection is not None: data_to_agg = dfgb._selected_obj else: cols_to_agg = dfgb.obj.columns.difference(dfgb.exclusions) data_to_agg = dfgb.obj[cols_to_agg] df_pow2 = data_to_agg**2 df_pow3 = data_to_agg**3 return pandas.concat( [ dfgb.count(*args, **kwargs), dfgb.sum(*args, **kwargs), df_pow2.groupby(dfgb.grouper).sum(*args, **kwargs), df_pow3.groupby(dfgb.grouper).sum(*args, **kwargs), ], copy=False, axis=1, keys=["count", "sum", "pow2_sum", "pow3_sum"], names=[GroupByReduce.ID_LEVEL_NAME], ) def skew_reduce(dfgb, *args, **kwargs): df = dfgb.sum(*args, **kwargs) if df.empty: return df.droplevel(GroupByReduce.ID_LEVEL_NAME, axis=1) count = df["count"] s = df["sum"] s2 = df["pow2_sum"] s3 = df["pow3_sum"] # mean = sum(x) / count m = s / count # m2 = sum( (x - m)^ 2) = sum(x^2 - 2*x*m + m^2) m2 = s2 - 2 * m * s + count * (m**2) # m3 = sum( (x - m)^ 3) = sum(x^3 - 3*x^2*m + 3*x*m^2 - m^3) m3 = s3 - 3 * m * s2 + 3 * s * (m**2) - count * (m**3) # The equation for the 'skew' was taken directly from pandas: # https://github.com/pandas-dev/pandas/blob/8dab54d6573f7186ff0c3b6364d5e4dd635ff3e7/pandas/core/nanops.py#L1226 with np.errstate(invalid="ignore", divide="ignore"): skew_res = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2**1.5) # Setting dummy values for invalid results in accordance with pandas skew_res[m2 == 0] = 0 skew_res[count < 3] = np.nan return skew_res GroupByReduce.register_implementation(skew_map, skew_reduce) return ( skew_map, skew_reduce, lambda grp, *args, **kwargs: grp.skew(*args, **kwargs), ) @staticmethod def _build_mean_impl(): """ Build TreeReduce implementation for 'mean' groupby aggregation. Returns ------- (map_fn: callable, reduce_fn: callable, default2pandas_fn: callable) """ def mean_map(dfgb, **kwargs): return pandas.concat( [dfgb.sum(**kwargs), dfgb.count()], axis=1, copy=False, keys=["sum", "count"], names=[GroupByReduce.ID_LEVEL_NAME], ) def mean_reduce(dfgb, **kwargs): """ Compute mean value in each group using sums/counts values within reduce phase. Parameters ---------- dfgb : pandas.DataFrameGroupBy GroupBy object for column-partition. **kwargs : dict Additional keyword parameters to be passed in ``pandas.DataFrameGroupBy.sum``. Returns ------- pandas.DataFrame A pandas Dataframe with mean values in each column of each group. """ sums_counts_df = dfgb.sum(**kwargs) if sums_counts_df.empty: return sums_counts_df.droplevel(GroupByReduce.ID_LEVEL_NAME, axis=1) sum_df = sums_counts_df["sum"] count_df = sums_counts_df["count"] return sum_df / count_df GroupByReduce.register_implementation(mean_map, mean_reduce) return ( mean_map, mean_reduce, lambda grp, *args, **kwargs: grp.mean(*args, **kwargs), ) GroupbyReduceImpl._groupby_reduce_impls = { "all": ("all", "all", lambda grp, *args, **kwargs: grp.all(*args, **kwargs)), "any": ("any", "any", lambda grp, *args, **kwargs: grp.any(*args, **kwargs)), "count": ("count", "sum", lambda grp, *args, **kwargs: grp.count(*args, **kwargs)), "max": ("max", "max", lambda grp, *args, **kwargs: grp.max(*args, **kwargs)), "mean": GroupbyReduceImpl._build_mean_impl(), "min": ("min", "min", lambda grp, *args, **kwargs: grp.min(*args, **kwargs)), "prod": ("prod", "prod", lambda grp, *args, **kwargs: grp.prod(*args, **kwargs)), "size": ("size", "sum", lambda grp, *args, **kwargs: grp.size(*args, **kwargs)), "skew": GroupbyReduceImpl._build_skew_impl(), "sum": ("sum", "sum", lambda grp, *args, **kwargs: grp.sum(*args, **kwargs)), } class PivotTableImpl: """Provide MapReduce, Range-Partitioning and Full-Column implementations for 'pivot_table()'.""" @classmethod def map_reduce_impl( cls, qc, unique_keys, drop_column_level, pivot_kwargs ): # noqa: PR01 """Compute 'pivot_table()' using MapReduce implementation.""" if pivot_kwargs["margins"]: raise NotImplementedError( "MapReduce 'pivot_table' implementation doesn't support 'margins=True' parameter" ) index, columns, values = ( pivot_kwargs["index"], pivot_kwargs["columns"], pivot_kwargs["values"], ) aggfunc = pivot_kwargs["aggfunc"] if not GroupbyReduceImpl.has_impl_for(aggfunc): raise NotImplementedError( "MapReduce 'pivot_table' implementation only supports 'aggfuncs' that are implemented in 'GroupbyReduceImpl'" ) if len(set(index).intersection(columns)) > 0: raise NotImplementedError( "MapReduce 'pivot_table' implementation doesn't support intersections of 'index' and 'columns'" ) to_group, keys_columns = cls._separate_data_from_grouper( qc, values, unique_keys ) to_unstack = columns if index else None result = GroupbyReduceImpl.build_qc_method( aggfunc, finalizer_fn=lambda df: cls._pivot_table_from_groupby( df, pivot_kwargs["dropna"], drop_column_level, to_unstack, pivot_kwargs["fill_value"], ), )( to_group, by=keys_columns, axis=0, groupby_kwargs={ "observed": pivot_kwargs["observed"], "sort": pivot_kwargs["sort"], }, agg_args=(), agg_kwargs={}, drop=True, ) if to_unstack is None: result = result.transpose() return result @classmethod def full_axis_impl( cls, qc, unique_keys, drop_column_level, pivot_kwargs ): # noqa: PR01 """Compute 'pivot_table()' using full-column-axis implementation.""" index, columns, values = ( pivot_kwargs["index"], pivot_kwargs["columns"], pivot_kwargs["values"], ) to_group, keys_columns = cls._separate_data_from_grouper( qc, values, unique_keys ) def applyier(df, other): # pragma: no cover """ Build pivot table for a single partition. Parameters ---------- df : pandas.DataFrame Partition of the self frame. other : pandas.DataFrame Broadcasted partition that contains `value` columns of the self frame. Returns ------- pandas.DataFrame Pivot table for this particular partition. """ concated = pandas.concat([df, other], axis=1, copy=False) # to reduce peak memory consumption del df, other result = pandas.pivot_table( concated, **pivot_kwargs, ) # to reduce peak memory consumption del concated # if only one value is specified, removing level that maps # columns from `values` to the actual values if drop_column_level is not None: result = result.droplevel(drop_column_level, axis=1) # in that case Pandas transposes the result of `pivot_table`, # transposing it back to be consistent with column axis values along # different partitions if len(index) == 0 and len(columns) > 0: common_type = find_common_type(result.dtypes.tolist()) # TODO: remove find_common_type+astype after pandas fix the following issue # transpose loses dtypes: https://github.com/pandas-dev/pandas/issues/43337 result = result.transpose().astype(common_type, copy=False) return result result = qc.__constructor__( to_group._modin_frame.broadcast_apply_full_axis( axis=0, func=applyier, other=keys_columns._modin_frame ) ) # transposing the result again, to be consistent with Pandas result if len(index) == 0 and len(columns) > 0: result = result.transpose() return result @classmethod def range_partition_impl( cls, qc, unique_keys, drop_column_level, pivot_kwargs ): # noqa: PR01 """Compute 'pivot_table()' using Range-Partitioning implementation.""" if pivot_kwargs["margins"]: raise NotImplementedError( "Range-partitioning 'pivot_table' implementation doesn't support 'margins=True' parameter" ) index, columns, values = ( pivot_kwargs["index"], pivot_kwargs["columns"], pivot_kwargs["values"], ) if len(set(index).intersection(columns)) > 0: raise NotImplementedError( "Range-partitioning 'pivot_table' implementation doesn't support intersections of 'index' and 'columns'" ) if values is not None: to_take = list(np.unique(list(index) + list(columns) + list(values))) qc = qc.getitem_column_array(to_take, ignore_order=True) to_unstack = columns if index else None groupby_result = qc._groupby_shuffle( by=list(unique_keys), agg_func=pivot_kwargs["aggfunc"], axis=0, groupby_kwargs={ "observed": pivot_kwargs["observed"], "sort": pivot_kwargs["sort"], }, agg_args=(), agg_kwargs={}, drop=True, ) # the length of 'groupby_result' is typically really small here, # so it's okay to call full-column function result = groupby_result._modin_frame.apply_full_axis( axis=0, func=lambda df: cls._pivot_table_from_groupby( df, pivot_kwargs["dropna"], drop_column_level, to_unstack, pivot_kwargs["fill_value"], # FIXME: Range-partitioning impl has a problem with the resulting order in case of multiple grouping keys, # so passing 'sort=True' explicitly in this case # https://github.com/modin-project/modin/issues/6875 sort=pivot_kwargs["sort"] if len(unique_keys) > 1 else False, ), ) if to_unstack is None: result = result.transpose() return qc.__constructor__(result) @staticmethod def _pivot_table_from_groupby( df, dropna, drop_column_level, to_unstack, fill_value, sort=False ): """ Convert group by aggregation result to a pivot table. Parameters ---------- df : pandas.DataFrame Group by aggregation result. dropna : bool Whether to drop NaN columns. drop_column_level : int or None An extra columns level to drop. to_unstack : list of labels or None Group by keys to pass to ``.unstack()``. Reperent `columns` parameter for ``.pivot_table()``. fill_value : bool Fill value for NaN values. sort : bool, default: False Whether to sort the result along index. Returns ------- pandas.DataFrame """ if df.index.nlevels > 1 and to_unstack is not None: df = df.unstack(level=to_unstack) if drop_column_level is not None: df = df.droplevel(drop_column_level, axis=1) if dropna: df = df.dropna(axis=1, how="all") if fill_value is not None: df = df.fillna(fill_value, downcast="infer") if sort: df = df.sort_index(axis=0) return df @staticmethod def _separate_data_from_grouper(qc, values, unique_keys): """ Split `qc` for key columns to group by and values to aggregate. Parameters ---------- qc : PandasQueryCompiler values : list of labels or None List of columns to aggregate. ``None`` means all columns except 'unique_keys'. unique_keys : list of labels List of key columns to group by. Returns ------- to_aggregate : PandasQueryCompiler keys_to_group : PandasQueryCompiler """ if values is None: to_aggregate = qc.drop(columns=unique_keys) else: to_aggregate = qc.getitem_column_array(np.unique(values), ignore_order=True) keys_to_group = qc.getitem_column_array(unique_keys, ignore_order=True) return to_aggregate, keys_to_group ================================================ FILE: modin/core/storage_formats/pandas/merge.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Contains implementations for Merge/Join.""" from __future__ import annotations from typing import TYPE_CHECKING, Optional import pandas from pandas.core.dtypes.common import is_list_like from pandas.errors import MergeError from modin.config import MinRowPartitionSize, NPartitions from modin.core.dataframe.base.dataframe.utils import join_columns from modin.core.dataframe.pandas.metadata import ModinDtypes from .utils import merge_partitioning if TYPE_CHECKING: from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler # TODO: add methods for 'join' here class MergeImpl: """Provide implementations for merge/join.""" @classmethod def range_partitioning_merge(cls, left, right, kwargs): """ Execute merge using range-partitioning implementation. Parameters ---------- left : PandasQueryCompiler right : PandasQueryCompiler kwargs : dict Keyword arguments for ``pandas.merge()`` function. Returns ------- PandasQueryCompiler """ if ( kwargs.get("left_index", False) or kwargs.get("right_index", False) or kwargs.get("left_on", None) is not None or kwargs.get("left_on", None) is not None or kwargs.get("how", "left") not in ("left", "inner") ): raise NotImplementedError( f"The passed parameters are not yet supported by range-partitioning merge: {kwargs=}" ) on = kwargs.get("on", None) if on is not None and not isinstance(on, list): on = [on] if on is None or len(on) > 1: raise NotImplementedError( f"Merging on multiple columns is not yet supported by range-partitioning merge: {on=}" ) if any(col not in left.columns or col not in right.columns for col in on): raise NotImplementedError( "Merging on an index level is not yet supported by range-partitioning merge." ) def func(left, right): return left.merge(right, **kwargs) new_columns, new_dtypes = cls._compute_result_metadata( left, right, on, left_on=None, right_on=None, suffixes=kwargs.get("suffixes", ("_x", "_y")), ) return left.__constructor__( left._modin_frame._apply_func_to_range_partitioning_broadcast( right._modin_frame, func=func, key=on, new_columns=new_columns, new_dtypes=new_dtypes, ) # pandas resets the index of the result unless we were merging on an index level, # the current implementation only supports merging on column names, so dropping # the index unconditionally ).reset_index(drop=True) @classmethod def row_axis_merge( cls, left: PandasQueryCompiler, right: PandasQueryCompiler, kwargs: dict ) -> PandasQueryCompiler: """ Execute merge using row-axis implementation. Parameters ---------- left : PandasQueryCompiler right : PandasQueryCompiler kwargs : dict Keyword arguments for ``pandas.merge()`` function. Returns ------- PandasQueryCompiler """ how = kwargs.get("how", "inner") on = kwargs.get("on", None) left_on = kwargs.get("left_on", None) right_on = kwargs.get("right_on", None) left_index = kwargs.get("left_index", False) right_index = kwargs.get("right_index", False) sort = kwargs.get("sort", False) if ( ( how in ["left", "inner"] or (how == "right" and right._modin_frame._partitions.size != 0) ) and left_index is False and right_index is False ): kwargs["sort"] = False reverted = False if how == "right": left, right = right, left reverted = True def should_keep_index( left: PandasQueryCompiler, right: PandasQueryCompiler, ) -> bool: keep_index = False if left_on is not None and right_on is not None: keep_index = any( o in left.index.names and o in right_on and o in right.index.names for o in left_on ) elif on is not None: keep_index = any( o in left.index.names and o in right.index.names for o in on ) return keep_index def map_func( left, right, kwargs=kwargs ) -> pandas.DataFrame: # pragma: no cover if reverted: df = pandas.merge(right, left, **kwargs) else: df = pandas.merge(left, right, **kwargs) return df # Want to ensure that these are python lists if left_on is not None and right_on is not None: left_on = list(left_on) if is_list_like(left_on) else [left_on] right_on = list(right_on) if is_list_like(right_on) else [right_on] elif on is not None: on = list(on) if is_list_like(on) else [on] right_to_broadcast = right._modin_frame.combine() new_columns, new_dtypes = cls._compute_result_metadata( *((left, right) if not reverted else (right, left)), on, left_on, right_on, kwargs.get("suffixes", ("_x", "_y")), ) # We rebalance when the ratio of the number of existing partitions to # the ideal number of partitions is smaller than this threshold. The # threshold is a heuristic that may need to be tuned for performance. if ( left._modin_frame._partitions.shape[0] < 0.3 * NPartitions.get() # to avoid empty partitions after repartition; can materialize index and len(left._modin_frame) > NPartitions.get() * MinRowPartitionSize.get() ): left = left.repartition(axis=0) new_left = left.__constructor__( left._modin_frame.broadcast_apply_full_axis( axis=1, func=map_func, other=right_to_broadcast, # We're going to explicitly change the shape across the 1-axis, # so we want for partitioning to adapt as well keep_partitioning=False, num_splits=merge_partitioning( left._modin_frame, right._modin_frame, axis=1 ), new_columns=new_columns, sync_labels=False, dtypes=new_dtypes, ) ) # Here we want to understand whether we're joining on a column or on an index level. # It's cool if indexes are already materialized so we can easily check that, if not # it's fine too, we can also decide that by columns, which tend to be already # materialized quite often compared to the indexes. keep_index = False if left.frame_has_materialized_index: keep_index = should_keep_index(left, right) else: # Have to trigger columns materialization. Hope they're already available at this point. if left_on is not None and right_on is not None: keep_index = any( o not in right.columns and o in left_on and o not in left.columns for o in right_on ) elif on is not None: keep_index = any( o not in right.columns and o not in left.columns for o in on ) if sort: if left_on is not None and right_on is not None: new_left = ( new_left.sort_index(axis=0, level=left_on + right_on) if keep_index else new_left.sort_rows_by_column_values(left_on + right_on) ) elif on is not None: new_left = ( new_left.sort_index(axis=0, level=on) if keep_index else new_left.sort_rows_by_column_values(on) ) return new_left if keep_index else new_left.reset_index(drop=True) else: return left.default_to_pandas(pandas.DataFrame.merge, right, **kwargs) @classmethod def _compute_result_metadata( cls, left: PandasQueryCompiler, right: PandasQueryCompiler, on, left_on, right_on, suffixes, ) -> tuple[Optional[pandas.Index], Optional[ModinDtypes]]: """ Compute columns and dtypes metadata for the result of merge if possible. Parameters ---------- left : PandasQueryCompiler right : PandasQueryCompiler on : label, list of labels or None `on` argument that was passed to ``pandas.merge()``. left_on : label, list of labels or None `left_on` argument that was passed to ``pandas.merge()``. right_on : label, list of labels or None `right_on` argument that was passed to ``pandas.merge()``. suffixes : list of strings `suffixes` argument that was passed to ``pandas.merge()``. Returns ------- new_columns : pandas.Index or None Columns for the result of merge. ``None`` if not enought metadata to compute. new_dtypes : ModinDtypes or None Dtypes for the result of merge. ``None`` if not enought metadata to compute. """ new_columns = None new_dtypes = None if not left.frame_has_materialized_columns: return new_columns, new_dtypes if left_on is None and right_on is None: if on is None: on = [c for c in left.columns if c in right.columns] _left_on, _right_on = on, on else: if left_on is None or right_on is None: raise MergeError( "Must either pass only 'on' or 'left_on' and 'right_on', not combination of them." ) _left_on, _right_on = left_on, right_on try: new_columns, left_renamer, right_renamer = join_columns( left.columns, right.columns, _left_on, _right_on, suffixes, ) except NotImplementedError: # This happens when one of the keys to join is an index level. Pandas behaviour # is really complicated in this case, so we're not computing resulted columns for now. pass else: # renamers may contain columns from 'index', so trying to merge index and column dtypes here right_index_dtypes = ( right.index.dtypes if isinstance(right.index, pandas.MultiIndex) else pandas.Series([right.index.dtype], index=[right.index.name]) ) right_dtypes = pandas.concat([right.dtypes, right_index_dtypes])[ right_renamer.keys() ].rename(right_renamer) left_index_dtypes = left._modin_frame._index_cache.maybe_get_dtypes() left_dtypes = ( ModinDtypes.concat([left._modin_frame._dtypes, left_index_dtypes]) .lazy_get(left_renamer.keys()) .set_index(list(left_renamer.values())) ) new_dtypes = ModinDtypes.concat([left_dtypes, right_dtypes]) return new_columns, new_dtypes ================================================ FILE: modin/core/storage_formats/pandas/native_query_compiler.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Module contains ``NativeQueryCompiler`` class. ``NativeQueryCompiler`` is responsible for compiling efficient DataFrame algebra queries for small data and empty ``PandasDataFrame``. """ from typing import TYPE_CHECKING, Any, Optional, Union import numpy as np import pandas from pandas.core.dtypes.common import is_scalar from modin.config.envvars import ( NativePandasDeepCopy, NativePandasMaxRows, NativePandasTransferThreshold, ) from modin.core.dataframe.base.interchange.dataframe_protocol.dataframe import ( ProtocolDataframe, ) from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler from modin.utils import _inherit_docstrings, try_cast_to_pandas if TYPE_CHECKING: from modin.pandas import DataFrame, Series from modin.pandas.base import BasePandasDataset _NO_REPARTITION_ON_NATIVE_EXECUTION_EXCEPTION_MESSAGE = ( "Modin dataframes and series using native execution do not have partitions." ) def _get_axis(axis): """ Build index labels getter of the specified axis. Parameters ---------- axis : {0, 1} Axis to get labels from. 0 is for index and 1 is for column. Returns ------- callable(NativeQueryCompiler) -> pandas.Index """ if axis == 0: return lambda self: self._modin_frame.index else: return lambda self: self._modin_frame.columns def _set_axis(axis): """ Build index labels setter of the specified axis. Parameters ---------- axis : {0, 1} Axis to set labels on. 0 is for index and 1 is for column. Returns ------- callable(NativeQueryCompiler) """ if axis == 0: def set_axis(self, idx): self._modin_frame.index = idx else: def set_axis(self, cols): self._modin_frame.columns = cols return set_axis @_inherit_docstrings(BaseQueryCompiler) class NativeQueryCompiler(BaseQueryCompiler): """ Query compiler for executing operations with native pandas. Parameters ---------- pandas_frame : pandas.DataFrame The pandas frame to query with the compiled queries. """ _OPERATION_INITIALIZATION_OVERHEAD = 0 _OPERATION_PER_ROW_OVERHEAD = 0 _modin_frame: pandas.DataFrame _should_warn_on_default_to_pandas: bool = False def __init__(self, pandas_frame): if hasattr(pandas_frame, "_to_pandas"): pandas_frame = pandas_frame._to_pandas() if is_scalar(pandas_frame): pandas_frame = pandas.DataFrame([pandas_frame]) elif isinstance(pandas_frame, pandas.DataFrame): # For performance purposes, we create "shallow" copies when NativePandasDeepCopy # is disabled (the default value). This may cause unexpected behavior if the # parent native frame is mutated, but creates a very significant performance # improvement on large data. pandas_frame = pandas_frame.copy(deep=NativePandasDeepCopy.get()) else: pandas_frame = pandas.DataFrame(pandas_frame) self._modin_frame = pandas_frame storage_format = property( lambda self: "Native", doc=BaseQueryCompiler.storage_format.__doc__ ) engine = property(lambda self: "Native", doc=BaseQueryCompiler.engine.__doc__) def execute(self): pass @property def frame_has_materialized_dtypes(self) -> bool: """ Check if the underlying dataframe has materialized dtypes. Returns ------- bool """ return True def set_frame_dtypes_cache(self, dtypes): """ Set dtypes cache for the underlying dataframe frame. Parameters ---------- dtypes : pandas.Series, ModinDtypes, callable or None Notes ----- This function is for consistency with other QCs, dtypes should be assigned directly on the frame. """ pass def set_frame_index_cache(self, index): """ Set index cache for underlying dataframe. Parameters ---------- index : sequence, callable or None Notes ----- This function is for consistency with other QCs, index should be assigned directly on the frame. """ pass @property def frame_has_index_cache(self): """ Check if the index cache exists for underlying dataframe. Returns ------- bool """ return True @property def frame_has_dtypes_cache(self) -> bool: """ Check if the dtypes cache exists for the underlying dataframe. Returns ------- bool """ return True def copy(self): # If NativePandasDeepCopy is enabled, no need to perform an explicit copy here since the # constructor will perform one anyway. # If it is disabled, then we need to perform a deep copy. if NativePandasDeepCopy.get(): return self.__constructor__(self._modin_frame) else: return self.__constructor__(self._modin_frame.copy(deep=True)) def to_pandas(self): # For performance purposes, we create "shallow" copies when NativePandasDeepCopy # is disabled (the default value). This may cause unexpected behavior if the # parent native frame is mutated, but creates a very significant performance # improvement on large data. return self._modin_frame.copy(deep=NativePandasDeepCopy.get()) @classmethod def from_pandas(cls, df, data_cls): return cls(df) @classmethod def from_arrow(cls, at, data_cls): return cls(at.to_pandas()) def free(self): return def finalize(self): return def move_to(self, target_backend: str) -> Union[BaseQueryCompiler, Any]: return NotImplemented @classmethod def move_from(cls, source_qc: BaseQueryCompiler) -> Union[BaseQueryCompiler, Any]: return NotImplemented @classmethod def _engine_max_size(cls): # do not return the custom configuration for sub-classes if cls == NativeQueryCompiler: return NativePandasMaxRows.get() return cls._MAX_SIZE_THIS_ENGINE_CAN_HANDLE @classmethod def _transfer_threshold(cls): # do not return the custom configuration for sub-classes if cls == NativeQueryCompiler: return NativePandasTransferThreshold.get() return cls._TRANSFER_THRESHOLD def do_array_ufunc_implementation( self, frame: "BasePandasDataset", ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any ) -> Union["DataFrame", "Series", Any]: assert ( self is frame._query_compiler ), "array ufunc called with mismatched query compiler and input frame" pandas_frame = self._modin_frame if not frame._is_dataframe: pandas_frame = pandas_frame.iloc[:, 0] pandas_result = pandas_frame.__array_ufunc__( ufunc, method, *( pandas_frame if each_input is frame else try_cast_to_pandas(each_input) for each_input in inputs ), **try_cast_to_pandas(kwargs), ) if isinstance(pandas_result, pandas.DataFrame): from modin.pandas import DataFrame return DataFrame(pandas_result) elif isinstance(pandas_result, pandas.Series): from modin.pandas import Series return Series(pandas_result) # ufuncs are required to be one-to-one mappings, so this branch should never be hit return pandas_result # pragma: no cover # Dataframe interchange protocol def to_interchange_dataframe( self, nan_as_null: bool = False, allow_copy: bool = True ): return self._modin_frame.__dataframe__( nan_as_null=nan_as_null, allow_copy=allow_copy ) @classmethod def from_interchange_dataframe(cls, df: ProtocolDataframe, data_cls): return cls(pandas.api.interchange.from_dataframe(df)) # END Dataframe interchange protocol def support_materialization_in_worker_process(self) -> bool: """ Whether it's possible to call function `to_pandas` during the pickling process, at the moment of recreating the object. Returns ------- bool """ return False def get_pandas_backend(self) -> Optional[str]: """ Get backend stored in `_modin_frame`. Returns ------- str | None Backend name. """ return None # NOTE that because this query compiler provides the index of its underlying # pandas dataframe, updating the index affects this frame, and vice versa. # Consequently, native execution does not suffer from the issue # https://github.com/modin-project/modin/issues/1618 index: pandas.Index = property(_get_axis(0), _set_axis(0)) columns = property(_get_axis(1), _set_axis(1)) @_inherit_docstrings(BaseQueryCompiler.repartition) def repartition(self, axis=None): raise Exception(_NO_REPARTITION_ON_NATIVE_EXECUTION_EXCEPTION_MESSAGE) ================================================ FILE: modin/core/storage_formats/pandas/parsers.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Module houses Modin parser classes, that are used for data parsing on the workers. Notes ----- Data parsing mechanism differs depending on the data format type: * text format type (CSV, EXCEL, FWF, JSON): File parsing begins from retrieving `start` and `end` parameters from `parse` kwargs - these parameters define start and end bytes of data file, that should be read in the concrete partition. Using this data and file handle got from `fname`, binary data is read by python `read` function. Then resulting data is passed into `pandas.read_*` function as `io.BytesIO` object to get corresponding `pandas.DataFrame` (we need to do this because Modin partitions internally stores data as `pandas.DataFrame`). * columnar store type (FEATHER, HDF, PARQUET): In this case data chunk to be read is defined by columns names passed as `columns` parameter as part of `parse` kwargs, so no additional action is needed and `fname` and `kwargs` are just passed into `pandas.read_*` function (in some corner cases `pyarrow.read_*` function can be used). * SQL type: Chunking is incorporated in the `sql` parameter as part of query, so `parse` parameters are passed into `pandas.read_sql` function without modification. """ import contextlib import json import os import warnings from io import BytesIO, IOBase, TextIOWrapper from typing import Any, NamedTuple import fsspec import numpy as np import pandas from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.concat import union_categoricals from pandas.io.common import infer_compression from pandas.util._decorators import doc from modin.config import MinColumnPartitionSize, MinRowPartitionSize from modin.core.io.file_dispatcher import OpenFile from modin.core.storage_formats.pandas.utils import split_result_of_axis_func_pandas from modin.db_conn import ModinDatabaseConnection from modin.error_message import ErrorMessage from modin.logging import ClassLogger from modin.logging.config import LogLevel from modin.utils import ModinAssumptionError _doc_pandas_parser_class = """ Class for handling {data_type} on the workers using pandas storage format. Inherits common functions from `PandasParser` class. """ _doc_parse_func = """ Parse data on the workers. Parameters ---------- {parameters} **kwargs : dict Keywords arguments to be used by `parse` function or passed into `read_*` function. Returns ------- list List with split parse results and it's metadata (index, dtypes, etc.). """ _doc_parse_parameters_common = """fname : str or path object Name of the file or path to read.""" _doc_common_read_kwargs = """common_read_kwargs : dict Common keyword parameters for read functions. """ _doc_parse_parameters_common2 = "\n".join( (_doc_parse_parameters_common, _doc_common_read_kwargs) ) def _split_result_for_readers(axis, num_splits, df): # pragma: no cover """ Split the read DataFrame into smaller DataFrames and handle all edge cases. Parameters ---------- axis : int The axis to split across (0 - index, 1 - columns). num_splits : int The number of splits to create. df : pandas.DataFrame `pandas.DataFrame` to split. Returns ------- list A list of pandas DataFrames. """ splits = split_result_of_axis_func_pandas( axis, num_splits, df, min_block_size=( MinRowPartitionSize.get() if axis == 0 else MinColumnPartitionSize.get() ), ) if not isinstance(splits, list): splits = [splits] return splits def find_common_type_cat(types): """ Find a common data type among the given dtypes. Parameters ---------- types : array-like Array of dtypes. Returns ------- pandas.core.dtypes.dtypes.ExtensionDtype or np.dtype or None `dtype` that is common for all passed `types`. """ if all(isinstance(t, pandas.CategoricalDtype) for t in types): if all(t.ordered for t in types): categories = np.sort(np.unique([c for t in types for c in t.categories])) return pandas.CategoricalDtype( categories, ordered=True, ) return union_categoricals( [pandas.Categorical([], dtype=t) for t in types], sort_categories=all(t.ordered for t in types), ).dtype else: return find_common_type(list(types)) class PandasParser(ClassLogger, modin_layer="PARSER", log_level=LogLevel.DEBUG): """Base class for parser classes with pandas storage format.""" @staticmethod @doc(_doc_parse_func, parameters=_doc_parse_parameters_common) def generic_parse(fname, **kwargs): warnings.filterwarnings("ignore") num_splits = kwargs.pop("num_splits", None) start = kwargs.pop("start", None) end = kwargs.pop("end", None) header_size = kwargs.pop("header_size", 0) common_dtypes = kwargs.pop("common_dtypes", None) encoding = kwargs.get("encoding", None) callback = kwargs.pop("callback") if start is None or end is None: # This only happens when we are reading with only one worker (Default) return callback(fname, **kwargs) # pop "compression" from kwargs because bio is uncompressed with OpenFile( fname, "rb", kwargs.pop("compression", "infer"), **(kwargs.pop("storage_options", None) or {}), ) as bio: header = b"" # In this case we beware that first line can contain BOM, so # adding this line to the `header` for reading and then skip it if encoding and ( "utf" in encoding and "8" not in encoding or encoding == "unicode_escape" or encoding.replace("-", "_") == "utf_8_sig" ): # do not 'close' the wrapper - underlying buffer is managed by `bio` handle fio = TextIOWrapper(bio, encoding=encoding, newline="") if header_size == 0: header = fio.readline().encode(encoding) kwargs["skiprows"] = 1 for _ in range(header_size): header += fio.readline().encode(encoding) elif encoding is not None: if header_size == 0: header = bio.readline() # `skiprows` can be only None here, so don't check it's type # and just set to 1 kwargs["skiprows"] = 1 for _ in range(header_size): header += bio.readline() else: for _ in range(header_size): header += bio.readline() bio.seek(start) to_read = header + bio.read(end - start) if "memory_map" in kwargs: kwargs = kwargs.copy() del kwargs["memory_map"] if common_dtypes is not None: kwargs["dtype"] = common_dtypes pandas_df = callback(BytesIO(to_read), **kwargs) index = ( pandas_df.index if not isinstance(pandas_df.index, pandas.RangeIndex) else len(pandas_df) ) return _split_result_for_readers(1, num_splits, pandas_df) + [ index, pandas_df.dtypes, ] @classmethod def get_dtypes(cls, dtypes_ids, columns): """ Get common for all partitions dtype for each of the columns. Parameters ---------- dtypes_ids : list Array with references to the partitions dtypes objects. columns : array-like or Index (1d) The names of the columns in this variable will be used for dtypes creation. Returns ------- frame_dtypes : pandas.Series, dtype or None Resulting dtype or pandas.Series where column names are used as index and types of columns are used as values for full resulting frame. """ if len(dtypes_ids) == 0: return None # each element in `partitions_dtypes` is a Series, where column names are # used as index and types of columns for different partitions are used as values partitions_dtypes = cls.materialize(dtypes_ids) if all([len(dtype) == 0 for dtype in partitions_dtypes]): return None combined_part_dtypes = pandas.concat(partitions_dtypes, axis=1) frame_dtypes = combined_part_dtypes.iloc[:, 0] frame_dtypes.name = None if not combined_part_dtypes.eq(frame_dtypes, axis=0).all(axis=None): ErrorMessage.mismatch_with_pandas( operation="read_*", message="Data types of partitions are different! " + "Please refer to the troubleshooting section of the Modin documentation " + "to fix this issue", ) # concat all elements of `partitions_dtypes` and find common dtype # for each of the column among all partitions frame_dtypes = combined_part_dtypes.apply( lambda row: find_common_type_cat(row.values), axis=1, ).squeeze(axis=0) # Set the index for the dtypes to the column names if isinstance(frame_dtypes, pandas.Series): frame_dtypes.index = columns else: frame_dtypes = pandas.Series(frame_dtypes, index=columns) return frame_dtypes @classmethod def single_worker_read(cls, fname, *args, reason: str, **kwargs): """ Perform reading by single worker (default-to-pandas implementation). Parameters ---------- fname : str, path object or file-like object Name of the file or file-like object to read. *args : tuple Positional arguments to be passed into `read_*` function. reason : str Message describing the reason for falling back to pandas. **kwargs : dict Keywords arguments to be passed into `read_*` function. Returns ------- BaseQueryCompiler or dict or pandas.io.parsers.TextFileReader Object with imported data (or with reference to data) for further processing, object type depends on the child class `parse` function result type. """ ErrorMessage.default_to_pandas(reason=reason) # Use default args for everything pandas_frame = cls.parse(fname, *args, **kwargs) if isinstance(pandas_frame, pandas.io.parsers.TextFileReader): pd_read = pandas_frame.read pandas_frame.read = ( lambda *args, **kwargs: cls.query_compiler_cls.from_pandas( pd_read(*args, **kwargs), cls.frame_cls ) ) return pandas_frame elif isinstance(pandas_frame, dict): return { i: cls.query_compiler_cls.from_pandas(frame, cls.frame_cls) for i, frame in pandas_frame.items() } return cls.query_compiler_cls.from_pandas(pandas_frame, cls.frame_cls) @staticmethod def get_types_mapper(dtype_backend): """ Get types mapper that would be used in read_parquet/read_feather. Parameters ---------- dtype_backend : {"numpy_nullable", "pyarrow", lib.no_default} Returns ------- dict """ to_pandas_kwargs = {} if dtype_backend == "numpy_nullable": from pandas.io._util import _arrow_dtype_mapping mapping = _arrow_dtype_mapping() to_pandas_kwargs["types_mapper"] = mapping.get elif dtype_backend == "pyarrow": to_pandas_kwargs["types_mapper"] = pandas.ArrowDtype return to_pandas_kwargs infer_compression = infer_compression @doc(_doc_pandas_parser_class, data_type="CSV files") class PandasCSVParser(PandasParser): @staticmethod @doc(_doc_parse_func, parameters=_doc_parse_parameters_common2) def parse(fname, common_read_kwargs, **kwargs): return PandasParser.generic_parse( fname, callback=PandasCSVParser.read_callback, **common_read_kwargs, **kwargs, ) @staticmethod def read_callback(*args, **kwargs): """ Parse data on each partition. Parameters ---------- *args : list Positional arguments to be passed to the callback function. **kwargs : dict Keyword arguments to be passed to the callback function. Returns ------- pandas.DataFrame or pandas.io.parsers.TextParser Function call result. """ return pandas.read_csv(*args, **kwargs) @doc(_doc_pandas_parser_class, data_type="tables with fixed-width formatted lines") class PandasFWFParser(PandasParser): @staticmethod @doc(_doc_parse_func, parameters=_doc_parse_parameters_common2) def parse(fname, common_read_kwargs, **kwargs): return PandasParser.generic_parse( fname, callback=PandasFWFParser.read_callback, **common_read_kwargs, **kwargs, ) @staticmethod def read_callback(*args, **kwargs): """ Parse data on each partition. Parameters ---------- *args : list Positional arguments to be passed to the callback function. **kwargs : dict Keyword arguments to be passed to the callback function. Returns ------- pandas.DataFrame or pandas.io.parsers.TextFileReader Function call result. """ return pandas.read_fwf(*args, **kwargs) @doc(_doc_pandas_parser_class, data_type="excel files") class PandasExcelParser(PandasParser): @classmethod def get_sheet_data(cls, sheet, convert_float): """ Get raw data from the excel sheet. Parameters ---------- sheet : openpyxl.worksheet.worksheet.Worksheet Sheet to get data from. convert_float : bool Whether to convert floats to ints or not. Returns ------- list List with sheet data. """ return [ [cls._convert_cell(cell, convert_float) for cell in row] for row in sheet.rows ] @classmethod def _convert_cell(cls, cell, convert_float): """ Convert excel cell to value. Parameters ---------- cell : openpyxl.cell.cell.Cell Excel cell to convert. convert_float : bool Whether to convert floats to ints or not. Returns ------- list Value that was converted from the excel cell. """ if cell.is_date: return cell.value elif cell.data_type == "e": return np.nan elif cell.data_type == "b": return bool(cell.value) elif cell.value is None: return "" elif cell.data_type == "n": if convert_float: val = int(cell.value) if val == cell.value: return val else: return float(cell.value) return cell.value @staticmethod def need_rich_text_param(): """ Determine whether a required `rich_text` parameter should be specified for the ``WorksheetReader`` constructor. Returns ------- bool """ import openpyxl from packaging import version return version.parse(openpyxl.__version__) >= version.parse("3.1.0") @staticmethod @doc(_doc_parse_func, parameters=_doc_parse_parameters_common) def parse(fname, **kwargs): num_splits = kwargs.pop("num_splits", None) start = kwargs.pop("start", None) end = kwargs.pop("end", None) excel_header = kwargs.get("_header") sheet_name = kwargs.get("sheet_name", 0) footer = b"" # Default to pandas case, where we are not splitting or partitioning if start is None or end is None: return pandas.read_excel(fname, **kwargs) _skiprows = kwargs.pop("skiprows") import re from zipfile import ZipFile import openpyxl from openpyxl.reader.excel import ExcelReader from openpyxl.worksheet._reader import WorksheetReader from openpyxl.worksheet.worksheet import Worksheet from pandas.core.dtypes.common import is_list_like from pandas.io.excel._util import fill_mi_header, maybe_convert_usecols from pandas.io.parsers import TextParser wb = openpyxl.load_workbook(filename=fname, read_only=True) # Get shared strings ex = ExcelReader(fname, read_only=True) ex.read_manifest() ex.read_strings() # Convert string name 0 to string if sheet_name == 0: sheet_name = wb.sheetnames[sheet_name] # get the worksheet to use with the worksheet reader ws = Worksheet(wb) # Read the raw data with ZipFile(fname) as z: with z.open("xl/worksheets/{}.xml".format(sheet_name)) as file: file.seek(start) bytes_data = file.read(end - start) def update_row_nums(match): """ Update the row numbers to start at 1. Parameters ---------- match : re.Match object The match from the origin `re.sub` looking for row number tags. Returns ------- str The updated string with new row numbers. Notes ----- This is needed because the parser we are using does not scale well if the row numbers remain because empty rows are inserted for all "missing" rows. """ b = match.group(0) return re.sub( rb"\d+", lambda c: str(int(c.group(0).decode("utf-8")) - _skiprows).encode( "utf-8" ), b, ) bytes_data = re.sub(rb'r="[A-Z]*\d+"', update_row_nums, bytes_data) bytesio = BytesIO(excel_header + bytes_data + footer) # Use openpyxl to read/parse sheet data common_args = (ws, bytesio, ex.shared_strings, False) if PandasExcelParser.need_rich_text_param(): reader = WorksheetReader(*common_args, rich_text=False) else: reader = WorksheetReader(*common_args) # Attach cells to worksheet object reader.bind_cells() data = PandasExcelParser.get_sheet_data(ws, kwargs.pop("convert_float", True)) usecols = maybe_convert_usecols(kwargs.pop("usecols", None)) header = kwargs.pop("header", 0) index_col = kwargs.pop("index_col", None) # skiprows is handled externally skiprows = None # Handle header and create MultiIndex for columns if necessary if is_list_like(header) and len(header) == 1: header = header[0] if header is not None and is_list_like(header): control_row = [True] * len(data[0]) for row in header: data[row], control_row = fill_mi_header(data[row], control_row) # Handle MultiIndex for row Index if necessary if is_list_like(index_col): # Forward fill values for MultiIndex index. if not is_list_like(header): offset = 1 + header else: offset = 1 + max(header) # Check if dataset is empty if offset < len(data): for col in index_col: last = data[offset][col] for row in range(offset + 1, len(data)): if data[row][col] == "" or data[row][col] is None: data[row][col] = last else: last = data[row][col] parser = TextParser( data, header=header, index_col=index_col, has_index_names=is_list_like(header) and len(header) > 1, skiprows=skiprows, usecols=usecols, skip_blank_lines=False, **kwargs, ) pandas_df = parser.read() if ( len(pandas_df) > 1 and len(pandas_df.columns) != 0 and pandas_df.isnull().all().all() ): # Drop NaN rows at the end of the DataFrame pandas_df = pandas.DataFrame(columns=pandas_df.columns) # Since we know the number of rows that occur before this partition, we can # correctly assign the index in cases of RangeIndex. If it is not a RangeIndex, # the index is already correct because it came from the data. if isinstance(pandas_df.index, pandas.RangeIndex): pandas_df.index = pandas.RangeIndex( start=_skiprows, stop=len(pandas_df.index) + _skiprows ) # We return the length if it is a RangeIndex (common case) to reduce # serialization cost. if index_col is not None: index = pandas_df.index else: # The lengths will become the RangeIndex index = len(pandas_df) return _split_result_for_readers(1, num_splits, pandas_df) + [ index, pandas_df.dtypes, ] @doc(_doc_pandas_parser_class, data_type="JSON files") class PandasJSONParser(PandasParser): @staticmethod @doc(_doc_parse_func, parameters=_doc_parse_parameters_common) def parse(fname, **kwargs): num_splits = kwargs.pop("num_splits", None) start = kwargs.pop("start", None) end = kwargs.pop("end", None) if start is not None and end is not None: # pop "compression" from kwargs because bio is uncompressed with OpenFile( fname, "rb", kwargs.pop("compression", "infer"), **(kwargs.pop("storage_options", None) or {}), ) as bio: bio.seek(start) to_read = b"" + bio.read(end - start) columns = kwargs.pop("columns") pandas_df = pandas.read_json(BytesIO(to_read), **kwargs) else: # This only happens when we are reading with only one worker (Default) return pandas.read_json(fname, **kwargs) if not pandas_df.columns.equals(columns): raise ModinAssumptionError("Columns must be the same across all rows.") partition_columns = pandas_df.columns return _split_result_for_readers(1, num_splits, pandas_df) + [ len(pandas_df), pandas_df.dtypes, partition_columns, ] class ParquetFileToRead(NamedTuple): """ Class to store path and row group information for parquet reads. Parameters ---------- path : str, path object or file-like object Name of the file to read. row_group_start : int Row group to start read from. row_group_end : int Row group to stop read. """ path: Any row_group_start: int row_group_end: int @doc(_doc_pandas_parser_class, data_type="PARQUET data") class PandasParquetParser(PandasParser): @staticmethod def _read_row_group_chunk( f, row_group_start, row_group_end, columns, filters, engine, to_pandas_kwargs ): # noqa: GL08 if engine == "pyarrow": if filters is not None: import pyarrow.dataset as ds from pyarrow.parquet import filters_to_expression parquet_format = ds.ParquetFileFormat() fragment = parquet_format.make_fragment( f, row_groups=range( row_group_start, row_group_end, ), ) dataset = ds.FileSystemDataset( [fragment], schema=fragment.physical_schema, format=parquet_format, filesystem=fragment.filesystem, ) # This lower-level API doesn't have the ability to automatically handle pandas metadata # The following code is based on # https://github.com/apache/arrow/blob/f44e28fa03a64ae5b3d9352d21aee2cc84f9af6c/python/pyarrow/parquet/core.py#L2619-L2628 # if use_pandas_metadata, we need to include index columns in the # column selection, to be able to restore those in the pandas DataFrame metadata = dataset.schema.metadata or {} if b"pandas" in metadata and columns is not None: index_columns = json.loads(metadata[b"pandas"].decode("utf8"))[ "index_columns" ] # In the pandas metadata, the index columns can either be string column names, # or a dictionary that describes a RangeIndex. # Here, we are finding the real data columns that need to be read to become part # of the pandas Index, so we can skip the RangeIndex. # Not only can a RangeIndex be trivially reconstructed later, but we actually # ignore partition-level range indices, because we want to have a single Modin # RangeIndex that spans all partitions. index_columns = [ col for col in index_columns if not isinstance(col, dict) ] columns = list(columns) + list(set(index_columns) - set(columns)) return dataset.to_table( columns=columns, filter=filters_to_expression(filters), ).to_pandas(**to_pandas_kwargs) else: from pyarrow.parquet import ParquetFile return ( ParquetFile(f) .read_row_groups( range( row_group_start, row_group_end, ), columns=columns, use_pandas_metadata=True, ) .to_pandas(**to_pandas_kwargs) ) elif engine == "fastparquet": from fastparquet import ParquetFile return ParquetFile(f)[row_group_start:row_group_end].to_pandas( columns=columns, filters=filters, # Setting row_filter=True would perform filtering at the row level, which is more correct # (in line with pyarrow) # However, it doesn't work: https://github.com/dask/fastparquet/issues/873 # Also, this would create incompatibility with pandas ) else: # We shouldn't ever come to this case, so something went wrong raise ValueError( f"engine must be one of 'pyarrow', 'fastparquet', got: {engine}" ) @staticmethod @doc( _doc_parse_func, parameters="""files_for_parser : list List of files to be read. engine : str Parquet library to use (either PyArrow or fastparquet). """, ) def parse(files_for_parser, engine, **kwargs): columns = kwargs.get("columns", None) filters = kwargs.get("filters", None) storage_options = kwargs.get("storage_options", {}) chunks = [] # `single_worker_read` just passes in a string path or path-like object if isinstance(files_for_parser, (str, os.PathLike)): return pandas.read_parquet(files_for_parser, engine=engine, **kwargs) to_pandas_kwargs = PandasParser.get_types_mapper(kwargs["dtype_backend"]) for file_for_parser in files_for_parser: if isinstance(file_for_parser.path, IOBase): context = contextlib.nullcontext(file_for_parser.path) else: context = fsspec.open(file_for_parser.path, **storage_options) with context as f: chunk = PandasParquetParser._read_row_group_chunk( f, file_for_parser.row_group_start, file_for_parser.row_group_end, columns, filters, engine, to_pandas_kwargs, ) chunks.append(chunk) df = pandas.concat(chunks) return df, df.index, len(df) @doc(_doc_pandas_parser_class, data_type="HDF data") class PandasHDFParser(PandasParser): # pragma: no cover @staticmethod @doc( _doc_parse_func, parameters="""fname : str, path object, pandas.HDFStore or file-like object Name of the file, path pandas.HDFStore or file-like object to read.""", ) def parse(fname, **kwargs): kwargs["key"] = kwargs.pop("_key", None) num_splits = kwargs.pop("num_splits", None) if num_splits is None: return pandas.read_hdf(fname, **kwargs) df = pandas.read_hdf(fname, **kwargs) # Append the length of the index here to build it externally return _split_result_for_readers(0, num_splits, df) + [len(df.index), df.dtypes] @doc(_doc_pandas_parser_class, data_type="FEATHER files") class PandasFeatherParser(PandasParser): @staticmethod @doc( _doc_parse_func, parameters="""fname : str, path object or file-like object Name of the file, path or file-like object to read.""", ) def parse(fname, **kwargs): from pyarrow import feather num_splits = kwargs.pop("num_splits", None) if num_splits is None: return pandas.read_feather(fname, **kwargs) to_pandas_kwargs = PandasParser.get_types_mapper(kwargs["dtype_backend"]) del kwargs["dtype_backend"] with OpenFile( fname, **(kwargs.pop("storage_options", None) or {}), ) as file: # The implementation is as close as possible to the one in pandas. # For reference see `read_feather` in pandas/io/feather_format.py. if not to_pandas_kwargs: df = feather.read_feather(file, **kwargs) else: # `read_feather` doesn't accept `types_mapper` if pyarrow<11.0 pa_table = feather.read_table(file, **kwargs) df = pa_table.to_pandas(**to_pandas_kwargs) # Append the length of the index here to build it externally return _split_result_for_readers(0, num_splits, df) + [len(df.index), df.dtypes] @doc(_doc_pandas_parser_class, data_type="SQL queries or tables") class PandasSQLParser(PandasParser): @staticmethod @doc( _doc_parse_func, parameters="""sql : str or SQLAlchemy Selectable (select or text object) SQL query to be executed or a table name. con : SQLAlchemy connectable, str, or sqlite3 connection Connection object to database. index_col : str or list of str Column(s) to set as index(MultiIndex). read_sql_engine : str Underlying engine ('pandas' or 'connectorx') used for fetching query result.""", ) def parse(sql, con, index_col, read_sql_engine, **kwargs): enable_cx = False if read_sql_engine == "Connectorx": try: import connectorx as cx enable_cx = True except ImportError: warnings.warn( "Switch to 'pandas.read_sql' since 'connectorx' is not installed, please run 'pip install connectorx'." ) num_splits = kwargs.pop("num_splits", None) if isinstance(con, ModinDatabaseConnection): con = con.get_string() if enable_cx else con.get_connection() if num_splits is None: if enable_cx: return cx.read_sql(con, sql, index_col=index_col) return pandas.read_sql(sql, con, index_col=index_col, **kwargs) if enable_cx: df = cx.read_sql(con, sql, index_col=index_col) else: df = pandas.read_sql(sql, con, index_col=index_col, **kwargs) if index_col is None: index = len(df) else: index = df.index return _split_result_for_readers(1, num_splits, df) + [index, df.dtypes] ================================================ FILE: modin/core/storage_formats/pandas/query_compiler.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Module contains ``PandasQueryCompiler`` class. ``PandasQueryCompiler`` is responsible for compiling efficient DataFrame algebra queries for the ``PandasDataframe``. """ from __future__ import annotations import ast import hashlib import re import warnings from collections.abc import Iterable from typing import TYPE_CHECKING, Any, Hashable, List, Literal, Optional, Union import numpy as np import pandas from pandas._libs import lib from pandas.api.types import is_scalar from pandas.core.apply import reconstruct_func from pandas.core.common import is_bool_indexer from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( is_bool_dtype, is_datetime64_any_dtype, is_list_like, is_numeric_dtype, ) from pandas.core.groupby.base import transformation_kernels from pandas.core.indexes.api import ensure_index_from_sequences from pandas.core.indexing import check_bool_indexer from pandas.errors import DataError from modin.config import CpuCount, RangePartitioning from modin.core.dataframe.algebra import ( Binary, Fold, GroupByReduce, Map, Reduce, TreeReduce, ) from modin.core.dataframe.algebra.default2pandas.groupby import ( GroupBy, GroupByDefault, SeriesGroupByDefault, ) from modin.core.dataframe.base.interchange.dataframe_protocol.dataframe import ( ProtocolDataframe, ) from modin.core.dataframe.pandas.metadata import ( DtypesDescriptor, ModinDtypes, ModinIndex, extract_dtype, ) from modin.core.storage_formats import BaseQueryCompiler from modin.error_message import ErrorMessage from modin.logging import get_logger from modin.utils import ( MODIN_UNNAMED_SERIES_LABEL, _inherit_docstrings, hashable, try_cast_to_pandas, wrap_udf_function, ) from .aggregations import CorrCovBuilder from .groupby import GroupbyReduceImpl, PivotTableImpl from .merge import MergeImpl from .utils import get_group_names, merge_partitioning if TYPE_CHECKING: from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe def _get_axis(axis): """ Build index labels getter of the specified axis. Parameters ---------- axis : {0, 1} Axis to get labels from. 0 is for index and 1 is for column. Returns ------- callable(PandasQueryCompiler) -> pandas.Index """ if axis == 0: return lambda self: self._modin_frame.index else: return lambda self: self._modin_frame.columns def _set_axis(axis): """ Build index labels setter of the specified axis. Parameters ---------- axis : {0, 1} Axis to set labels on. 0 is for index and 1 is for column. Returns ------- callable(PandasQueryCompiler) """ if axis == 0: def set_axis(self, idx): self._modin_frame.index = idx else: def set_axis(self, cols): self._modin_frame.columns = cols return set_axis def _str_map(func_name): """ Build function that calls specified string function on frames ``str`` accessor. Parameters ---------- func_name : str String function name to execute on ``str`` accessor. Returns ------- callable(pandas.DataFrame, *args, **kwargs) -> pandas.DataFrame """ def str_op_builder(df, *args, **kwargs): """Apply specified function against `str` accessor of the passed frame.""" str_s = df.squeeze(axis=1).str res = getattr(pandas.Series.str, func_name)(str_s, *args, **kwargs) if hasattr(res, "to_frame"): res = res.to_frame() return res return str_op_builder def _dt_prop_map(property_name): """ Build function that access specified property of the ``dt`` property of the passed frame. Parameters ---------- property_name : str Date-time property name to access. Returns ------- callable(pandas.DataFrame, *args, **kwargs) -> pandas.DataFrame Function to be applied in the partitions. Notes ----- This applies non-callable properties of ``Series.dt``. """ def dt_op_builder(df, *args, **kwargs): """Access specified date-time property of the passed frame.""" squeezed_df = df.squeeze(axis=1) if isinstance(squeezed_df, pandas.DataFrame) and len(squeezed_df.columns) == 0: return squeezed_df assert isinstance(squeezed_df, pandas.Series) prop_val = getattr(squeezed_df.dt, property_name) if isinstance(prop_val, pandas.Series): return prop_val.to_frame() elif isinstance(prop_val, pandas.DataFrame): return prop_val else: return pandas.DataFrame([prop_val]) return dt_op_builder def _dt_func_map(func_name): """ Build function that apply specified method against ``dt`` property of the passed frame. Parameters ---------- func_name : str Date-time function name to apply. Returns ------- callable(pandas.DataFrame, *args, **kwargs) -> pandas.DataFrame Function to be applied in the partitions. Notes ----- This applies callable methods of ``Series.dt``. """ def dt_op_builder(df, *args, **kwargs): """Apply specified function against ``dt`` accessor of the passed frame.""" dt_s = df.squeeze(axis=1).dt dt_func_result = getattr(pandas.Series.dt, func_name)(dt_s, *args, **kwargs) # If we don't specify the dtype for the frame, the frame might get the # wrong dtype, e.g. for to_pydatetime in https://github.com/modin-project/modin/issues/4436 return pandas.DataFrame(dt_func_result, dtype=dt_func_result.dtype) return dt_op_builder def copy_df_for_func(func, display_name: str = None): """ Build function that execute specified `func` against passed frame inplace. Built function copies passed frame, applies `func` to the copy and returns the modified frame. Parameters ---------- func : callable(pandas.DataFrame) The function, usually updates a dataframe inplace. display_name : str, optional The function's name, which is displayed by progress bar. Returns ------- callable(pandas.DataFrame) A callable function to be applied in the partitions. """ def caller(df, *args, **kwargs): """Apply specified function the passed frame inplace.""" df = df.copy() func(df, *args, **kwargs) return df if display_name is not None: caller.__name__ = display_name return caller def _series_logical_binop(func): """ Build a callable function to pass to Binary.register for Series logical operators. Parameters ---------- func : callable Binary operator method of pandas.Series to be applied. Returns ------- callable """ return lambda x, y, **kwargs: func( x.squeeze(axis=1), y.squeeze(axis=1) if kwargs.pop("squeeze_other", False) else y, **kwargs, ).to_frame() @_inherit_docstrings(BaseQueryCompiler) class PandasQueryCompiler(BaseQueryCompiler): """ Query compiler for the pandas storage format. This class translates common query compiler API into the DataFrame Algebra queries, that is supposed to be executed by :py:class:`~modin.core.dataframe.pandas.dataframe.dataframe.PandasDataframe`. Parameters ---------- modin_frame : PandasDataframe Modin Frame to query with the compiled queries. shape_hint : {"row", "column", None}, default: None Shape hint for frames known to be a column or a row, otherwise None. """ _modin_frame: PandasDataframe _shape_hint: Optional[str] def __init__(self, modin_frame: PandasDataframe, shape_hint: Optional[str] = None): self._modin_frame = modin_frame self._shape_hint = shape_hint storage_format = property(lambda self: self._modin_frame.storage_format) engine = property(lambda self: self._modin_frame.engine) @property def lazy_row_labels(self): """ Whether the row labels are computed lazily. Equivalent to `not self.frame_has_materialized_index`. Returns ------- bool """ return not self.frame_has_materialized_index @property def lazy_row_count(self): """ Whether the row count is computed lazily. Equivalent to `not self.frame_has_materialized_index`. Returns ------- bool """ return not self.frame_has_materialized_index @property def lazy_column_types(self): """ Whether the dtypes are computed lazily. Equivalent to `not self.frame_has_materialized_dtypes`. Returns ------- bool """ return not self.frame_has_materialized_dtypes @property def lazy_column_labels(self): """ Whether the column labels are computed lazily. Equivalent to `not self.frame_has_materialized_columns`. Returns ------- bool """ return not self.frame_has_materialized_columns @property def lazy_column_count(self): """ Whether the column count is are computed lazily. Equivalent to `not self.frame_has_materialized_columns`. Returns ------- bool """ return not self.frame_has_materialized_columns # The default implementation of stay_cost will cache some information # which will violate some assumptions in test_internals. Since this class # is only used for non-hybrid operations we simply return 0 here for now. def stay_cost(self, api_cls_name, operation, arguments): return 0 def finalize(self): self._modin_frame.finalize() def execute(self): self.finalize() self._modin_frame.wait_computations() def to_pandas(self): return self._modin_frame.to_pandas() @classmethod def from_pandas(cls, df, data_cls): return cls(data_cls.from_pandas(df)) @classmethod def from_arrow(cls, at, data_cls): return cls(data_cls.from_arrow(at)) # Dataframe exchange protocol def to_interchange_dataframe( self, nan_as_null: bool = False, allow_copy: bool = True ): return self._modin_frame.__dataframe__( nan_as_null=nan_as_null, allow_copy=allow_copy ) @classmethod def from_interchange_dataframe(cls, df: ProtocolDataframe, data_cls): return cls(data_cls.from_interchange_dataframe(df)) # END Dataframe exchange protocol index: pandas.Index = property(_get_axis(0), _set_axis(0)) columns: pandas.Index = property(_get_axis(1), _set_axis(1)) def get_axis_len(self, axis: Literal[0, 1]) -> int: """ Return the length of the specified axis. Parameters ---------- axis : {0, 1} Axis to return labels on. Returns ------- int """ if axis == 0: return len(self._modin_frame) else: return sum(self._modin_frame.column_widths) @property def dtypes(self) -> pandas.Series: return self._modin_frame.dtypes def get_dtypes_set(self): return self._modin_frame.get_dtypes_set() # END Index, columns, and dtypes objects # Metadata modification methods def add_prefix(self, prefix, axis=1): if axis == 1: return self.__constructor__( self._modin_frame.rename(new_col_labels=lambda x: f"{prefix}{x}") ) else: return self.__constructor__( self._modin_frame.rename(new_row_labels=lambda x: f"{prefix}{x}") ) def add_suffix(self, suffix, axis=1): if axis == 1: return self.__constructor__( self._modin_frame.rename(new_col_labels=lambda x: f"{x}{suffix}") ) else: return self.__constructor__( self._modin_frame.rename(new_row_labels=lambda x: f"{x}{suffix}") ) # END Metadata modification methods # Copy # For copy, we don't want a situation where we modify the metadata of the # copies if we end up modifying something here. We copy all of the metadata # to prevent that. def copy(self): return self.__constructor__(self._modin_frame.copy(), self._shape_hint) # END Copy # Append/Concat/Join (Not Merge) # The append/concat/join operations should ideally never trigger remote # compute. These operations should only ever be manipulations of the # metadata of the resulting object. It should just be a simple matter of # appending the other object's blocks and adding np.nan columns for the new # columns, if needed. If new columns are added, some compute may be # required, though it can be delayed. # # Currently this computation is not delayed, and it may make a copy of the # DataFrame in memory. This can be problematic and should be fixed in the # future. TODO (devin-petersohn): Delay reindexing def concat(self, axis, other, **kwargs): if not isinstance(other, list): other = [other] assert all( isinstance(o, type(self)) for o in other ), "Different Manager objects are being used. This is not allowed" sort = kwargs.get("sort", None) if sort is None: sort = False join = kwargs.get("join", "outer") ignore_index = kwargs.get("ignore_index", False) other_modin_frame = [o._modin_frame for o in other] new_modin_frame = self._modin_frame.concat(axis, other_modin_frame, join, sort) result = self.__constructor__(new_modin_frame) if ignore_index: if axis == 0: return result.reset_index(drop=True) else: result.columns = pandas.RangeIndex(len(result.columns)) return result return result # END Append/Concat/Join # Data Management Methods def free(self): # TODO create a way to clean up this object. return # END Data Management Methods # Data Movement Methods def move_to(self, target_backend: str) -> Union[BaseQueryCompiler, Any]: return NotImplemented @classmethod def move_from(cls, source_qc: BaseQueryCompiler) -> Union[BaseQueryCompiler, Any]: return NotImplemented # END Data Movement Methods # To NumPy def to_numpy(self, **kwargs): return self._modin_frame.to_numpy(**kwargs) # END To NumPy # Binary operations (e.g. add, sub) # These operations require two DataFrames and will change the shape of the # data if the index objects don't match. An outer join + op is performed, # such that columns/rows that don't have an index on the other DataFrame # result in NaN values. add = Binary.register(pandas.DataFrame.add, infer_dtypes="try_sample") # 'combine' and 'combine_first' are working with UDFs, so it's better not so sample them combine = Binary.register(pandas.DataFrame.combine, infer_dtypes="common_cast") combine_first = Binary.register( pandas.DataFrame.combine_first, infer_dtypes="common_cast" ) eq = Binary.register(pandas.DataFrame.eq, infer_dtypes="bool") equals = Binary.register( lambda df, other: pandas.DataFrame([[df.equals(other)]]), join_type=None, labels="drop", infer_dtypes="bool", ) floordiv = Binary.register(pandas.DataFrame.floordiv, infer_dtypes="try_sample") ge = Binary.register(pandas.DataFrame.ge, infer_dtypes="bool") gt = Binary.register(pandas.DataFrame.gt, infer_dtypes="bool") le = Binary.register(pandas.DataFrame.le, infer_dtypes="bool") lt = Binary.register(pandas.DataFrame.lt, infer_dtypes="bool") mod = Binary.register(pandas.DataFrame.mod, infer_dtypes="try_sample") mul = Binary.register(pandas.DataFrame.mul, infer_dtypes="try_sample") rmul = Binary.register(pandas.DataFrame.rmul, infer_dtypes="try_sample") ne = Binary.register(pandas.DataFrame.ne, infer_dtypes="bool") pow = Binary.register(pandas.DataFrame.pow, infer_dtypes="try_sample") radd = Binary.register(pandas.DataFrame.radd, infer_dtypes="try_sample") rfloordiv = Binary.register(pandas.DataFrame.rfloordiv, infer_dtypes="try_sample") rmod = Binary.register(pandas.DataFrame.rmod, infer_dtypes="try_sample") rpow = Binary.register(pandas.DataFrame.rpow, infer_dtypes="try_sample") rsub = Binary.register(pandas.DataFrame.rsub, infer_dtypes="try_sample") rtruediv = Binary.register(pandas.DataFrame.rtruediv, infer_dtypes="try_sample") sub = Binary.register(pandas.DataFrame.sub, infer_dtypes="try_sample") truediv = Binary.register(pandas.DataFrame.truediv, infer_dtypes="try_sample") __and__ = Binary.register(pandas.DataFrame.__and__, infer_dtypes="bool") __or__ = Binary.register(pandas.DataFrame.__or__, infer_dtypes="bool") __rand__ = Binary.register(pandas.DataFrame.__rand__, infer_dtypes="bool") __ror__ = Binary.register(pandas.DataFrame.__ror__, infer_dtypes="bool") __rxor__ = Binary.register(pandas.DataFrame.__rxor__, infer_dtypes="bool") __xor__ = Binary.register(pandas.DataFrame.__xor__, infer_dtypes="bool") df_update = Binary.register( copy_df_for_func(pandas.DataFrame.update, display_name="update"), join_type="left", sort=False, ) series_update = Binary.register( copy_df_for_func( lambda x, y: pandas.Series.update(x.squeeze(axis=1), y.squeeze(axis=1)), display_name="update", ), join_type="left", sort=False, ) # Series logical operators take an additional fill_value flag that dataframe does not series_eq = Binary.register( _series_logical_binop(pandas.Series.eq), infer_dtypes="bool" ) series_ge = Binary.register( _series_logical_binop(pandas.Series.ge), infer_dtypes="bool" ) series_gt = Binary.register( _series_logical_binop(pandas.Series.gt), infer_dtypes="bool" ) series_le = Binary.register( _series_logical_binop(pandas.Series.le), infer_dtypes="bool" ) series_lt = Binary.register( _series_logical_binop(pandas.Series.lt), infer_dtypes="bool" ) series_ne = Binary.register( _series_logical_binop(pandas.Series.ne), infer_dtypes="bool" ) # Needed for numpy API _logical_and = Binary.register( lambda df, other, *args, **kwargs: pandas.DataFrame( np.logical_and(df, other, *args, **kwargs) ), infer_dtypes="bool", ) _logical_or = Binary.register( lambda df, other, *args, **kwargs: pandas.DataFrame( np.logical_or(df, other, *args, **kwargs) ), infer_dtypes="bool", ) _logical_xor = Binary.register( lambda df, other, *args, **kwargs: pandas.DataFrame( np.logical_xor(df, other, *args, **kwargs) ), infer_dtypes="bool", ) def where(self, cond, other, **kwargs): assert isinstance( cond, type(self) ), "Must have the same QueryCompiler subclass to perform this operation" # it's doesn't work if `other` is Series._query_compiler because # `n_ary_op` performs columns copartition both for `cond` and `other`. if isinstance(other, type(self)) and other._shape_hint is not None: other = other.to_pandas() if isinstance(other, type(self)): # Make sure to set join_type=None so the `where` result always has # the same row and column labels as `self`. new_modin_frame = self._modin_frame.n_ary_op( lambda df, cond, other: df.where(cond, other, **kwargs), [ cond._modin_frame, other._modin_frame, ], join_type=None, ) # This will be a Series of scalars to be applied based on the condition # dataframe. else: def where_builder_series(df, cond): return df.where(cond, other, **kwargs) new_modin_frame = self._modin_frame.n_ary_op( where_builder_series, [cond._modin_frame], join_type="left" ) return self.__constructor__(new_modin_frame) def merge(self, right, **kwargs): if RangePartitioning.get(): try: return MergeImpl.range_partitioning_merge(self, right, kwargs) except NotImplementedError as e: message = ( f"Can't use range-partitioning merge implementation because of: {e}" + "\nFalling back to a row-axis implementation." ) get_logger().info(message) return MergeImpl.row_axis_merge(self, right, kwargs) def join(self, right: PandasQueryCompiler, **kwargs) -> PandasQueryCompiler: on = kwargs.get("on", None) how = kwargs.get("how", "left") sort = kwargs.get("sort", False) left = self if how in ["left", "inner"] or ( how == "right" and right._modin_frame._partitions.size != 0 ): reverted = False if how == "right": left, right = right, left reverted = True def map_func( left, right, kwargs=kwargs ) -> pandas.DataFrame: # pragma: no cover if reverted: df = pandas.DataFrame.join(right, left, **kwargs) else: df = pandas.DataFrame.join(left, right, **kwargs) return df right_to_broadcast = right._modin_frame.combine() left = left.__constructor__( left._modin_frame.broadcast_apply_full_axis( axis=1, func=map_func, # We're going to explicitly change the shape across the 1-axis, # so we want for partitioning to adapt as well keep_partitioning=False, num_splits=merge_partitioning( left._modin_frame, right._modin_frame, axis=1 ), other=right_to_broadcast, ) ) return left.sort_rows_by_column_values(on) if sort else left else: return left.default_to_pandas(pandas.DataFrame.join, right, **kwargs) # END Inter-Data operations # Reindex/reset_index (may shuffle data) def reindex(self, axis, labels, **kwargs): new_index, indexer = (self.index, None) if axis else self.index.reindex(labels) new_columns, _ = self.columns.reindex(labels) if axis else (self.columns, None) new_dtypes = None if self.frame_has_materialized_dtypes and kwargs.get("method", None) is None: # For columns, defining types is easier because we don't have to calculate the common # type, since the entire column is filled. A simple `reindex` covers our needs. # For rows, we can avoid calculating common types if we know that no new strings of # arbitrary type have been added (this information is in `indexer`). dtype = pandas.Index([kwargs.get("fill_value", np.nan)]).dtype if axis == 0: new_dtypes = self.dtypes.copy() # "-1" means that the required labels are missing in the dataframe and the # corresponding rows will be filled with "fill_value" that may change the column type. if indexer is not None and -1 in indexer: for col, col_dtype in new_dtypes.items(): new_dtypes[col] = find_common_type((col_dtype, dtype)) else: new_dtypes = self.dtypes.reindex(labels, fill_value=dtype) new_modin_frame = self._modin_frame.apply_full_axis( axis, lambda df: df.reindex(labels=labels, axis=axis, **kwargs), new_index=new_index, new_columns=new_columns, dtypes=new_dtypes, ) return self.__constructor__(new_modin_frame) def reset_index(self, **kwargs) -> PandasQueryCompiler: if self.lazy_row_labels: def _reset(df, *axis_lengths, partition_idx): # pragma: no cover df = df.reset_index(**kwargs) if isinstance(df.index, pandas.RangeIndex): # If the resulting index is a pure RangeIndex that means that # `.reset_index` actually dropped all of the levels of the # original index and so we have to recompute it manually for each partition start = sum(axis_lengths[:partition_idx]) stop = sum(axis_lengths[: partition_idx + 1]) df.index = pandas.RangeIndex(start, stop) return df new_columns = None if kwargs["drop"]: dtypes = self._modin_frame.copy_dtypes_cache() if self.frame_has_columns_cache: new_columns = self._modin_frame.copy_columns_cache( copy_lengths=True ) else: # concat index dtypes with column dtypes index_dtypes = self._modin_frame._index_cache.maybe_get_dtypes() try: dtypes = ModinDtypes.concat( [ index_dtypes, self._modin_frame._dtypes, ] ) except NotImplementedError: # may raise on duplicated names in materialized 'self.dtypes' dtypes = None if ( # can precompute new columns if we know columns and index names self.frame_has_materialized_columns and index_dtypes is not None ): empty_index = ( pandas.Index([0], name=index_dtypes.index[0]) if len(index_dtypes) == 1 else pandas.MultiIndex.from_arrays( [[i] for i in range(len(index_dtypes))], names=index_dtypes.index, ) ) new_columns = ( pandas.DataFrame(columns=self.columns, index=empty_index) .reset_index(**kwargs) .columns ) return self.__constructor__( self._modin_frame.apply_full_axis( axis=1, func=_reset, enumerate_partitions=True, new_columns=new_columns, dtypes=dtypes, sync_labels=False, pass_axis_lengths_to_partitions=True, ) ) allow_duplicates = kwargs.pop("allow_duplicates", lib.no_default) names = kwargs.pop("names", None) if allow_duplicates not in (lib.no_default, False) or names is not None: return self.default_to_pandas( pandas.DataFrame.reset_index, allow_duplicates=allow_duplicates, names=names, **kwargs, ) drop = kwargs.get("drop", False) level = kwargs.get("level", None) new_index = None if level is not None: if not isinstance(level, (tuple, list)): level = [level] level = [self.index._get_level_number(lev) for lev in level] uniq_sorted_level = sorted(set(level)) if len(uniq_sorted_level) < self.index.nlevels: # We handle this by separately computing the index. We could just # put the labels into the data and pull them back out, but that is # expensive. new_index = ( self.index.droplevel(uniq_sorted_level) if len(level) < self.index.nlevels else pandas.RangeIndex(len(self.index)) ) elif not drop: uniq_sorted_level = list(range(self.index.nlevels)) if not drop: if len(uniq_sorted_level) < self.index.nlevels: # These are the index levels that will remain after the reset_index keep_levels = [ i for i in range(self.index.nlevels) if i not in uniq_sorted_level ] new_copy = self.copy() # Change the index to have only the levels that will be inserted # into the data. We will replace the old levels later. new_copy.index = self.index.droplevel(keep_levels) new_copy.index.names = [ ( "level_{}".format(level_value) if new_copy.index.names[level_index] is None else new_copy.index.names[level_index] ) for level_index, level_value in enumerate(uniq_sorted_level) ] new_modin_frame = new_copy._modin_frame.from_labels() # Replace the levels that will remain as a part of the index. new_modin_frame.index = new_index else: new_modin_frame = self._modin_frame.from_labels() if isinstance(new_modin_frame.columns, pandas.MultiIndex): # Fix col_level and col_fill in generated column names because from_labels works with assumption # that col_level and col_fill are not specified but it expands tuples in level names. col_level = kwargs.get("col_level", 0) col_fill = kwargs.get("col_fill", "") if col_level != 0 or col_fill != "": # Modify generated column names if col_level and col_fil have values different from default. levels_names_list = [ f"level_{level_index}" if level_name is None else level_name for level_index, level_name in enumerate(self.index.names) ] if col_fill is None: # Initialize col_fill if it is None. # This is some weird undocumented Pandas behavior to take first # element of the last column name. last_col_name = levels_names_list[uniq_sorted_level[-1]] last_col_name = ( list(last_col_name) if isinstance(last_col_name, tuple) else [last_col_name] ) if len(last_col_name) not in (1, self.columns.nlevels): raise ValueError( "col_fill=None is incompatible " + f"with incomplete column name {last_col_name}" ) col_fill = last_col_name[0] columns_list = new_modin_frame.columns.tolist() for level_index, level_value in enumerate(uniq_sorted_level): level_name = levels_names_list[level_value] # Expand tuples into separate items and fill the rest with col_fill top_level = [col_fill] * col_level middle_level = ( list(level_name) if isinstance(level_name, tuple) else [level_name] ) bottom_level = [col_fill] * ( self.columns.nlevels - (col_level + len(middle_level)) ) item = tuple(top_level + middle_level + bottom_level) if len(item) > self.columns.nlevels: raise ValueError( "Item must have length equal to number of levels." ) columns_list[level_index] = item new_modin_frame.columns = pandas.MultiIndex.from_tuples( columns_list, names=self.columns.names ) new_self = self.__constructor__(new_modin_frame) else: new_self = self.copy() new_self.index = ( # Cheaper to compute row lengths than index pandas.RangeIndex(sum(new_self._modin_frame.row_lengths)) if new_index is None else new_index ) return new_self def set_index_from_columns( self, keys: List[Hashable], drop: bool = True, append: bool = False ): new_modin_frame = self._modin_frame.to_labels(keys) if append: arrays = [] # Appending keeps the original order of the index levels, then appends the # new index objects. names = list(self.index.names) if isinstance(self.index, pandas.MultiIndex): for i in range(self.index.nlevels): arrays.append(self.index._get_level_values(i)) else: arrays.append(self.index) # Add the names in the correct order. names.extend(new_modin_frame.index.names) if isinstance(new_modin_frame.index, pandas.MultiIndex): for i in range(new_modin_frame.index.nlevels): arrays.append(new_modin_frame.index._get_level_values(i)) else: arrays.append(new_modin_frame.index) new_modin_frame.index = ensure_index_from_sequences(arrays, names) if not drop: # The algebraic operator for this operation always drops the column, but we # can copy the data in this object and just use the index from the result of # the query compiler call. result = self._modin_frame.copy() result.index = new_modin_frame.index else: result = new_modin_frame return self.__constructor__(result) # END Reindex/reset_index # Transpose # For transpose, we aren't going to immediately copy everything. Since the # actual transpose operation is very fast, we will just do it before any # operation that gets called on the transposed data. See _prepare_method # for how the transpose is applied. # # Our invariants assume that the blocks are transposed, but not the # data inside. Sometimes we have to reverse this transposition of blocks # for simplicity of implementation. def transpose(self, *args, **kwargs) -> PandasQueryCompiler: # Switch the index and columns and transpose the data within the blocks. return self.__constructor__(self._modin_frame.transpose()) def is_series_like(self): return len(self.columns) == 1 or len(self.index) == 1 # END Transpose # TreeReduce operations count = TreeReduce.register(pandas.DataFrame.count, pandas.DataFrame.sum) def _dtypes_sum(dtypes: pandas.Series, *func_args, **func_kwargs): # noqa: GL08 # The common type evaluation for `TreeReduce` operator may differ depending # on the pandas function, so it's better to pass a evaluation function that # should be defined for each Modin's function. return find_common_type(dtypes.tolist()) sum = TreeReduce.register(pandas.DataFrame.sum, compute_dtypes=_dtypes_sum) prod = TreeReduce.register(pandas.DataFrame.prod) any = TreeReduce.register(pandas.DataFrame.any, pandas.DataFrame.any) all = TreeReduce.register(pandas.DataFrame.all, pandas.DataFrame.all) # memory_usage adds an extra column for index usage, but we don't want to distribute # the index memory usage calculation. _memory_usage_without_index = TreeReduce.register( pandas.DataFrame.memory_usage, lambda x, *args, **kwargs: pandas.DataFrame.sum(x), axis=0, ) def memory_usage(self, **kwargs): index = kwargs.get("index", True) deep = kwargs.get("deep", False) usage_without_index = self._memory_usage_without_index(index=False, deep=deep) return ( self.from_pandas( pandas.DataFrame( [self.index.memory_usage()], columns=["Index"], index=[MODIN_UNNAMED_SERIES_LABEL], ), data_cls=type(self._modin_frame), ).concat(axis=1, other=[usage_without_index]) if index else usage_without_index ) def max(self, axis, **kwargs): def map_func(df, **kwargs): return pandas.DataFrame.max(df, **kwargs) def reduce_func(df, **kwargs): if kwargs.get("numeric_only", False): kwargs = kwargs.copy() kwargs["numeric_only"] = False return pandas.DataFrame.max(df, **kwargs) return TreeReduce.register(map_func, reduce_func)(self, axis=axis, **kwargs) def min(self, axis, **kwargs): def map_func(df, **kwargs): return pandas.DataFrame.min(df, **kwargs) def reduce_func(df, **kwargs): if kwargs.get("numeric_only", False): kwargs = kwargs.copy() kwargs["numeric_only"] = False return pandas.DataFrame.min(df, **kwargs) return TreeReduce.register(map_func, reduce_func)(self, axis=axis, **kwargs) def mean(self, axis, **kwargs): if kwargs.get("level") is not None or axis is None: return self.default_to_pandas(pandas.DataFrame.mean, axis=axis, **kwargs) skipna = kwargs.get("skipna", True) # TODO-FIX: this function may work incorrectly with user-defined "numeric" values. # Since `count(numeric_only=True)` discards all unknown "numeric" types, we can get incorrect # divisor inside the reduce function. def map_fn(df, numeric_only=False, **kwargs): """ Perform Map phase of the `mean`. Compute sum and number of elements in a given partition. """ result = pandas.DataFrame( { "sum": df.sum(axis=axis, skipna=skipna, numeric_only=numeric_only), "count": df.count(axis=axis, numeric_only=numeric_only), } ) return result if axis else result.T def reduce_fn(df, **kwargs): """ Perform Reduce phase of the `mean`. Compute sum for all the the partitions and divide it to the total number of elements. """ sum_cols = df["sum"] if axis else df.loc["sum"] count_cols = df["count"] if axis else df.loc["count"] if not isinstance(sum_cols, pandas.Series): # If we got `NaN` as the result of the sum in any axis partition, # then we must consider the whole sum as `NaN`, so setting `skipna=False` sum_cols = sum_cols.sum(axis=axis, skipna=False) count_cols = count_cols.sum(axis=axis, skipna=False) return sum_cols / count_cols def compute_dtypes_fn(dtypes, axis, **kwargs): """ Compute the resulting Series dtype. When computing along rows and there are numeric and boolean columns Pandas returns `object`. In all other cases - `float64`. """ if ( axis == 1 and any(is_bool_dtype(t) for t in dtypes) and any(is_numeric_dtype(t) for t in dtypes) ): return "object" return "float64" return TreeReduce.register( map_fn, reduce_fn, compute_dtypes=compute_dtypes_fn, )(self, axis=axis, **kwargs) # END TreeReduce operations # Reduce operations idxmax = Reduce.register(pandas.DataFrame.idxmax) idxmin = Reduce.register(pandas.DataFrame.idxmin) def median(self, axis, **kwargs): if axis is None: return self.default_to_pandas(pandas.DataFrame.median, axis=axis, **kwargs) return Reduce.register(pandas.DataFrame.median)(self, axis=axis, **kwargs) def nunique(self, axis=0, dropna=True): if not RangePartitioning.get(): return Reduce.register(pandas.DataFrame.nunique)( self, axis=axis, dropna=dropna ) unsupported_message = "" if axis != 0: unsupported_message += ( "Range-partitioning 'nunique()' is only supported for 'axis=0'.\n" ) if len(self.columns) > 1: unsupported_message += "Range-partitioning 'nunique()' is only supported for a signle-column dataframe.\n" if len(unsupported_message) > 0: message = ( f"Can't use range-partitioning implementation for 'nunique' because:\n{unsupported_message}" + "Falling back to a full-axis reduce implementation." ) get_logger().info(message) ErrorMessage.warn(message) return Reduce.register(pandas.DataFrame.nunique)( self, axis=axis, dropna=dropna ) # compute '.nunique()' for each row partitions new_modin_frame = self._modin_frame._apply_func_to_range_partitioning( key_columns=self.columns.tolist(), func=lambda df: df.nunique(dropna=dropna).to_frame(), ) # sum the results of each row part to get the final value new_modin_frame = new_modin_frame.reduce(axis=0, function=lambda df: df.sum()) return self.__constructor__(new_modin_frame, shape_hint="column") def skew(self, axis, **kwargs): if axis is None: return self.default_to_pandas(pandas.DataFrame.skew, axis=axis, **kwargs) return Reduce.register(pandas.DataFrame.skew)(self, axis=axis, **kwargs) def kurt(self, axis, **kwargs): if axis is None: return self.default_to_pandas(pandas.DataFrame.kurt, axis=axis, **kwargs) return Reduce.register(pandas.DataFrame.kurt)(self, axis=axis, **kwargs) sem = Reduce.register(pandas.DataFrame.sem) std = Reduce.register(pandas.DataFrame.std) var = Reduce.register(pandas.DataFrame.var) sum_min_count = Reduce.register(pandas.DataFrame.sum) prod_min_count = Reduce.register(pandas.DataFrame.prod) quantile_for_single_value = Reduce.register(pandas.DataFrame.quantile) def to_datetime(self, *args, **kwargs): if len(self.columns) == 1: return Map.register( # to_datetime has inplace side effects, see GH#3063 lambda df, *args, **kwargs: pandas.to_datetime( df.squeeze(axis=1), *args, **kwargs ).to_frame(), shape_hint="column", )(self, *args, **kwargs) else: return Reduce.register(pandas.to_datetime, axis=1, shape_hint="column")( self, *args, **kwargs ) # END Reduce operations def _resample_func( self, resample_kwargs, func_name, new_columns=None, df_op=None, allow_range_impl=True, *args, **kwargs, ): """ Resample underlying time-series data and apply aggregation on it. Parameters ---------- resample_kwargs : dict Resample parameters in the format of ``modin.pandas.DataFrame.resample`` signature. func_name : str Aggregation function name to apply on resampler object. new_columns : list of labels, optional Actual column labels of the resulted frame, supposed to be a hint for the Modin frame. If not specified will be computed automaticly. df_op : callable(pandas.DataFrame) -> [pandas.DataFrame, pandas.Series], optional Preprocessor function to apply to the passed frame before resampling. allow_range_impl : bool, default: True Whether to use range-partitioning if ``RangePartitioning.get() is True``. *args : args Arguments to pass to the aggregation function. **kwargs : kwargs Arguments to pass to the aggregation function. Returns ------- PandasQueryCompiler New QueryCompiler containing the result of resample aggregation. """ from modin.core.dataframe.pandas.dataframe.utils import ShuffleResample def map_func(df, resample_kwargs=resample_kwargs): # pragma: no cover """Resample time-series data of the passed frame and apply aggregation function on it.""" if len(df) == 0: if resample_kwargs["on"] is not None: df = df.set_index(resample_kwargs["on"]) return df if "bin_bounds" in df.attrs: timestamps = df.attrs["bin_bounds"] if isinstance(df.index, pandas.MultiIndex): level_to_keep = resample_kwargs["level"] if isinstance(level_to_keep, int): to_drop = [ lvl for lvl in range(df.index.nlevels) if lvl != level_to_keep ] else: to_drop = [ lvl for lvl in df.index.names if lvl != level_to_keep ] df.index = df.index.droplevel(to_drop) resample_kwargs = resample_kwargs.copy() resample_kwargs["level"] = None filler = pandas.DataFrame( np.nan, index=pandas.Index(timestamps), columns=df.columns ) df = pandas.concat([df, filler], copy=False) if df_op is not None: df = df_op(df) resampled_val = df.resample(**resample_kwargs) op = getattr(pandas.core.resample.Resampler, func_name) if callable(op): try: # This will happen with Arrow buffer read-only errors. We don't want to copy # all the time, so this will try to fast-path the code first. val = op(resampled_val, *args, **kwargs) except ValueError: resampled_val = df.copy().resample(**resample_kwargs) val = op(resampled_val, *args, **kwargs) else: val = getattr(resampled_val, func_name) if isinstance(val, pandas.Series): return val.to_frame() else: return val if resample_kwargs["on"] is None: level = [ 0 if resample_kwargs["level"] is None else resample_kwargs["level"] ] key_columns = [] else: level = None key_columns = [resample_kwargs["on"]] if ( not allow_range_impl or resample_kwargs["axis"] not in (0, "index") or not RangePartitioning.get() ): new_modin_frame = self._modin_frame.apply_full_axis( axis=0, func=map_func, new_columns=new_columns ) else: new_modin_frame = self._modin_frame._apply_func_to_range_partitioning( key_columns=key_columns, level=level, func=map_func, shuffle_func_cls=ShuffleResample, resample_kwargs=resample_kwargs, ) return self.__constructor__(new_modin_frame) def resample_get_group(self, resample_kwargs, name, obj): return self._resample_func( resample_kwargs, "get_group", name=name, allow_range_impl=False, obj=obj ) def resample_app_ser(self, resample_kwargs, func, *args, **kwargs): return self._resample_func( resample_kwargs, "apply", df_op=lambda df: df.squeeze(axis=1), func=func, *args, **kwargs, ) def resample_app_df(self, resample_kwargs, func, *args, **kwargs): return self._resample_func(resample_kwargs, "apply", func=func, *args, **kwargs) def resample_agg_ser(self, resample_kwargs, func, *args, **kwargs): return self._resample_func( resample_kwargs, "aggregate", df_op=lambda df: df.squeeze(axis=1), func=func, *args, **kwargs, ) def resample_agg_df(self, resample_kwargs, func, *args, **kwargs): return self._resample_func( resample_kwargs, "aggregate", func=func, *args, **kwargs ) def resample_transform(self, resample_kwargs, arg, *args, **kwargs): return self._resample_func( resample_kwargs, "transform", arg=arg, allow_range_impl=False, *args, **kwargs, ) def resample_pipe(self, resample_kwargs, func, *args, **kwargs): return self._resample_func(resample_kwargs, "pipe", func=func, *args, **kwargs) def resample_ffill(self, resample_kwargs, limit): return self._resample_func( resample_kwargs, "ffill", limit=limit, allow_range_impl=False ) def resample_bfill(self, resample_kwargs, limit): return self._resample_func( resample_kwargs, "bfill", limit=limit, allow_range_impl=False ) def resample_nearest(self, resample_kwargs, limit): return self._resample_func( resample_kwargs, "nearest", limit=limit, allow_range_impl=False ) def resample_fillna(self, resample_kwargs, method, limit): return self._resample_func( resample_kwargs, "fillna", method=method, limit=limit, allow_range_impl=method is None, ) def resample_asfreq(self, resample_kwargs, fill_value): return self._resample_func(resample_kwargs, "asfreq", fill_value=fill_value) def resample_interpolate( self, resample_kwargs, method, axis, limit, inplace, limit_direction, limit_area, downcast, **kwargs, ): return self._resample_func( resample_kwargs, "interpolate", axis=axis, limit=limit, inplace=inplace, limit_direction=limit_direction, limit_area=limit_area, downcast=downcast, allow_range_impl=False, **kwargs, ) def resample_count(self, resample_kwargs): return self._resample_func(resample_kwargs, "count") def resample_nunique(self, resample_kwargs, *args, **kwargs): return self._resample_func(resample_kwargs, "nunique", *args, **kwargs) def resample_first(self, resample_kwargs, *args, **kwargs): return self._resample_func( resample_kwargs, "first", allow_range_impl=False, *args, **kwargs ) def resample_last(self, resample_kwargs, *args, **kwargs): return self._resample_func( resample_kwargs, "last", allow_range_impl=False, *args, **kwargs ) def resample_max(self, resample_kwargs, *args, **kwargs): return self._resample_func(resample_kwargs, "max", *args, **kwargs) def resample_mean(self, resample_kwargs, *args, **kwargs): return self._resample_func(resample_kwargs, "mean", *args, **kwargs) def resample_median(self, resample_kwargs, *args, **kwargs): return self._resample_func(resample_kwargs, "median", *args, **kwargs) def resample_min(self, resample_kwargs, *args, **kwargs): return self._resample_func(resample_kwargs, "min", *args, **kwargs) def resample_ohlc_ser(self, resample_kwargs, *args, **kwargs): return self._resample_func( resample_kwargs, "ohlc", df_op=lambda df: df.squeeze(axis=1), *args, **kwargs, ) def resample_ohlc_df(self, resample_kwargs, *args, **kwargs): return self._resample_func(resample_kwargs, "ohlc", *args, **kwargs) def resample_prod(self, resample_kwargs, min_count, *args, **kwargs): return self._resample_func( resample_kwargs, "prod", min_count=min_count, *args, **kwargs, ) def resample_size(self, resample_kwargs): return self._resample_func( resample_kwargs, "size", new_columns=[MODIN_UNNAMED_SERIES_LABEL], allow_range_impl=False, ) def resample_sem(self, resample_kwargs, *args, **kwargs): return self._resample_func(resample_kwargs, "sem", *args, **kwargs) def resample_std(self, resample_kwargs, ddof, *args, **kwargs): return self._resample_func(resample_kwargs, "std", ddof=ddof, *args, **kwargs) def resample_sum(self, resample_kwargs, min_count, *args, **kwargs): return self._resample_func( resample_kwargs, "sum", min_count=min_count, *args, **kwargs, ) def resample_var(self, resample_kwargs, ddof, *args, **kwargs): return self._resample_func(resample_kwargs, "var", ddof=ddof, *args, **kwargs) def resample_quantile(self, resample_kwargs, q, **kwargs): return self._resample_func(resample_kwargs, "quantile", q=q, **kwargs) def expanding_aggregate(self, axis, expanding_args, func, *args, **kwargs): new_modin_frame = self._modin_frame.apply_full_axis( axis, lambda df: pandas.DataFrame( df.expanding(*expanding_args).aggregate(func=func, *args, **kwargs) ), new_index=self.index, ) return self.__constructor__(new_modin_frame) expanding_sum = Fold.register( lambda df, expanding_args, *args, **kwargs: pandas.DataFrame( df.expanding(*expanding_args).sum(*args, **kwargs) ), shape_preserved=True, ) expanding_min = Fold.register( lambda df, expanding_args, *args, **kwargs: pandas.DataFrame( df.expanding(*expanding_args).min(*args, **kwargs) ), shape_preserved=True, ) expanding_max = Fold.register( lambda df, expanding_args, *args, **kwargs: pandas.DataFrame( df.expanding(*expanding_args).max(*args, **kwargs) ), shape_preserved=True, ) expanding_mean = Fold.register( lambda df, expanding_args, *args, **kwargs: pandas.DataFrame( df.expanding(*expanding_args).mean(*args, **kwargs) ), shape_preserved=True, ) expanding_median = Fold.register( lambda df, expanding_args, *args, **kwargs: pandas.DataFrame( df.expanding(*expanding_args).median(*args, **kwargs) ), shape_preserved=True, ) expanding_var = Fold.register( lambda df, expanding_args, *args, **kwargs: pandas.DataFrame( df.expanding(*expanding_args).var(*args, **kwargs) ), shape_preserved=True, ) expanding_std = Fold.register( lambda df, expanding_args, *args, **kwargs: pandas.DataFrame( df.expanding(*expanding_args).std(*args, **kwargs) ), shape_preserved=True, ) expanding_count = Fold.register( lambda df, expanding_args, *args, **kwargs: pandas.DataFrame( df.expanding(*expanding_args).count(*args, **kwargs) ), shape_preserved=True, ) def expanding_cov( self, fold_axis, expanding_args, squeeze_self, squeeze_other, other=None, pairwise=None, ddof=1, numeric_only=False, **kwargs, ): other_for_pandas = ( other if other is None else ( other.to_pandas().squeeze(axis=1) if squeeze_other else other.to_pandas() ) ) if len(self.columns) > 1: # computing covariance for each column requires having the other columns, # so we can't parallelize this as a full-column operation return self.default_to_pandas( lambda df: pandas.DataFrame.expanding(df, *expanding_args).cov( other=other_for_pandas, pairwise=pairwise, ddof=ddof, numeric_only=numeric_only, **kwargs, ) ) return Fold.register( lambda df, expanding_args, *args, **kwargs: pandas.DataFrame( (df.squeeze(axis=1) if squeeze_self else df) .expanding(*expanding_args) .cov(*args, **kwargs) ), shape_preserved=True, )( self, fold_axis, expanding_args, other=other_for_pandas, pairwise=pairwise, ddof=ddof, numeric_only=numeric_only, **kwargs, ) def expanding_corr( self, fold_axis, expanding_args, squeeze_self, squeeze_other, other=None, pairwise=None, ddof=1, numeric_only=False, **kwargs, ): other_for_pandas = ( other if other is None else ( other.to_pandas().squeeze(axis=1) if squeeze_other else other.to_pandas() ) ) if len(self.columns) > 1: # computing correlation for each column requires having the other columns, # so we can't parallelize this as a full-column operation return self.default_to_pandas( lambda df: pandas.DataFrame.expanding(df, *expanding_args).corr( other=other_for_pandas, pairwise=pairwise, ddof=ddof, numeric_only=numeric_only, **kwargs, ) ) return Fold.register( lambda df, expanding_args, *args, **kwargs: pandas.DataFrame( (df.squeeze(axis=1) if squeeze_self else df) .expanding(*expanding_args) .corr(*args, **kwargs) ), shape_preserved=True, )( self, fold_axis, expanding_args, other=other_for_pandas, pairwise=pairwise, ddof=ddof, numeric_only=numeric_only, **kwargs, ) expanding_quantile = Fold.register( lambda df, expanding_args, *args, **kwargs: pandas.DataFrame( df.expanding(*expanding_args).quantile(*args, **kwargs) ), shape_preserved=True, ) expanding_sem = Fold.register( lambda df, expanding_args, *args, **kwargs: pandas.DataFrame( df.expanding(*expanding_args).sem(*args, **kwargs) ), shape_preserved=True, ) expanding_kurt = Fold.register( lambda df, expanding_args, *args, **kwargs: pandas.DataFrame( df.expanding(*expanding_args).kurt(*args, **kwargs) ), shape_preserved=True, ) expanding_skew = Fold.register( lambda df, expanding_args, *args, **kwargs: pandas.DataFrame( df.expanding(*expanding_args).skew(*args, **kwargs) ), shape_preserved=True, ) expanding_rank = Fold.register( lambda df, expanding_args, *args, **kwargs: pandas.DataFrame( df.expanding(*expanding_args).rank(*args, **kwargs) ), shape_preserved=True, ) window_mean = Fold.register( lambda df, rolling_kwargs, *args, **kwargs: pandas.DataFrame( df.rolling(**rolling_kwargs).mean(*args, **kwargs) ), shape_preserved=True, ) window_sum = Fold.register( lambda df, rolling_kwargs, *args, **kwargs: pandas.DataFrame( df.rolling(**rolling_kwargs).sum(*args, **kwargs) ), shape_preserved=True, ) window_var = Fold.register( lambda df, rolling_kwargs, ddof, *args, **kwargs: pandas.DataFrame( df.rolling(**rolling_kwargs).var(ddof=ddof, *args, **kwargs) ), shape_preserved=True, ) window_std = Fold.register( lambda df, rolling_kwargs, ddof, *args, **kwargs: pandas.DataFrame( df.rolling(**rolling_kwargs).std(ddof=ddof, *args, **kwargs) ), shape_preserved=True, ) rolling_count = Fold.register( lambda df, rolling_kwargs: pandas.DataFrame( df.rolling(**rolling_kwargs).count() ), shape_preserved=True, ) rolling_sum = Fold.register( lambda df, rolling_kwargs, *args, **kwargs: pandas.DataFrame( df.rolling(**rolling_kwargs).sum(*args, **kwargs) ), shape_preserved=True, ) rolling_sem = Fold.register( lambda df, rolling_kwargs, *args, **kwargs: pandas.DataFrame( df.rolling(**rolling_kwargs).sem(*args, **kwargs) ), shape_preserved=True, ) rolling_mean = Fold.register( lambda df, rolling_kwargs, *args, **kwargs: pandas.DataFrame( df.rolling(**rolling_kwargs).mean(*args, **kwargs) ), shape_preserved=True, ) rolling_median = Fold.register( lambda df, rolling_kwargs, **kwargs: pandas.DataFrame( df.rolling(**rolling_kwargs).median(**kwargs) ), shape_preserved=True, ) rolling_var = Fold.register( lambda df, rolling_kwargs, ddof, *args, **kwargs: pandas.DataFrame( df.rolling(**rolling_kwargs).var(ddof=ddof, *args, **kwargs) ), shape_preserved=True, ) rolling_std = Fold.register( lambda df, rolling_kwargs, ddof, *args, **kwargs: pandas.DataFrame( df.rolling(**rolling_kwargs).std(ddof=ddof, *args, **kwargs) ), shape_preserved=True, ) rolling_min = Fold.register( lambda df, rolling_kwargs, *args, **kwargs: pandas.DataFrame( df.rolling(**rolling_kwargs).min(*args, **kwargs) ), shape_preserved=True, ) rolling_max = Fold.register( lambda df, rolling_kwargs, *args, **kwargs: pandas.DataFrame( df.rolling(**rolling_kwargs).max(*args, **kwargs) ), shape_preserved=True, ) rolling_skew = Fold.register( lambda df, rolling_kwargs, **kwargs: pandas.DataFrame( df.rolling(**rolling_kwargs).skew(**kwargs) ), shape_preserved=True, ) rolling_kurt = Fold.register( lambda df, rolling_kwargs, **kwargs: pandas.DataFrame( df.rolling(**rolling_kwargs).kurt(**kwargs) ), shape_preserved=True, ) rolling_apply = Fold.register( lambda df, rolling_kwargs, func, raw, engine, engine_kwargs, args, kwargs: pandas.DataFrame( df.rolling(**rolling_kwargs).apply( func=func, raw=raw, engine=engine, engine_kwargs=engine_kwargs, args=args, kwargs=kwargs, ), ), shape_preserved=True, ) rolling_quantile = Fold.register( lambda df, rolling_kwargs, q, interpolation, **kwargs: pandas.DataFrame( df.rolling(**rolling_kwargs).quantile( q=q, interpolation=interpolation, **kwargs ), ), shape_preserved=True, ) rolling_rank = Fold.register( lambda df, rolling_kwargs, method, ascending, pct, numeric_only, **kwargs: pandas.DataFrame( df.rolling(**rolling_kwargs).rank( method=method, ascending=ascending, pct=pct, numeric_only=numeric_only, **kwargs, ), ), shape_preserved=True, ) def rolling_corr(self, axis, rolling_kwargs, other, pairwise, *args, **kwargs): if len(self.columns) > 1: return self.default_to_pandas( lambda df: pandas.DataFrame.rolling(df, **rolling_kwargs).corr( other=other, pairwise=pairwise, *args, **kwargs ) ) else: return Fold.register( lambda df: pandas.DataFrame( df.rolling(**rolling_kwargs).corr( other=other, pairwise=pairwise, *args, **kwargs ) ), shape_preserved=True, )(self, axis) def rolling_cov(self, axis, rolling_kwargs, other, pairwise, ddof, **kwargs): if len(self.columns) > 1: return self.default_to_pandas( lambda df: pandas.DataFrame.rolling(df, **rolling_kwargs).cov( other=other, pairwise=pairwise, ddof=ddof, **kwargs ) ) else: return Fold.register( lambda df: pandas.DataFrame( df.rolling(**rolling_kwargs).cov( other=other, pairwise=pairwise, ddof=ddof, **kwargs ) ), shape_preserved=True, )(self, axis) def rolling_aggregate(self, axis, rolling_kwargs, func, *args, **kwargs): new_modin_frame = self._modin_frame.apply_full_axis( axis, lambda df: pandas.DataFrame( df.rolling(**rolling_kwargs).aggregate(func=func, *args, **kwargs) ), new_index=self.index, ) return self.__constructor__(new_modin_frame) def unstack(self, level, fill_value): if not isinstance(self.index, pandas.MultiIndex) or ( isinstance(self.index, pandas.MultiIndex) and is_list_like(level) and len(level) == self.index.nlevels ): axis = 1 new_columns = [MODIN_UNNAMED_SERIES_LABEL] need_reindex = True else: axis = 0 new_columns = None need_reindex = False def map_func(df): # pragma: no cover return pandas.DataFrame(df.unstack(level=level, fill_value=fill_value)) def is_tree_like_or_1d(calc_index, valid_index): """ Check whether specified index is a single dimensional or built in a tree manner. Parameters ---------- calc_index : pandas.Index Frame index to check. valid_index : pandas.Index Frame index on the opposite from `calc_index` axis. Returns ------- bool True if `calc_index` is not MultiIndex or MultiIndex and built in a tree manner. False otherwise. """ if not isinstance(calc_index, pandas.MultiIndex): return True actual_len = 1 for lvl in calc_index.levels: actual_len *= len(lvl) return len(self.index) * len(self.columns) == actual_len * len(valid_index) is_tree_like_or_1d_index = is_tree_like_or_1d(self.index, self.columns) is_tree_like_or_1d_cols = is_tree_like_or_1d(self.columns, self.index) is_all_multi_list = False if ( isinstance(self.index, pandas.MultiIndex) and isinstance(self.columns, pandas.MultiIndex) and is_list_like(level) and len(level) == self.index.nlevels and is_tree_like_or_1d_index and is_tree_like_or_1d_cols ): is_all_multi_list = True real_cols_bkp = self.columns obj = self.copy() obj.columns = np.arange(len(obj.columns)) else: obj = self new_modin_frame = obj._modin_frame.apply_full_axis( axis, map_func, new_columns=new_columns ) result = self.__constructor__(new_modin_frame) def compute_index(index, columns, consider_index=True, consider_columns=True): """ Compute new index for the unstacked frame. Parameters ---------- index : pandas.Index Index of the original frame. columns : pandas.Index Columns of the original frame. consider_index : bool, default: True Whether original index contains duplicated values. If True all duplicates will be droped. consider_columns : bool, default: True Whether original columns contains duplicated values. If True all duplicates will be droped. Returns ------- pandas.Index New index to use in the unstacked frame. """ def get_unique_level_values(index): return [ index.get_level_values(lvl).unique() for lvl in np.arange(index.nlevels) ] new_index = ( get_unique_level_values(index) if consider_index else index if isinstance(index, list) else [index] ) new_columns = ( get_unique_level_values(columns) if consider_columns else [columns] ) return pandas.MultiIndex.from_product([*new_columns, *new_index]) if is_all_multi_list and is_tree_like_or_1d_index and is_tree_like_or_1d_cols: result = result.sort_index() index_level_values = [lvl for lvl in obj.index.levels] result.index = compute_index( index_level_values, real_cols_bkp, consider_index=False ) return result if need_reindex: if is_tree_like_or_1d_index and is_tree_like_or_1d_cols: is_recompute_index = isinstance(self.index, pandas.MultiIndex) is_recompute_columns = not is_recompute_index and isinstance( self.columns, pandas.MultiIndex ) new_index = compute_index( self.index, self.columns, is_recompute_index, is_recompute_columns ) elif is_tree_like_or_1d_index != is_tree_like_or_1d_cols: if isinstance(self.columns, pandas.MultiIndex) or not isinstance( self.index, pandas.MultiIndex ): return result else: index = ( self.index.sortlevel()[0] if is_tree_like_or_1d_index and not is_tree_like_or_1d_cols and isinstance(self.index, pandas.MultiIndex) else self.index ) index = pandas.MultiIndex.from_tuples( list(index) * len(self.columns) ) columns = self.columns.repeat(len(self.index)) index_levels = [ index.get_level_values(i) for i in range(index.nlevels) ] new_index = pandas.MultiIndex.from_arrays( [columns] + index_levels, names=self.columns.names + self.index.names, ) else: return result result = result.reindex(0, new_index) return result def stack(self, level, dropna, sort): if not isinstance(self.columns, pandas.MultiIndex) or ( isinstance(self.columns, pandas.MultiIndex) and is_list_like(level) and len(level) == self.columns.nlevels ): new_columns = [MODIN_UNNAMED_SERIES_LABEL] else: new_columns = None new_modin_frame = self._modin_frame.apply_full_axis( 1, lambda df: pandas.DataFrame( df.stack(level=level, dropna=dropna, sort=sort) ), new_columns=new_columns, ) return self.__constructor__(new_modin_frame) # Map partitions operations # These operations are operations that apply a function to every partition. def isin(self, values, ignore_indices=False): shape_hint = self._shape_hint if isinstance(values, type(self)): # HACK: if we don't cast to pandas, then the execution engine will try to # propagate the distributed Series to workers and most likely would have # some performance problems. # TODO: A better way of doing so could be passing this `values` as a query compiler # and broadcast accordingly. values = values.to_pandas() if ignore_indices: # Pandas logic is that it ignores indexing if 'values' is a 1D object values = values.squeeze(axis=1) def isin_func(df, values): if shape_hint == "column": df = df.squeeze(axis=1) res = df.isin(values) if isinstance(res, pandas.Series): res = res.to_frame( MODIN_UNNAMED_SERIES_LABEL if res.name is None else res.name ) return res return Map.register(isin_func, shape_hint=shape_hint, dtypes=np.bool_)( self, values ) abs = Map.register(pandas.DataFrame.abs, dtypes="copy") map = Map.register(pandas.DataFrame.map) conj = Map.register(lambda df, *args, **kwargs: pandas.DataFrame(np.conj(df))) def convert_dtypes( self, infer_objects: bool = True, convert_string: bool = True, convert_integer: bool = True, convert_boolean: bool = True, convert_floating: bool = True, dtype_backend: str = "numpy_nullable", ): result = Fold.register(pandas.DataFrame.convert_dtypes, shape_preserved=True)( self, infer_objects=infer_objects, convert_string=convert_string, convert_integer=convert_integer, convert_boolean=convert_boolean, convert_floating=convert_floating, dtype_backend=dtype_backend, ) # TODO: `numpy_nullable` should be handled similar if dtype_backend == "pyarrow": result._modin_frame._pandas_backend = "pyarrow" return result invert = Map.register(pandas.DataFrame.__invert__, dtypes="copy") isna = Map.register(pandas.DataFrame.isna, dtypes=np.bool_) # TODO: better way to distinguish methods for NumPy API? _isfinite = Map.register( lambda df, *args, **kwargs: pandas.DataFrame(np.isfinite(df, *args, **kwargs)), dtypes=np.bool_, ) _isinf = Map.register( # Needed for numpy API lambda df, *args, **kwargs: pandas.DataFrame(np.isinf(df, *args, **kwargs)), dtypes=np.bool_, ) _isnat = Map.register( # Needed for numpy API lambda df, *args, **kwargs: pandas.DataFrame(np.isnat(df, *args, **kwargs)), dtypes=np.bool_, ) _isneginf = Map.register( # Needed for numpy API lambda df, *args, **kwargs: pandas.DataFrame(np.isneginf(df, *args, **kwargs)), dtypes=np.bool_, ) _isposinf = Map.register( # Needed for numpy API lambda df, *args, **kwargs: pandas.DataFrame(np.isposinf(df, *args, **kwargs)), dtypes=np.bool_, ) _iscomplex = Map.register( # Needed for numpy API lambda df, *args, **kwargs: pandas.DataFrame(np.iscomplex(df, *args, **kwargs)), dtypes=np.bool_, ) _isreal = Map.register( # Needed for numpy API lambda df, *args, **kwargs: pandas.DataFrame(np.isreal(df, *args, **kwargs)), dtypes=np.bool_, ) _logical_not = Map.register(np.logical_not, dtypes=np.bool_) # Needed for numpy API _tanh = Map.register( lambda df, *args, **kwargs: pandas.DataFrame(np.tanh(df, *args, **kwargs)) ) # Needed for numpy API _sqrt = Map.register( lambda df, *args, **kwargs: pandas.DataFrame(np.sqrt(df, *args, **kwargs)) ) # Needed for numpy API _exp = Map.register( lambda df, *args, **kwargs: pandas.DataFrame(np.exp(df, *args, **kwargs)) ) # Needed for numpy API negative = Map.register(pandas.DataFrame.__neg__) notna = Map.register(pandas.DataFrame.notna, dtypes=np.bool_) round = Map.register(pandas.DataFrame.round) replace = Map.register(pandas.DataFrame.replace) series_view = Map.register( lambda df, *args, **kwargs: pandas.DataFrame( df.squeeze(axis=1).view(*args, **kwargs) ) ) to_numeric = Map.register( lambda df, *args, **kwargs: pandas.DataFrame( pandas.to_numeric(df.squeeze(axis=1), *args, **kwargs) ) ) to_timedelta = Map.register( lambda s, *args, **kwargs: pandas.to_timedelta( s.squeeze(axis=1), *args, **kwargs ).to_frame(), dtypes="timedelta64[ns]", ) # END Map partitions operations # String map partitions operations str_capitalize = Map.register(_str_map("capitalize"), dtypes="copy") str_center = Map.register(_str_map("center"), dtypes="copy") str_contains = Map.register(_str_map("contains"), dtypes=np.bool_) str_count = Map.register(_str_map("count"), dtypes=int) str_endswith = Map.register(_str_map("endswith"), dtypes=np.bool_) str_find = Map.register(_str_map("find"), dtypes=np.int64) str_findall = Map.register(_str_map("findall"), dtypes="copy") str_get = Map.register(_str_map("get"), dtypes="copy") str_index = Map.register(_str_map("index"), dtypes=np.int64) str_isalnum = Map.register(_str_map("isalnum"), dtypes=np.bool_) str_isalpha = Map.register(_str_map("isalpha"), dtypes=np.bool_) str_isdecimal = Map.register(_str_map("isdecimal"), dtypes=np.bool_) str_isdigit = Map.register(_str_map("isdigit"), dtypes=np.bool_) str_islower = Map.register(_str_map("islower"), dtypes=np.bool_) str_isnumeric = Map.register(_str_map("isnumeric"), dtypes=np.bool_) str_isspace = Map.register(_str_map("isspace"), dtypes=np.bool_) str_istitle = Map.register(_str_map("istitle"), dtypes=np.bool_) str_isupper = Map.register(_str_map("isupper"), dtypes=np.bool_) str_join = Map.register(_str_map("join"), dtypes="copy") str_len = Map.register(_str_map("len"), dtypes=int) str_ljust = Map.register(_str_map("ljust"), dtypes="copy") str_lower = Map.register(_str_map("lower"), dtypes="copy") str_lstrip = Map.register(_str_map("lstrip"), dtypes="copy") str_match = Map.register(_str_map("match"), dtypes="copy") str_normalize = Map.register(_str_map("normalize"), dtypes="copy") str_pad = Map.register(_str_map("pad"), dtypes="copy") _str_partition = Map.register(_str_map("partition"), dtypes="copy") def str_partition(self, sep=" ", expand=True): # For `expand`, need an operator that can create more columns than before if expand: return super().str_partition(sep=sep, expand=expand) return self._str_partition(sep=sep, expand=False) str_repeat = Map.register(_str_map("repeat"), dtypes="copy") _str_extract = Map.register(_str_map("extract"), dtypes="copy") def str_extract(self, pat, flags, expand): regex = re.compile(pat, flags=flags) # need an operator that can create more columns than before if expand and regex.groups == 1: qc = self._str_extract(pat, flags=flags, expand=expand) qc.columns = get_group_names(regex) else: qc = super().str_extract(pat, flags=flags, expand=expand) return qc str_replace = Map.register(_str_map("replace"), dtypes="copy", shape_hint="column") str_rfind = Map.register(_str_map("rfind"), dtypes=np.int64, shape_hint="column") str_rindex = Map.register(_str_map("rindex"), dtypes=np.int64, shape_hint="column") str_rjust = Map.register(_str_map("rjust"), dtypes="copy", shape_hint="column") _str_rpartition = Map.register( _str_map("rpartition"), dtypes="copy", shape_hint="column" ) def str_rpartition(self, sep=" ", expand=True): if expand: # For `expand`, need an operator that can create more columns than before return super().str_rpartition(sep=sep, expand=expand) return self._str_rpartition(sep=sep, expand=False) _str_rsplit = Map.register(_str_map("rsplit"), dtypes="copy", shape_hint="column") def str_rsplit(self, pat=None, n=-1, expand=False): if expand: # For `expand`, need an operator that can create more columns than before return super().str_rsplit(pat=pat, n=n, expand=expand) return self._str_rsplit(pat=pat, n=n, expand=False) str_rstrip = Map.register(_str_map("rstrip"), dtypes="copy", shape_hint="column") str_slice = Map.register(_str_map("slice"), dtypes="copy", shape_hint="column") str_slice_replace = Map.register( _str_map("slice_replace"), dtypes="copy", shape_hint="column" ) _str_split = Map.register(_str_map("split"), dtypes="copy", shape_hint="column") def str_split(self, pat=None, n=-1, expand=False, regex=None): if expand: # For `expand`, need an operator that can create more columns than before return super().str_split(pat=pat, n=n, expand=expand, regex=regex) return self._str_split(pat=pat, n=n, expand=False, regex=regex) str_startswith = Map.register( _str_map("startswith"), dtypes=np.bool_, shape_hint="column" ) str_strip = Map.register(_str_map("strip"), dtypes="copy", shape_hint="column") str_swapcase = Map.register( _str_map("swapcase"), dtypes="copy", shape_hint="column" ) str_title = Map.register(_str_map("title"), dtypes="copy", shape_hint="column") str_translate = Map.register( _str_map("translate"), dtypes="copy", shape_hint="column" ) str_upper = Map.register(_str_map("upper"), dtypes="copy", shape_hint="column") str_wrap = Map.register(_str_map("wrap"), dtypes="copy", shape_hint="column") str_zfill = Map.register(_str_map("zfill"), dtypes="copy", shape_hint="column") str___getitem__ = Map.register( _str_map("__getitem__"), dtypes="copy", shape_hint="column" ) # END String map partitions operations def unique(self, keep="first", ignore_index=True, subset=None): # kernels with 'pandas.Series.unique()' work faster can_use_unique_kernel = ( subset is None and ignore_index and len(self.columns) == 1 and keep is not False ) if not can_use_unique_kernel and not RangePartitioning.get(): return super().unique(keep=keep, ignore_index=ignore_index, subset=subset) if RangePartitioning.get(): new_modin_frame = self._modin_frame._apply_func_to_range_partitioning( key_columns=self.columns.tolist() if subset is None else subset, func=( ( lambda df: pandas.DataFrame( df.squeeze(axis=1).unique(), columns=["__reduced__"] ) ) if can_use_unique_kernel else ( lambda df: df.drop_duplicates( keep=keep, ignore_index=ignore_index, subset=subset ) ) ), preserve_columns=True, ) else: # return self.to_pandas().squeeze(axis=1).unique() works faster # but returns pandas type instead of query compiler # TODO: https://github.com/modin-project/modin/issues/7182 new_modin_frame = self._modin_frame.apply_full_axis( 0, lambda x: x.squeeze(axis=1).unique(), new_columns=self.columns, ) return self.__constructor__(new_modin_frame, shape_hint=self._shape_hint) def searchsorted(self, **kwargs): def searchsorted(df): """Apply `searchsorted` function to a single partition.""" result = df.squeeze(axis=1).searchsorted(**kwargs) if not is_list_like(result): result = [result] return pandas.DataFrame(result) return self.default_to_pandas(searchsorted) # Dt map partitions operations dt_date = Map.register(_dt_prop_map("date"), dtypes=np.object_) dt_time = Map.register(_dt_prop_map("time"), dtypes=np.object_) dt_timetz = Map.register(_dt_prop_map("timetz"), dtypes=np.object_) dt_year = Map.register(_dt_prop_map("year"), dtypes=np.int32) dt_month = Map.register(_dt_prop_map("month"), dtypes=np.int32) dt_day = Map.register(_dt_prop_map("day"), dtypes=np.int32) dt_hour = Map.register(_dt_prop_map("hour"), dtypes=np.int64) dt_minute = Map.register(_dt_prop_map("minute"), dtypes=np.int64) dt_second = Map.register(_dt_prop_map("second"), dtypes=np.int64) dt_microsecond = Map.register(_dt_prop_map("microsecond"), dtypes=np.int64) dt_nanosecond = Map.register(_dt_prop_map("nanosecond"), dtypes=np.int64) dt_dayofweek = Map.register(_dt_prop_map("dayofweek"), dtypes=np.int64) dt_weekday = Map.register(_dt_prop_map("weekday"), dtypes=np.int64) dt_dayofyear = Map.register(_dt_prop_map("dayofyear"), dtypes=np.int64) dt_quarter = Map.register(_dt_prop_map("quarter"), dtypes=np.int64) dt_is_month_start = Map.register(_dt_prop_map("is_month_start"), dtypes=np.bool_) dt_is_month_end = Map.register(_dt_prop_map("is_month_end"), dtypes=np.bool_) dt_is_quarter_start = Map.register( _dt_prop_map("is_quarter_start"), dtypes=np.bool_ ) dt_is_quarter_end = Map.register(_dt_prop_map("is_quarter_end"), dtypes=np.bool_) dt_is_year_start = Map.register(_dt_prop_map("is_year_start"), dtypes=np.bool_) dt_is_year_end = Map.register(_dt_prop_map("is_year_end"), dtypes=np.bool_) dt_is_leap_year = Map.register(_dt_prop_map("is_leap_year"), dtypes=np.bool_) dt_daysinmonth = Map.register(_dt_prop_map("daysinmonth"), dtypes=np.int64) dt_days_in_month = Map.register(_dt_prop_map("days_in_month"), dtypes=np.int64) dt_asfreq = Map.register(_dt_func_map("asfreq")) dt_to_period = Map.register(_dt_func_map("to_period")) dt_to_pydatetime = Map.register(_dt_func_map("to_pydatetime"), dtypes=np.object_) dt_tz_localize = Map.register(_dt_func_map("tz_localize")) dt_tz_convert = Map.register(_dt_func_map("tz_convert")) dt_normalize = Map.register(_dt_func_map("normalize")) dt_strftime = Map.register(_dt_func_map("strftime"), dtypes=np.object_) dt_round = Map.register(_dt_func_map("round")) dt_floor = Map.register(_dt_func_map("floor")) dt_ceil = Map.register(_dt_func_map("ceil")) dt_month_name = Map.register(_dt_func_map("month_name"), dtypes=np.object_) dt_day_name = Map.register(_dt_func_map("day_name"), dtypes=np.object_) dt_to_pytimedelta = Map.register(_dt_func_map("to_pytimedelta"), dtypes=np.object_) dt_total_seconds = Map.register(_dt_func_map("total_seconds"), dtypes=np.float64) dt_seconds = Map.register(_dt_prop_map("seconds"), dtypes=np.int64) dt_days = Map.register(_dt_prop_map("days"), dtypes=np.int64) dt_microseconds = Map.register(_dt_prop_map("microseconds"), dtypes=np.int64) dt_nanoseconds = Map.register(_dt_prop_map("nanoseconds"), dtypes=np.int64) dt_qyear = Map.register(_dt_prop_map("qyear"), dtypes=np.int64) dt_start_time = Map.register(_dt_prop_map("start_time")) dt_end_time = Map.register(_dt_prop_map("end_time")) dt_to_timestamp = Map.register(_dt_func_map("to_timestamp")) # END Dt map partitions operations def astype(self, col_dtypes, errors: str = "raise"): # `errors` parameter needs to be part of the function signature because # other query compilers may not take care of error handling at the API # layer. This query compiler assumes there won't be any errors due to # invalid type keys. return self.__constructor__( self._modin_frame.astype(col_dtypes, errors=errors), shape_hint=self._shape_hint, ) def infer_objects(self): return self.__constructor__(self._modin_frame.infer_objects()) # Column/Row partitions reduce operations def first_valid_index(self): def first_valid_index_builder(df): """Get the position of the first valid index in a single partition.""" return df.set_axis(pandas.RangeIndex(len(df.index)), axis="index").apply( lambda df: df.first_valid_index() ) # We get the minimum from each column, then take the min of that to get # first_valid_index. The `to_pandas()` here is just for a single value and # `squeeze` will convert it to a scalar. first_result = ( self.__constructor__(self._modin_frame.reduce(0, first_valid_index_builder)) .min(axis=1) .to_pandas() .squeeze() ) return self.index[first_result] def last_valid_index(self): def last_valid_index_builder(df): """Get the position of the last valid index in a single partition.""" return df.set_axis(pandas.RangeIndex(len(df.index)), axis="index").apply( lambda df: df.last_valid_index() ) # We get the maximum from each column, then take the max of that to get # last_valid_index. The `to_pandas()` here is just for a single value and # `squeeze` will convert it to a scalar. first_result = ( self.__constructor__(self._modin_frame.reduce(0, last_valid_index_builder)) .max(axis=1) .to_pandas() .squeeze() ) return self.index[first_result] # END Column/Row partitions reduce operations def describe(self, percentiles: np.ndarray): # Use pandas to calculate the correct columns empty_df = ( pandas.DataFrame(columns=self.columns) .astype(self.dtypes) .describe(percentiles, include="all") ) new_index = empty_df.index def describe_builder(df, internal_indices=[]): # pragma: no cover """Apply `describe` function to the subset of columns in a single partition.""" # The index of the resulting dataframe is the same amongst all partitions # when dealing with the same data type. However, if we work with columns # that contain strings, we can get extra values in our result index such as # 'unique', 'top', and 'freq'. Since we call describe() on each partition, # we can have cases where certain partitions do not contain any of the # object string data leading to an index mismatch between partitions. # Thus, we must reindex each partition with the global new_index. return ( df.iloc[:, internal_indices] .describe(percentiles=percentiles, include="all") .reindex(new_index) ) return self.__constructor__( self._modin_frame.apply_full_axis_select_indices( 0, describe_builder, empty_df.columns, new_index=new_index, new_columns=empty_df.columns, ) ) # END Column/Row partitions reduce operations over select indices # Map across rows/columns # These operations require some global knowledge of the full column/row # that is being operated on. This means that we have to put all of that # data in the same place. cummax = Fold.register(pandas.DataFrame.cummax, shape_preserved=True) cummin = Fold.register(pandas.DataFrame.cummin, shape_preserved=True) cumsum = Fold.register(pandas.DataFrame.cumsum, shape_preserved=True) cumprod = Fold.register(pandas.DataFrame.cumprod, shape_preserved=True) _diff = Fold.register(pandas.DataFrame.diff, shape_preserved=True) def diff(self, axis, periods): return self._diff(fold_axis=axis, axis=axis, periods=periods) def clip(self, lower, upper, **kwargs): if isinstance(lower, BaseQueryCompiler): lower = lower.to_pandas().squeeze(1) if isinstance(upper, BaseQueryCompiler): upper = upper.to_pandas().squeeze(1) kwargs["upper"] = upper kwargs["lower"] = lower axis = kwargs.get("axis", 0) if is_list_like(lower) or is_list_like(upper): new_modin_frame = self._modin_frame.fold( axis, lambda df: df.clip(**kwargs), shape_preserved=True ) else: new_modin_frame = self._modin_frame.map(lambda df: df.clip(**kwargs)) return self.__constructor__(new_modin_frame) corr = CorrCovBuilder.build_corr_method() def cov(self, min_periods=None, ddof=1): if self.get_pandas_backend() == "pyarrow": return super().cov(min_periods=min_periods, ddof=ddof) # _nancorr use numpy which incompatible with pandas dataframes on pyarrow return self._nancorr(min_periods=min_periods, cov=True, ddof=ddof) def _nancorr(self, min_periods=1, cov=False, ddof=1): """ Compute either pairwise covariance or pairwise correlation of columns. This function considers NA/null values the same like pandas does. Parameters ---------- min_periods : int, default: 1 Minimum number of observations required per pair of columns to have a valid result. cov : boolean, default: False Either covariance or correlation should be computed. ddof : int, default: 1 Means Delta Degrees of Freedom. The divisor used in calculations. Returns ------- PandasQueryCompiler The covariance or correlation matrix. Notes ----- This method is only used to compute covariance at the moment. """ other = self.to_numpy() try: other_mask = self._isfinite().to_numpy() except TypeError as err: # Pandas raises ValueError on unsupported types, so casting # the exception to a proper type raise ValueError("Unsupported types with 'numeric_only=False'") from err n_cols = other.shape[1] if min_periods is None: min_periods = 1 def map_func(df): # pragma: no cover """Compute covariance or correlation matrix for the passed frame.""" df = df.to_numpy() n_rows = df.shape[0] df_mask = np.isfinite(df) result = np.empty((n_rows, n_cols), dtype=np.float64) for i in range(n_rows): df_ith_row = df[i] df_ith_mask = df_mask[i] for j in range(n_cols): other_jth_col = other[:, j] valid = df_ith_mask & other_mask[:, j] vx = df_ith_row[valid] vy = other_jth_col[valid] nobs = len(vx) if nobs < min_periods: result[i, j] = np.nan else: vx = vx - vx.mean() vy = vy - vy.mean() sumxy = (vx * vy).sum() sumxx = (vx * vx).sum() sumyy = (vy * vy).sum() denom = (nobs - ddof) if cov else np.sqrt(sumxx * sumyy) if denom != 0: result[i, j] = sumxy / denom else: result[i, j] = np.nan return pandas.DataFrame(result) columns = self.columns index = columns.copy() transponed_self = self.transpose() new_modin_frame = transponed_self._modin_frame.apply_full_axis( 1, map_func, new_index=index, new_columns=columns ) return transponed_self.__constructor__(new_modin_frame) def dot(self, other, squeeze_self=None, squeeze_other=None): if isinstance(other, PandasQueryCompiler): other = ( other.to_pandas().squeeze(axis=1) if squeeze_other else other.to_pandas() ) num_cols = other.shape[1] if len(other.shape) > 1 else 1 if len(self.columns) == 1: new_index = ( [MODIN_UNNAMED_SERIES_LABEL] if (len(self.index) == 1 or squeeze_self) and num_cols == 1 else None ) new_columns = ( [MODIN_UNNAMED_SERIES_LABEL] if squeeze_self and num_cols == 1 else None ) axis = 0 else: new_index = self.index new_columns = [MODIN_UNNAMED_SERIES_LABEL] if num_cols == 1 else None axis = 1 # If either new index or new columns are supposed to be a single-dimensional, # then we use a special labeling for them. Besides setting the new labels as # a metadata to the resulted frame, we also want to set them inside the kernel, # so actual partitions would be labeled accordingly (there's a 'sync_label' # parameter that can do the same, but doing it manually is faster) align_index = isinstance(new_index, list) and new_index == [ MODIN_UNNAMED_SERIES_LABEL ] align_columns = new_columns == [MODIN_UNNAMED_SERIES_LABEL] def map_func(df, other=other, squeeze_self=squeeze_self): # pragma: no cover """Compute matrix multiplication of the passed frames.""" result = df.squeeze(axis=1).dot(other) if squeeze_self else df.dot(other) if is_list_like(result): res = pandas.DataFrame(result) else: res = pandas.DataFrame([result]) # manual aligning with external index to avoid `sync_labels` overhead if align_columns: res.columns = [MODIN_UNNAMED_SERIES_LABEL] if align_index: res.index = [MODIN_UNNAMED_SERIES_LABEL] return res new_modin_frame = self._modin_frame.apply_full_axis( axis, map_func, new_index=new_index, new_columns=new_columns, sync_labels=False, ) return self.__constructor__(new_modin_frame) def _nsort(self, n, columns=None, keep="first", sort_type="nsmallest"): """ Return first N rows of the data sorted in the specified order. Parameters ---------- n : int Number of rows to return. columns : list of labels, optional Column labels to sort data by. keep : {"first", "last", "all"}, default: "first" How to pick first rows in case of duplicated values: - "first": prioritize first occurrence. - "last": prioritize last occurrence. - "all": do not drop any duplicates, even if it means selecting more than `n` rows. sort_type : {"nsmallest", "nlargest"}, default: "nsmallest" "nsmallest" means sort in descending order, "nlargest" means sort in ascending order. Returns ------- PandasQueryCompiler New QueryCompiler containing the first N rows of the data sorted in the given order. """ def map_func(df, n=n, keep=keep, columns=columns): # pragma: no cover """Return first `N` rows of the sorted data for a single partition.""" if columns is None: return pandas.DataFrame( getattr(pandas.Series, sort_type)( df.squeeze(axis=1), n=n, keep=keep ) ) return getattr(pandas.DataFrame, sort_type)( df, n=n, columns=columns, keep=keep ) if columns is None: new_columns = [MODIN_UNNAMED_SERIES_LABEL] else: new_columns = self.columns new_modin_frame = self._modin_frame.apply_full_axis( axis=0, func=map_func, new_columns=new_columns ) return self.__constructor__(new_modin_frame) def nsmallest(self, *args, **kwargs): return self._nsort(sort_type="nsmallest", *args, **kwargs) def nlargest(self, *args, **kwargs): return self._nsort(sort_type="nlargest", *args, **kwargs) def eval(self, expr, **kwargs): # Make a copy of columns and eval on the copy to determine if result type is # series or not empty_eval = ( pandas.DataFrame(columns=self.columns) .astype(self.dtypes) .eval(expr, inplace=False, **kwargs) ) if isinstance(empty_eval, pandas.Series): new_columns = ( [empty_eval.name] if empty_eval.name is not None else [MODIN_UNNAMED_SERIES_LABEL] ) else: new_columns = empty_eval.columns new_modin_frame = self._modin_frame.apply_full_axis( 1, lambda df: pandas.DataFrame(df.eval(expr, inplace=False, **kwargs)), new_index=self.index, new_columns=new_columns, ) return self.__constructor__(new_modin_frame) def mode(self, **kwargs): axis = kwargs.get("axis", 0) def mode_builder(df): # pragma: no cover """Compute modes for a single partition.""" result = pandas.DataFrame(df.mode(**kwargs)) # We return a dataframe with the same shape as the input to ensure # that all the partitions will be the same shape if axis == 0 and len(df) != len(result): # Pad rows result = result.reindex(index=pandas.RangeIndex(len(df.index))) elif axis == 1 and len(df.columns) != len(result.columns): # Pad columns result = result.reindex(columns=pandas.RangeIndex(len(df.columns))) return pandas.DataFrame(result) if axis == 0: new_index = pandas.RangeIndex(len(self.index)) new_columns = self.columns else: new_index = self.index new_columns = pandas.RangeIndex(len(self.columns)) new_modin_frame = self._modin_frame.apply_full_axis( axis, mode_builder, new_index=new_index, new_columns=new_columns ) return self.__constructor__(new_modin_frame).dropna(axis=axis, how="all") def fillna(self, **kwargs): squeeze_self = kwargs.pop("squeeze_self", False) squeeze_value = kwargs.pop("squeeze_value", False) axis = kwargs.get("axis", 0) value = kwargs.pop("value") method = kwargs.get("method", None) limit = kwargs.get("limit", None) full_axis = method is not None or limit is not None new_dtypes = None if isinstance(value, BaseQueryCompiler): # This code assumes that the operation occurs with the same query compiler assert isinstance(value, PandasQueryCompiler) if squeeze_self: # Self is a Series type object if full_axis: value = value.to_pandas().squeeze(axis=1) def fillna_builder(series): # pragma: no cover # `limit` parameter works only on `Series` type, so we have to squeeze both objects to get # correct behavior. return series.squeeze(axis=1).fillna(value=value, **kwargs) new_modin_frame = self._modin_frame.apply_full_axis( 0, fillna_builder ) else: def fillna_builder(df, value_arg): if isinstance(value_arg, pandas.DataFrame): value_arg = value_arg.squeeze(axis=1) res = df.squeeze(axis=1).fillna(value=value_arg, **kwargs) return pandas.DataFrame(res) new_modin_frame = self._modin_frame.n_ary_op( fillna_builder, [value._modin_frame], join_type="left", copartition_along_columns=False, ) return self.__constructor__(new_modin_frame) else: # Self is a DataFrame type object if squeeze_value: # Value is Series type object value = value.to_pandas().squeeze(axis=1) def fillna(df): return df.fillna(value=value, **kwargs) # Continue to end of this function else: # Value is a DataFrame type object def fillna_builder(df, right): return df.fillna(value=right, **kwargs) new_modin_frame = self._modin_frame.broadcast_apply( 0, fillna_builder, value._modin_frame ) return self.__constructor__(new_modin_frame) elif isinstance(value, dict): if squeeze_self: # For Series dict works along the index. def fillna(df): return pandas.DataFrame( df.squeeze(axis=1).fillna(value=value, **kwargs) ) else: # For DataFrames dict works along columns, all columns have to be present. def fillna(df): func_dict = { col: val for (col, val) in value.items() if col in df.columns } return df.fillna(value=func_dict, **kwargs) if self.frame_has_materialized_dtypes: dtypes = self.dtypes value_dtypes = pandas.DataFrame( {k: [v] for (k, v) in value.items()} ).dtypes if all( find_common_type([dtypes[col], dtype]) == dtypes[col] for (col, dtype) in value_dtypes.items() if col in dtypes ): new_dtypes = dtypes else: if self.frame_has_materialized_dtypes: dtype = pandas.Series(value).dtype if all(find_common_type([t, dtype]) == t for t in self.dtypes): new_dtypes = self.dtypes def fillna(df): return df.fillna(value=value, **kwargs) if full_axis: new_modin_frame = self._modin_frame.fold(axis, fillna, shape_preserved=True) else: new_modin_frame = self._modin_frame.map(fillna, dtypes=new_dtypes) return self.__constructor__(new_modin_frame) def quantile_for_list_of_values(self, **kwargs): axis = kwargs.get("axis", 0) q = kwargs.get("q") numeric_only = kwargs.get("numeric_only", True) assert isinstance(q, (pandas.Series, np.ndarray, pandas.Index, list, tuple)) if numeric_only: new_columns = self._modin_frame.numeric_columns() else: new_columns = [ col for col, dtype in zip(self.columns, self.dtypes) if (is_numeric_dtype(dtype) or lib.is_np_dtype(dtype, "mM")) ] if axis == 1: query_compiler = self.getitem_column_array(new_columns) new_columns = self.index else: query_compiler = self def quantile_builder(df, **kwargs): result = df.quantile(**kwargs) return result.T if kwargs.get("axis", 0) == 1 else result # This took a long time to debug, so here is the rundown of why this is needed. # Previously, we were operating on select indices, but that was broken. We were # not correctly setting the columns/index. Because of how we compute `to_pandas` # and because of the static nature of the index for `axis=1` it is easier to # just handle this as the transpose (see `quantile_builder` above for the # transpose within the partition) than it is to completely rework other # internal methods. Basically we are returning the transpose of the object for # correctness and cleanliness of the code. if axis == 1: q_index = new_columns new_columns = pandas.Index(q) else: q_index = pandas.Index(q) new_modin_frame = query_compiler._modin_frame.apply_full_axis( axis, lambda df: quantile_builder(df, **kwargs), new_index=q_index, new_columns=new_columns, dtypes=np.float64, ) result = self.__constructor__(new_modin_frame) return result.transpose() if axis == 1 else result def rank(self, **kwargs): axis = kwargs.get("axis", 0) numeric_only = True if axis else kwargs.get("numeric_only", False) new_modin_frame = self._modin_frame.apply_full_axis( axis, lambda df: df.rank(**kwargs), new_index=self._modin_frame.copy_index_cache(copy_lengths=True), new_columns=( self._modin_frame.copy_columns_cache(copy_lengths=True) if not numeric_only else None ), dtypes=np.float64, sync_labels=False, ) return self.__constructor__(new_modin_frame) def sort_index(self, **kwargs): axis = kwargs.pop("axis", 0) level = kwargs.pop("level", None) sort_remaining = kwargs.pop("sort_remaining", True) kwargs["inplace"] = False if level is not None or self.has_multiindex(axis=axis): return self.default_to_pandas( pandas.DataFrame.sort_index, axis=axis, level=level, sort_remaining=sort_remaining, **kwargs, ) # sort_index can have ascending be None and behaves as if it is False. # sort_values cannot have ascending be None. Thus, the following logic is to # convert the ascending argument to one that works with sort_values ascending = kwargs.pop("ascending", True) if ascending is None: ascending = False kwargs["ascending"] = ascending if axis: new_columns = self.columns.to_frame().sort_index(**kwargs).index new_index = self.index else: new_index = self.index.to_frame().sort_index(**kwargs).index new_columns = self.columns new_modin_frame = self._modin_frame.apply_full_axis( axis, lambda df: df.sort_index( axis=axis, level=level, sort_remaining=sort_remaining, **kwargs ), new_index, new_columns, dtypes="copy" if axis == 0 else None, ) return self.__constructor__(new_modin_frame) def melt( self, id_vars=None, value_vars=None, var_name=None, value_name="value", col_level=None, ignore_index=True, ): ErrorMessage.mismatch_with_pandas( operation="melt", message="Order of rows could be different from pandas" ) if var_name is None: var_name = "variable" def _convert_to_list(x): """Convert passed object to a list.""" if is_list_like(x): x = [*x] elif x is not None: x = [x] else: x = [] return x id_vars, value_vars = map(_convert_to_list, [id_vars, value_vars]) if len(value_vars) == 0: value_vars = self.columns.drop(id_vars) if len(id_vars) != 0: to_broadcast = self.getitem_column_array(id_vars)._modin_frame else: to_broadcast = None def applyier(df, internal_indices, other=[], internal_other_indices=[]): """ Apply `melt` function to a single partition. Parameters ---------- df : pandas.DataFrame Partition of the self frame. internal_indices : list of ints Positional indices of columns in this particular partition which represents `value_vars` columns in the source frame. other : pandas.DataFrame Broadcasted partition which contains `id_vars` columns of the source frame. internal_other_indices : list of ints Positional indices of columns in `other` partition which represents `id_vars` columns in the source frame. Returns ------- pandas.DataFrame The result of the `melt` function for this particular partition. """ if len(other): other = pandas.concat(other, axis=1) columns_to_add = other.columns.difference(df.columns) df = pandas.concat([df, other[columns_to_add]], axis=1) return df.melt( id_vars=id_vars, value_vars=df.columns[internal_indices], var_name=var_name, value_name=value_name, col_level=col_level, ) # we have no able to calculate correct indices here, so making it `dummy_index` inconsistent_frame = self._modin_frame.broadcast_apply_select_indices( axis=0, apply_indices=value_vars, func=applyier, other=to_broadcast, new_index=["dummy_index"] * len(id_vars), new_columns=["dummy_index"] * len(id_vars), ) # after applying `melt` for selected indices we will get partitions like this: # id_vars vars value | id_vars vars value # 0 foo col3 1 | 0 foo col5 a so stacking it into # 1 fiz col3 2 | 1 fiz col5 b `new_parts` to get # 2 bar col3 3 | 2 bar col5 c correct answer # 3 zoo col3 4 | 3 zoo col5 d new_parts = np.array( [np.array([x]) for x in np.concatenate(inconsistent_frame._partitions.T)] ) new_index = pandas.RangeIndex(len(self.index) * len(value_vars)) new_modin_frame = self._modin_frame.__constructor__( new_parts, index=new_index, columns=id_vars + [var_name, value_name], ) result = self.__constructor__(new_modin_frame) # this assigment needs to propagate correct indices into partitions result.index = new_index return result # END Map across rows/columns # __getitem__ methods __getitem_bool = Binary.register( lambda df, r: df[[r]] if is_scalar(r) else df[r], join_type="left", labels="drop", ) # __setitem__ methods def setitem_bool(self, row_loc: PandasQueryCompiler, col_loc, item): def _set_item(df, row_loc): # pragma: no cover df = df.copy() df.loc[row_loc.squeeze(axis=1), col_loc] = item return df if self.frame_has_materialized_dtypes and is_scalar(item): new_dtypes = self.dtypes.copy() old_dtypes = new_dtypes[col_loc] item_type = extract_dtype(item) if isinstance(old_dtypes, pandas.Series): new_dtypes[col_loc] = [ find_common_type([dtype, item_type]) for dtype in old_dtypes.values ] else: new_dtypes[col_loc] = find_common_type([old_dtypes, item_type]) else: new_dtypes = None new_modin_frame = self._modin_frame.broadcast_apply_full_axis( axis=1, func=_set_item, other=row_loc._modin_frame, new_index=self._modin_frame.copy_index_cache(copy_lengths=True), new_columns=self._modin_frame.copy_columns_cache(), keep_partitioning=False, dtypes=new_dtypes, ) return self.__constructor__(new_modin_frame) # END __setitem__ methods def __validate_bool_indexer(self, indexer): if len(indexer) != len(self.index): raise ValueError( f"Item wrong length {len(indexer)} instead of {len(self.index)}." ) if isinstance(indexer, pandas.Series) and not indexer.equals(self.index): warnings.warn( "Boolean Series key will be reindexed to match DataFrame index.", PendingDeprecationWarning, stacklevel=4, ) def getitem_array(self, key): if isinstance(key, type(self)): # here we check for a subset of bool indexers only to simplify the code; # there could (potentially) be more of those, but we assume the most frequent # ones are just of bool dtype if len(key.dtypes) == 1 and is_bool_dtype(key.dtypes.iloc[0]): self.__validate_bool_indexer(key.index) return self.__getitem_bool(key, broadcast=True, dtypes="copy") key = key.to_pandas().squeeze(axis=1) if is_bool_indexer(key): self.__validate_bool_indexer(key) key = check_bool_indexer(self.index, key) # We convert to a RangeIndex because getitem_row_array is expecting a list # of indices, and RangeIndex will give us the exact indices of each boolean # requested. key = pandas.RangeIndex(len(self.index))[key] if len(key): return self.getitem_row_array(key) else: return self.from_pandas( pandas.DataFrame(columns=self.columns), type(self._modin_frame) ) else: if any(k not in self.columns for k in key): raise KeyError( "{} not index".format( str([k for k in key if k not in self.columns]).replace(",", "") ) ) return self.getitem_column_array(key) def getitem_column_array( self, key, numeric=False, ignore_order=False ) -> PandasQueryCompiler: shape_hint = "column" if len(key) == 1 else None if numeric: if ignore_order and is_list_like(key): key = np.sort(key) new_modin_frame = self._modin_frame.take_2d_labels_or_positional( col_positions=key ) else: if ignore_order and is_list_like(key): key_set = frozenset(key) key = [col for col in self.columns if col in key_set] new_modin_frame = self._modin_frame.take_2d_labels_or_positional( col_labels=key ) return self.__constructor__(new_modin_frame, shape_hint=shape_hint) def getitem_row_array(self, key): return self.__constructor__( self._modin_frame.take_2d_labels_or_positional(row_positions=key) ) def setitem(self, axis, key, value): # Default to pandas for empty frames to avoid complex partitioning issues if axis == 0 and not self.lazy_row_count and self.get_axis_len(0) == 0: def do_setitem(df: pandas.DataFrame, key, value) -> pandas.DataFrame: df[key] = value return df return self.default_to_pandas(do_setitem, key=key, value=value) if axis == 0: value = self._wrap_column_data(value) return self._setitem(axis=axis, key=key, value=value, how=None) def _setitem(self, axis, key, value, how="inner"): """ Set the row/column defined by `key` to the `value` provided. In contrast with `setitem` with this function you can specify how to handle non-aligned `self` and `value`. Parameters ---------- axis : {0, 1} Axis to set `value` along. 0 means set row, 1 means set column. key : scalar Row/column label to set `value` in. value : PandasQueryCompiler (1xN), list-like or scalar Define new row/column value. how : {"inner", "outer", "left", "right", None}, default: "inner" Type of join to perform if specified axis of `self` and `value` are not equal. If `how` is `None`, reindex `value` with `self` labels without joining. Returns ------- BaseQueryCompiler New QueryCompiler with updated `key` value. """ def setitem_builder(df, internal_indices=[]): # pragma: no cover """ Set the row/column to the `value` in a single partition. Parameters ---------- df : pandas.DataFrame Partition of the self frame. internal_indices : list of ints Positional indices of rows/columns in this particular partition which represents `key` in the source frame. Returns ------- pandas.DataFrame Partition data with updated values. """ df = df.copy() if len(internal_indices) == 1: if axis == 0: df[df.columns[internal_indices[0]]] = value else: df.iloc[internal_indices[0]] = value else: if axis == 0: df[df.columns[internal_indices]] = value else: df.iloc[internal_indices] = value return df if isinstance(value, type(self)): value.columns = [key] if axis == 1: value = value.transpose() idx = self.get_axis(axis ^ 1).get_indexer_for([key])[0] return self.insert_item(axis ^ 1, idx, value, how, replace=True) if axis == 0: value_dtype = extract_dtype(value) old_columns = self.columns.difference(pandas.Index([key])) old_dtypes = ModinDtypes(self._modin_frame._dtypes).lazy_get(old_columns) new_dtypes = ModinDtypes.concat( [ old_dtypes, DtypesDescriptor({key: value_dtype}, cols_with_unknown_dtypes=[]), ] # get dtypes in a proper order ).lazy_get(self.columns) else: # TODO: apply 'find_common_dtype' to the value's dtype and old column dtypes new_dtypes = None # TODO: rework by passing list-like values to `apply_select_indices` # as an item to distribute if is_list_like(value): new_modin_frame = self._modin_frame.apply_full_axis_select_indices( axis, setitem_builder, [key], new_index=self.index, new_columns=self.columns, keep_remaining=True, new_dtypes=new_dtypes, ) else: new_modin_frame = self._modin_frame.apply_select_indices( axis, setitem_builder, [key], new_index=self.index, new_columns=self.columns, new_dtypes=new_dtypes, keep_remaining=True, ) return self.__constructor__(new_modin_frame) # END __getitem__ methods # Drop/Dropna # This will change the shape of the resulting data. def dropna(self, **kwargs): is_column_wise = kwargs.get("axis", 0) == 1 no_thresh_passed = kwargs.get("thresh", lib.no_default) in ( lib.no_default, None, ) # The map reduce approach works well for frames with few columnar partitions processable_amount_of_partitions = ( self._modin_frame.num_parts < CpuCount.get() * 32 ) if is_column_wise and no_thresh_passed and processable_amount_of_partitions: how = kwargs.get("how", "any") subset = kwargs.get("subset") how = "any" if how in (lib.no_default, None) else how condition = lambda df: getattr(df, how)() # noqa: E731 (lambda assignment) def mapper(df: pandas.DataFrame): """Compute a mask indicating whether there are all/any NaN values in each column.""" if subset is not None: subset_mask = condition( df.loc[df.index.intersection(subset)].isna() ) # we have to keep other columns so setting their mask # values with `False` mask = pandas.Series( np.zeros(df.shape[1], dtype=bool), index=df.columns ) mask.update(subset_mask) else: mask = condition(df.isna()) # for proper partitioning at the 'reduce' phase each partition has to # represent a one-row frame rather than a one-column frame, so calling `.T` here return mask.to_frame().T masks = self._modin_frame.apply_full_axis( func=mapper, axis=1, keep_partitioning=True ) def reduce(df: pandas.DataFrame, mask: pandas.DataFrame): """Drop columns from `df` that satisfy the NaN `mask`.""" # `mask` here consists of several rows each representing the masks result # for a certain row partition: # col1 col2 col3 # 0 True True False col1 True # 1 False True False ---> mask.any() ---> col2 True # 2 True True False col3 False # in order to get the proper 1D mask we have to reduce the partition's # results by applying the condition one more time to_take_mask = ~condition(mask) to_take = [] for col, value in to_take_mask.items(): if value and col in df: to_take.append(col) return df[to_take] result = self._modin_frame.broadcast_apply( # 'masks' have identical partitioning as we specified 'keep_partitioning=True' before, # this means that we can safely skip the 'co-partitioning' stage axis=1, func=reduce, other=masks, copartition=False, labels="drop", ) return self.__constructor__(result, shape_hint=self._shape_hint) return self.__constructor__( self._modin_frame.filter( kwargs.get("axis", 0) ^ 1, lambda df: pandas.DataFrame.dropna(df, **kwargs), ), shape_hint=self._shape_hint, ) def drop( self, index=None, columns=None, errors: str = "raise" ) -> PandasQueryCompiler: # `errors` parameter needs to be part of the function signature because # other query compilers may not take care of error handling at the API # layer. This query compiler assumes there won't be any errors due to # invalid keys. if index is not None: index = np.sort(self.index.get_indexer_for(self.index.difference(index))) if columns is not None: columns = np.sort( self.columns.get_indexer_for(self.columns.difference(columns)) ) new_modin_frame = self._modin_frame.take_2d_labels_or_positional( row_positions=index, col_positions=columns ) return self.__constructor__(new_modin_frame) # END Drop/Dropna def duplicated(self, **kwargs): def _compute_hash(df): result = df.apply( lambda s: hashlib.new("md5", str(tuple(s)).encode()).hexdigest(), axis=1 ) if isinstance(result, pandas.Series): result = result.to_frame( result.name if result.name is not None else MODIN_UNNAMED_SERIES_LABEL ) return result def _compute_duplicated(df): # pragma: no cover result = df.duplicated(**kwargs) if isinstance(result, pandas.Series): result = result.to_frame( result.name if result.name is not None else MODIN_UNNAMED_SERIES_LABEL ) return result if self._modin_frame._partitions.shape[1] > 1: # if the number of columns (or column partitions) we are checking for duplicates is larger than 1, # we must first hash them to generate a single value that can be compared across rows. hashed_modin_frame = self._modin_frame.reduce( axis=1, function=_compute_hash, dtypes=pandas.api.types.pandas_dtype("O"), ) else: hashed_modin_frame = self._modin_frame new_modin_frame = hashed_modin_frame.apply_full_axis( axis=0, func=_compute_duplicated, new_index=self._modin_frame.copy_index_cache(), new_columns=[MODIN_UNNAMED_SERIES_LABEL], dtypes=np.bool_, keep_partitioning=True, ) return self.__constructor__(new_modin_frame, shape_hint="column") # Insert # This method changes the shape of the resulting data. In Pandas, this # operation is always inplace, but this object is immutable, so we just # return a new one from here and let the front end handle the inplace # update. def insert(self, loc, column, value): value = self._wrap_column_data(value) if isinstance(value, type(self)): value.columns = [column] return self.insert_item(axis=1, loc=loc, value=value, how=None) def insert(df, internal_indices=[]): # pragma: no cover """ Insert new column to the partition. Parameters ---------- df : pandas.DataFrame Partition of the self frame. internal_indices : list of ints Positional index of the column in this particular partition to insert new column after. """ internal_idx = int(internal_indices[0]) df.insert(internal_idx, column, value) return df value_dtype = extract_dtype(value) new_columns = self.columns.insert(loc, column) new_dtypes = ModinDtypes.concat( [ self._modin_frame._dtypes, DtypesDescriptor({column: value_dtype}, cols_with_unknown_dtypes=[]), ] ).lazy_get( new_columns ) # get dtypes in a proper order # TODO: rework by passing list-like values to `apply_select_indices` # as an item to distribute new_modin_frame = self._modin_frame.apply_full_axis_select_indices( 0, insert, numeric_indices=[loc], keep_remaining=True, new_index=self.index, new_columns=new_columns, new_dtypes=new_dtypes, ) return self.__constructor__(new_modin_frame) def _wrap_column_data(self, data): """ If the data is list-like, create a single column query compiler. Parameters ---------- data : any Returns ------- data or PandasQueryCompiler """ if is_list_like(data): return self.from_pandas( pandas.DataFrame(pandas.Series(data, index=self.index)), data_cls=type(self._modin_frame), ) return data # END Insert def explode(self, column): return self.__constructor__( self._modin_frame.explode(1, lambda df: df.explode(column)) ) # UDF (apply and agg) methods # There is a wide range of behaviors that are supported, so a lot of the # logic can get a bit convoluted. def apply(self, func, axis, *args, **kwargs): # if any of args contain modin object, we should # convert it to pandas args = try_cast_to_pandas(args) kwargs = try_cast_to_pandas(kwargs) _, func, _, _ = reconstruct_func(func, **kwargs) if isinstance(func, dict): return self._dict_func(func, axis, *args, **kwargs) elif is_list_like(func): return self._list_like_func(func, axis, *args, **kwargs) else: return self._callable_func(func, axis, *args, **kwargs) def apply_on_series(self, func, *args, **kwargs): args = try_cast_to_pandas(args) kwargs = try_cast_to_pandas(kwargs) assert self.is_series_like() # We use apply_full_axis here instead of map since the latter assumes that the # shape of the DataFrame does not change. However, it is possible for functions # applied to Series objects to end up creating DataFrames. It is possible that # using apply_full_axis is much less performant compared to using a variant of # map. return self.__constructor__( self._modin_frame.apply_full_axis( 1, lambda df: df.squeeze(axis=1).apply(func, *args, **kwargs) ) ) def _dict_func(self, func, axis, *args, **kwargs): """ Apply passed functions to the specified rows/columns. Parameters ---------- func : dict(label) -> [callable, str] Dictionary that maps axis labels to the function to apply against them. axis : {0, 1} Target axis to apply functions along. 0 means apply to columns, 1 means apply to rows. *args : args Arguments to pass to the specified functions. **kwargs : kwargs Arguments to pass to the specified functions. Returns ------- PandasQueryCompiler New QueryCompiler containing the results of passed functions. """ if "axis" not in kwargs: kwargs["axis"] = axis func = {k: wrap_udf_function(v) if callable(v) else v for k, v in func.items()} def dict_apply_builder(df, internal_indices=[]): # pragma: no cover # Sometimes `apply` can return a `Series`, but we require that internally # all objects are `DataFrame`s. # It looks like it doesn't need to use `internal_indices` option internally # for the case since `apply` use labels from dictionary keys in `func` variable. return pandas.DataFrame(df.apply(func, *args, **kwargs)) labels = list(func.keys()) return self.__constructor__( self._modin_frame.apply_full_axis_select_indices( axis, dict_apply_builder, labels, new_index=labels if axis == 1 else None, new_columns=labels if axis == 0 else None, keep_remaining=False, ) ) def _list_like_func(self, func, axis, *args, **kwargs): """ Apply passed functions to each row/column. Parameters ---------- func : list of callable List of functions to apply against each row/column. axis : {0, 1} Target axis to apply functions along. 0 means apply to columns, 1 means apply to rows. *args : args Arguments to pass to the specified functions. **kwargs : kwargs Arguments to pass to the specified functions. Returns ------- PandasQueryCompiler New QueryCompiler containing the results of passed functions. """ # When the function is list-like, the function names become the index/columns new_index = ( [f if isinstance(f, str) else f.__name__ for f in func] if axis == 0 else self.index ) new_columns = ( [f if isinstance(f, str) else f.__name__ for f in func] if axis == 1 else self.columns ) func = [wrap_udf_function(f) if callable(f) else f for f in func] new_modin_frame = self._modin_frame.apply_full_axis( axis, lambda df: pandas.DataFrame(df.apply(func, axis, *args, **kwargs)), new_index=new_index, new_columns=new_columns, ) return self.__constructor__(new_modin_frame) def rowwise_query(self, expr, **kwargs): """ Query the columns of a ``PandasQueryCompiler`` with a boolean row-wise expression. Basically, in row-wise expressions we only allow column names, constants and other variables captured using the '@' symbol. No function/method cannot be called inside such expressions. Parameters ---------- expr : str Row-wise boolean expression. **kwargs : dict Arguments to pass to the ``pandas.DataFrame.query()``. Returns ------- PandasQueryCompiler Raises ------ NotImplementedError In case the passed expression cannot be executed row-wise. """ # Walk through the AST and verify it doesn't contain any nodes that # prevent us from executing the query row-wise (we're basically # looking for 'ast.Call') nodes = ast.parse(expr.replace("@", "")).body is_row_wise_query = True while nodes: node = nodes.pop() if isinstance(node, ast.Expr): node = getattr(node, "value", node) if isinstance(node, ast.UnaryOp): nodes.append(node.operand) elif isinstance(node, ast.BinOp): nodes.extend([node.left, node.right]) elif isinstance(node, ast.BoolOp): nodes.extend(node.values) elif isinstance(node, ast.Compare): nodes.extend([node.left] + node.comparators) elif isinstance(node, (ast.Name, ast.Constant)): pass else: # if we end up here then the expression is no longer simple # enough to run it row-wise, so exiting is_row_wise_query = False break if not is_row_wise_query: raise NotImplementedError("A non row-wise query was passed.") def query_builder(df, **modin_internal_kwargs): return df.query(expr, inplace=False, **kwargs, **modin_internal_kwargs) return self.__constructor__(self._modin_frame.filter(1, query_builder)) def _callable_func(self, func, axis, *args, **kwargs): """ Apply passed function to each row/column. Parameters ---------- func : callable or str Function to apply. axis : {0, 1} Target axis to apply function along. 0 means apply to columns, 1 means apply to rows. *args : args Arguments to pass to the specified function. **kwargs : kwargs Arguments to pass to the specified function. Returns ------- PandasQueryCompiler New QueryCompiler containing the results of passed function for each row/column. """ if callable(func): func = wrap_udf_function(func) new_modin_frame = self._modin_frame.apply_full_axis( axis, lambda df: df.apply(func, axis=axis, *args, **kwargs) ) return self.__constructor__(new_modin_frame) # END UDF # Manual Partitioning methods (e.g. merge, groupby) # These methods require some sort of manual partitioning due to their # nature. They require certain data to exist on the same partition, and # after the shuffle, there should be only a local map required. def _groupby_separate_by(self, by, drop): """ Separate internal and external groupers in `by` argument of groupby. Parameters ---------- by : BaseQueryCompiler, column or index label, Grouper or list drop : bool Indicates whether or not by data came from self frame. True, by data came from self. False, external by data. Returns ------- external_by : list of BaseQueryCompiler and arrays Values to group by. internal_by : list of str List of column names from `self` to group by. by_positions : list of ints Specifies the order of grouping by `internal_by` and `external_by` columns. Each element in `by_positions` specifies an index from either `external_by` or `internal_by`. Indices for `external_by` are positive and start from 0. Indices for `internal_by` are negative and start from -1 (so in order to convert them to a valid indices one should do ``-idx - 1``) ''' by_positions = [0, -1, 1, -2, 2, 3] internal_by = ["col1", "col2"] external_by = [sr1, sr2, sr3, sr4] df.groupby([sr1, "col1", sr2, "col2", sr3, sr4]) '''. """ if isinstance(by, type(self)): if drop: internal_by = by.columns.tolist() external_by = [] by_positions = [-i - 1 for i in range(len(internal_by))] else: internal_by = [] external_by = [by] by_positions = [i for i in range(len(external_by[0].columns))] else: if not isinstance(by, list): by = [by] if by is not None else [] internal_by = [] external_by = [] external_by_counter = 0 by_positions = [] for o in by: if isinstance(o, pandas.Grouper) and o.key in self.columns: internal_by.append(o.key) by_positions.append(-len(internal_by)) elif hashable(o) and o in self.columns: internal_by.append(o) by_positions.append(-len(internal_by)) else: external_by.append(o) for _ in range(len(o.columns) if isinstance(o, type(self)) else 1): by_positions.append(external_by_counter) external_by_counter += 1 return external_by, internal_by, by_positions groupby_all = GroupbyReduceImpl.build_qc_method("all") groupby_any = GroupbyReduceImpl.build_qc_method("any") groupby_count = GroupbyReduceImpl.build_qc_method("count") groupby_max = GroupbyReduceImpl.build_qc_method("max") groupby_min = GroupbyReduceImpl.build_qc_method("min") groupby_prod = GroupbyReduceImpl.build_qc_method("prod") groupby_sum = GroupbyReduceImpl.build_qc_method("sum") groupby_skew = GroupbyReduceImpl.build_qc_method("skew") def groupby_nth( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): result = super().groupby_nth( by, axis, groupby_kwargs, agg_args, agg_kwargs, drop ) if not groupby_kwargs.get("as_index", True): # pandas keeps order of columns intact, follow suit return result.getitem_column_array(self.columns) return result def groupby_mean(self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False): if RangePartitioning.get(): try: return self._groupby_shuffle( by=by, agg_func="mean", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) except NotImplementedError as e: ErrorMessage.warn( f"Can't use range-partitioning groupby implementation because of: {e}" + "\nFalling back to a TreeReduce implementation." ) _, internal_by, _ = self._groupby_separate_by(by, drop) numeric_only = agg_kwargs.get("numeric_only", False) datetime_cols = ( { col: dtype for col, dtype in zip(self.dtypes.index, self.dtypes) if is_datetime64_any_dtype(dtype) and col not in internal_by } if not numeric_only else dict() ) if len(datetime_cols) > 0: datetime_qc = self.getitem_array(datetime_cols) if datetime_qc.isna().any().any(axis=1).to_pandas().squeeze(): return super().groupby_mean( by=by, axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) qc_with_converted_datetime_cols = ( self.astype({col: "int64" for col in datetime_cols.keys()}) if len(datetime_cols) > 0 else self ) result = GroupbyReduceImpl.build_qc_method("mean")( query_compiler=qc_with_converted_datetime_cols, by=by, axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) if len(datetime_cols) > 0: result = result.astype({col: dtype for col, dtype in datetime_cols.items()}) return result def groupby_size( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): if RangePartitioning.get(): try: return self._groupby_shuffle( by=by, agg_func="size", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) except NotImplementedError as e: ErrorMessage.warn( f"Can't use range-partitioning groupby implementation because of: {e}" + "\nFalling back to a TreeReduce implementation." ) result = self._groupby_dict_reduce( by=by, axis=axis, agg_func={self.columns[0]: [("__size_col__", "size")]}, agg_args=agg_args, agg_kwargs=agg_kwargs, groupby_kwargs=groupby_kwargs, drop=drop, method="size", default_to_pandas_func=lambda grp: grp.size(), ) if groupby_kwargs.get("as_index", True): result.columns = [MODIN_UNNAMED_SERIES_LABEL] elif isinstance(result.columns, pandas.MultiIndex): # Dropping one extra-level which was added because of renaming aggregation result.columns = ( result.columns[:-1].droplevel(-1).append(pandas.Index(["size"])) ) return result def _groupby_dict_reduce( self, by, agg_func, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, **kwargs, ): """ Group underlying data and apply aggregation functions to each group of the specified column/row. This method is responsible of performing dictionary groupby aggregation for such functions, that can be implemented via TreeReduce approach. Parameters ---------- by : PandasQueryCompiler, column or index label, Grouper or list of such Object that determine groups. agg_func : dict(label) -> str Dictionary that maps row/column labels to the function names. **Note:** specified functions have to be supported by ``modin.core.dataframe.algebra.GroupByReduce``. Supported functions are listed in the ``modin.core.dataframe.algebra.GroupByReduce.groupby_reduce_functions`` dictionary. axis : {0, 1} Axis to group and apply aggregation function along. 0 is for index, when 1 is for columns. groupby_kwargs : dict GroupBy parameters in the format of ``modin.pandas.DataFrame.groupby`` signature. agg_args : list-like Serves the compatibility purpose. Does not affect the result. agg_kwargs : dict Arguments to pass to the aggregation functions. drop : bool, default: False If `by` is a QueryCompiler indicates whether or not by-data came from the `self`. **kwargs : dict Additional parameters to pass to the ``modin.core.dataframe.algebra.GroupByReduce.register``. Returns ------- PandasQueryCompiler New QueryCompiler containing the result of groupby dictionary aggregation. """ map_dict = {} reduce_dict = {} kwargs.setdefault( "default_to_pandas_func", lambda grp, *args, **kwargs: grp.agg(agg_func, *args, **kwargs), ) rename_columns = any( not isinstance(fn, str) and isinstance(fn, Iterable) for fn in agg_func.values() ) for col, col_funcs in agg_func.items(): if not rename_columns: map_dict[col], reduce_dict[col], _ = GroupbyReduceImpl.get_impl( col_funcs ) continue if isinstance(col_funcs, str): col_funcs = [col_funcs] map_fns = [] for i, fn in enumerate(col_funcs): if not isinstance(fn, str) and isinstance(fn, Iterable): new_col_name, func = fn elif isinstance(fn, str): new_col_name, func = fn, fn else: raise TypeError map_fn, reduce_fn, _ = GroupbyReduceImpl.get_impl(func) map_fns.append((new_col_name, map_fn)) reduced_col_name = ( (*col, new_col_name) if isinstance(col, tuple) else (col, new_col_name) ) reduce_dict[reduced_col_name] = reduce_fn map_dict[col] = map_fns return GroupByReduce.register(map_dict, reduce_dict, **kwargs)( query_compiler=self, by=by, axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) def groupby_dtypes( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, axis=axis, agg_func=lambda df: df.dtypes, how="group_wise", agg_args=agg_args, agg_kwargs=agg_kwargs, groupby_kwargs=groupby_kwargs, drop=drop, ) @_inherit_docstrings(BaseQueryCompiler.groupby_agg) def _groupby_shuffle( self, by, agg_func, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, how="axis_wise", series_groupby=False, ): # Defaulting to pandas in case of an empty frame as we can't process it properly. # Higher API level won't pass empty data here unless the frame has delayed # computations. FIXME: We apparently lose some laziness here (due to index access) # because of the inability to process empty groupby natively. if len(self.columns) == 0 or len(self._modin_frame) == 0: return super().groupby_agg( by, agg_func, axis, groupby_kwargs, agg_args, agg_kwargs, how, drop ) grouping_on_level = groupby_kwargs.get("level") is not None if any( isinstance(obj, pandas.Grouper) for obj in (by if isinstance(by, list) else [by]) ): raise NotImplementedError( "Grouping on a pandas.Grouper with range-partitioning groupby is not yet supported: " + "https://github.com/modin-project/modin/issues/5926" ) if grouping_on_level: external_by, internal_by, by_positions = [], [], [] else: external_by, internal_by, by_positions = self._groupby_separate_by(by, drop) all_external_are_qcs = all(isinstance(obj, type(self)) for obj in external_by) if not all_external_are_qcs: raise NotImplementedError( "Grouping on an external grouper with range-partitioning groupby is only supported with Series'es: " + "https://github.com/modin-project/modin/issues/5926" ) is_transform = how == "transform" or GroupBy.is_transformation_kernel(agg_func) if is_transform: # https://github.com/modin-project/modin/issues/5924 ErrorMessage.mismatch_with_pandas( operation="range-partitioning groupby", message="the order of rows may be shuffled for the result", ) # This check materializes dtypes for 'by' columns if not is_transform and groupby_kwargs.get("observed", False) in ( False, lib.no_default, ): # The following 'dtypes' check materializes dtypes for 'by' columns internal_dtypes = pandas.Series() external_dtypes = pandas.Series() if len(internal_by) > 0: internal_dtypes = ( self._modin_frame._dtypes.lazy_get(internal_by).get() if isinstance(self._modin_frame._dtypes, ModinDtypes) else self.dtypes[internal_by] ) if len(external_by) > 0: dtypes_list = [] for obj in external_by: if not isinstance(obj, type(self)): # we're only interested in categorical dtypes here, which can only # appear in modin objects continue dtypes_list.append(obj.dtypes) external_dtypes = pandas.concat(dtypes_list) by_dtypes = pandas.concat([internal_dtypes, external_dtypes]) add_missing_cats = any( isinstance(dtype, pandas.CategoricalDtype) for dtype in by_dtypes ) else: add_missing_cats = False if add_missing_cats and not groupby_kwargs.get("as_index", True): raise NotImplementedError( "Range-partitioning groupby is not implemented for grouping on categorical columns with " + "the following set of parameters {'as_index': False, 'observed': False}. Change either 'as_index' " + "or 'observed' to True and try again. " + "https://github.com/modin-project/modin/issues/5926" ) if isinstance(agg_func, dict): assert ( how == "axis_wise" ), f"Only 'axis_wise' aggregation is supported with dictionary functions, got: {how}" subset = internal_by + list(agg_func.keys()) # extracting unique values; no we can't use np.unique here as it would # convert a list of tuples to a 2D matrix and so mess up the result subset = list(dict.fromkeys(subset)) obj = self.getitem_column_array(subset) else: obj = self agg_method = ( SeriesGroupByDefault if series_groupby else GroupByDefault ).get_aggregation_method(how) original_agg_func = agg_func def agg_func(grp, *args, **kwargs): result = agg_method(grp, original_agg_func, *args, **kwargs) # Convert Series to DataFrame if result.ndim == 1: result = result.to_frame( MODIN_UNNAMED_SERIES_LABEL if result.name is None else result.name ) return result result = obj._modin_frame.groupby( axis=axis, internal_by=internal_by, external_by=[ obj._modin_frame if isinstance(obj, type(self)) else obj for obj in external_by ], by_positions=by_positions, series_groupby=series_groupby, operator=lambda grp: agg_func(grp, *agg_args, **agg_kwargs), # UDFs passed to '.apply()' are allowed to produce results with arbitrary shapes, # that's why we have to align the partition's shapes/labeling across different # row partitions align_result_columns=how == "group_wise", add_missing_cats=add_missing_cats, **groupby_kwargs, ) result_qc: PandasQueryCompiler = self.__constructor__(result) if not is_transform and not groupby_kwargs.get("as_index", True): return result_qc.reset_index(drop=True) return result_qc def groupby_corr( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): ErrorMessage.default_to_pandas("`GroupBy.corr`") # TODO(https://github.com/modin-project/modin/issues/1323) implement this. # Right now, using this class's groupby_agg method, even with how="group_wise", # produces a result with the wrong index, so default to pandas by using the # super class's groupby_agg method. return super().groupby_agg( by=by, agg_func="corr", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) def groupby_cov( self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): ErrorMessage.default_to_pandas("`GroupBy.cov`") # TODO(https://github.com/modin-project/modin/issues/1322) implement this. # Right now, using this class's groupby_agg method, even with how="group_wise", # produces a result with the wrong index, so default to pandas by using the # super class's groupby_agg method. return super().groupby_agg( by=by, agg_func="cov", axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, ) def groupby_rolling( self, by, agg_func, axis, groupby_kwargs, rolling_kwargs, agg_args, agg_kwargs, drop=False, ): # 'corr' and 'cov' require knowledge about the whole row axis (all columns have # to be available in the same partitions), this requirement is not being satisfied # in the current groupby implementation unsupported_groupby = ( agg_func in ("corr", "cov") or rolling_kwargs.get("on") is not None ) if isinstance(agg_func, str): str_func = agg_func def agg_func(window, *args, **kwargs): return getattr(window, str_func)(*args, **kwargs) else: assert callable(agg_func) kwargs = { "by": by, "agg_func": lambda grp, *args, **kwargs: agg_func( grp.rolling(**rolling_kwargs), *args, **kwargs ), "axis": axis, "groupby_kwargs": groupby_kwargs, "agg_args": agg_args, "agg_kwargs": agg_kwargs, "how": "direct", "drop": drop, } if unsupported_groupby: return super(PandasQueryCompiler, self).groupby_agg(**kwargs) try: return self._groupby_shuffle(**kwargs) except NotImplementedError as e: get_logger().info( f"Can't use range-partitioning groupby implementation because of: {e}" + "\nFalling back to a full-axis implementation." ) return self.groupby_agg(**kwargs) def groupby_agg( self, by, agg_func, axis, groupby_kwargs, agg_args, agg_kwargs, how="axis_wise", drop=False, series_groupby=False, ): # Defaulting to pandas in case of an empty frame as we can't process it properly. # Higher API level won't pass empty data here unless the frame has delayed # computations. So we apparently lose some laziness here (due to index access) # because of the inability to process empty groupby natively. if len(self.columns) == 0 or len(self._modin_frame) == 0: return super().groupby_agg( by, agg_func, axis, groupby_kwargs, agg_args, agg_kwargs, how, drop ) # 'group_wise' means 'groupby.apply()'. We're certain that range-partitioning groupby # always works better for '.apply()', so we're using it regardless of the 'RangePartitioning' # value if how == "group_wise" or RangePartitioning.get(): try: return self._groupby_shuffle( by=by, agg_func=agg_func, axis=axis, groupby_kwargs=groupby_kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=drop, how=how, series_groupby=series_groupby, ) except NotImplementedError as e: # if a user wants to use range-partitioning groupby explicitly, then we should print a visible # warning to them on a failure, otherwise we're only logging it message = ( f"Can't use range-partitioning groupby implementation because of: {e}" + "\nFalling back to a full-axis implementation." ) get_logger().info(message) if RangePartitioning.get(): ErrorMessage.warn(message) if isinstance(agg_func, dict) and GroupbyReduceImpl.has_impl_for(agg_func): return self._groupby_dict_reduce( by, agg_func, axis, groupby_kwargs, agg_args, agg_kwargs, drop ) is_transform_method = how == "transform" or ( isinstance(agg_func, str) and agg_func in transformation_kernels ) original_agg_func = agg_func if isinstance(agg_func, dict): assert ( how == "axis_wise" ), f"Only 'axis_wise' aggregation is supported with dictionary functions, got: {how}" else: agg_method = ( SeriesGroupByDefault if series_groupby else GroupByDefault ).get_aggregation_method(how) def agg_func(grp, *args, **kwargs): return agg_method(grp, original_agg_func, *args, **kwargs) # since we're going to modify `groupby_kwargs` dict in a `groupby_agg_builder`, # we want to copy it to not propagate these changes into source dict, in case # of unsuccessful end of function groupby_kwargs = groupby_kwargs.copy() as_index = groupby_kwargs.get("as_index", True) external_by, internal_by, _ = self._groupby_separate_by(by, drop) internal_qc = ( [self.getitem_column_array(internal_by)] if len(internal_by) else [] ) by = internal_qc + external_by broadcastable_by = [o._modin_frame for o in by if isinstance(o, type(self))] not_broadcastable_by = [o for o in by if not isinstance(o, type(self))] def groupby_agg_builder(df, by=None, drop=False, partition_idx=None): """ Compute groupby aggregation for a single partition. Parameters ---------- df : pandas.DataFrame Partition of the self frame. by : pandas.DataFrame, optional Broadcasted partition which contains `by` columns. drop : bool, default: False Indicates whether `by` partition came from the `self` frame. partition_idx : int, optional Positional partition index along groupby axis. Returns ------- pandas.DataFrame DataFrame containing the result of groupby aggregation for this particular partition. """ # Set `as_index` to True to track the metadata of the grouping object # It is used to make sure that between phases we are constructing the # right index and placing columns in the correct order. groupby_kwargs["as_index"] = True # We have to filter func-dict BEFORE inserting broadcasted 'by' columns # to avoid multiple aggregation results for 'by' cols in case they're # present in the func-dict: partition_agg_func = GroupByReduce.get_callable(agg_func, df) internal_by_cols = pandas.Index([]) missed_by_cols = pandas.Index([]) if by is not None: internal_by_df = by[internal_by] if isinstance(internal_by_df, pandas.Series): internal_by_df = internal_by_df.to_frame() missed_by_cols = internal_by_df.columns.difference(df.columns) if len(missed_by_cols) > 0: df = pandas.concat( [df, internal_by_df[missed_by_cols]], axis=1, copy=False, ) internal_by_cols = internal_by_df.columns external_by = by.columns.difference(internal_by).unique() external_by_df = by[external_by].squeeze(axis=1) if isinstance(external_by_df, pandas.DataFrame): external_by_cols = [o for _, o in external_by_df.items()] else: external_by_cols = [external_by_df] by = internal_by_cols.tolist() + external_by_cols else: by = [] by += not_broadcastable_by level = groupby_kwargs.get("level", None) if level is not None and not by: by = None by_length = len(level) if is_list_like(level) else 1 else: by_length = len(by) def compute_groupby(df, drop=False, partition_idx=0): """Compute groupby aggregation for a single partition.""" target_df = df.squeeze(axis=1) if series_groupby else df grouped_df = target_df.groupby(by=by, axis=axis, **groupby_kwargs) try: result = partition_agg_func(grouped_df, *agg_args, **agg_kwargs) except DataError: # This happens when the partition is filled with non-numeric data and a # numeric operation is done. We need to build the index here to avoid # issues with extracting the index. result = pandas.DataFrame(index=grouped_df.size().index) if isinstance(result, pandas.Series): result = result.to_frame( result.name if result.name is not None else MODIN_UNNAMED_SERIES_LABEL ) selection = agg_func.keys() if isinstance(agg_func, dict) else None if selection is None: # Some pandas built-in aggregation functions aggregate 'by' columns # (for example 'apply', 'dtypes', maybe more...). Since we make sure # that all of the 'by' columns are presented in every partition by # inserting the missed ones, we will end up with all of the 'by' # columns being aggregated in every partition. To avoid duplications # in the result we drop all of the 'by' columns that were inserted # in this partition AFTER handling 'as_index' parameter. The order # is important for proper naming-conflicts handling. misaggregated_cols = missed_by_cols.intersection(result.columns) else: misaggregated_cols = [] if not as_index: GroupBy.handle_as_index_for_dataframe( result, internal_by_cols, by_cols_dtypes=df[internal_by_cols].dtypes.values, by_length=by_length, selection=selection, partition_idx=partition_idx, drop=drop, inplace=True, method="transform" if is_transform_method else None, ) else: new_index_names = tuple( ( None if isinstance(name, str) and name.startswith(MODIN_UNNAMED_SERIES_LABEL) else name ) for name in result.index.names ) result.index.names = new_index_names if len(misaggregated_cols) > 0: result.drop(columns=misaggregated_cols, inplace=True) return result try: return compute_groupby(df, drop, partition_idx) except (ValueError, KeyError): # This will happen with Arrow buffer read-only errors. We don't want to copy # all the time, so this will try to fast-path the code first. return compute_groupby(df.copy(), drop, partition_idx) if isinstance(original_agg_func, dict): apply_indices = list(agg_func.keys()) elif isinstance(original_agg_func, list): apply_indices = self.columns.difference(internal_by).tolist() else: apply_indices = None if ( # For now handling only simple cases, where 'by' columns are described by a single query compiler agg_kwargs.get("as_index", True) and len(not_broadcastable_by) == 0 and len(broadcastable_by) == 1 and broadcastable_by[0].has_materialized_dtypes ): new_index = ModinIndex( # actual value will be assigned on a parent update value=None, axis=0, dtypes=broadcastable_by[0].dtypes, ) else: new_index = None new_modin_frame = self._modin_frame.broadcast_apply_full_axis( axis=axis, func=lambda df, by=None, partition_idx=None: groupby_agg_builder( df, by, drop, partition_idx ), other=broadcastable_by, new_index=new_index, apply_indices=apply_indices, enumerate_partitions=True, ) result = self.__constructor__(new_modin_frame) # that means that exception in `compute_groupby` was raised # in every partition, so we also should raise it if ( len(result.columns) == 0 and len(self.columns) != 0 and agg_kwargs.get("numeric_only", False) ): raise TypeError("No numeric types to aggregate.") return result # END Manual Partitioning methods def pivot(self, index, columns, values): from pandas.core.reshape.pivot import _convert_by def __convert_by(by): """Convert passed value to a list.""" if isinstance(by, pandas.Index): by = list(by) by = _convert_by(by) if ( len(by) > 0 and (not is_list_like(by[0]) or isinstance(by[0], tuple)) and not all([key in self.columns for key in by]) ): by = [by] return by index, columns, values = map(__convert_by, [index, columns, values]) is_custom_index = ( len(index) == 1 and is_list_like(index[0]) and not isinstance(index[0], tuple) ) if is_custom_index or len(index) == 0: to_reindex = columns else: to_reindex = index + columns if len(values) != 0: obj = self.getitem_column_array(to_reindex + values) else: obj = self if is_custom_index: obj.index = index reindexed = self.__constructor__( obj._modin_frame.apply_full_axis( 1, lambda df: df.set_index(to_reindex, append=(len(to_reindex) == 1)), new_columns=obj.columns.drop(to_reindex), ) ) unstacked = reindexed.unstack(level=columns, fill_value=None) if len(reindexed.columns) == 1 and unstacked.columns.nlevels > 1: unstacked.columns = unstacked.columns.droplevel(0) return unstacked def pivot_table( self, index, values, columns, aggfunc, fill_value, margins, dropna, margins_name, observed, sort, ): ErrorMessage.mismatch_with_pandas( operation="pivot_table", message="Order of columns could be different from pandas", ) from pandas.core.reshape.pivot import _convert_by def __convert_by(by): """Convert passed value to a list.""" if isinstance(by, pandas.Index): return list(by) return _convert_by(by) is_1d_values = values is not None and not is_list_like(values) index, columns = map(__convert_by, [index, columns]) if len(index) + len(columns) == 0: raise ValueError("No group keys passed!") if is_1d_values and len(index) > 0 and len(columns) > 0: drop_column_level = 1 if isinstance(aggfunc, list) else 0 else: drop_column_level = None # if the value is 'None' it will be converted to an empty list (no columns to aggregate), # which is invalid for 'values', as 'None' means aggregate ALL columns instead if values is not None: values = __convert_by(values) # using 'pandas.unique' instead of 'numpy' as it guarantees to not change the original order unique_keys = pandas.Series(index + columns).unique() kwargs = { "qc": self, "unique_keys": unique_keys, "drop_column_level": drop_column_level, "pivot_kwargs": { "index": index, "values": values, "columns": columns, "aggfunc": aggfunc, "fill_value": fill_value, "margins": margins, "dropna": dropna, "margins_name": margins_name, "observed": observed, "sort": sort, }, } try: return PivotTableImpl.map_reduce_impl(**kwargs) except NotImplementedError as e: message = ( f"Can't use MapReduce 'pivot_table' implementation because of: {e}" + "\nFalling back to a range-partitioning implementation." ) get_logger().info(message) try: return PivotTableImpl.range_partition_impl(**kwargs) except NotImplementedError as e: message = ( f"Can't use range-partitioning 'pivot_table' implementation because of: {e}" + "\nFalling back to a full-axis implementation." ) get_logger().info(message) return PivotTableImpl.full_axis_impl(**kwargs) # Get_dummies def get_dummies(self, columns, **kwargs): # `columns` as None does not mean all columns, by default it means only # non-numeric columns. if columns is None: columns = [c for c in self.columns if not is_numeric_dtype(self.dtypes[c])] # If we aren't computing any dummies, there is no need for any # remote compute. if len(columns) == 0: return self.copy() elif not is_list_like(columns): columns = [columns] def map_fn(df): # pragma: no cover cols_to_encode = df.columns.intersection(columns) return pandas.get_dummies(df, columns=cols_to_encode, **kwargs) # In some cases, we are mapping across all of the data. It is more # efficient if we are mapping over all of the data to do it this way # than it would be to reuse the code for specific columns. if len(columns) == len(self.columns): new_modin_frame = self._modin_frame.apply_full_axis( 0, map_fn, new_index=self.index, dtypes=bool ) untouched_frame = None else: new_modin_frame = self._modin_frame.take_2d_labels_or_positional( col_labels=columns ).apply_full_axis(0, map_fn, new_index=self.index, dtypes=bool) untouched_frame = self.drop(columns=columns) # If we mapped over all the data we are done. If not, we need to # prepend the `new_modin_frame` with the raw data from the columns that were # not selected. if len(columns) != len(self.columns): new_modin_frame = untouched_frame._modin_frame.concat( 1, [new_modin_frame], how="left", sort=False ) return self.__constructor__(new_modin_frame) # END Get_dummies # Indexing def take_2d_positional(self, index=None, columns=None): return self.__constructor__( self._modin_frame.take_2d_labels_or_positional( row_positions=index, col_positions=columns ) ) def write_items( self, row_numeric_index, col_numeric_index, item, need_columns_reindex=True ): # We have to keep this import away from the module level to avoid circular import from modin.pandas.utils import broadcast_item, is_scalar def iloc_mut(partition, row_internal_indices, col_internal_indices, item): """ Write `value` in a specified location in a single partition. Parameters ---------- partition : pandas.DataFrame Partition of the self frame. row_internal_indices : list of ints Positional indices of rows in this particular partition to write `item` to. col_internal_indices : list of ints Positional indices of columns in this particular partition to write `item` to. item : 2D-array Value to write. Returns ------- pandas.DataFrame Partition data with updated values. """ partition = partition.copy() try: partition.iloc[row_internal_indices, col_internal_indices] = item except ValueError: # `copy` is needed to avoid "ValueError: buffer source array is read-only" for `item` # because the item may be converted to the type that is in the dataframe. # TODO: in the future we will need to convert to the correct type manually according # to the following warning. Example: "FutureWarning: Setting an item of incompatible # dtype is deprecated and will raise in a future error of pandas. Value '[1.38629436]' # has dtype incompatible with int64, please explicitly cast to a compatible dtype first." partition.iloc[row_internal_indices, col_internal_indices] = item.copy() return partition if not is_scalar(item): ( broadcasted_item, broadcasted_dtypes, row_numeric_index, col_numeric_index, ) = broadcast_item( self, row_numeric_index, col_numeric_index, item, need_columns_reindex=need_columns_reindex, ) else: broadcasted_item, broadcasted_dtypes = item, pandas.Series( [extract_dtype(item)] * len(col_numeric_index) ) new_dtypes = None if ( # compute dtypes only if assigning entire columns isinstance(row_numeric_index, slice) and row_numeric_index == slice(None) and self.frame_has_materialized_dtypes ): new_dtypes = self.dtypes.copy() new_dtypes.iloc[col_numeric_index] = broadcasted_dtypes.values new_modin_frame = self._modin_frame.apply_select_indices( axis=None, func=iloc_mut, row_labels=row_numeric_index, col_labels=col_numeric_index, new_index=self.index, new_columns=self.columns, new_dtypes=new_dtypes, keep_remaining=True, item_to_distribute=broadcasted_item, ) return self.__constructor__(new_modin_frame) def sort_rows_by_column_values(self, columns, ascending=True, **kwargs): new_modin_frame = self._modin_frame.sort_by( 0, columns, ascending=ascending, **kwargs ) return self.__constructor__(new_modin_frame) def sort_columns_by_row_values(self, rows, ascending=True, **kwargs): if not is_list_like(rows): rows = [rows] ErrorMessage.default_to_pandas("sort_values") broadcast_value_list = [ self.getitem_row_array([row]).to_pandas() for row in rows ] index_builder = list(zip(broadcast_value_list, rows)) broadcast_values = pandas.concat( [row for row, idx in index_builder], copy=False ) broadcast_values.columns = self.columns new_columns = broadcast_values.sort_values( by=rows, axis=1, ascending=ascending, **kwargs ).columns return self.reindex(axis=1, labels=new_columns) # Cat operations def cat_codes(self): def func(df: pandas.DataFrame) -> pandas.DataFrame: ser = df.iloc[:, 0] return ser.cat.codes.to_frame(name=MODIN_UNNAMED_SERIES_LABEL) res = self._modin_frame.map(func=func, new_columns=[MODIN_UNNAMED_SERIES_LABEL]) return self.__constructor__(res, shape_hint="column") # END Cat operations def compare(self, other, **kwargs): return self.__constructor__( self._modin_frame.broadcast_apply_full_axis( 0, lambda left, right: pandas.DataFrame.compare( left, other=right, **kwargs ), other._modin_frame, ) ) def case_when(self, caselist): qc_type = type(self) caselist = [ tuple( data._modin_frame if isinstance(data, qc_type) else data for data in case_tuple ) for case_tuple in caselist ] return self.__constructor__( self._modin_frame.case_when(caselist), shape_hint=self._shape_hint, ) ================================================ FILE: modin/core/storage_formats/pandas/query_compiler_caster.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Module contains ``QueryCompilerCaster`` class. ``QueryCompilerCaster`` is used for automatically casting query compiler arguments to the type of the current query compiler for query compiler class functions. This ensures compatibility between different query compiler classes. """ import functools import inspect import random from abc import ABC, abstractmethod from collections import defaultdict, namedtuple from types import FunctionType, MappingProxyType, MethodType from typing import Any, Callable, Dict, Optional, Tuple, TypeVar, Union, ValuesView import pandas from pandas.core.indexes.frozen import FrozenList from typing_extensions import Self from modin.config import AutoSwitchBackend, Backend, BackendMergeCastInPlace from modin.config import context as config_context from modin.core.storage_formats.base.query_compiler import ( BaseQueryCompiler, QCCoercionCost, ) from modin.core.storage_formats.base.query_compiler_calculator import ( BackendCostCalculator, all_switchable_backends, ) from modin.error_message import ErrorMessage from modin.logging import disable_logging, get_logger from modin.logging.metrics import emit_metric from modin.utils import _inherit_docstrings, sentinel Fn = TypeVar("Fn", bound=Any) # Constant for the default class name when class_of_wrapped_fn is None # (represents functions in the modin.pandas module) MODIN_PANDAS_MODULE_NAME = "modin.pandas" def _normalize_class_name(class_of_wrapped_fn: Optional[str]) -> str: """ Normalize class name for logging and operation tracking. Parameters ---------- class_of_wrapped_fn : Optional[str] The name of the class that the function belongs to. `None` for functions in the modin.pandas module. Returns ------- str The normalized class name. Returns "modin.pandas" if input is None. """ return ( class_of_wrapped_fn if class_of_wrapped_fn is not None else MODIN_PANDAS_MODULE_NAME ) # This type describes a defaultdict that maps backend name (or `None` for # method implementation and not bound to any one extension) to the dictionary of # extensions for that backend. The keys of the inner dictionary are the names of # the extensions, and the values are the extensions themselves. EXTENSION_DICT_TYPE = defaultdict[Optional[str], dict[str, Any]] _NON_EXTENDABLE_ATTRIBUTES = { # we use these attributes to implement casting and backend dispatching, so # we can't allow extensions to override them. "__getattribute__", "__setattr__", "__delattr__", "__getattr__", "_getattribute__from_extension_impl", "_getattr__from_extension_impl", "get_backend", "move_to", "set_backend", "_get_extension", "_query_compiler", "_get_query_compiler", "_copy_into", "_update_inplace", "is_backend_pinned", "_set_backend_pinned", "pin_backend", "unpin_backend", "__dict__", } # Do not look up these attributes when searching for extensions. We use them # to implement the extension lookup itself. EXTENSION_NO_LOOKUP = { "_get_extension", "_query_compiler", "get_backend", "_getattribute__from_extension_impl", "_getattr__from_extension_impl", "_get_query_compiler", "set_backend", "_pinned", "is_backend_pinned", "_set_backend_pinned", "pin_backend", "unpin_backend", "_update_inplace", } BackendAndClassName = namedtuple("BackendAndClassName", ["backend", "class_name"]) _AUTO_SWITCH_CLASS = defaultdict[BackendAndClassName, set[str]] _CLASS_AND_BACKEND_TO_POST_OP_SWITCH_METHODS: _AUTO_SWITCH_CLASS = _AUTO_SWITCH_CLASS( set ) _CLASS_AND_BACKEND_TO_PRE_OP_SWITCH_METHODS: _AUTO_SWITCH_CLASS = _AUTO_SWITCH_CLASS( set ) def _get_empty_qc_for_default_backend() -> BaseQueryCompiler: """ Get an empty query compiler for the default backend. Returns ------- BaseQueryCompiler An empty query compiler for the default backend. """ from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return FactoryDispatcher.get_factory().io_cls.from_pandas(pandas.DataFrame()) _BACKEND_TO_EMPTY_QC: defaultdict[str, BaseQueryCompiler] = defaultdict( _get_empty_qc_for_default_backend ) class QueryCompilerCaster(ABC): """Cast all query compiler arguments of the member function to current query compiler.""" @classmethod def __init_subclass__( cls, **kwargs: Dict, ) -> None: """ Apply type casting to all children of ``QueryCompilerCaster``. This method is called automatically when a class inherits from ``QueryCompilerCaster``. It ensures that all member functions within the subclass have their arguments automatically casted to the current query compiler type. Parameters ---------- **kwargs : Additional keyword arguments """ super().__init_subclass__(**kwargs) apply_argument_cast_to_class(cls) @abstractmethod def _get_query_compiler(self) -> Optional[BaseQueryCompiler]: """ Get the query compiler storing data for this object. Returns ------- Optional[BaseQueryCompiler] The query compiler storing data for this object, if it exists. Otherwise, None. """ pass @abstractmethod def is_backend_pinned(self) -> bool: """ Get whether this object's data is pinned to a particular backend. Returns ------- bool True if the data is pinned. """ pass @abstractmethod def _set_backend_pinned(self, pinned: bool, inplace: bool) -> Optional[Self]: """ Update whether this object's data is pinned to a particular backend. Parameters ---------- pinned : bool Whether the data is pinned. inplace : bool, default: False Whether to update the object in place. Returns ------- Optional[Self] The object with the new pin state, if `inplace` is False. Otherwise, None. """ pass def pin_backend(self, inplace: bool = False) -> Optional[Self]: """ Pin the object's underlying data, preventing Modin from automatically moving it to another backend. Parameters ---------- inplace : bool, default: False Whether to update the object in place. Returns ------- Optional[Self] The newly-pinned object, if `inplace` is False. Otherwise, None. """ return self._set_backend_pinned(True, inplace) def unpin_backend(self, inplace: bool = False) -> Optional[Self]: """ Unpin the object's underlying data, allowing Modin to automatically move it to another backend. Parameters ---------- inplace : bool, default: False Whether to update the object in place. Returns ------- Optional[Self] The newly-unpinned object, if `inplace` is False. Otherwise, None. """ return self._set_backend_pinned(False, inplace) @abstractmethod def get_backend(self) -> str: """ Get the backend of this object. Returns ------- str The backend of this object. The backend name must be title-cased. """ pass @abstractmethod def set_backend( self, backend: str, inplace: bool = False, *, switch_operation: Optional[str] = None, ) -> Optional[Self]: """ Set the backend of this object. Parameters ---------- backend : str The new backend. inplace : bool, default: False Whether to update the object in place. switch_operation : Optional[str], default: None The name of the operation that triggered the set_backend call. Internal argument used for displaying progress bar information. Returns ------- Optional[Self] The object with the new backend, if `inplace` is False. Otherwise, None. """ pass @_inherit_docstrings(set_backend) def move_to( self, backend: str, inplace: bool = False, *, switch_operation: Optional[str] = None, ) -> Optional[Self]: return self.set_backend( backend=backend, inplace=inplace, switch_operation=switch_operation ) @abstractmethod def _copy_into(self, other: Self) -> None: """ Copy the data from this object into another object of the same type. Parameters ---------- other : Self The object to copy data into. """ pass @disable_logging def _get_extension(self, name: str, extensions: EXTENSION_DICT_TYPE) -> Any: """ Get an extension with the given name from the given set of extensions. Parameters ---------- name : str The name of the extension. extensions : EXTENSION_DICT_TYPE The set of extensions. Returns ------- Any The extension with the given name, or `sentinel` if the extension is not found. """ if self._get_query_compiler() is not None: extensions_for_backend = extensions[self.get_backend()] if name in extensions_for_backend: return extensions_for_backend[name] if name in extensions[None]: return extensions[None][name] return sentinel @disable_logging def _getattribute__from_extension_impl( self, item: str, extensions: EXTENSION_DICT_TYPE ): """ __getatttribute__() an extension with the given name from the given set of extensions. Implement __getattribute__() for extensions. Python calls __getattribute_() every time you access an attribute of an object. Parameters ---------- item : str The name of the attribute to get. extensions : EXTENSION_DICT_TYPE The set of extensions. Returns ------- Any The attribute from the extension, or `sentinel` if the attribute is not found. """ # An extension property is only accessible if the backend supports it. extension = self._get_extension(item, extensions) if ( extension is not sentinel # We should implement callable extensions by wrapping them in # methods that dispatch to the corrrect backend. We should get the # wrapped method with the usual object.__getattribute__() method # lookup rather than by getting a particular extension when we call # __getattribute__(). For example, if we've extended sort_values(), # then __getattribute__('sort_values') should return a wrapper that # calls the correct extension once it's invoked. and not callable(extension) ): return ( extension.__get__(self) if hasattr(extension, "__get__") else extension ) return sentinel @disable_logging def _getattr__from_extension_impl( self, key: str, default_behavior_attributes: set[str], extensions: EXTENSION_DICT_TYPE, ) -> Any: """ Implement __getattr__, which the python interpreter falls back to if __getattribute__ raises AttributeError. We override this method to make sure we try to get the extension attribute for `key`, even if this class has a different attribute for `key`. Parameters ---------- key : str Attribute name. default_behavior_attributes : set[str] The set of attributes for which we should follow the default __getattr__ behavior and not try to get the extension. extensions : EXTENSION_DICT_TYPE The set of extensions. Returns ------- The value of the attribute. """ if key not in default_behavior_attributes: # If this class has a an extension for `key`, but __getattribute__() # for the extension raises an AttributeError, we end up in this # method, which should try getting the extension again (and # probably raise the AttributeError that # _getattribute__from_extension_impl() originally raised), rather # than following back to object.__getattribute__(). extensions_result = self._getattribute__from_extension_impl(key, extensions) # If extensions_result is not `sentinel`, __getattribute__() should have # returned it first. ErrorMessage.catch_bugs_and_request_email( failure_condition=extensions_result is not sentinel, extra_log=( "This object should return extensions via " + "__getattribute__ rather than __getattr__" ), ) return object.__getattribute__(self, key) def visit_nested_args(arguments, fn: callable): """ Visit each argument recursively, calling fn on each one. Parameters ---------- arguments : tuple or dict fn : Callable to apply to matching arguments Returns ------- tuple or dict Returns args and kwargs with all query compilers casted to current_qc. """ if isinstance(arguments, pandas.NamedAgg): # NamedAgg needs special treatment because it's an immutable subclass # of tuple that can't be constructed from another tuple. return pandas.NamedAgg( column=fn(arguments.column), aggfunc=fn(arguments.aggfunc) ) immutable_types = (FrozenList, tuple, ValuesView) if isinstance(arguments, immutable_types): args_type = type(arguments) return ( # ValuesView, which we might get from dict.values(), is immutable, # but not constructable, so we convert it to a tuple. Otherwise, # we return an object of the same type as the input. tuple if issubclass(args_type, ValuesView) else args_type )(visit_nested_args(list(arguments), fn)) types_to_recursively_visit = (list, dict, *immutable_types) if isinstance( arguments, list, ): for i in range(len(arguments)): if isinstance(arguments[i], types_to_recursively_visit): visit_nested_args(arguments[i], fn) else: arguments[i] = fn(arguments[i]) elif isinstance(arguments, dict): for key in arguments: if isinstance(arguments[key], types_to_recursively_visit): visit_nested_args(arguments[key], fn) else: arguments[key] = fn(arguments[key]) return arguments def _assert_casting_functions_wrap_same_implementation( m1: callable, m2: callable ) -> None: """ Assert that two casting wrappers wrap the same implementation. Parameters ---------- m1 : callable The first casting wrapper. m2 : callable The second casting wrapper. Raises ------ AssertionError If the two casting wrappers wrap different implementations. """ assert ( # For cases like (m1=Series.agg, m2=Series.aggregate), where Series # defines its own method and aliases it, the two wrapped methods # are the same. m2._wrapped_method_for_casting is m1._wrapped_method_for_casting # For cases like (m1=Series.kurt, m2=Series.kurtosis), where Series # inherits both kurt and kurtosis from BasePandasDataset but does # not define its own implementation of either, # Series.kurt._wrapped_method_for_casting points to # BasePandasDataset.kurt, which is not the same as # BasePandasDataset.kurtosis. In that case, we need to go one level # deeper to compare the wrapped methods of the two aliases of # BasePandasDataset. or m2._wrapped_method_for_casting._wrapped_method_for_casting is m1._wrapped_method_for_casting._wrapped_method_for_casting ) def apply_argument_cast_to_class(klass: type) -> type: """ Apply argument casting to all functions in a class. Parameters ---------- klass : type The class to apply argument casting to. Returns ------- type The class with argument casting applied to all functions. """ all_attrs = dict(inspect.getmembers(klass)) # This is required because inspect converts class methods to member functions current_class_attrs = vars(klass) for key in current_class_attrs: all_attrs[key] = current_class_attrs[key] for attr_name, attr_value in all_attrs.items(): if attr_name in _NON_EXTENDABLE_ATTRIBUTES or not isinstance( attr_value, (FunctionType, classmethod, staticmethod) ): continue implementation_function = ( attr_value.__func__ if isinstance(attr_value, (classmethod, staticmethod)) else attr_value ) if attr_name not in klass._extensions[None]: # Register the original implementation as the default # extension. We fall back to this implementation if the # object's backend does not have an implementation for this # method. klass._extensions[None][attr_name] = implementation_function casting_implementation = wrap_function_in_argument_caster( klass=klass, f=implementation_function, wrapping_function_type=( classmethod if isinstance(attr_value, classmethod) else ( staticmethod if isinstance(attr_value, staticmethod) else MethodType ) ), extensions=klass._extensions, name=attr_name, ) wrapped = ( classmethod(casting_implementation) if isinstance(attr_value, classmethod) else ( staticmethod(casting_implementation) if isinstance(attr_value, staticmethod) else casting_implementation ) ) if attr_name not in klass.__dict__: # If this class's method comes from a superclass (i.e. # it's not in klass.__dict__), mark it so that # modin.utils._inherit_docstrings knows that the method # must get its docstrings from its superclass. wrapped._wrapped_superclass_method = attr_value setattr(klass, attr_name, wrapped) return klass def _maybe_switch_backend_pre_op( function_name: str, input_qc: BaseQueryCompiler, class_of_wrapped_fn: Optional[str], arguments: MappingProxyType[str, Any], ) -> tuple[str, Callable[[Any], Any]]: """ Possibly switch backend before a function. Parameters ---------- function_name : str The name of the function. input_qc : BaseQueryCompiler The input query compiler. class_of_wrapped_fn : Optional[str] The name of the class that the function belongs to. `None` for functions in the modin.pandas module. arguments : MappingProxyType[str, Any] Mapping from operation argument names to their values. Returns ------- Tuple[str, callable] A tuple of the new backend and a function that casts all castable arguments to the new query compiler type. """ input_backend = input_qc.get_backend() if ( function_name in _CLASS_AND_BACKEND_TO_PRE_OP_SWITCH_METHODS[ BackendAndClassName( backend=input_qc.get_backend(), class_name=class_of_wrapped_fn ) ] ): result_backend = _get_backend_for_auto_switch( input_qc=input_qc, class_of_wrapped_fn=class_of_wrapped_fn, function_name=function_name, arguments=arguments, ) else: result_backend = input_backend def cast_to_qc(arg: Any) -> Any: if not ( isinstance(arg, QueryCompilerCaster) and arg._get_query_compiler() is not None and arg.get_backend() != result_backend ): return arg arg.set_backend( result_backend, inplace=True, switch_operation=f"{_normalize_class_name(class_of_wrapped_fn)}.{function_name}", ) return arg return result_backend, cast_to_qc def _maybe_switch_backend_post_op( result: Any, function_name: str, qc_list: list[BaseQueryCompiler], starting_backend: str, class_of_wrapped_fn: Optional[str], pin_backend: bool, arguments: MappingProxyType[str, Any], ) -> Any: """ Possibly switch the backend of the result of a function. Use cost-based optimization to determine whether to switch the backend of the result of a function. If the function returned a QueryCompilerCaster and the cost of switching is less than the cost of staying on the current backend, we switch. If there are multiple backends we can switch to, we choose the one that minimizes cost_to_move - cost_to_stay. Parameters ---------- result : Any The result of the function. function_name : str The name of the function. qc_list : list[BaseQueryCompiler] The list of query compilers that were arguments to the function. starting_backend : str The backend used to run the function. class_of_wrapped_fn : Optional[str] The name of the class that the function belongs to. `None` for functions in the modin.pandas module. pin_backend : bool Whether the result should have its backend pinned, and therefore not moved. arguments : MappingProxyType[str, Any] Mapping from operation argument names to their values. Returns ------- Any The result of the function, possibly with its backend switched. """ # If any input QC was pinned, then the output should be as well. if pin_backend: if isinstance(result, QueryCompilerCaster): result.pin_backend(inplace=True) return result if ( # only apply post-operation switch to nullary and unary methods len(qc_list) in (0, 1) and function_name in _CLASS_AND_BACKEND_TO_POST_OP_SWITCH_METHODS[ BackendAndClassName( backend=( qc_list[0].get_backend() if len(qc_list) == 1 else starting_backend ), class_name=class_of_wrapped_fn, ) ] # if the operation did not return a query compiler, we can't switch the # backend of the result. and isinstance(result, QueryCompilerCaster) and (input_qc := result._get_query_compiler()) is not None ): return result.move_to( _get_backend_for_auto_switch( input_qc=input_qc, class_of_wrapped_fn=class_of_wrapped_fn, function_name=function_name, arguments=arguments, ), switch_operation=f"{_normalize_class_name(class_of_wrapped_fn)}.{function_name}", ) return result def _get_backend_for_auto_switch( input_qc: BaseQueryCompiler, class_of_wrapped_fn: str, function_name: str, arguments: MappingProxyType[str, Any], ) -> str: """ Get the best backend to switch to. Use cost-based optimization to determine whether to switch the backend of the arguments to a function. If the cost of switching is less than the cost of staying on the current backend, we switch. If there are multiple backends we can switch to, we choose the one that minimizes cost_to_move - cost_to_stay. Parameters ---------- input_qc : BaseQueryCompiler The query compiler representing the starting backend. class_of_wrapped_fn : Optional[str] The name of the class that the function belongs to. `None` for functions in the modin.pandas module. function_name : str The name of the function. arguments : MappingProxyType[str, Any] Mapping from operation argument names to their values. Returns ------- str The name of the best backend to switch to. """ # TODO(https://github.com/modin-project/modin/issues/7503): Make costing # methods take backend instead of query compiler type so that we don't # have to use the dispatcher to figure out the appropriate type for each # backend. from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher # Does not need to be secure, should not use system entropy metrics_group = "%04x" % random.randrange(16**4) starting_backend = input_qc.get_backend() min_move_stay_delta = None best_backend = starting_backend stay_cost = input_qc.stay_cost( api_cls_name=class_of_wrapped_fn, operation=function_name, arguments=arguments, ) data_max_shape = input_qc._max_shape() emit_metric( f"hybrid.auto.api.{class_of_wrapped_fn}.{function_name}.group.{metrics_group}", 1, ) emit_metric( f"hybrid.auto.current.{starting_backend}.group.{metrics_group}.stay_cost", stay_cost, ) emit_metric( f"hybrid.auto.current.{starting_backend}.group.{metrics_group}.rows", data_max_shape[0], ) emit_metric( f"hybrid.auto.current.{starting_backend}.group.{metrics_group}.cols", data_max_shape[1], ) for backend in all_switchable_backends(): if backend == starting_backend: continue move_to_class = FactoryDispatcher._get_prepared_factory_for_backend( backend=backend ).io_cls.query_compiler_cls move_to_cost = input_qc.move_to_cost( move_to_class, api_cls_name=class_of_wrapped_fn, operation=function_name, arguments=arguments, ) other_execute_cost = move_to_class.move_to_me_cost( input_qc, api_cls_name=class_of_wrapped_fn, operation=function_name, arguments=arguments, ) if ( move_to_cost is not None and stay_cost is not None and other_execute_cost is not None ): if stay_cost >= QCCoercionCost.COST_IMPOSSIBLE: # We cannot execute the workload on the current engine # disregard the move_to_cost and just consider whether # the other engine can execute the workload move_stay_delta = other_execute_cost - stay_cost else: # We can execute this workload if we need to, consider # move_to_cost/transfer time in our decision move_stay_delta = (move_to_cost + other_execute_cost) - stay_cost if move_stay_delta < 0 and ( min_move_stay_delta is None or move_stay_delta < min_move_stay_delta ): min_move_stay_delta = move_stay_delta best_backend = backend emit_metric( f"hybrid.auto.candidate.{backend}.group.{metrics_group}.move_to_cost", move_to_cost, ) emit_metric( f"hybrid.auto.candidate.{backend}.group.{metrics_group}.other_execute_cost", other_execute_cost, ) emit_metric( f"hybrid.auto.candidate.{backend}.group.{metrics_group}.delta", move_stay_delta, ) get_logger().info( f"After {_normalize_class_name(class_of_wrapped_fn)} function {function_name}, " + f"considered moving to backend {backend} with " + f"(transfer_cost {move_to_cost} + other_execution_cost {other_execute_cost}) " + f", stay_cost {stay_cost}, and move-stay delta " + f"{move_stay_delta}" ) if best_backend == starting_backend: emit_metric(f"hybrid.auto.decision.{best_backend}.group.{metrics_group}", 0) get_logger().info( f"Chose not to switch backends after operation {function_name}" ) else: emit_metric(f"hybrid.auto.decision.{best_backend}.group.{metrics_group}", 1) get_logger().info(f"Chose to move to backend {best_backend}") return best_backend def _get_extension_for_method( name: str, extensions: EXTENSION_DICT_TYPE, backend: str, args: tuple, wrapping_function_type: Optional[ Union[type[classmethod], type[staticmethod], type[MethodType]] ], ) -> callable: """ Get the extension implementation for a method. Parameters ---------- name : str The name of the method. extensions : EXTENSION_DICT_TYPE The extension dictionary for the modin-API-level object (e.g. class DataFrame or module modin.pandas) that the method belongs to. backend : str The backend to use for this method call. args : tuple The arguments to the method. wrapping_function_type : Union[type[classmethod], type[staticmethod], type[MethodType]] The type of the original function that `f` implements. - `None` means we are wrapping a free function, e.g. pd.concat() - `classmethod` means we are wrapping a classmethod. - `staticmethod` means we are wrapping a staticmethod. - `MethodType` means we are wrapping a regular method of a class. Returns ------- callable The implementation of the method for the given backend. """ if name in extensions[backend]: f_to_apply = extensions[backend][name] else: if name not in extensions[None]: raise AttributeError( ( # When python invokes a method on an object, it passes the object as # the first positional argument. ( f"{(type(args[0]).__name__)} object" if wrapping_function_type is MethodType else "module 'modin.pandas'" ) + f" has no attribute {name}" ) ) f_to_apply = extensions[None][name] return f_to_apply def wrap_function_in_argument_caster( klass: Optional[type], f: callable, name: str, wrapping_function_type: Optional[ Union[type[classmethod], type[staticmethod], type[MethodType]] ], extensions: EXTENSION_DICT_TYPE, ) -> callable: """ Wrap a function so that it casts all castable arguments to a consistent query compiler, and uses the correct extension implementation for methods. Also propagates pin behavior across operations. Parameters ---------- klass : Optional[type] Class of the function being wrapped. f : callable The function to wrap. name : str The name of the function. wrapping_function_type : Optional[Union[type[classmethod], type[staticmethod], type[MethodType]] The type of the original function that `f` implements. - `None` means we are wrapping a free function, e.g. pd.concat() - `classmethod` means we are wrapping a classmethod. - `staticmethod` means we are wrapping a staticmethod. - `MethodType` means we are wrapping a regular method of a class. extensions : EXTENSION_DICT_TYPE The class of the function we are wrapping. This should be None if and only if `wrapping_function_type` is None. Returns ------- callable The wrapped function. """ @functools.wraps(f) def f_with_argument_casting(*args: Tuple, **kwargs: Dict) -> Any: """ Add casting for query compiler arguments. Parameters ---------- *args : tuple The function arguments. **kwargs : dict The function keyword arguments. Returns ------- Any """ if wrapping_function_type in (classmethod, staticmethod): # TODO: currently we don't support any kind of casting or extension # for classmethod or staticmethod. return f(*args, **kwargs) # f() may make in-place updates to some of its arguments. If we cast # an argument and then f() updates it in place, the updates will not # be reflected in the original object. As a fix, we keep track of all # the in-place updates that f() makes, and once f() is finished, we # copy the updates back into the original objects. The query compiler # interface is mostly immutable (the only exceptions being the mutable # index and column properties), so to check for an in-place update, we # check whether an input's query compiler has changed its identity. InplaceUpdateTracker = namedtuple( "InplaceUpdateTracker", ["input_castable", "original_query_compiler", "new_castable"], ) inplace_update_trackers: list[InplaceUpdateTracker] = [] # The function name and class name of the function are passed to the calculator as strings class_of_wrapped_fn = klass.__name__ if klass is not None else None input_query_compilers: list[BaseQueryCompiler] = [] pin_target_backend = None input_backends: set[str] = set() def register_query_compilers(arg): nonlocal pin_target_backend if ( isinstance(arg, QueryCompilerCaster) and (qc := arg._get_query_compiler()) is not None ): arg_backend = arg.get_backend() input_backends.add(arg_backend) if pin_target_backend is not None: if arg.is_backend_pinned() and arg_backend != pin_target_backend: raise ValueError( f"Cannot combine arguments that are pinned to conflicting backends ({pin_target_backend}, {arg_backend})" ) elif arg.is_backend_pinned(): pin_target_backend = arg_backend input_query_compilers.append(qc) elif isinstance(arg, BaseQueryCompiler): # We might get query compiler arguments in __init__() input_query_compilers.append(arg) return arg visit_nested_args(args, register_query_compilers) visit_nested_args(kwargs, register_query_compilers) # Before determining any automatic switches, we perform the following checks: # 1. If the global AutoSwitchBackend configuration variable is set to False, do not switch. # 2. If there's only one query compiler and it's pinned, do not switch. # 3. If there are multiple query compilers, and at least one is pinned to a particular # backend, then switch to that backend. # 4. If there are multiple query compilers, at least two of which are pinned to distinct # backends, raise a ValueError. if len(input_query_compilers) == 0: input_backend = Backend.get() # For nullary functions, we need to create a dummy query compiler # to calculate the cost of switching backends. We should only # create the dummy query compiler once per backend. input_qc_for_pre_op_switch = _BACKEND_TO_EMPTY_QC[input_backend] else: input_qc_for_pre_op_switch = input_query_compilers[0] input_backend = input_qc_for_pre_op_switch.get_backend() # Skip the casting code if there are < 2 input backends and either # auto-switching is disabled or the inputs are pinned to the input # backend. if len(input_backends) < 2 and ( not AutoSwitchBackend.get() or pin_target_backend is not None ): f_to_apply = _get_extension_for_method( name=name, extensions=extensions, backend=( pin_target_backend if pin_target_backend is not None else input_backend ), args=args, wrapping_function_type=wrapping_function_type, ) result = f_to_apply(*args, **kwargs) if ( isinstance(result, QueryCompilerCaster) and pin_target_backend is not None ): result._set_backend_pinned(True, inplace=True) return result # Bind the arguments using the function implementation for the input # backend. TODO(https://github.com/modin-project/modin/issues/7525): # Ideally every implementation would have the same signature. bound_arguments = inspect.signature( _get_extension_for_method( name=name, extensions=extensions, backend=input_backend, args=args, wrapping_function_type=wrapping_function_type, ), ).bind(*args, **kwargs) bound_arguments.apply_defaults() args_dict = MappingProxyType(bound_arguments.arguments) if len(input_query_compilers) < 2: # No need to check should_pin_result() again, since we have already done so above. result_backend, cast_to_qc = _maybe_switch_backend_pre_op( name, input_qc=input_qc_for_pre_op_switch, class_of_wrapped_fn=class_of_wrapped_fn, arguments=args_dict, ) else: preop_switch = ( name in _CLASS_AND_BACKEND_TO_PRE_OP_SWITCH_METHODS[ BackendAndClassName( backend=input_backend, class_name=class_of_wrapped_fn, ) ] ) calculator: BackendCostCalculator = BackendCostCalculator( operation_arguments=args_dict, api_cls_name=class_of_wrapped_fn, operation=name, query_compilers=input_query_compilers, preop_switch=preop_switch, ) if pin_target_backend is None: result_backend = calculator.calculate() else: result_backend = pin_target_backend def cast_to_qc(arg): if not ( isinstance(arg, QueryCompilerCaster) and arg._get_query_compiler() is not None and arg.get_backend() != result_backend ): return arg if BackendMergeCastInPlace.get(): arg.set_backend( result_backend, switch_operation=f"{_normalize_class_name(class_of_wrapped_fn)}.{name}", inplace=True, ) assert arg.get_backend() == result_backend cast = arg else: cast = arg.set_backend( result_backend, switch_operation=f"{_normalize_class_name(class_of_wrapped_fn)}.{name}", inplace=False, ) inplace_update_trackers.append( InplaceUpdateTracker( input_castable=arg, original_query_compiler=cast._get_query_compiler(), new_castable=cast, ) ) return cast args = visit_nested_args(args, cast_to_qc) kwargs = visit_nested_args(kwargs, cast_to_qc) # `result_backend` may be different from `input_backend`, so we have to # look up the correct implementation based on `result_backend`. f_to_apply = _get_extension_for_method( name=name, extensions=extensions, backend=result_backend, args=args, wrapping_function_type=wrapping_function_type, ) # We have to set the global Backend correctly for I/O methods like # read_json() to use the correct backend. with config_context(Backend=result_backend): result = f_to_apply(*args, **kwargs) for ( original_castable, original_qc, new_castable, ) in inplace_update_trackers: new_qc = new_castable._get_query_compiler() if BackendMergeCastInPlace.get() or original_qc is not new_qc: new_castable._copy_into(original_castable) return _maybe_switch_backend_post_op( result, function_name=name, qc_list=input_query_compilers, starting_backend=result_backend, class_of_wrapped_fn=class_of_wrapped_fn, pin_backend=pin_target_backend is not None, arguments=args_dict, ) f_with_argument_casting._wrapped_method_for_casting = f return f_with_argument_casting _GENERAL_EXTENSIONS: EXTENSION_DICT_TYPE = defaultdict(dict) def wrap_free_function_in_argument_caster(name: str) -> callable: """ Get a wrapper for a free function that casts all castable arguments to a consistent query compiler. Parameters ---------- name : str The name of the function. Returns ------- callable A wrapper for a free function that casts all castable arguments to a consistent query compiler. """ def wrapper(f): if name not in _GENERAL_EXTENSIONS[None]: _GENERAL_EXTENSIONS[None][name] = f return wrap_function_in_argument_caster( klass=None, f=f, wrapping_function_type=None, extensions=_GENERAL_EXTENSIONS, name=name, ) return wrapper def register_function_for_post_op_switch( class_name: Optional[str], backend: str, method: str ) -> None: """ Register a function for post-operation backend switch. Parameters ---------- class_name : Optional[str] The name of the class that the function belongs to. `None` for functions in the modin.pandas module. backend : str Only consider switching when the starting backend is this one. method : str The name of the method to register. """ _CLASS_AND_BACKEND_TO_POST_OP_SWITCH_METHODS[ BackendAndClassName(backend=backend, class_name=class_name) ].add(method) def register_function_for_pre_op_switch( class_name: Optional[str], backend: str, method: str ) -> None: """ Register a function for pre-operation backend switch. Parameters ---------- class_name : Optional[str] The name of the class that the function belongs to. `None` for functions in the modin.pandas module. backend : str Only consider switching when the starting backend is this one. method : str The name of the method to register. """ _CLASS_AND_BACKEND_TO_PRE_OP_SWITCH_METHODS[ BackendAndClassName(backend=backend, class_name=class_name) ].add(method) ================================================ FILE: modin/core/storage_formats/pandas/utils.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Contains utility functions for frame partitioning.""" from __future__ import annotations import re from math import ceil from typing import Generator, Hashable, List, Optional import numpy as np import pandas from modin.config import MinColumnPartitionSize, MinRowPartitionSize, NPartitions def compute_chunksize(axis_len: int, num_splits: int, min_block_size: int) -> int: """ Compute the number of elements (rows/columns) to include in each partition. Chunksize is defined the same for both axes. Parameters ---------- axis_len : int Element count in an axis. num_splits : int The number of splits. min_block_size : int Minimum number of rows/columns in a single split. Returns ------- int Integer number of rows/columns to split the DataFrame will be returned. """ if not isinstance(min_block_size, int) or min_block_size <= 0: raise ValueError( f"'min_block_size' should be int > 0, passed: {min_block_size=}" ) chunksize = axis_len // num_splits if axis_len % num_splits: chunksize += 1 # chunksize shouldn't be less than `min_block_size` to avoid a # large amount of small partitions. return max(chunksize, min_block_size) def split_result_of_axis_func_pandas( axis: int, num_splits: int, result: pandas.DataFrame, min_block_size: int, length_list: Optional[list] = None, ) -> list[pandas.DataFrame]: """ Split pandas DataFrame evenly based on the provided number of splits. Parameters ---------- axis : {0, 1} Axis to split across. 0 means index axis when 1 means column axis. num_splits : int Number of splits to separate the DataFrame into. This parameter is ignored if `length_list` is specified. result : pandas.DataFrame DataFrame to split. min_block_size : int Minimum number of rows/columns in a single split. length_list : list of ints, optional List of slice lengths to split DataFrame into. This is used to return the DataFrame to its original partitioning schema. Returns ------- list of pandas.DataFrames Splitted dataframe represented by list of frames. """ return list( generate_result_of_axis_func_pandas( axis, num_splits, result, min_block_size, length_list ) ) def generate_result_of_axis_func_pandas( axis: int, num_splits: int, result: pandas.DataFrame, min_block_size: int, length_list: Optional[list] = None, ) -> Generator: """ Generate pandas DataFrame evenly based on the provided number of splits. Parameters ---------- axis : {0, 1} Axis to split across. 0 means index axis when 1 means column axis. num_splits : int Number of splits to separate the DataFrame into. This parameter is ignored if `length_list` is specified. result : pandas.DataFrame DataFrame to split. min_block_size : int Minimum number of rows/columns in a single split. length_list : list of ints, optional List of slice lengths to split DataFrame into. This is used to return the DataFrame to its original partitioning schema. Yields ------ Generator Generates 'num_splits' dataframes as a result of axis function. """ if num_splits == 1: yield result else: if length_list is None: length_list = get_length_list( result.shape[axis], num_splits, min_block_size ) # Inserting the first "zero" to properly compute cumsum indexing slices length_list = np.insert(length_list, obj=0, values=[0]) sums = np.cumsum(length_list) axis = 0 if isinstance(result, pandas.Series) else axis for i in range(len(sums) - 1): # We do this to restore block partitioning if axis == 0: chunk = result.iloc[sums[i] : sums[i + 1]] else: chunk = result.iloc[:, sums[i] : sums[i + 1]] # Sliced MultiIndex still stores all encoded values of the original index, explicitly # asking it to drop unused values in order to save memory. if isinstance(chunk.axes[axis], pandas.MultiIndex): chunk = chunk.set_axis( chunk.axes[axis].remove_unused_levels(), axis=axis, copy=False ) yield chunk def get_length_list(axis_len: int, num_splits: int, min_block_size: int) -> list: """ Compute partitions lengths along the axis with the specified number of splits. Parameters ---------- axis_len : int Element count in an axis. num_splits : int Number of splits along the axis. min_block_size : int Minimum number of rows/columns in a single split. Returns ------- list of ints List of integer lengths of partitions. """ chunksize = compute_chunksize(axis_len, num_splits, min_block_size) return [ ( chunksize if (i + 1) * chunksize <= axis_len else max(0, axis_len - i * chunksize) ) for i in range(num_splits) ] def length_fn_pandas(df): """ Compute number of rows of passed `pandas.DataFrame`. Parameters ---------- df : pandas.DataFrame Returns ------- int """ assert isinstance(df, pandas.DataFrame) return len(df) if len(df) > 0 else 0 def width_fn_pandas(df): """ Compute number of columns of passed `pandas.DataFrame`. Parameters ---------- df : pandas.DataFrame Returns ------- int """ assert isinstance(df, pandas.DataFrame) return len(df.columns) if len(df.columns) > 0 else 0 def get_group_names(regex: "re.Pattern") -> "List[Hashable]": """ Get named groups from compiled regex. Unnamed groups are numbered. Parameters ---------- regex : compiled regex Returns ------- list of column labels """ names = {v: k for k, v in regex.groupindex.items()} return [names.get(1 + i, i) for i in range(regex.groups)] def merge_partitioning(left, right, axis=1): """ Get the number of splits across the `axis` for the two dataframes being concatenated. Parameters ---------- left : PandasDataframe right : PandasDataframe axis : int, default: 1 Returns ------- int """ lshape = left._row_lengths_cache if axis == 0 else left._column_widths_cache rshape = right._row_lengths_cache if axis == 0 else right._column_widths_cache if lshape is not None and rshape is not None: res_shape = sum(lshape) + sum(rshape) chunk_size = compute_chunksize( axis_len=res_shape, num_splits=NPartitions.get(), min_block_size=( MinRowPartitionSize.get() if axis == 0 else MinColumnPartitionSize.get() ), ) return ceil(res_shape / chunk_size) else: lsplits = left._partitions.shape[axis] rsplits = right._partitions.shape[axis] return min(lsplits + rsplits, NPartitions.get()) ================================================ FILE: modin/db_conn.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Module houses `ModinDatabaseConnection` class. `ModinDatabaseConnection` lets a single process make its own connection to a database to read from it. Whereas it's possible in pandas to pass an open connection directly to `read_sql`, the open connection is not pickleable in Modin, so each worker must open its own connection. `ModinDatabaseConnection` saves the arguments that would normally be used to make a db connection. It can make and provide a connection whenever the Modin driver or a worker wants one. """ from typing import Any, Dict, Optional, Sequence _PSYCOPG_LIB_NAME = "psycopg2" _SQLALCHEMY_LIB_NAME = "sqlalchemy" class UnsupportedDatabaseException(Exception): """Modin can't create a particular kind of database connection.""" pass class ModinDatabaseConnection: """ Creates a SQL database connection. Parameters ---------- lib : str The library for the SQL connection. *args : iterable Positional arguments to pass when creating the connection. **kwargs : dict Keyword arguments to pass when creating the connection. """ lib: str args: Sequence kwargs: Dict _dialect_is_microsoft_sql_cache: Optional[bool] def __init__(self, lib: str, *args: Any, **kwargs: Any) -> None: lib = lib.lower() if lib not in (_PSYCOPG_LIB_NAME, _SQLALCHEMY_LIB_NAME): raise UnsupportedDatabaseException(f"Unsupported database library {lib}") self.lib = lib self.args = args self.kwargs = kwargs self._dialect_is_microsoft_sql_cache = None def _dialect_is_microsoft_sql(self) -> bool: """ Tell whether this connection requires Microsoft SQL dialect. If this is a sqlalchemy connection, create an engine from args and kwargs. If that engine's driver is pymssql or pyodbc, this connection requires Microsoft SQL. Otherwise, it doesn't. Returns ------- bool """ if self._dialect_is_microsoft_sql_cache is None: self._dialect_is_microsoft_sql_cache = False if self.lib == _SQLALCHEMY_LIB_NAME: from sqlalchemy import create_engine self._dialect_is_microsoft_sql_cache = create_engine( *self.args, **self.kwargs ).driver in ("pymssql", "pyodbc") return self._dialect_is_microsoft_sql_cache def get_connection(self) -> Any: """ Make the database connection and get it. For psycopg2, pass all arguments to psycopg2.connect() and return the result of psycopg2.connect(). For sqlalchemy, pass all arguments to sqlalchemy.create_engine() and return the result of calling connect() on the engine. Returns ------- Any The open database connection. """ if self.lib == _PSYCOPG_LIB_NAME: import psycopg2 return psycopg2.connect(*self.args, **self.kwargs) if self.lib == _SQLALCHEMY_LIB_NAME: from sqlalchemy import create_engine return create_engine(*self.args, **self.kwargs).connect() raise UnsupportedDatabaseException("Unsupported database library") def get_string(self) -> str: """ Get input connection string. Returns ------- str """ return self.args[0] def column_names_query(self, query: str) -> str: """ Get a query that gives the names of columns that `query` would produce. Parameters ---------- query : str The SQL query to check. Returns ------- str """ # This query looks odd, but it works in both PostgreSQL and Microsoft # SQL, which doesn't let you use a "limit" clause to select 0 rows. return f"SELECT * FROM ({query}) AS _MODIN_COUNT_QUERY WHERE 1 = 0" def row_count_query(self, query: str) -> str: """ Get a query that gives the names of rows that `query` would produce. Parameters ---------- query : str The SQL query to check. Returns ------- str """ return f"SELECT COUNT(*) FROM ({query}) AS _MODIN_COUNT_QUERY" def partition_query(self, query: str, limit: int, offset: int) -> str: """ Get a query that partitions the original `query`. Parameters ---------- query : str The SQL query to get a partition. limit : int The size of the partition. offset : int Where the partition begins. Returns ------- str """ return ( ( f"SELECT * FROM ({query}) AS _MODIN_COUNT_QUERY ORDER BY(SELECT NULL)" + f" OFFSET {offset} ROWS FETCH NEXT {limit} ROWS ONLY" ) if self._dialect_is_microsoft_sql() else f"SELECT * FROM ({query}) AS _MODIN_COUNT_QUERY LIMIT " + f"{limit} OFFSET {offset}" ) ================================================ FILE: modin/distributed/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """API to operate on distributed objects.""" ================================================ FILE: modin/distributed/dataframe/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """API to operate on distributed DataFrame objects.""" ================================================ FILE: modin/distributed/dataframe/pandas/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """API to operate on distributed pandas DataFrame objects.""" from .partitions import from_partitions, unwrap_partitions __all__ = ["unwrap_partitions", "from_partitions"] ================================================ FILE: modin/distributed/dataframe/pandas/partitions.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses API to operate on Modin DataFrame partitions that are pandas DataFrame(s).""" from typing import TYPE_CHECKING, Optional, Union import numpy as np from pandas._typing import Axes from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler from modin.pandas.dataframe import DataFrame, Series if TYPE_CHECKING: from modin.core.execution.dask.implementations.pandas_on_dask.partitioning import ( PandasOnDaskDataframeColumnPartition, PandasOnDaskDataframePartition, PandasOnDaskDataframeRowPartition, ) from modin.core.execution.ray.implementations.pandas_on_ray.partitioning import ( PandasOnRayDataframeColumnPartition, PandasOnRayDataframePartition, PandasOnRayDataframeRowPartition, ) from modin.core.execution.unidist.implementations.pandas_on_unidist.partitioning import ( PandasOnUnidistDataframeColumnPartition, PandasOnUnidistDataframePartition, PandasOnUnidistDataframeRowPartition, ) PartitionUnionType = Union[ PandasOnRayDataframePartition, PandasOnDaskDataframePartition, PandasOnUnidistDataframePartition, PandasOnRayDataframeColumnPartition, PandasOnRayDataframeRowPartition, PandasOnDaskDataframeColumnPartition, PandasOnDaskDataframeRowPartition, PandasOnUnidistDataframeColumnPartition, PandasOnUnidistDataframeRowPartition, ] else: from typing import Any PartitionUnionType = Any def unwrap_partitions( api_layer_object: Union[DataFrame, Series], axis: Optional[int] = None, get_ip: bool = False, ) -> list: """ Unwrap partitions of the ``api_layer_object``. Parameters ---------- api_layer_object : DataFrame or Series The API layer object. axis : {None, 0, 1}, default: None The axis to unwrap partitions for (0 - row partitions, 1 - column partitions). If ``axis is None``, the partitions are unwrapped as they are currently stored. get_ip : bool, default: False Whether to get node ip address to each partition or not. Returns ------- list A list of Ray.ObjectRef/Dask.Future to partitions of the ``api_layer_object`` if Ray/Dask is used as an engine. Notes ----- If ``get_ip=True``, a list of tuples of Ray.ObjectRef/Dask.Future to node ip addresses and partitions of the ``api_layer_object``, respectively, is returned if Ray/Dask is used as an engine (i.e. ``[(Ray.ObjectRef/Dask.Future, Ray.ObjectRef/Dask.Future), ...]``). """ if not hasattr(api_layer_object, "_query_compiler"): raise ValueError( f"Only API Layer objects may be passed in here, got {type(api_layer_object)} instead." ) modin_frame = api_layer_object._query_compiler._modin_frame modin_frame._propagate_index_objs(None) if axis is None: def _unwrap_partitions() -> list: [p.drain_call_queue() for p in modin_frame._partitions.flatten()] def get_block(partition: PartitionUnionType) -> np.ndarray: if hasattr(partition, "force_materialization"): blocks = partition.force_materialization().list_of_blocks else: blocks = partition.list_of_blocks assert ( len(blocks) == 1 ), f"Implementation assumes that partition contains a single block, but {len(blocks)} received." return blocks[0] if get_ip: return [ [ (partition.ip(materialize=False), get_block(partition)) for partition in row ] for row in modin_frame._partitions ] else: return [ [get_block(partition) for partition in row] for row in modin_frame._partitions ] actual_engine = type( api_layer_object._query_compiler._modin_frame._partitions[0][0] ).__name__ if actual_engine in ( "PandasOnRayDataframePartition", "PandasOnDaskDataframePartition", "PandasOnUnidistDataframePartition", "PandasOnRayDataframeColumnPartition", "PandasOnRayDataframeRowPartition", "PandasOnDaskDataframeColumnPartition", "PandasOnDaskDataframeRowPartition", "PandasOnUnidistDataframeColumnPartition", "PandasOnUnidistDataframeRowPartition", ): return _unwrap_partitions() raise ValueError( f"Do not know how to unwrap '{actual_engine}' underlying partitions" ) else: partitions = modin_frame._partition_mgr_cls.axis_partition( modin_frame._partitions, axis ^ 1 ) return [ part.force_materialization(get_ip=get_ip).unwrap( squeeze=True, get_ip=get_ip ) for part in partitions ] def from_partitions( partitions: list, axis: Optional[int], index: Optional[Axes] = None, columns: Optional[Axes] = None, row_lengths: Optional[list] = None, column_widths: Optional[list] = None, ) -> DataFrame: """ Create DataFrame from remote partitions. Parameters ---------- partitions : list A list of Ray.ObjectRef/Dask.Future to partitions depending on the engine used. Or a list of tuples of Ray.ObjectRef/Dask.Future to node ip addresses and partitions depending on the engine used (i.e. ``[(Ray.ObjectRef/Dask.Future, Ray.ObjectRef/Dask.Future), ...]``). axis : {None, 0 or 1} The ``axis`` parameter is used to identify what are the partitions passed. You have to set: * ``axis=0`` if you want to create DataFrame from row partitions * ``axis=1`` if you want to create DataFrame from column partitions * ``axis=None`` if you want to create DataFrame from 2D list of partitions index : sequence, optional The index for the DataFrame. Is computed if not provided. columns : sequence, optional The columns for the DataFrame. Is computed if not provided. row_lengths : list, optional The length of each partition in the rows. The "height" of each of the block partitions. Is computed if not provided. column_widths : list, optional The width of each partition in the columns. The "width" of each of the block partitions. Is computed if not provided. Returns ------- modin.pandas.DataFrame DataFrame instance created from remote partitions. Notes ----- Pass `index`, `columns`, `row_lengths` and `column_widths` to avoid triggering extra computations of the metadata when creating a DataFrame. """ from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher factory = FactoryDispatcher.get_factory() # TODO(https://github.com/modin-project/modin/issues/5127): # Remove these assertions once the dependencies of this function all have types. assert factory is not None assert factory.io_cls is not None assert factory.io_cls.frame_cls is not None assert factory.io_cls.frame_cls._partition_mgr_cls is not None # type: ignore[unreachable] partition_class = factory.io_cls.frame_cls._partition_mgr_cls._partition_class partition_frame_class = factory.io_cls.frame_cls partition_mgr_class = factory.io_cls.frame_cls._partition_mgr_cls # Since we store partitions of Modin DataFrame as a 2D NumPy array we need to place # passed partitions to 2D NumPy array to pass it to internal Modin Frame class. # `axis=None` - convert 2D list to 2D NumPy array if axis is None: if isinstance(partitions[0][0], tuple): parts = np.array( [ [partition_class(partition, ip=ip) for ip, partition in row] for row in partitions ] ) else: parts = np.array( [ [partition_class(partition) for partition in row] for row in partitions ] ) # `axis=0` - place row partitions to 2D NumPy array so that each row of the array is one row partition. elif axis == 0: if isinstance(partitions[0], tuple): parts = np.array( [[partition_class(partition, ip=ip)] for ip, partition in partitions] ) else: parts = np.array([[partition_class(partition)] for partition in partitions]) # `axis=1` - place column partitions to 2D NumPy array so that each column of the array is one column partition. elif axis == 1: if isinstance(partitions[0], tuple): parts = np.array( [[partition_class(partition, ip=ip) for ip, partition in partitions]] ) else: parts = np.array([[partition_class(partition) for partition in partitions]]) else: raise ValueError( f"Got unacceptable value of axis {axis}. Possible values are {0}, {1} or {None}." ) labels_axis_to_sync = None if index is None: labels_axis_to_sync = 1 index, internal_indices = partition_mgr_class.get_indices(0, parts) if row_lengths is None: row_lengths = [len(idx) for idx in internal_indices] if columns is None: labels_axis_to_sync = 0 if labels_axis_to_sync is None else -1 columns, internal_indices = partition_mgr_class.get_indices(1, parts) if column_widths is None: column_widths = [len(idx) for idx in internal_indices] frame = partition_frame_class( parts, index, columns, row_lengths=row_lengths, column_widths=column_widths, ) if labels_axis_to_sync != -1: frame.synchronize_labels(axis=labels_axis_to_sync) return DataFrame(query_compiler=PandasQueryCompiler(frame)) ================================================ FILE: modin/error_message.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import warnings from typing import NoReturn, Optional, Set from modin.logging import get_logger from modin.utils import get_current_execution class ErrorMessage(object): # Only print full ``default to pandas`` warning one time. printed_default_to_pandas = False printed_warnings: Set[int] = set() # Set of hashes of printed warnings @classmethod def not_implemented(cls, message: str = "") -> NoReturn: if message == "": message = "This functionality is not yet available in Modin." get_logger().info(f"Modin Error: NotImplementedError: {message}") raise NotImplementedError( f"{message}\n" + "To request implementation, file an issue at " + "https://github.com/modin-project/modin/issues or, if that's " + "not possible, send an email to feature_requests@modin.org." ) @classmethod def single_warning( cls, message: str, category: Optional[type[Warning]] = None ) -> None: # note that there should not be identical messages with different categories since # only the message is used as the hash key. message_hash = hash(message) logger = get_logger() if message_hash in cls.printed_warnings: logger.debug( f"Modin Warning: Single Warning: {message} was raised and suppressed." ) return logger.debug(f"Modin Warning: Single Warning: {message} was raised.") warnings.warn(message, category=category) cls.printed_warnings.add(message_hash) @classmethod def default_to_pandas(cls, message: str = "", reason: str = "") -> None: # TODO(https://github.com/modin-project/modin/issues/7429): Use # frame-level engine config. if message != "": execution_str = get_current_execution() message = ( f"{message} is not currently supported by {execution_str}, " + "defaulting to pandas implementation." ) else: message = "Defaulting to pandas implementation." if not cls.printed_default_to_pandas: message = ( f"{message}\n" + "Please refer to " + "https://modin.readthedocs.io/en/stable/supported_apis/defaulting_to_pandas.html for explanation." ) cls.printed_default_to_pandas = True if reason: message += f"\nReason: {reason}" get_logger().debug(f"Modin Warning: Default to pandas: {message}") warnings.warn(message) @classmethod def catch_bugs_and_request_email( cls, failure_condition: bool, extra_log: str = "" ) -> None: if failure_condition: get_logger().info(f"Modin Error: Internal Error: {extra_log}") raise Exception( "Internal Error. " + "Please visit https://github.com/modin-project/modin/issues " + "to file an issue with the traceback and the command that " + "caused this error. If you can't file a GitHub issue, " + f"please email bug_reports@modin.org.\n{extra_log}" ) @classmethod def non_verified_udf(cls) -> None: get_logger().debug("Modin Warning: Non Verified UDF") warnings.warn( "User-defined function verification is still under development in Modin. " + "The function provided is not verified." ) @classmethod def bad_type_for_numpy_op(cls, function_name: str, operand_type: type) -> None: cls.single_warning( f"Modin NumPy only supports objects of modin.numpy.array types for {function_name}, not {operand_type}. Defaulting to NumPy." ) @classmethod def mismatch_with_pandas(cls, operation: str, message: str) -> None: get_logger().debug( f"Modin Warning: {operation} mismatch with pandas: {message}" ) cls.single_warning( f"`{operation}` implementation has mismatches with pandas:\n{message}." ) @classmethod def warn(cls, message: str) -> None: warnings.warn(message) @classmethod def not_initialized(cls, engine: str, code: str) -> None: get_logger().debug(f"Modin Warning: Not Initialized: {engine}") warnings.warn( f"{engine} execution environment not yet initialized. Initializing...\n" + "To remove this warning, run the following python code before doing dataframe operations:\n" + f"{code}" ) ================================================ FILE: modin/experimental/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. ================================================ FILE: modin/experimental/batch/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. from .pipeline import PandasQueryPipeline __all__ = [ "PandasQueryPipeline", ] ================================================ FILE: modin/experimental/batch/pipeline.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses ``PandasQueryPipeline`` and ``PandasQuery`` classes, that implement a batch pipeline protocol for Modin Dataframes.""" from typing import Callable, Optional import numpy as np import modin.pandas as pd from modin.config import NPartitions from modin.core.execution.ray.implementations.pandas_on_ray.dataframe.dataframe import ( PandasOnRayDataframe, ) from modin.core.storage_formats.pandas import PandasQueryCompiler from modin.error_message import ErrorMessage from modin.utils import get_current_execution class PandasQuery(object): """ Internal representation of a single query in a pipeline. This object represents a single function to be pipelined in a batch pipeline. Parameters ---------- func : Callable The function to apply to the dataframe. is_output : bool, default: False Whether this query is an output query and should be passed both to the next query, and directly to postprocessing. repartition_after : bool, default: False Whether to repartition after this query is computed. Currently, repartitioning is only supported if there is 1 partition prior to repartitioning. fan_out : bool, default: False Whether to fan out this node. If True and only 1 partition is passed as input, the partition is replicated `PandasQueryPipeline.num_partitions` (default: `NPartitions.get`) times, and the function is called on each. The `reduce_fn` must also be specified. pass_partition_id : bool, default: False Whether to pass the numerical partition id to the query. reduce_fn : Callable, default: None The reduce function to apply if `fan_out` is set to True. This takes the `PandasQueryPipeline.num_partitions` (default: `NPartitions.get`) partitions that result from this query, and combines them into 1 partition. output_id : int, default: None An id to assign to this node if it is an output. Notes ----- `func` must be a function that is applied along an axis of the dataframe. Use `pandas` for any module level functions inside `func` since it operates directly on partitions. """ def __init__( self, func: Callable, is_output: bool = False, repartition_after: bool = False, fan_out: bool = False, pass_partition_id: bool = False, reduce_fn: Optional[Callable] = None, output_id: Optional[int] = None, ): self.func = func self.is_output = is_output self.repartition_after = repartition_after self.fan_out = fan_out self.pass_partition_id = pass_partition_id self.reduce_fn = reduce_fn self.output_id = output_id # List of sub-queries to feed into this query, if this query is an output node. self.operators = None class PandasQueryPipeline(object): """ Internal representation of a query pipeline. This object keeps track of the functions that compose to form a query pipeline. Parameters ---------- df : modin.pandas.Dataframe The dataframe to perform this pipeline on. num_partitions : int, optional The number of partitions to maintain for the batched dataframe. If not specified, the value is assumed equal to ``NPartitions.get()``. Notes ----- Only row-parallel pipelines are supported. All queries will be applied along the row axis. """ def __init__(self, df, num_partitions: Optional[int] = None): if get_current_execution() != "PandasOnRay" or ( not isinstance(df._query_compiler._modin_frame, PandasOnRayDataframe) ): # pragma: no cover ErrorMessage.not_implemented( "Batch Pipeline API is only implemented for `PandasOnRay` execution." ) ErrorMessage.single_warning( "The Batch Pipeline API is an experimental feature and still under development in Modin." ) self.df = df self.num_partitions = num_partitions if num_partitions else NPartitions.get() self.outputs = [] # List of output queries. self.query_list = [] # List of all queries. self.is_output_id_specified = ( False # Flag to indicate that `output_id` has been specified for a node. ) def update_df(self, df): """ Update the dataframe to perform this pipeline on. Parameters ---------- df : modin.pandas.DataFrame The new dataframe to perform this pipeline on. """ if get_current_execution() != "PandasOnRay" or ( not isinstance(df._query_compiler._modin_frame, PandasOnRayDataframe) ): # pragma: no cover ErrorMessage.not_implemented( "Batch Pipeline API is only implemented for `PandasOnRay` execution." ) self.df = df def add_query( self, func: Callable, is_output: bool = False, repartition_after: bool = False, fan_out: bool = False, pass_partition_id: bool = False, reduce_fn: Optional[Callable] = None, output_id: Optional[int] = None, ): """ Add a query to the current pipeline. Parameters ---------- func : Callable DataFrame query to perform. is_output : bool, default: False Whether this query should be designated as an output query. If `True`, the output of this query is passed both to the next query and directly to postprocessing. repartition_after : bool, default: False Whether the dataframe should be repartitioned after this query. Currently, repartitioning is only supported if there is 1 partition prior. fan_out : bool, default: False Whether to fan out this node. If True and only 1 partition is passed as input, the partition is replicated `self.num_partitions` (default: `NPartitions.get`) times, and the function is called on each. The `reduce_fn` must also be specified. pass_partition_id : bool, default: False Whether to pass the numerical partition id to the query. reduce_fn : Callable, default: None The reduce function to apply if `fan_out` is set to True. This takes the `self.num_partitions` (default: `NPartitions.get`) partitions that result from this query, and combines them into 1 partition. output_id : int, default: None An id to assign to this node if it is an output. Notes ----- Use `pandas` for any module level functions inside `func` since it operates directly on partitions. """ if not is_output and output_id is not None: raise ValueError("Output ID cannot be specified for non-output node.") if is_output: if not self.is_output_id_specified and output_id is not None: if len(self.outputs) != 0: raise ValueError("Output ID must be specified for all nodes.") if output_id is None and self.is_output_id_specified: raise ValueError("Output ID must be specified for all nodes.") self.query_list.append( PandasQuery( func, is_output, repartition_after, fan_out, pass_partition_id, reduce_fn, output_id, ) ) if is_output: self.outputs.append(self.query_list[-1]) if output_id is not None: self.is_output_id_specified = True self.outputs[-1].operators = self.query_list[:-1] self.query_list = [] def _complete_nodes(self, list_of_nodes, partitions): """ Run a sub-query end to end. Parameters ---------- list_of_nodes : list of PandasQuery The functions that compose this query. partitions : list of PandasOnRayDataframeVirtualPartition The partitions that compose the dataframe that is input to this sub-query. Returns ------- list of PandasOnRayDataframeVirtualPartition The partitions that result from computing the functions represented by `list_of_nodes`. """ for node in list_of_nodes: if node.fan_out: if len(partitions) > 1: ErrorMessage.not_implemented( "Fan out is only supported with DataFrames with 1 partition." ) partitions[0] = partitions[0].force_materialization() partition_list = partitions[0].list_of_block_partitions partitions[0] = partitions[0].add_to_apply_calls(node.func, 0) partitions[0].drain_call_queue(num_splits=1) new_dfs = [] for i in range(1, self.num_partitions): new_dfs.append( type(partitions[0])( partition_list, full_axis=partitions[0].full_axis, ).add_to_apply_calls(node.func, i) ) new_dfs[-1].drain_call_queue(num_splits=1) def reducer(df): df_inputs = [df] for df in new_dfs: df_inputs.append(df.to_pandas()) return node.reduce_fn(df_inputs) partitions = [partitions[0].add_to_apply_calls(reducer)] elif node.repartition_after: if len(partitions) > 1: ErrorMessage.not_implemented( "Dynamic repartitioning is currently only supported for DataFrames with 1 partition." ) partitions[0] = ( partitions[0].add_to_apply_calls(node.func).force_materialization() ) new_dfs = [] def mask_partition(df, i): # pragma: no cover new_length = len(df.index) // self.num_partitions if i == self.num_partitions - 1: return df.iloc[i * new_length :] return df.iloc[i * new_length : (i + 1) * new_length] for i in range(self.num_partitions): new_dfs.append( type(partitions[0])( partitions[0].list_of_block_partitions, full_axis=partitions[0].full_axis, ).add_to_apply_calls(mask_partition, i) ) partitions = new_dfs else: if node.pass_partition_id: partitions = [ part.add_to_apply_calls(node.func, i) for i, part in enumerate(partitions) ] else: partitions = [ part.add_to_apply_calls(node.func) for part in partitions ] return partitions def compute_batch( self, postprocessor: Optional[Callable] = None, pass_partition_id: Optional[bool] = False, pass_output_id: Optional[bool] = False, ): """ Run the completed pipeline + any postprocessing steps end to end. Parameters ---------- postprocessor : Callable, default: None A postprocessing function to be applied to each output partition. The order of arguments passed is `df` (the partition), `output_id` (if `pass_output_id=True`), and `partition_id` (if `pass_partition_id=True`). pass_partition_id : bool, default: False Whether or not to pass the numerical partition id to the postprocessing function. pass_output_id : bool, default: False Whether or not to pass the output ID associated with output queries to the postprocessing function. Returns ------- list or dict or DataFrame If output ids are specified, a dictionary mapping output id to the resulting dataframe is returned, otherwise, a list of the resulting dataframes is returned. """ if len(self.outputs) == 0: ErrorMessage.single_warning( "No outputs to compute. Returning an empty list. Please specify outputs by calling `add_query` with `is_output=True`." ) return [] if not self.is_output_id_specified and pass_output_id: raise ValueError( "`pass_output_id` is set to True, but output ids have not been specified. " + "To pass output ids, please specify them using the `output_id` kwarg with pipeline.add_query" ) if self.is_output_id_specified: outs = {} else: outs = [] modin_frame = self.df._query_compiler._modin_frame partitions = modin_frame._partition_mgr_cls.row_partitions( modin_frame._partitions ) for node in self.outputs: partitions = self._complete_nodes(node.operators + [node], partitions) for part in partitions: part.drain_call_queue(num_splits=1) if postprocessor: output_partitions = [] for partition_id, partition in enumerate(partitions): args = [] if pass_output_id: args.append(node.output_id) if pass_partition_id: args.append(partition_id) output_partitions.append( partition.add_to_apply_calls(postprocessor, *args) ) else: output_partitions = [ part.add_to_apply_calls(lambda df: df) for part in partitions ] [ part.drain_call_queue(num_splits=self.num_partitions) for part in output_partitions ] # Ensures our result df is block partitioned. if not self.is_output_id_specified: outs.append(output_partitions) else: outs[node.output_id] = output_partitions if self.is_output_id_specified: final_results = {} id_df_iter = outs.items() else: final_results = [None] * len(outs) id_df_iter = enumerate(outs) for id, df in id_df_iter: partitions = [] for row_partition in df: partitions.append(row_partition.list_of_block_partitions) partitions = np.array(partitions) partition_mgr_class = PandasOnRayDataframe._partition_mgr_cls index, internal_rows = partition_mgr_class.get_indices(0, partitions) columns, internal_cols = partition_mgr_class.get_indices(1, partitions) result_modin_frame = PandasOnRayDataframe( partitions, index, columns, row_lengths=list(map(len, internal_rows)), column_widths=list(map(len, internal_cols)), ) query_compiler = PandasQueryCompiler(result_modin_frame) result_df = pd.DataFrame(query_compiler=query_compiler) final_results[id] = result_df return final_results ================================================ FILE: modin/experimental/core/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Experimental Modin's core functionality.""" ================================================ FILE: modin/experimental/core/execution/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Experimental Modin's functionality related to execution engines supported.""" ================================================ FILE: modin/experimental/core/execution/dask/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Experimental Modin's functionality related to Dask execution engine.""" ================================================ FILE: modin/experimental/core/execution/dask/implementations/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Experimental functionality related to Dask execution engine and optimized for specific storage formats.""" ================================================ FILE: modin/experimental/core/execution/dask/implementations/pandas_on_dask/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Experimental functionality related to Dask execution engine and optimized for pandas storage format.""" ================================================ FILE: modin/experimental/core/execution/ray/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Experimental Modin's functionality related to Ray execution engine.""" ================================================ FILE: modin/experimental/core/execution/ray/implementations/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Experimental functionality related to Ray execution engine and optimized for specific storage formats.""" ================================================ FILE: modin/experimental/core/execution/unidist/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Experimental Modin's functionality related to unidist execution engine.""" ================================================ FILE: modin/experimental/core/execution/unidist/implementations/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Experimental Modin's functionality related to unidist execution engine and optimized for specific storage formats.""" ================================================ FILE: modin/experimental/core/execution/unidist/implementations/pandas_on_unidist/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Experimental functionality related to unidist execution engine and optimized for pandas storage format.""" ================================================ FILE: modin/experimental/core/io/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Experimental IO functions implementations.""" from .glob.glob_dispatcher import ExperimentalGlobDispatcher from .sql.sql_dispatcher import ExperimentalSQLDispatcher from .text.csv_glob_dispatcher import ExperimentalCSVGlobDispatcher from .text.custom_text_dispatcher import ExperimentalCustomTextDispatcher __all__ = [ "ExperimentalCSVGlobDispatcher", "ExperimentalSQLDispatcher", "ExperimentalGlobDispatcher", "ExperimentalCustomTextDispatcher", ] ================================================ FILE: modin/experimental/core/io/glob/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Experimental module that allows to work with various formats using glob syntax.""" ================================================ FILE: modin/experimental/core/io/glob/glob_dispatcher.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses ``ExperimentalGlobDispatcher`` class that is used to read/write files of different formats in parallel.""" import glob import warnings import pandas from pandas.io.common import stringify_path from modin.config import NPartitions from modin.core.io.file_dispatcher import FileDispatcher from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler class ExperimentalGlobDispatcher(FileDispatcher): """Class implements reading/writing different formats, parallelizing by the number of files.""" @classmethod def _read(cls, **kwargs): """ Read data from `filepath_or_buffer` according to `kwargs` parameters. Parameters ---------- filepath_or_buffer : str, path object or file-like object `filepath_or_buffer` parameter of `read_*` function. **kwargs : dict Parameters of `read_*` function. Returns ------- new_query_compiler : BaseQueryCompiler Query compiler with imported data for further processing. Notes ----- The number of partitions is equal to the number of input files. """ if "filepath_or_buffer" in kwargs: path_key = "filepath_or_buffer" elif "path" in kwargs: path_key = "path" elif "path_or_buf" in kwargs: path_key = "path_or_buf" elif "path_or_buffer" in kwargs: path_key = "path_or_buffer" filepath_or_buffer = kwargs.pop(path_key) filepath_or_buffer = stringify_path(filepath_or_buffer) if not (isinstance(filepath_or_buffer, str) and "*" in filepath_or_buffer): return cls.single_worker_read( filepath_or_buffer, single_worker_read=True, reason="Buffers and single files are not supported", **kwargs, ) filepath_or_buffer = sorted(glob.glob(filepath_or_buffer)) if len(filepath_or_buffer) == 0: raise ValueError( f"There are no files matching the pattern: {filepath_or_buffer}" ) partition_ids = [None] * len(filepath_or_buffer) lengths_ids = [None] * len(filepath_or_buffer) widths_ids = [None] * len(filepath_or_buffer) if len(filepath_or_buffer) != NPartitions.get(): # do we need to do a repartitioning? warnings.warn("can be inefficient partitioning") for idx, file_name in enumerate(filepath_or_buffer): *partition_ids[idx], lengths_ids[idx], widths_ids[idx] = cls.deploy( func=cls.parse, f_kwargs={ "fname": file_name, **kwargs, }, num_returns=3, ) lengths = cls.materialize(lengths_ids) widths = cls.materialize(widths_ids) # while num_splits is 1, need only one value partition_ids = cls.build_partition(partition_ids, lengths, [widths[0]]) new_index, _ = cls.frame_cls._partition_mgr_cls.get_indices(0, partition_ids) new_columns, _ = cls.frame_cls._partition_mgr_cls.get_indices(1, partition_ids) return cls.query_compiler_cls( cls.frame_cls(partition_ids, new_index, new_columns) ) @classmethod def write(cls, qc, **kwargs): """ When `*` is in the filename, all partitions are written to their own separate file. The filenames is determined as follows: - if `*` is in the filename, then it will be replaced by the ascending sequence 0, 1, 2, … - if `*` is not in the filename, then the default implementation will be used. Parameters ---------- qc : BaseQueryCompiler The query compiler of the Modin dataframe that we want to run ``to__glob`` on. **kwargs : dict Parameters for ``pandas.to_(**kwargs)``. """ if "filepath_or_buffer" in kwargs: path_key = "filepath_or_buffer" elif "path" in kwargs: path_key = "path" elif "path_or_buf" in kwargs: path_key = "path_or_buf" elif "path_or_buffer" in kwargs: path_key = "path_or_buffer" filepath_or_buffer = kwargs.pop(path_key) filepath_or_buffer = stringify_path(filepath_or_buffer) if not ( isinstance(filepath_or_buffer, str) and "*" in filepath_or_buffer ) or not isinstance(qc, PandasQueryCompiler): warnings.warn("Defaulting to Modin core implementation") cls.base_write(qc, filepath_or_buffer, **kwargs) return # Be careful, this is a kind of limitation, but at the time of the first implementation, # getting a name in this way is quite convenient. # We can use this attribute because the names of the BaseIO's methods match pandas API. write_func_name = cls.base_write.__name__ def func(df, **kw): # pragma: no cover idx = str(kw["partition_idx"]) path = filepath_or_buffer.replace("*", idx) getattr(df, write_func_name)(path, **kwargs) return pandas.DataFrame() result = qc._modin_frame.apply_full_axis( 1, func, new_index=[], new_columns=[], enumerate_partitions=True ) cls.materialize( [part.list_of_blocks[0] for row in result._partitions for part in row] ) ================================================ FILE: modin/experimental/core/io/sql/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Experimental SQL format type IO functions implementations.""" ================================================ FILE: modin/experimental/core/io/sql/sql_dispatcher.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses `ExperimentalSQLDispatcher` class.""" import warnings import numpy as np import pandas from modin.config import NPartitions from modin.core.io import SQLDispatcher class ExperimentalSQLDispatcher(SQLDispatcher): """Class handles experimental utils for reading SQL queries or database tables.""" __read_sql_with_offset = None @classmethod def preprocess_func(cls): # noqa: RT01 """Prepare a function for transmission to remote workers.""" if cls.__read_sql_with_offset is None: # sql deps are optional, so import only when needed from modin.experimental.core.io.sql.utils import read_sql_with_offset cls.__read_sql_with_offset = cls.put(read_sql_with_offset) return cls.__read_sql_with_offset @classmethod def _read( cls, sql, con, index_col, coerce_float, params, parse_dates, columns, chunksize, dtype_backend, dtype, partition_column, lower_bound, upper_bound, max_sessions, ): # noqa: PR01 """ Read SQL query or database table into a DataFrame. Documentation for parameters can be found at `modin.read_sql`. Returns ------- BaseQueryCompiler A new query compiler with imported data for further processing. """ # sql deps are optional, so import only when needed from modin.experimental.core.io.sql.utils import get_query_info, is_distributed if not is_distributed(partition_column, lower_bound, upper_bound): message = "Defaulting to Modin core implementation; \ 'partition_column', 'lower_bound', 'upper_bound' must be different from None" warnings.warn(message) return cls.base_read( sql, con, index_col, coerce_float=coerce_float, params=params, parse_dates=parse_dates, columns=columns, chunksize=chunksize, dtype_backend=dtype_backend, dtype=dtype, ) # starts the distributed alternative cols_names, query = get_query_info(sql, con, partition_column) num_parts = min(NPartitions.get(), max_sessions if max_sessions else 1) num_splits = min(len(cols_names), num_parts) diff = (upper_bound - lower_bound) + 1 min_size = diff // num_parts rest = diff % num_parts partition_ids = [] index_ids = [] end = lower_bound - 1 func = cls.preprocess_func() for part in range(num_parts): if rest: size = min_size + 1 rest -= 1 else: size = min_size start = end + 1 end = start + size - 1 partition_id = cls.deploy( func, f_args=( partition_column, start, end, num_splits, query, con, index_col, coerce_float, params, parse_dates, columns, chunksize, dtype_backend, dtype, ), num_returns=num_splits + 1, ) partition_ids.append( [cls.frame_partition_cls(obj) for obj in partition_id[:-1]] ) index_ids.append(partition_id[-1]) new_index = pandas.RangeIndex(sum(cls.materialize(index_ids))) new_query_compiler = cls.query_compiler_cls( cls.frame_cls(np.array(partition_ids), new_index, cols_names) ) new_query_compiler._modin_frame.synchronize_labels(axis=0) return new_query_compiler ================================================ FILE: modin/experimental/core/io/sql/utils.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Utilities for experimental SQL format type IO functions implementations.""" import pandas import pandas._libs.lib as lib from sqlalchemy import MetaData, Table, create_engine, inspect, text from modin.core.storage_formats.pandas.parsers import _split_result_for_readers def is_distributed(partition_column, lower_bound, upper_bound): """ Check if is possible to distribute a query with the given args. Parameters ---------- partition_column : str Column name used for data partitioning between the workers. lower_bound : int The minimum value to be requested from the `partition_column`. upper_bound : int The maximum value to be requested from the `partition_column`. Returns ------- bool Whether the given query is distributable or not. """ if ( (partition_column is not None) and (lower_bound is not None) and (upper_bound is not None) ): if upper_bound > lower_bound: return True raise InvalidArguments("upper_bound must be greater than lower_bound.") elif (partition_column is None) and (lower_bound is None) and (upper_bound is None): return False else: raise InvalidArguments( "Invalid combination of partition_column, lower_bound, upper_bound." + "All these arguments should be passed (distributed) or none of them (standard pandas)." ) def is_table(engine, sql): """ Check if given `sql` parameter is a table name. Parameters ---------- engine : sqlalchemy.engine.base.Engine SQLAlchemy connection engine. sql : str SQL query to be executed or a table name. Returns ------- bool Whether `sql` a table name or not. """ return inspect(engine).has_table(sql) def get_table_metadata(engine, table): """ Extract all useful data from the given table. Parameters ---------- engine : sqlalchemy.engine.base.Engine SQLAlchemy connection engine. table : str Table name. Returns ------- sqlalchemy.sql.schema.Table Extracted metadata. """ metadata = MetaData() metadata.reflect(bind=engine, only=[table]) table_metadata = Table(table, metadata, autoload=True) return table_metadata def get_table_columns(metadata): """ Extract columns names and python types from the `metadata`. Parameters ---------- metadata : sqlalchemy.sql.schema.Table Table metadata. Returns ------- dict Dictionary with columns names and python types. """ cols = dict() for col in metadata.c: name = str(col).rpartition(".")[2] cols[name] = col.type.python_type.__name__ return cols def build_query_from_table(name): """ Create a query from the given table name. Parameters ---------- name : str Table name. Returns ------- str Query string. """ return "SELECT * FROM {0}".format(name) def check_query(query): """ Check query sanity. Parameters ---------- query : str Query string. """ q = query.lower() if "select " not in q: raise InvalidQuery("SELECT word not found in the query: {0}".format(query)) if " from " not in q: raise InvalidQuery("FROM word not found in the query: {0}".format(query)) def get_query_columns(engine, query): """ Extract columns names and python types from the `query`. Parameters ---------- engine : sqlalchemy.engine.base.Engine SQLAlchemy connection engine. query : str SQL query. Returns ------- dict Dictionary with columns names and python types. """ con = engine.connect() result = con.execute(text(query)) cols_names = list(result.keys()) values = list(result.first()) cols = dict() for i in range(len(cols_names)): cols[cols_names[i]] = type(values[i]).__name__ return cols def check_partition_column(partition_column, cols): """ Check `partition_column` existence and it's type. Parameters ---------- partition_column : str Column name used for data partitioning between the workers. cols : dict Dictionary with columns names and python types. """ for k, v in cols.items(): if k == partition_column: if v == "int": return raise InvalidPartitionColumn(f"partition_column must be int, and not {v}") raise InvalidPartitionColumn( f"partition_column {partition_column} not found in the query" ) def get_query_info(sql, con, partition_column): """ Compute metadata needed for query distribution. Parameters ---------- sql : str SQL query to be executed or a table name. con : SQLAlchemy connectable or str Database connection or url string. partition_column : str Column name used for data partitioning between the workers. Returns ------- list Columns names list. str Query string. """ engine = create_engine(con) if is_table(engine, sql): table_metadata = get_table_metadata(engine, sql) query = build_query_from_table(sql) cols = get_table_columns(table_metadata) else: check_query(sql) query = sql.replace(";", "") cols = get_query_columns(engine, query) # TODO allow validation that takes into account edge cases of pandas e.g. "[index]" # check_partition_column(partition_column, cols) # TODO partition_column isn't used; we need to use it; return list(cols.keys()), query def query_put_bounders(query, partition_column, start, end): # pragma: no cover """ Put partition boundaries into the query. Parameters ---------- query : str SQL query string. partition_column : str Column name used for data partitioning between the workers. start : int Lowest value to request from the `partition_column`. end : int Highest value to request from the `partition_column`. Returns ------- str Query string with boundaries. """ where = " WHERE TMP_TABLE.{0} >= {1} AND TMP_TABLE.{0} <= {2}".format( partition_column, start, end ) query_with_bounders = "SELECT * FROM ({0}) AS TMP_TABLE {1}".format(query, where) return query_with_bounders class InvalidArguments(Exception): """Exception that should be raised if invalid arguments combination was found.""" class InvalidQuery(Exception): """Exception that should be raised if invalid query statement was found.""" class InvalidPartitionColumn(Exception): """Exception that should be raised if `partition_column` doesn't satisfy predefined requirements.""" def read_sql_with_offset( partition_column, start, end, num_splits, sql, con, index_col=None, coerce_float=True, params=None, parse_dates=None, columns=None, chunksize=None, dtype_backend=lib.no_default, dtype=None, ): # pragma: no cover """ Read a chunk of SQL query or table into a pandas DataFrame. Parameters ---------- partition_column : str Column name used for data partitioning between the workers. start : int Lowest value to request from the `partition_column`. end : int Highest value to request from the `partition_column`. num_splits : int The number of partitions to split the column into. sql : str or SQLAlchemy Selectable (select or text object) SQL query to be executed or a table name. con : SQLAlchemy connectable or str Connection to database (sqlite3 connections are not supported). index_col : str or list of str, optional Column(s) to set as index(MultiIndex). coerce_float : bool, default: True Attempts to convert values of non-string, non-numeric objects (like decimal.Decimal) to floating point, useful for SQL result sets. params : list, tuple or dict, optional List of parameters to pass to ``execute`` method. The syntax used to pass parameters is database driver dependent. Check your database driver documentation for which of the five syntax styles, described in PEP 249's paramstyle, is supported. parse_dates : list or dict, optional The behavior is as follows: - List of column names to parse as dates. - Dict of `{column_name: format string}` where format string is strftime compatible in case of parsing string times, or is one of (D, s, ns, ms, us) in case of parsing integer timestamps. - Dict of `{column_name: arg dict}`, where the arg dict corresponds to the keyword arguments of ``pandas.to_datetime`` Especially useful with databases without native Datetime support, such as SQLite. columns : list, optional List of column names to select from SQL table (only used when reading a table). chunksize : int, optional If specified, return an iterator where `chunksize` is the number of rows to include in each chunk. dtype_backend : {"numpy_nullable", "pyarrow"}, default: NumPy backed DataFrames Which dtype_backend to use, e.g. whether a DataFrame should have NumPy arrays, nullable dtypes are used for all dtypes that have a nullable implementation when "numpy_nullable" is set, PyArrow is used for all dtypes if "pyarrow" is set. The dtype_backends are still experimential. dtype : Type name or dict of columns, optional Data type for data or columns. E.g. np.float64 or {'a': np.float64, 'b': np.int32, 'c': 'Int64'}. The argument is ignored if a table is passed instead of a query. Returns ------- list List with split read results and it's metadata (index, dtypes, etc.). """ query_with_bounders = query_put_bounders(sql, partition_column, start, end) pandas_df = pandas.read_sql( query_with_bounders, con, index_col=index_col, coerce_float=coerce_float, params=params, parse_dates=parse_dates, columns=columns, chunksize=chunksize, dtype_backend=dtype_backend, dtype=dtype, ) index = len(pandas_df) return _split_result_for_readers(1, num_splits, pandas_df) + [index] ================================================ FILE: modin/experimental/core/io/text/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Experimental text format type IO functions implementations.""" ================================================ FILE: modin/experimental/core/io/text/csv_glob_dispatcher.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses `ExperimentalCSVGlobDispatcher` class, that is used for reading multiple `.csv` files simultaneously.""" import csv import glob import os import warnings from contextlib import ExitStack from typing import List, Tuple import fsspec import pandas import pandas._libs.lib as lib from pandas.io.common import is_fsspec_url, is_url, stringify_path from modin.config import NPartitions from modin.core.io.file_dispatcher import OpenFile from modin.core.io.text.csv_dispatcher import CSVDispatcher class ExperimentalCSVGlobDispatcher(CSVDispatcher): """Class contains utils for reading multiple `.csv` files simultaneously.""" @classmethod def _read(cls, filepath_or_buffer, **kwargs): """ Read data from multiple `.csv` files passed with `filepath_or_buffer` simultaneously. Parameters ---------- filepath_or_buffer : str, path object or file-like object `filepath_or_buffer` parameter of ``read_csv`` function. **kwargs : dict Parameters of ``read_csv`` function. Returns ------- new_query_compiler : BaseQueryCompiler Query compiler with imported data for further processing. """ # Ensures that the file is a string file path. Otherwise, default to pandas. filepath_or_buffer = cls.get_path_or_buffer(stringify_path(filepath_or_buffer)) if isinstance(filepath_or_buffer, str): # os.altsep == None on Linux is_folder = any( filepath_or_buffer.endswith(sep) for sep in (os.sep, os.altsep) if sep ) if "*" not in filepath_or_buffer and not is_folder: warnings.warn( "Shell-style wildcard '*' must be in the filename pattern in order to read multiple " + f"files at once. Did you forget it? Passed filename: '{filepath_or_buffer}'" ) if not cls.file_exists(filepath_or_buffer, kwargs.get("storage_options")): return cls.single_worker_read( filepath_or_buffer, reason=cls._file_not_found_msg(filepath_or_buffer), **kwargs, ) filepath_or_buffer = cls.get_path( filepath_or_buffer, kwargs.get("storage_options") ) elif not cls.pathlib_or_pypath(filepath_or_buffer): return cls.single_worker_read( filepath_or_buffer, reason=cls.BUFFER_UNSUPPORTED_MSG, **kwargs, ) # We read multiple csv files when the file path is a list of absolute file paths. We assume that all of the files will be essentially replicas of the # first file but with different data values. glob_filepaths = filepath_or_buffer filepath_or_buffer = filepath_or_buffer[0] compression_type = cls.infer_compression( filepath_or_buffer, kwargs.get("compression") ) chunksize = kwargs.get("chunksize") if chunksize is not None: return cls.single_worker_read( filepath_or_buffer, reason="`chunksize` parameter is not supported", **kwargs, ) skiprows = kwargs.get("skiprows") if skiprows is not None and not isinstance(skiprows, int): return cls.single_worker_read( filepath_or_buffer, reason="Non-integer `skiprows` value not supported", **kwargs, ) nrows = kwargs.pop("nrows", None) names = kwargs.get("names", lib.no_default) index_col = kwargs.get("index_col", None) usecols = kwargs.get("usecols", None) encoding = kwargs.get("encoding", None) if names in [lib.no_default, None]: # For the sake of the empty df, we assume no `index_col` to get the correct # column names before we build the index. Because we pass `names` in, this # step has to happen without removing the `index_col` otherwise it will not # be assigned correctly. names = pandas.read_csv( filepath_or_buffer, **dict(kwargs, usecols=None, nrows=0, skipfooter=0, index_col=None), ).columns elif index_col is None and not usecols: # When names is set to some list that is smaller than the number of columns # in the file, the first columns are built as a hierarchical index. empty_pd_df = pandas.read_csv( filepath_or_buffer, nrows=0, encoding=encoding ) num_cols = len(empty_pd_df.columns) if num_cols > len(names): index_col = list(range(num_cols - len(names))) if len(index_col) == 1: index_col = index_col[0] kwargs["index_col"] = index_col pd_df_metadata = pandas.read_csv( filepath_or_buffer, **dict(kwargs, nrows=1, skipfooter=0) ) column_names = pd_df_metadata.columns skipfooter = kwargs.get("skipfooter", None) skiprows = kwargs.pop("skiprows", None) usecols_md = cls._validate_usecols_arg(usecols) if usecols is not None and usecols_md[1] != "integer": del kwargs["usecols"] all_cols = pandas.read_csv( filepath_or_buffer, **dict(kwargs, nrows=0, skipfooter=0), ).columns usecols = all_cols.get_indexer_for(list(usecols_md[0])) parse_dates = kwargs.pop("parse_dates", False) partition_kwargs = dict( kwargs, header=None, names=names, skipfooter=0, skiprows=None, parse_dates=parse_dates, usecols=usecols, ) encoding = kwargs.get("encoding", None) quotechar = kwargs.get("quotechar", '"').encode( encoding if encoding is not None else "UTF-8" ) is_quoting = kwargs.get("quoting", "") != csv.QUOTE_NONE with ExitStack() as stack: files = [ stack.enter_context( OpenFile( fname, "rb", compression_type, **(kwargs.get("storage_options", None) or {}), ) ) for fname in glob_filepaths ] # Skip the header since we already have the header information and skip the # rows we are told to skip. if isinstance(skiprows, int) or skiprows is None: if skiprows is None: skiprows = 0 header = kwargs.get("header", "infer") if header == "infer" and kwargs.get("names", lib.no_default) in [ lib.no_default, None, ]: skip_header = 1 elif isinstance(header, int): skip_header = header + 1 elif hasattr(header, "__iter__") and not isinstance(header, str): skip_header = max(header) + 1 else: skip_header = 0 if kwargs.get("encoding", None) is not None: partition_kwargs["skiprows"] = 1 # Launch tasks to read partitions column_widths, num_splits = cls._define_metadata( pd_df_metadata, column_names ) args = { "num_splits": num_splits, **partition_kwargs, } splits = cls.partitioned_file( files, glob_filepaths, num_partitions=NPartitions.get(), nrows=nrows, skiprows=skiprows, skip_header=skip_header, quotechar=quotechar, is_quoting=is_quoting, ) partition_ids = [None] * len(splits) index_ids = [None] * len(splits) dtypes_ids = [None] * len(splits) for idx, chunks in enumerate(splits): args.update({"chunks": chunks}) *partition_ids[idx], index_ids[idx], dtypes_ids[idx] = cls.deploy( func=cls.parse, f_kwargs=args, num_returns=num_splits + 2, ) # Compute the index based on a sum of the lengths of each partition (by default) # or based on the column(s) that were requested. if index_col is None: row_lengths = cls.materialize(index_ids) new_index = pandas.RangeIndex(sum(row_lengths)) else: index_objs = cls.materialize(index_ids) row_lengths = [len(o) for o in index_objs] new_index = index_objs[0].append(index_objs[1:]) new_index.name = pd_df_metadata.index.name partition_ids = cls.build_partition(partition_ids, row_lengths, column_widths) # Compute dtypes by getting collecting and combining all of the partitions. The # reported dtypes from differing rows can be different based on the inference in # the limited data seen by each worker. We use pandas to compute the exact dtype # over the whole column for each column. The index is set below. dtypes = cls.get_dtypes(dtypes_ids, column_names) new_frame = cls.frame_cls( partition_ids, new_index, column_names, row_lengths, column_widths, dtypes=dtypes, ) new_query_compiler = cls.query_compiler_cls(new_frame) if skipfooter: new_query_compiler = new_query_compiler.drop( new_query_compiler.index[-skipfooter:] ) if kwargs.get("squeeze", False) and len(new_query_compiler.columns) == 1: return new_query_compiler[new_query_compiler.columns[0]] if index_col is None: new_query_compiler._modin_frame.synchronize_labels(axis=0) return new_query_compiler @classmethod def file_exists(cls, file_path: str, storage_options=None) -> bool: """ Check if the `file_path` is valid. Parameters ---------- file_path : str String representing a path. storage_options : dict, optional Keyword from `read_*` functions. Returns ------- bool True if the path is valid. """ if is_url(file_path): raise NotImplementedError("`read_csv_glob` does not support urllib paths.") if not is_fsspec_url(file_path): return len(glob.glob(file_path)) > 0 try: from botocore.exceptions import ( ConnectTimeoutError, EndpointConnectionError, NoCredentialsError, ) credential_error_type = ( NoCredentialsError, PermissionError, EndpointConnectionError, ConnectTimeoutError, ) except ModuleNotFoundError: credential_error_type = (PermissionError,) if storage_options is not None: new_storage_options = dict(storage_options) new_storage_options.pop("anon", None) else: new_storage_options = {} fs, _ = fsspec.core.url_to_fs(file_path, **new_storage_options) exists = False try: exists = fs.exists(file_path) except credential_error_type: fs, _ = fsspec.core.url_to_fs(file_path, anon=True, **new_storage_options) exists = fs.exists(file_path) return exists or len(fs.glob(file_path)) > 0 @classmethod def get_path(cls, file_path: str, storage_options=None) -> list: """ Return the path of the file(s). Parameters ---------- file_path : str String representing a path. storage_options : dict, optional Keyword from `read_*` functions. Returns ------- list List of strings of absolute file paths. """ if not is_fsspec_url(file_path) and not is_url(file_path): relative_paths = glob.glob(file_path) abs_paths = [os.path.abspath(path) for path in relative_paths] return abs_paths try: from botocore.exceptions import ( ConnectTimeoutError, EndpointConnectionError, NoCredentialsError, ) credential_error_type = ( NoCredentialsError, PermissionError, EndpointConnectionError, ConnectTimeoutError, ) except ModuleNotFoundError: credential_error_type = (PermissionError,) def get_file_path(fs_handle) -> List[str]: if "*" in file_path: file_paths = fs_handle.glob(file_path) else: file_paths = [ f for f in fs_handle.find(file_path) if not f.endswith("/") # exclude folder ] if len(file_paths) == 0 and not fs_handle.exists(file_path): raise FileNotFoundError(f"Path <{file_path}> isn't available.") fs_addresses = [fs_handle.unstrip_protocol(path) for path in file_paths] return fs_addresses if storage_options is not None: new_storage_options = dict(storage_options) new_storage_options.pop("anon", None) else: new_storage_options = {} fs, _ = fsspec.core.url_to_fs(file_path, **new_storage_options) try: return get_file_path(fs) except credential_error_type: fs, _ = fsspec.core.url_to_fs(file_path, anon=True, **new_storage_options) return get_file_path(fs) @classmethod def partitioned_file( cls, files, fnames: List[str], num_partitions: int = None, nrows: int = None, skiprows: int = None, skip_header: int = None, quotechar: bytes = b'"', is_quoting: bool = True, ) -> List[List[Tuple[str, int, int]]]: """ Compute chunk sizes in bytes for every partition. Parameters ---------- files : file or list of files File(s) to be partitioned. fnames : str or list of str File name(s) to be partitioned. num_partitions : int, optional For what number of partitions split a file. If not specified grabs the value from `modin.config.NPartitions.get()`. nrows : int, optional Number of rows of file to read. skiprows : int, optional Specifies rows to skip. skip_header : int, optional Specifies header rows to skip. quotechar : bytes, default: b'"' Indicate quote in a file. is_quoting : bool, default: True Whether or not to consider quotes. Returns ------- list List, where each element of the list is a list of tuples. The inner lists of tuples contains the data file name of the chunk, chunk start offset, and chunk end offsets for its corresponding file. Notes ----- The logic gets really complicated if we try to use the `TextFileDispatcher.partitioned_file`. """ if type(files) is not list: files = [files] if num_partitions is None: num_partitions = NPartitions.get() file_sizes = [cls.file_size(f) for f in files] partition_size = max( 1, num_partitions, (nrows if nrows else sum(file_sizes)) // num_partitions ) result = [] split_result = [] split_size = 0 read_rows_counter = 0 for f, fname, f_size in zip(files, fnames, file_sizes): if skiprows or skip_header: skip_amount = (skiprows if skiprows else 0) + ( skip_header if skip_header else 0 ) # TODO(williamma12): Handle when skiprows > number of rows in file. Currently returns empty df. outside_quotes, read_rows = cls._read_rows( f, nrows=skip_amount, quotechar=quotechar, is_quoting=is_quoting, ) if skiprows: skiprows -= read_rows if skiprows > 0: # We have more rows to skip than the amount read in the file. continue start = f.tell() while f.tell() < f_size: if split_size >= partition_size: # Create a new split when the split has reached partition_size. # This is mainly used when we are reading row-wise partitioned files. result.append(split_result) split_result = [] split_size = 0 # We calculate the amount that we need to read based off of how much of the split we have already read. read_size = partition_size - split_size if nrows: if read_rows_counter >= nrows: # # Finish when we have read enough rows. if len(split_result) > 0: # Add last split into the result. result.append(split_result) return result elif read_rows_counter + read_size > nrows: # Ensure that we will not read more than nrows. read_size = nrows - read_rows_counter outside_quotes, read_rows = cls._read_rows( f, nrows=read_size, quotechar=quotechar, is_quoting=is_quoting, ) split_size += read_rows read_rows_counter += read_rows else: outside_quotes = cls.offset( f, offset_size=read_size, quotechar=quotechar, is_quoting=is_quoting, ) split_result.append((fname, start, f.tell())) split_size += f.tell() - start start = f.tell() # Add outside_quotes. if is_quoting and not outside_quotes: warnings.warn("File has mismatched quotes") # Add last split into the result. if len(split_result) > 0: result.append(split_result) return result ================================================ FILE: modin/experimental/core/io/text/custom_text_dispatcher.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses `ExperimentalCustomTextDispatcher` class, that is used for reading custom text files.""" import pandas from pandas.io.common import stringify_path from modin.config import NPartitions from modin.core.io.file_dispatcher import OpenFile from modin.core.io.text.text_file_dispatcher import TextFileDispatcher class ExperimentalCustomTextDispatcher(TextFileDispatcher): """Class handles utils for reading custom text files.""" @classmethod def _read(cls, filepath_or_buffer, columns, custom_parser, **kwargs): r""" Read data from `filepath_or_buffer` according to the passed `read_custom_text` `kwargs` parameters. Parameters ---------- filepath_or_buffer : str, path object or file-like object `filepath_or_buffer` parameter of `read_custom_text` function. columns : list or callable(file-like object, \*\*kwargs -> list Column names of list type or callable that create column names from opened file and passed `kwargs`. custom_parser : callable(file-like object, \*\*kwargs -> pandas.DataFrame Function that takes as input a part of the `filepath_or_buffer` file loaded into memory in file-like object form. **kwargs : dict Parameters of `read_custom_text` function. Returns ------- BaseQueryCompiler Query compiler with imported data for further processing. """ filepath_or_buffer = stringify_path(filepath_or_buffer) filepath_or_buffer_md = ( cls.get_path(filepath_or_buffer) if isinstance(filepath_or_buffer, str) else cls.get_path_or_buffer(filepath_or_buffer) ) compression_infered = cls.infer_compression( filepath_or_buffer, kwargs["compression"] ) with OpenFile(filepath_or_buffer_md, "rb", compression_infered) as f: splits, _ = cls.partitioned_file( f, num_partitions=NPartitions.get(), is_quoting=kwargs.pop("is_quoting"), nrows=kwargs["nrows"], ) if callable(columns): with OpenFile(filepath_or_buffer_md, "rb", compression_infered) as f: columns = columns(f, **kwargs) if not isinstance(columns, pandas.Index): columns = pandas.Index(columns) empty_pd_df = pandas.DataFrame(columns=columns) index_name = empty_pd_df.index.name column_widths, num_splits = cls._define_metadata(empty_pd_df, columns) # kwargs that will be passed to the workers partition_kwargs = dict( kwargs, fname=filepath_or_buffer_md, num_splits=num_splits, nrows=None, compression=compression_infered, ) partition_ids, index_ids, dtypes_ids = cls._launch_tasks( splits, callback=custom_parser, **partition_kwargs ) new_query_compiler = cls._get_new_qc( partition_ids=partition_ids, index_ids=index_ids, dtypes_ids=dtypes_ids, index_col=None, index_name=index_name, column_widths=column_widths, column_names=columns, nrows=kwargs["nrows"], ) return new_query_compiler ================================================ FILE: modin/experimental/core/storage_formats/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Experimental functionality related to storage formats supported.""" ================================================ FILE: modin/experimental/core/storage_formats/pandas/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """The module represents the query compiler level for the pandas storage format (experimental).""" ================================================ FILE: modin/experimental/core/storage_formats/pandas/parsers.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses experimental Modin parser classes, that are used for data parsing on the workers.""" import warnings from io import BytesIO import pandas from pandas.util._decorators import doc from modin.core.io.file_dispatcher import OpenFile from modin.core.storage_formats.pandas.parsers import ( PandasCSVParser, PandasParser, _doc_pandas_parser_class, _doc_parse_func, _doc_parse_parameters_common, _split_result_for_readers, ) @doc(_doc_pandas_parser_class, data_type="multiple CSV files simultaneously") class ExperimentalPandasCSVGlobParser(PandasCSVParser): @staticmethod @doc( _doc_parse_func, parameters="""chunks : list List, where each element of the list is a list of tuples. The inner lists of tuples contains the data file name of the chunk, chunk start offset, and chunk end offsets for its corresponding file.""", ) def parse(chunks, **kwargs): warnings.filterwarnings("ignore") num_splits = kwargs.pop("num_splits", None) index_col = kwargs.get("index_col", None) # `single_worker_read` just pass filename via chunks; need check if isinstance(chunks, str): return pandas.read_csv(chunks, **kwargs) # pop `compression` from kwargs because `bio` below is uncompressed compression = kwargs.pop("compression", "infer") storage_options = kwargs.pop("storage_options", None) or {} pandas_dfs = [] for fname, start, end in chunks: if start is not None and end is not None: with OpenFile(fname, "rb", compression, **storage_options) as bio: if kwargs.get("encoding", None) is not None: header = b"" + bio.readline() else: header = b"" bio.seek(start) to_read = header + bio.read(end - start) pandas_dfs.append(pandas.read_csv(BytesIO(to_read), **kwargs)) else: # This only happens when we are reading with only one worker (Default) return pandas.read_csv( fname, compression=compression, storage_options=storage_options, **kwargs, ) # Combine read in data. if len(pandas_dfs) > 1: pandas_df = pandas.concat(pandas_dfs) elif len(pandas_dfs) > 0: pandas_df = pandas_dfs[0] else: pandas_df = pandas.DataFrame() # Set internal index. if index_col is not None: index = pandas_df.index else: # The lengths will become the RangeIndex index = len(pandas_df) return _split_result_for_readers(1, num_splits, pandas_df) + [ index, pandas_df.dtypes, ] @doc(_doc_pandas_parser_class, data_type="pickled pandas objects") class ExperimentalPandasPickleParser(PandasParser): @staticmethod @doc(_doc_parse_func, parameters=_doc_parse_parameters_common) def parse(fname, **kwargs): warnings.filterwarnings("ignore") num_splits = 1 single_worker_read = kwargs.pop("single_worker_read", None) df = pandas.read_pickle(fname, **kwargs) if single_worker_read: return df assert isinstance( df, pandas.DataFrame ), f"Pickled obj type: [{type(df)}] in [{fname}]; works only with pandas.DataFrame" length = len(df) width = len(df.columns) return _split_result_for_readers(1, num_splits, df) + [length, width] @doc(_doc_pandas_parser_class, data_type="parquet files") class ExperimentalPandasParquetParser(PandasParser): @staticmethod @doc(_doc_parse_func, parameters=_doc_parse_parameters_common) def parse(fname, **kwargs): warnings.filterwarnings("ignore") num_splits = 1 single_worker_read = kwargs.pop("single_worker_read", None) df = pandas.read_parquet(fname, **kwargs) if single_worker_read: return df length = len(df) width = len(df.columns) return _split_result_for_readers(1, num_splits, df) + [length, width] @doc(_doc_pandas_parser_class, data_type="json files") class ExperimentalPandasJsonParser(PandasParser): @staticmethod @doc(_doc_parse_func, parameters=_doc_parse_parameters_common) def parse(fname, **kwargs): warnings.filterwarnings("ignore") num_splits = 1 single_worker_read = kwargs.pop("single_worker_read", None) df = pandas.read_json(fname, **kwargs) if single_worker_read: return df length = len(df) width = len(df.columns) return _split_result_for_readers(1, num_splits, df) + [length, width] @doc(_doc_pandas_parser_class, data_type="XML files") class ExperimentalPandasXmlParser(PandasParser): @staticmethod @doc(_doc_parse_func, parameters=_doc_parse_parameters_common) def parse(fname, **kwargs): warnings.filterwarnings("ignore") num_splits = 1 single_worker_read = kwargs.pop("single_worker_read", None) df = pandas.read_xml(fname, **kwargs) if single_worker_read: return df length = len(df) width = len(df.columns) return _split_result_for_readers(1, num_splits, df) + [length, width] @doc(_doc_pandas_parser_class, data_type="custom text") class ExperimentalCustomTextParser(PandasParser): @staticmethod @doc(_doc_parse_func, parameters=_doc_parse_parameters_common) def parse(fname, **kwargs): return PandasParser.generic_parse(fname, **kwargs) ================================================ FILE: modin/experimental/fuzzydata/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module holds experimental fuzzydata specific functionality for Modin.""" ================================================ FILE: modin/experimental/pandas/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ The main module through which interaction with the experimental API takes place. See `Experimental API Reference` for details. Notes ----- * Some of experimental APIs deviate from pandas in order to provide improved performance. * Although the use of experimental storage formats and engines is available through the `modin.pandas` module when defining environment variable `MODIN_EXPERIMENTAL=true`, the use of experimental I/O functions is available only through the `modin.experimental.pandas` module. Examples -------- >>> import modin.experimental.pandas as pd >>> df = pd.read_csv_glob("data*.csv") """ from modin.pandas import * # noqa F401, F403 from .io import ( # noqa F401 read_csv_glob, read_custom_text, read_json_glob, read_parquet_glob, read_pickle_glob, read_sql, read_xml_glob, ) ================================================ FILE: modin/experimental/pandas/io.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Implement experimental I/O public API.""" from __future__ import annotations import inspect import pathlib import pickle from typing import IO, AnyStr, Callable, Iterator, Literal, Optional, Union import pandas import pandas._libs.lib as lib from pandas._typing import CompressionOptions, DtypeArg, DtypeBackend, StorageOptions from modin.core.storage_formats import BaseQueryCompiler from modin.utils import expanduser_path_arg from . import DataFrame def read_sql( sql, con, index_col=None, coerce_float=True, params=None, parse_dates=None, columns=None, chunksize=None, dtype_backend=lib.no_default, dtype=None, partition_column: Optional[str] = None, lower_bound: Optional[int] = None, upper_bound: Optional[int] = None, max_sessions: Optional[int] = None, ) -> Union[DataFrame, Iterator[DataFrame]]: """ General documentation is available in `modin.pandas.read_sql`. This experimental feature provides distributed reading from a sql file. The function extended with `Spark-like parameters `_ such as ``partition_column``, ``lower_bound`` and ``upper_bound``. With these parameters, the user will be able to specify how to partition the imported data. Parameters ---------- sql : str or SQLAlchemy Selectable (select or text object) SQL query to be executed or a table name. con : SQLAlchemy connectable, str, or sqlite3 connection Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. The user is responsible for engine disposal and connection closure for the SQLAlchemy connectable; str connections are closed automatically. See `here `_. index_col : str or list of str, optional Column(s) to set as index(MultiIndex). coerce_float : bool, default: True Attempts to convert values of non-string, non-numeric objects (like decimal.Decimal) to floating point, useful for SQL result sets. params : list, tuple or dict, optional List of parameters to pass to execute method. The syntax used to pass parameters is database driver dependent. Check your database driver documentation for which of the five syntax styles, described in PEP 249's paramstyle, is supported. Eg. for psycopg2, uses %(name)s so use params= {'name' : 'value'}. parse_dates : list or dict, optional - List of column names to parse as dates. - Dict of ``{column_name: format string}`` where format string is strftime compatible in case of parsing string times, or is one of (D, s, ns, ms, us) in case of parsing integer timestamps. - Dict of ``{column_name: arg dict}``, where the arg dict corresponds to the keyword arguments of :func:`pandas.to_datetime` Especially useful with databases without native Datetime support, such as SQLite. columns : list, optional List of column names to select from SQL table (only used when reading a table). chunksize : int, optional If specified, return an iterator where `chunksize` is the number of rows to include in each chunk. dtype_backend : {"numpy_nullable", "pyarrow"}, default: NumPy backed DataFrames Which dtype_backend to use, e.g. whether a DataFrame should have NumPy arrays, nullable dtypes are used for all dtypes that have a nullable implementation when "numpy_nullable" is set, PyArrow is used for all dtypes if "pyarrow" is set. The dtype_backends are still experimential. dtype : Type name or dict of columns, optional Data type for data or columns. E.g. np.float64 or {'a': np.float64, 'b': np.int32, 'c': 'Int64'}. The argument is ignored if a table is passed instead of a query. partition_column : str, optional Column used to share the data between the workers (MUST be a INTEGER column). lower_bound : int, optional The minimum value to be requested from the partition_column. upper_bound : int, optional The maximum value to be requested from the partition_column. max_sessions : int, optional The maximum number of simultaneous connections allowed to use. Returns ------- modin.DataFrame or Iterator[modin.DataFrame] """ _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher result = FactoryDispatcher.read_sql_distributed(**kwargs) if isinstance(result, BaseQueryCompiler): return DataFrame(query_compiler=result) return (DataFrame(query_compiler=qc) for qc in result) @expanduser_path_arg("filepath_or_buffer") def read_custom_text( filepath_or_buffer, columns, custom_parser, compression="infer", nrows: Optional[int] = None, is_quoting=True, ): r""" Load custom text data from file. Parameters ---------- filepath_or_buffer : str File path where the custom text data will be loaded from. columns : list or callable(file-like object, \*\*kwargs) -> list Column names of list type or callable that create column names from opened file and passed `kwargs`. custom_parser : callable(file-like object, \*\*kwargs) -> pandas.DataFrame Function that takes as input a part of the `filepath_or_buffer` file loaded into memory in file-like object form. compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default: 'infer' If 'infer' and 'path_or_url' is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no compression). If 'infer' and 'path_or_url' is not path-like, then use None (= no decompression). nrows : int, optional Amount of rows to read. is_quoting : bool, default: True Whether or not to consider quotes. Returns ------- modin.DataFrame """ _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return DataFrame(query_compiler=FactoryDispatcher.read_custom_text(**kwargs)) # CSV and table def _make_parser_func(sep: str, funcname: str) -> Callable: """ Create a parser function from the given sep. Parameters ---------- sep : str The separator default to use for the parser. funcname : str The name of the generated parser function. Returns ------- Callable """ def parser_func( filepath_or_buffer: Union[str, pathlib.Path, IO[AnyStr]], *, sep=lib.no_default, delimiter=None, header="infer", names=lib.no_default, index_col=None, usecols=None, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, skipfooter=0, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=lib.no_default, skip_blank_lines=True, parse_dates=None, infer_datetime_format=lib.no_default, keep_date_col=lib.no_default, date_parser=lib.no_default, date_format=None, dayfirst=False, cache_dates=True, iterator=False, chunksize=None, compression="infer", thousands=None, decimal: str = ".", lineterminator=None, quotechar='"', quoting=0, escapechar=None, comment=None, encoding=None, encoding_errors="strict", dialect=None, on_bad_lines="error", doublequote=True, delim_whitespace=lib.no_default, low_memory=True, memory_map=False, float_precision=None, storage_options: StorageOptions = None, dtype_backend=lib.no_default, ) -> DataFrame: # ISSUE #2408: parse parameter shared with pandas read_csv and read_table and update with provided args _pd_read_csv_signature = { val.name for val in inspect.signature(pandas.read_csv).parameters.values() } _, _, _, f_locals = inspect.getargvalues(inspect.currentframe()) if f_locals.get("sep", sep) is False: f_locals["sep"] = "\t" kwargs = {k: v for k, v in f_locals.items() if k in _pd_read_csv_signature} return _read(**kwargs) parser_func.__doc__ = _read.__doc__ parser_func.__name__ = funcname return expanduser_path_arg("filepath_or_buffer")(parser_func) def _read(**kwargs) -> DataFrame: """ General documentation is available in `modin.pandas.read_csv`. This experimental feature provides parallel reading from multiple csv files which are defined by glob pattern. Parameters ---------- **kwargs : dict Keyword arguments in `modin.pandas.read_csv`. Returns ------- modin.DataFrame Examples -------- >>> import modin.experimental.pandas as pd >>> df = pd.read_csv_glob("s3://dask-data/nyc-taxi/2015/yellow_tripdata_2015-1*") UserWarning: `read_*` implementation has mismatches with pandas: Data types of partitions are different! Please refer to the troubleshooting section of the Modin documentation to fix this issue. VendorID tpep_pickup_datetime ... total_amount congestion_surcharge 0 1.0 2020-10-01 00:09:08 ... 4.30 0.0 1 1.0 2020-10-01 00:09:19 ... 13.30 2.5 2 1.0 2020-10-01 00:30:00 ... 15.36 2.5 3 2.0 2020-10-01 00:56:46 ... -3.80 0.0 4 2.0 2020-10-01 00:56:46 ... 3.80 0.0 ... ... ... ... ... ... 4652008 NaN 2020-12-31 23:44:35 ... 43.95 2.5 4652009 NaN 2020-12-31 23:41:36 ... 20.17 2.5 4652010 NaN 2020-12-31 23:01:17 ... 78.98 0.0 4652011 NaN 2020-12-31 23:31:29 ... 39.50 0.0 4652012 NaN 2020-12-31 23:12:48 ... 20.64 0.0 [4652013 rows x 18 columns] """ from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher pd_obj = FactoryDispatcher.read_csv_glob(**kwargs) # This happens when `read_csv` returns a TextFileReader object for iterating through if isinstance(pd_obj, pandas.io.parsers.TextFileReader): reader = pd_obj.read pd_obj.read = lambda *args, **kwargs: DataFrame( query_compiler=reader(*args, **kwargs) ) return pd_obj return DataFrame(query_compiler=pd_obj) read_csv_glob = _make_parser_func(sep=",", funcname="read_csv_glob") @expanduser_path_arg("filepath_or_buffer") def read_pickle_glob( filepath_or_buffer, compression: Optional[str] = "infer", storage_options: StorageOptions = None, ): """ Load pickled pandas object from files. This experimental feature provides parallel reading from multiple pickle files which are defined by glob pattern. The files must contain parts of one dataframe, which can be obtained, for example, by `DataFrame.modin.to_pickle_glob` function. Parameters ---------- filepath_or_buffer : str, path object or file-like object File path, URL, or buffer where the pickled object will be loaded from. Accept URL. URL is not limited to S3 and GCS. compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default: 'infer' If 'infer' and 'path_or_url' is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no compression) If 'infer' and 'path_or_url' is not path-like, then use None (= no decompression). storage_options : dict, optional Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc., if using a URL that will be parsed by fsspec, e.g., starting "s3://", "gcs://". An error will be raised if providing this argument with a non-fsspec URL. See the fsspec and backend storage implementation docs for the set of allowed keys and values. Returns ------- unpickled : same type as object stored in file Notes ----- The number of partitions is equal to the number of input files. """ _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return DataFrame(query_compiler=FactoryDispatcher.read_pickle_glob(**kwargs)) @expanduser_path_arg("filepath_or_buffer") def to_pickle_glob( self, filepath_or_buffer, compression: CompressionOptions = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, ) -> None: """ Pickle (serialize) object to file. This experimental feature provides parallel writing into multiple pickle files which are defined by glob pattern, otherwise (without glob pattern) default pandas implementation is used. Parameters ---------- filepath_or_buffer : str File path where the pickled object will be stored. compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default: 'infer' A string representing the compression to use in the output file. By default, infers from the file extension in specified path. Compression mode may be any of the following possible values: {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}. If compression mode is 'infer' and path_or_buf is path-like, then detect compression mode from the following extensions: '.gz', '.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given and mode is 'zip' or inferred as 'zip', other entries passed as additional compression options. protocol : int, default: pickle.HIGHEST_PROTOCOL Int which indicates which protocol should be used by the pickler, default HIGHEST_PROTOCOL (see `pickle docs `_ paragraph 12.1.2 for details). The possible values are 0, 1, 2, 3, 4, 5. A negative value for the protocol parameter is equivalent to setting its value to HIGHEST_PROTOCOL. storage_options : dict, optional Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc., if using a URL that will be parsed by fsspec, e.g., starting "s3://", "gcs://". An error will be raised if providing this argument with a non-fsspec URL. See the fsspec and backend storage implementation docs for the set of allowed keys and values. """ obj = self from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher if isinstance(self, DataFrame): obj = self._query_compiler FactoryDispatcher.to_pickle_glob( obj, filepath_or_buffer=filepath_or_buffer, compression=compression, protocol=protocol, storage_options=storage_options, ) @expanduser_path_arg("path") def read_parquet_glob( path, engine: str = "auto", columns: list[str] | None = None, storage_options: StorageOptions = None, use_nullable_dtypes: bool = lib.no_default, dtype_backend=lib.no_default, filesystem=None, filters=None, **kwargs, ) -> DataFrame: # noqa: PR01 """ Load a parquet object from the file path, returning a DataFrame. This experimental feature provides parallel reading from multiple parquet files which are defined by glob pattern. The files must contain parts of one dataframe, which can be obtained, for example, by `DataFrame.modin.to_parquet_glob` function. Returns ------- DataFrame Notes ----- * Only string type supported for `path` argument. * The rest of the arguments are the same as for `pandas.read_parquet`. """ from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return DataFrame( query_compiler=FactoryDispatcher.read_parquet_glob( path=path, engine=engine, columns=columns, storage_options=storage_options, use_nullable_dtypes=use_nullable_dtypes, dtype_backend=dtype_backend, filesystem=filesystem, filters=filters, **kwargs, ) ) @expanduser_path_arg("path") def to_parquet_glob( self, path, engine="auto", compression="snappy", index=None, partition_cols=None, storage_options: StorageOptions = None, **kwargs, ) -> None: # noqa: PR01 """ Write a DataFrame to the binary parquet format. This experimental feature provides parallel writing into multiple parquet files which are defined by glob pattern, otherwise (without glob pattern) default pandas implementation is used. Notes ----- * Only string type supported for `path` argument. * The rest of the arguments are the same as for `pandas.to_parquet`. """ obj = self from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher if isinstance(self, DataFrame): obj = self._query_compiler FactoryDispatcher.to_parquet_glob( obj, path=path, engine=engine, compression=compression, index=index, partition_cols=partition_cols, storage_options=storage_options, **kwargs, ) @expanduser_path_arg("path_or_buf") def read_json_glob( path_or_buf, *, orient: str | None = None, typ: Literal["frame", "series"] = "frame", dtype: DtypeArg | None = None, convert_axes=None, convert_dates: bool | list[str] = True, keep_default_dates: bool = True, precise_float: bool = False, date_unit: str | None = None, encoding: str | None = None, encoding_errors: str | None = "strict", lines: bool = False, chunksize: int | None = None, compression: CompressionOptions = "infer", nrows: int | None = None, storage_options: StorageOptions = None, dtype_backend: Union[DtypeBackend, lib.NoDefault] = lib.no_default, engine="ujson", ) -> DataFrame: # noqa: PR01 """ Convert a JSON string to pandas object. This experimental feature provides parallel reading from multiple json files which are defined by glob pattern. The files must contain parts of one dataframe, which can be obtained, for example, by `DataFrame.modin.to_json_glob` function. Returns ------- DataFrame Notes ----- * Only string type supported for `path_or_buf` argument. * The rest of the arguments are the same as for `pandas.read_json`. """ from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher if nrows is not None: raise NotImplementedError( "`read_json_glob` only support nrows is None, otherwise use `to_json`." ) return DataFrame( query_compiler=FactoryDispatcher.read_json_glob( path_or_buf=path_or_buf, orient=orient, typ=typ, dtype=dtype, convert_axes=convert_axes, convert_dates=convert_dates, keep_default_dates=keep_default_dates, precise_float=precise_float, date_unit=date_unit, encoding=encoding, encoding_errors=encoding_errors, lines=lines, chunksize=chunksize, compression=compression, nrows=nrows, storage_options=storage_options, dtype_backend=dtype_backend, engine=engine, ) ) @expanduser_path_arg("path_or_buf") def to_json_glob( self, path_or_buf=None, orient=None, date_format=None, double_precision=10, force_ascii=True, date_unit="ms", default_handler=None, lines=False, compression="infer", index=None, indent=None, storage_options: StorageOptions = None, mode="w", ) -> None: # noqa: PR01 """ Convert the object to a JSON string. Notes ----- * Only string type supported for `path_or_buf` argument. * The rest of the arguments are the same as for `pandas.to_json`. """ obj = self from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher if isinstance(self, DataFrame): obj = self._query_compiler FactoryDispatcher.to_json_glob( obj, path_or_buf=path_or_buf, orient=orient, date_format=date_format, double_precision=double_precision, force_ascii=force_ascii, date_unit=date_unit, default_handler=default_handler, lines=lines, compression=compression, index=index, indent=indent, storage_options=storage_options, mode=mode, ) @expanduser_path_arg("path_or_buffer") def read_xml_glob( path_or_buffer, *, xpath="./*", namespaces=None, elems_only=False, attrs_only=False, names=None, dtype=None, converters=None, parse_dates=None, encoding="utf-8", parser="lxml", stylesheet=None, iterparse=None, compression="infer", storage_options: StorageOptions = None, dtype_backend=lib.no_default, ) -> DataFrame: # noqa: PR01 """ Read XML document into a DataFrame object. This experimental feature provides parallel reading from multiple XML files which are defined by glob pattern. The files must contain parts of one dataframe, which can be obtained, for example, by `DataFrame.modin.to_xml_glob` function. Returns ------- DataFrame Notes ----- * Only string type supported for `path_or_buffer` argument. * The rest of the arguments are the same as for `pandas.read_xml`. """ from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return DataFrame( query_compiler=FactoryDispatcher.read_xml_glob( path_or_buffer=path_or_buffer, xpath=xpath, namespaces=namespaces, elems_only=elems_only, attrs_only=attrs_only, names=names, dtype=dtype, converters=converters, parse_dates=parse_dates, encoding=encoding, parser=parser, stylesheet=stylesheet, iterparse=iterparse, compression=compression, storage_options=storage_options, dtype_backend=dtype_backend, ) ) @expanduser_path_arg("path_or_buffer") def to_xml_glob( self, path_or_buffer=None, index=True, root_name="data", row_name="row", na_rep=None, attr_cols=None, elem_cols=None, namespaces=None, prefix=None, encoding="utf-8", xml_declaration=True, pretty_print=True, parser="lxml", stylesheet=None, compression="infer", storage_options=None, ) -> None: # noqa: PR01 """ Render a DataFrame to an XML document. Notes ----- * Only string type supported for `path_or_buffer` argument. * The rest of the arguments are the same as for `pandas.to_xml`. """ obj = self from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher if isinstance(self, DataFrame): obj = self._query_compiler FactoryDispatcher.to_xml_glob( obj, path_or_buffer=path_or_buffer, index=index, root_name=root_name, row_name=row_name, na_rep=na_rep, attr_cols=attr_cols, elem_cols=elem_cols, namespaces=namespaces, prefix=prefix, encoding=encoding, xml_declaration=xml_declaration, pretty_print=pretty_print, parser=parser, stylesheet=stylesheet, compression=compression, storage_options=storage_options, ) ================================================ FILE: modin/experimental/sklearn/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module holds experimental scikit-learn specific functionality for Modin.""" ================================================ FILE: modin/experimental/sklearn/model_selection/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module holds model selection specific functionality.""" from .train_test_split import train_test_split __all__ = ["train_test_split"] ================================================ FILE: modin/experimental/sklearn/model_selection/train_test_split.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module holds `train_test_splt` function.""" # FIXME: Change `**options`-->`train_size=0.75` def train_test_split(df, **options): """ Split input data to train and test data. Parameters ---------- df : modin.pandas.DataFrame / modin.pandas.Series Data to split. **options : dict Keyword arguments. If `train_size` key isn't provided `train_size` will be 0.75. Returns ------- tuple A pair of modin.pandas.DataFrame / modin.pandas.Series. """ train_size = options.get("train_size", 0.75) train = df.iloc[: int(len(df) * train_size)] test = df.iloc[len(train) :] return train, test ================================================ FILE: modin/experimental/spreadsheet/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. try: import modin_spreadsheet except ImportError: raise ImportError( 'Please `pip install "modin[spreadsheet]"` to install the spreadsheet extension' ) from .general import from_dataframe, to_dataframe __all__ = ["from_dataframe", "to_dataframe"] del modin_spreadsheet ================================================ FILE: modin/experimental/spreadsheet/general.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. from modin_spreadsheet import SpreadsheetWidget, show_grid from .. import pandas as pd def from_dataframe( dataframe, show_toolbar=None, show_history=None, precision=None, grid_options=None, column_options=None, column_definitions=None, row_edit_callback=None, ): """ Renders a DataFrame or Series as an interactive spreadsheet, represented by an instance of the ``SpreadsheetWidget`` class. The ``SpreadsheetWidget`` instance is constructed using the options passed in to this function. The ``dataframe`` argument to this function is used as the ``df`` kwarg in call to the SpreadsheetWidget constructor, and the rest of the parameters are passed through as is. If the ``dataframe`` argument is a Series, it will be converted to a DataFrame before being passed in to the SpreadsheetWidget constructor as the ``df`` kwarg. :rtype: SpreadsheetWidget Parameters ---------- dataframe : DataFrame The DataFrame that will be displayed by this instance of SpreadsheetWidget. grid_options : dict Options to use when creating the SlickGrid control (i.e. the interactive grid). See the Notes section below for more information on the available options, as well as the default options that this widget uses. precision : integer The number of digits of precision to display for floating-point values. If unset, we use the value of `pandas.get_option('display.precision')`. show_toolbar : bool Whether to show a toolbar with options for adding/removing rows. Adding/removing rows is an experimental feature which only works with DataFrames that have an integer index. show_history : bool Whether to show the cell containing the spreadsheet transformation history. column_options : dict Column options that are to be applied to every column. See the Notes section below for more information on the available options, as well as the default options that this widget uses. column_definitions : dict Column options that are to be applied to individual columns. The keys of the dict should be the column names, and each value should be the column options for a particular column, represented as a dict. The available options for each column are the same options that are available to be set for all columns via the ``column_options`` parameter. See the Notes section below for more information on those options. row_edit_callback : callable A callable that is called to determine whether a particular row should be editable or not. Its signature should be ``callable(row)``, where ``row`` is a dictionary which contains a particular row's values, keyed by column name. The callback should return True if the provided row should be editable, and False otherwise. Notes ----- The following dictionary is used for ``grid_options`` if none are provided explicitly:: { # SlickGrid options 'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': False, 'defaultColumnWidth': 150, 'rowHeight': 28, 'enableColumnReorder': False, 'enableTextSelectionOnCells': True, 'editable': True, 'autoEdit': False, 'explicitInitialization': True, # Modin-spreadsheet options 'maxVisibleRows': 15, 'minVisibleRows': 8, 'sortable': True, 'filterable': True, 'highlightSelectedCell': False, 'highlightSelectedRow': True } The first group of options are SlickGrid "grid options" which are described in the `SlickGrid documentation `__. The second group of option are options that were added specifically for modin-spreadsheet and therefore are not documented in the SlickGrid documentation. The following bullet points describe these options. * **maxVisibleRows** The maximum number of rows that modin-spreadsheet will show. * **minVisibleRows** The minimum number of rows that modin-spreadsheet will show * **sortable** Whether the modin-spreadsheet instance will allow the user to sort columns by clicking the column headers. When this is set to ``False``, nothing will happen when users click the column headers. * **filterable** Whether the modin-spreadsheet instance will allow the user to filter the grid. When this is set to ``False`` the filter icons won't be shown for any columns. * **highlightSelectedCell** If you set this to True, the selected cell will be given a light blue border. * **highlightSelectedRow** If you set this to False, the light blue background that's shown by default for selected rows will be hidden. The following dictionary is used for ``column_options`` if none are provided explicitly:: { # SlickGrid column options 'defaultSortAsc': True, 'maxWidth': None, 'minWidth': 30, 'resizable': True, 'sortable': True, 'toolTip': "", 'width': None # Modin-spreadsheet column options 'editable': True, } The first group of options are SlickGrid "column options" which are described in the `SlickGrid documentation `__. The ``editable`` option was added specifically for modin-spreadsheet and therefore is not documented in the SlickGrid documentation. This option specifies whether a column should be editable or not. See Also -------- set_defaults : Permanently set global defaults for the parameters of ``show_grid``, with the exception of the ``dataframe`` and ``column_definitions`` parameters, since those depend on the particular set of data being shown by an instance, and therefore aren't parameters we would want to set for all SpreadsheetWidget instances. set_grid_option : Permanently set global defaults for individual grid options. Does so by changing the defaults that the ``show_grid`` method uses for the ``grid_options`` parameter. SpreadsheetWidget : The widget class that is instantiated and returned by this method. """ if not isinstance(dataframe, pd.DataFrame): raise TypeError("dataframe must be modin.DataFrame, not %s" % type(dataframe)) return show_grid( dataframe, show_toolbar, show_history, precision, grid_options, column_options, column_definitions, row_edit_callback, ) def to_dataframe(spreadsheet): """ Get a copy of the DataFrame that reflects the current state of the ``spreadsheet`` SpreadsheetWidget instance UI. This includes any sorting or filtering changes, as well as edits that have been made by double clicking cells. :rtype: DataFrame Parameters ---------- spreadsheet : SpreadsheetWidget The SpreadsheetWidget instance that DataFrame that will be displayed by this instance of SpreadsheetWidget. """ if not isinstance(spreadsheet, SpreadsheetWidget): raise TypeError( "spreadsheet must be modin_spreadsheet.SpreadsheetWidget, not %s" % type(spreadsheet) ) return spreadsheet.get_changed_df() ================================================ FILE: modin/experimental/torch/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module supports conversion for torch `DataLoader` interplay.""" ================================================ FILE: modin/experimental/torch/datasets.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. from __future__ import annotations import math from typing import Hashable, Sequence, Type from pandas import DataFrame from torch.utils.data import Sampler, SequentialSampler from modin.pandas import DataFrame as ModinDataFrame class ModinDataLoader: "A self explainatory class to convert a DataFrame into a DataLoader that batches rows." def __init__( self, df: DataFrame | ModinDataFrame, batch_size: int, features: Sequence[Hashable] = (), sampler: Type[Sampler] | Sampler = SequentialSampler, ) -> None: """ Converts a Pandas/Modin DataFrame into a torch DataLoader. NOTE: This function should eventually go into modin/utils.py. Parameters ---------- df : DataFrame batch_size : int, default: 1 features : Sequence[Hashable], default: () If specified, only these features will be used. sampler: Type[Sampler] | Sampler, default: SequentialSampler The sampler to use. By default, iterates over the DataFrame in order. Returns ------- DataLoader DataLoader object backed by desired data. """ if features: df = df[features] if isinstance(sampler, type): sampler = sampler(df) self._df = df self._batch_size = batch_size self._sampler = sampler def __len__(self): # Sampler length is always valid. return math.ceil(len(self._sampler) / self._batch_size) def __iter__(self): idx_buffer = [] for cnt, idx in enumerate(self._sampler): idx_buffer.append(idx) if self._end_of_batch(cnt): yield self._df.iloc[idx_buffer].to_numpy() idx_buffer = [] def _end_of_batch(self, counter: int): return ( counter % self._batch_size == self._batch_size - 1 or counter == len(self._sampler) - 1 ) ================================================ FILE: modin/experimental/xgboost/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module holds public interfaces for Modin XGBoost.""" from .xgboost import Booster, DMatrix, train __all__ = ["DMatrix", "Booster", "train"] ================================================ FILE: modin/experimental/xgboost/utils.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module holds classes for work with Rabit all-reduce context.""" import logging import xgboost as xgb LOGGER = logging.getLogger("[modin.xgboost]") class RabitContextManager: """ A manager class that controls lifecycle of `xgb.RabitTracker`. All workers that are used for distributed training will connect to Rabit Tracker stored in this class. Parameters ---------- num_workers : int Number of workers of `self.rabit_tracker`. host_ip : str IP address of host that creates `self` object. """ # TODO: Specify type of host_ip def __init__(self, num_workers: int, host_ip): self._num_workers = num_workers self.env = {"DMLC_NUM_WORKER": self._num_workers} self.rabit_tracker = xgb.RabitTracker( host_ip=host_ip, n_workers=self._num_workers ) def __enter__(self): """ Entry point of manager. Updates Rabit Tracker environment, starts `self.rabit_tracker`. Returns ------- dict Dict with Rabit Tracker environment. """ self.env.update(self.rabit_tracker.worker_envs()) self.rabit_tracker.start(self._num_workers) return self.env # TODO: (type, value, traceback) -> *args def __exit__(self, type, value, traceback): """ Exit point of manager. Finishes `self.rabit_tracker`. Parameters ---------- type : exception type Type of exception, captured by manager. value : Exception Exception value. traceback : TracebackType Traceback of exception. """ self.rabit_tracker.join() class RabitContext: """ Context to connect a worker to a rabit tracker. Parameters ---------- actor_rank : int Rank of actor, connected to this context. args : list List with environment variables for Rabit Tracker. """ def __init__(self, actor_rank, args): self.args = args self.args.append(("DMLC_TASK_ID=[modin.xgboost]:" + str(actor_rank)).encode()) def __enter__(self): """ Entry point of context. Connects to Rabit Tracker. """ xgb.rabit.init(self.args) LOGGER.info("-------------- rabit started ------------------") def __exit__(self, *args): """ Exit point of context. Disconnects from Rabit Tracker. Parameters ---------- *args : iterable Parameters for Exception capturing. """ xgb.rabit.finalize() LOGGER.info("-------------- rabit finished ------------------") ================================================ FILE: modin/experimental/xgboost/xgboost.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module holds public interfaces for work Modin XGBoost.""" import logging from typing import Dict, Optional import xgboost as xgb import modin.pandas as pd from modin.config import Engine from modin.distributed.dataframe.pandas import unwrap_partitions LOGGER = logging.getLogger("[modin.xgboost]") class DMatrix: """ DMatrix holds references to partitions of Modin DataFrame. On init stage unwrapping partitions of Modin DataFrame is started. Parameters ---------- data : modin.pandas.DataFrame Data source of DMatrix. label : modin.pandas.DataFrame or modin.pandas.Series, optional Labels used for training. missing : float, optional Value in the input data which needs to be present as a missing value. If ``None``, defaults to ``np.nan``. silent : boolean, optional Whether to print messages during construction or not. feature_names : list, optional Set names for features. feature_types : list, optional Set types for features. feature_weights : array_like, optional Set feature weights for column sampling. enable_categorical : boolean, optional Experimental support of specializing for categorical features. Notes ----- Currently DMatrix doesn't support `weight`, `base_margin`, `nthread`, `group`, `qid`, `label_lower_bound`, `label_upper_bound` parameters. """ def __init__( self, data, label=None, missing=None, silent=False, feature_names=None, feature_types=None, feature_weights=None, enable_categorical=None, ): assert isinstance( data, pd.DataFrame ), f"Type of `data` is {type(data)}, but expected {pd.DataFrame}." if label is not None: assert isinstance( label, (pd.DataFrame, pd.Series) ), f"Type of `data` is {type(label)}, but expected {pd.DataFrame} or {pd.Series}." self.label = unwrap_partitions(label, axis=0) else: self.label = None self.data = unwrap_partitions(data, axis=0, get_ip=True) self._n_rows = data.shape[0] self._n_cols = data.shape[1] for i, dtype in enumerate(data.dtypes): if dtype == "object": raise ValueError(f"Column {i} has unsupported data type {dtype}.") self.feature_names = feature_names self.feature_types = feature_types self.missing = missing self.silent = silent self.feature_weights = feature_weights self.enable_categorical = enable_categorical self.metadata = ( data.index, data.columns, data._query_compiler._modin_frame.row_lengths, ) def __iter__(self): """ Return unwrapped `self.data` and `self.label`. Yields ------ list List of `self.data` with pairs of references to IP of row partition and row partition [(IP_ref0, partition_ref0), ..]. list List of `self.label` with references to row partitions [partition_ref0, ..]. """ yield self.data yield self.label def get_dmatrix_params(self): """ Get dict of DMatrix parameters excluding `self.data`/`self.label`. Returns ------- dict """ dmatrix_params = { "feature_names": self.feature_names, "feature_types": self.feature_types, "missing": self.missing, "silent": self.silent, "feature_weights": self.feature_weights, "enable_categorical": self.enable_categorical, } return dmatrix_params @property def feature_names(self): """ Get column labels. Returns ------- Column labels. """ return self._feature_names @feature_names.setter def feature_names(self, feature_names): """ Set column labels. Parameters ---------- feature_names : list or None Labels for columns. In the case of ``None``, existing feature names will be reset. """ if feature_names is not None: feature_names = ( list(feature_names) if not isinstance(feature_names, str) else [feature_names] ) if len(feature_names) != len(set(feature_names)): raise ValueError("Items in `feature_names` must be unique.") if len(feature_names) != self.num_col() and self.num_col() != 0: raise ValueError( "`feature_names` must have the same width as `self.data`." ) if not all( isinstance(f, str) and not any(x in f for x in set(("[", "]", "<"))) for f in feature_names ): raise ValueError( "Items of `feature_names` must be string and must not contain [, ] or <." ) else: feature_names = None self._feature_names = feature_names @property def feature_types(self): """ Get column types. Returns ------- Column types. """ return self._feature_types @feature_types.setter def feature_types(self, feature_types): """ Set column types. Parameters ---------- feature_types : list or None Labels for columns. In case None, existing feature names will be reset. """ if feature_types is not None: if not isinstance(feature_types, (list, str)): raise TypeError("feature_types must be string or list of strings") if isinstance(feature_types, str): feature_types = [feature_types] * self.num_col() feature_types = ( list(feature_types) if not isinstance(feature_types, str) else [feature_types] ) else: feature_types = None self._feature_types = feature_types def num_row(self): """ Get number of rows. Returns ------- int """ return self._n_rows def num_col(self): """ Get number of columns. Returns ------- int """ return self._n_cols def get_float_info(self, name): """ Get float property from the DMatrix. Parameters ---------- name : str The field name of the information. Returns ------- A NumPy array of float information of the data. """ return getattr(self, name) def set_info( self, *, label=None, feature_names=None, feature_types=None, feature_weights=None, ) -> None: """ Set meta info for DMatrix. Parameters ---------- label : modin.pandas.DataFrame or modin.pandas.Series, optional Labels used for training. feature_names : list, optional Set names for features. feature_types : list, optional Set types for features. feature_weights : array_like, optional Set feature weights for column sampling. """ if label is not None: self.label = label if feature_names is not None: self.feature_names = feature_names if feature_types is not None: self.feature_types = feature_types if feature_weights is not None: self.feature_weights = feature_weights class Booster(xgb.Booster): """ A Modin Booster of XGBoost. Booster is the model of XGBoost, that contains low level routines for training, prediction and evaluation. Parameters ---------- params : dict, optional Parameters for boosters. cache : list, default: empty List of cache items. model_file : string/os.PathLike/xgb.Booster/bytearray, optional Path to the model file if it's string or PathLike or xgb.Booster. """ def __init__(self, params=None, cache=(), model_file=None): # noqa: MD01 super(Booster, self).__init__(params=params, cache=cache, model_file=model_file) def predict( self, data: DMatrix, **kwargs, ): """ Run distributed prediction with a trained booster. During execution it runs ``xgb.predict`` on each worker for subset of `data` and creates Modin DataFrame with prediction results. Parameters ---------- data : modin.experimental.xgboost.DMatrix Input data used for prediction. **kwargs : dict Other parameters are the same as for ``xgboost.Booster.predict``. Returns ------- modin.pandas.DataFrame Modin DataFrame with prediction results. """ LOGGER.info("Prediction started") if Engine.get() == "Ray": from .xgboost_ray import _predict else: raise ValueError("Current version supports only Ray engine.") assert isinstance( data, DMatrix ), f"Type of `data` is {type(data)}, but expected {DMatrix}." if ( self.feature_names is not None and data.feature_names is not None and self.feature_names != data.feature_names ): data_missing = set(self.feature_names) - set(data.feature_names) self_missing = set(data.feature_names) - set(self.feature_names) msg = "feature_names mismatch: {0} {1}" if data_missing: msg += ( "\nexpected " + ", ".join(str(s) for s in data_missing) + " in input data" ) if self_missing: msg += ( "\ntraining data did not have the following fields: " + ", ".join(str(s) for s in self_missing) ) raise ValueError(msg.format(self.feature_names, data.feature_names)) result = _predict(self.copy(), data, **kwargs) LOGGER.info("Prediction finished") return result def train( params: Dict, dtrain: DMatrix, *args, evals=(), num_actors: Optional[int] = None, evals_result: Optional[Dict] = None, **kwargs, ): """ Run distributed training of XGBoost model. During work it evenly distributes `dtrain` between workers according to IP addresses partitions (in case of not even distribution of `dtrain` over nodes, some partitions will be re-distributed between nodes), runs xgb.train on each worker for subset of `dtrain` and reduces training results of each worker using Rabit Context. Parameters ---------- params : dict Booster params. dtrain : modin.experimental.xgboost.DMatrix Data to be trained against. *args : iterable Other parameters for `xgboost.train`. evals : list of pairs (modin.experimental.xgboost.DMatrix, str), default: empty List of validation sets for which metrics will evaluated during training. Validation metrics will help us track the performance of the model. num_actors : int, optional Number of actors for training. If unspecified, this value will be computed automatically. evals_result : dict, optional Dict to store evaluation results in. **kwargs : dict Other parameters are the same as `xgboost.train`. Returns ------- modin.experimental.xgboost.Booster A trained booster. """ LOGGER.info("Training started") if Engine.get() == "Ray": from .xgboost_ray import _train else: raise ValueError("Current version supports only Ray engine.") assert isinstance( dtrain, DMatrix ), f"Type of `dtrain` is {type(dtrain)}, but expected {DMatrix}." result = _train(dtrain, params, *args, num_actors=num_actors, evals=evals, **kwargs) if isinstance(evals_result, dict): evals_result.update(result["history"]) LOGGER.info("Training finished") return Booster(model_file=result["booster"]) ================================================ FILE: modin/experimental/xgboost/xgboost_ray.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Module holds internal entities for Modin XGBoost on Ray engine. Class ModinXGBoostActor provides interfaces to run XGBoost operations on remote workers. Other functions create Ray actors, distribute data between them, etc. """ import logging import math import time import warnings from collections import defaultdict from typing import Dict, List import numpy as np import pandas import ray import xgboost as xgb from ray.util import get_node_ip_address from modin.core.execution.ray.common import RayWrapper from modin.distributed.dataframe.pandas import from_partitions from .utils import RabitContext, RabitContextManager LOGGER = logging.getLogger("[modin.xgboost]") @ray.remote(num_cpus=0) class ModinXGBoostActor: """ Ray actor-class runs training on the remote worker. Parameters ---------- rank : int Rank of this actor. nthread : int Number of threads used by XGBoost in this actor. """ def __init__(self, rank, nthread): self._evals = [] self._rank = rank self._nthreads = nthread LOGGER.info( f"Actor <{self._rank}>, nthread = {self._nthreads} was initialized." ) def _get_dmatrix(self, X_y, **dmatrix_kwargs): """ Create xgboost.DMatrix from sequence of pandas.DataFrame objects. First half of `X_y` should contains objects for `X`, second for `y`. Parameters ---------- X_y : list List of pandas.DataFrame objects. **dmatrix_kwargs : dict Keyword parameters for ``xgb.DMatrix``. Returns ------- xgb.DMatrix A XGBoost DMatrix. """ s = time.time() X = X_y[: len(X_y) // 2] y = X_y[len(X_y) // 2 :] assert ( len(X) == len(y) and len(X) > 0 ), "X and y should have the equal length more than 0" X = pandas.concat(X, axis=0) y = pandas.concat(y, axis=0) LOGGER.info(f"Concat time: {time.time() - s} s") return xgb.DMatrix(X, y, nthread=self._nthreads, **dmatrix_kwargs) def set_train_data(self, *X_y, add_as_eval_method=None, **dmatrix_kwargs): """ Set train data for actor. Parameters ---------- *X_y : iterable Sequence of ray.ObjectRef objects. First half of sequence is for `X` data, second for `y`. When it is passed in actor, auto-materialization of ray.ObjectRef -> pandas.DataFrame happens. add_as_eval_method : str, optional Name of eval data. Used in case when train data also used for evaluation. **dmatrix_kwargs : dict Keyword parameters for ``xgb.DMatrix``. """ self._dtrain = self._get_dmatrix(X_y, **dmatrix_kwargs) if add_as_eval_method is not None: self._evals.append((self._dtrain, add_as_eval_method)) def add_eval_data(self, *X_y, eval_method, **dmatrix_kwargs): """ Add evaluation data for actor. Parameters ---------- *X_y : iterable Sequence of ray.ObjectRef objects. First half of sequence is for `X` data, second for `y`. When it is passed in actor, auto-materialization of ray.ObjectRef -> pandas.DataFrame happens. eval_method : str Name of eval data. **dmatrix_kwargs : dict Keyword parameters for ``xgb.DMatrix``. """ self._evals.append((self._get_dmatrix(X_y, **dmatrix_kwargs), eval_method)) def train(self, rabit_args, params, *args, **kwargs): """ Run local XGBoost training. Connects to Rabit Tracker environment to share training data between actors and trains XGBoost booster using `self._dtrain`. Parameters ---------- rabit_args : list List with environment variables for Rabit Tracker. params : dict Booster params. *args : iterable Other parameters for `xgboost.train`. **kwargs : dict Other parameters for `xgboost.train`. Returns ------- dict A dictionary with trained booster and dict of evaluation results as {"booster": xgb.Booster, "history": dict}. """ local_params = params.copy() local_dtrain = self._dtrain local_evals = self._evals local_params["nthread"] = self._nthreads evals_result = dict() s = time.time() with RabitContext(self._rank, rabit_args): bst = xgb.train( local_params, local_dtrain, *args, evals=local_evals, evals_result=evals_result, **kwargs, ) LOGGER.info(f"Local training time: {time.time() - s} s") return {"booster": bst, "history": evals_result} def _get_cluster_cpus(): """ Get number of CPUs available on Ray cluster. Returns ------- int Number of CPUs available on cluster. """ return ray.cluster_resources().get("CPU", 1) def _get_min_cpus_per_node(): """ Get min number of node CPUs available on cluster nodes. Returns ------- int Min number of CPUs per node. """ # TODO: max_node_cpus -> min_node_cpus max_node_cpus = min( node.get("Resources", {}).get("CPU", 0.0) for node in ray.nodes() ) return max_node_cpus if max_node_cpus > 0.0 else _get_cluster_cpus() def _get_cpus_per_actor(num_actors): """ Get number of CPUs to use by each actor. Parameters ---------- num_actors : int Number of Ray actors. Returns ------- int Number of CPUs per actor. """ cluster_cpus = _get_cluster_cpus() cpus_per_actor = max( 1, min(int(_get_min_cpus_per_node() or 1), int(cluster_cpus // num_actors)) ) return cpus_per_actor def _get_num_actors(num_actors=None): """ Get number of actors to create. Parameters ---------- num_actors : int, optional Desired number of actors. If is None, integer number of actors will be computed by condition 2 CPUs per 1 actor. Returns ------- int Number of actors to create. """ min_cpus_per_node = _get_min_cpus_per_node() if num_actors is None: num_actors_per_node = max(1, int(min_cpus_per_node // 2)) return num_actors_per_node * len(ray.nodes()) elif isinstance(num_actors, int): assert ( num_actors % len(ray.nodes()) == 0 ), "`num_actors` must be a multiple to number of nodes in Ray cluster." return num_actors else: RuntimeError("`num_actors` must be int or None") def create_actors(num_actors): """ Create ModinXGBoostActors. Parameters ---------- num_actors : int Number of actors to create. Returns ------- list List of pairs (ip, actor). """ num_cpus_per_actor = _get_cpus_per_actor(num_actors) # starting from ray 2.6 there is a new field: 'node:__internal_head__' # example: # >>> ray.cluster_resources() # {'object_store_memory': 1036438732.0, 'memory': 2072877467.0, 'node:127.0.0.1': 1.0, 'CPU': 8.0, 'node:__internal_head__': 1.0} node_ips = [ key for key in ray.cluster_resources().keys() if key.startswith("node:") and "__internal_head__" not in key ] num_actors_per_node = max(num_actors // len(node_ips), 1) actors_ips = [ip for ip in node_ips for _ in range(num_actors_per_node)] actors = [ ( node_ip.split("node:")[-1], ModinXGBoostActor.options(resources={node_ip: 0.01}).remote( i, nthread=num_cpus_per_actor ), ) for i, node_ip in enumerate(actors_ips) ] return actors def _split_data_across_actors( actors: List, set_func, X_parts, y_parts, ): """ Split row partitions of data between actors. Parameters ---------- actors : list List of used actors. set_func : callable The function for setting data in actor. X_parts : list Row partitions of X data. y_parts : list Row partitions of y data. """ X_parts_by_actors = _assign_row_partitions_to_actors( actors, X_parts, ) y_parts_by_actors = _assign_row_partitions_to_actors( actors, y_parts, data_for_aligning=X_parts_by_actors, ) for rank, (_, actor) in enumerate(actors): set_func(actor, *(X_parts_by_actors[rank][0] + y_parts_by_actors[rank][0])) def _assign_row_partitions_to_actors( actors: List, row_partitions, data_for_aligning=None, ): """ Assign row_partitions to actors. `row_partitions` will be assigned to actors according to their IPs. If distribution isn't even, partitions will be moved from actor with excess partitions to actor with lack of them. Parameters ---------- actors : list List of used actors. row_partitions : list Row partitions of data to assign. data_for_aligning : dict, optional Data according to the order of which should be distributed `row_partitions`. Used to align y with X. Returns ------- dict Dictionary of assigned to actors partitions as {actor_rank: (partitions, order)}. """ num_actors = len(actors) if data_for_aligning is None: parts_ips_ref, parts_ref = zip(*row_partitions) # Group actors which are one the same ip actor_ips = defaultdict(list) for rank, (ip, _) in enumerate(actors): actor_ips[ip].append(rank) # Get distribution of parts between nodes ({ip:[(part, position),..],..}) init_parts_distribution = defaultdict(list) for idx, (ip, part_ref) in enumerate( zip(RayWrapper.materialize(list(parts_ips_ref)), parts_ref) ): init_parts_distribution[ip].append((part_ref, idx)) num_parts = len(parts_ref) min_parts_per_actor = math.floor(num_parts / num_actors) max_parts_per_actor = math.ceil(num_parts / num_actors) num_actors_with_max_parts = num_parts % num_actors row_partitions_by_actors = defaultdict(list) # Fill actors without movement parts between ips for actor_ip, ranks in actor_ips.items(): # Loop across actors which are placed on actor_ip for rank in ranks: num_parts_on_ip = len(init_parts_distribution[actor_ip]) # Check that have something to distribute on this ip if num_parts_on_ip == 0: break # Check that node with `actor_ip` has enough parts for minimal # filling actor with `rank` if num_parts_on_ip >= min_parts_per_actor: # Check that node has enough parts for max filling # actor with `rank` if ( num_parts_on_ip >= max_parts_per_actor and num_actors_with_max_parts > 0 ): pop_slice = slice(0, max_parts_per_actor) num_actors_with_max_parts -= 1 else: pop_slice = slice(0, min_parts_per_actor) row_partitions_by_actors[rank].extend( init_parts_distribution[actor_ip][pop_slice] ) # Delete parts which we already assign del init_parts_distribution[actor_ip][pop_slice] else: row_partitions_by_actors[rank].extend( init_parts_distribution[actor_ip] ) init_parts_distribution[actor_ip] = [] # Remove empty IPs for ip in list(init_parts_distribution): if len(init_parts_distribution[ip]) == 0: init_parts_distribution.pop(ip) # IP's aren't necessary now init_parts_distribution = [ pair for pairs in init_parts_distribution.values() for pair in pairs ] # Fill the actors with extra parts (movements data between nodes) for rank in range(len(actors)): num_parts_on_rank = len(row_partitions_by_actors[rank]) if num_parts_on_rank == max_parts_per_actor or ( num_parts_on_rank == min_parts_per_actor and num_actors_with_max_parts == 0 ): continue if num_actors_with_max_parts > 0: pop_slice = slice(0, max_parts_per_actor - num_parts_on_rank) num_actors_with_max_parts -= 1 else: pop_slice = slice(0, min_parts_per_actor - num_parts_on_rank) row_partitions_by_actors[rank].extend(init_parts_distribution[pop_slice]) del init_parts_distribution[pop_slice] if len(init_parts_distribution) != 0: raise RuntimeError( f"Not all partitions were ditributed between actors: {len(init_parts_distribution)} left." ) row_parts_by_ranks = dict() for rank, pairs_part_pos in dict(row_partitions_by_actors).items(): parts, order = zip(*pairs_part_pos) row_parts_by_ranks[rank] = (list(parts), list(order)) else: row_parts_by_ranks = {rank: ([], []) for rank in range(len(actors))} for rank, (_, order_of_indexes) in data_for_aligning.items(): row_parts_by_ranks[rank][1].extend(order_of_indexes) for row_idx in order_of_indexes: row_parts_by_ranks[rank][0].append(row_partitions[row_idx]) return row_parts_by_ranks def _train( dtrain, params: Dict, *args, num_actors=None, evals=(), **kwargs, ): """ Run distributed training of XGBoost model on Ray engine. During work it evenly distributes `dtrain` between workers according to IP addresses partitions (in case of not even distribution of `dtrain` by nodes, part of partitions will be re-distributed between nodes), runs xgb.train on each worker for subset of `dtrain` and reduces training results of each worker using Rabit Context. Parameters ---------- dtrain : modin.experimental.DMatrix Data to be trained against. params : dict Booster params. *args : iterable Other parameters for `xgboost.train`. num_actors : int, optional Number of actors for training. If unspecified, this value will be computed automatically. evals : list of pairs (modin.experimental.xgboost.DMatrix, str), default: empty List of validation sets for which metrics will be evaluated during training. Validation metrics will help us track the performance of the model. **kwargs : dict Other parameters are the same as `xgboost.train`. Returns ------- dict A dictionary with trained booster and dict of evaluation results as {"booster": xgboost.Booster, "history": dict}. """ s = time.time() X_row_parts, y_row_parts = dtrain dmatrix_kwargs = dtrain.get_dmatrix_params() assert len(X_row_parts) == len(y_row_parts), "Unaligned train data" num_actors = _get_num_actors(num_actors) if num_actors > len(X_row_parts): num_actors = len(X_row_parts) if evals: min_num_parts = num_actors for (eval_X, _), eval_method in evals: if len(eval_X) < min_num_parts: min_num_parts = len(eval_X) method_name = eval_method if num_actors != min_num_parts: num_actors = min_num_parts warnings.warn( f"`num_actors` is set to {num_actors}, because `evals` data with name `{method_name}` has only {num_actors} partition(s)." ) actors = create_actors(num_actors) add_as_eval_method = None if evals: for eval_data, method in evals[:]: if eval_data is dtrain: add_as_eval_method = method evals.remove((eval_data, method)) for (eval_X, eval_y), eval_method in evals: # Split data across workers _split_data_across_actors( actors, lambda actor, *X_y: actor.add_eval_data.remote( *X_y, eval_method=eval_method, **dmatrix_kwargs ), eval_X, eval_y, ) # Split data across workers _split_data_across_actors( actors, lambda actor, *X_y: actor.set_train_data.remote( *X_y, add_as_eval_method=add_as_eval_method, **dmatrix_kwargs ), X_row_parts, y_row_parts, ) LOGGER.info(f"Data preparation time: {time.time() - s} s") s = time.time() with RabitContextManager(len(actors), get_node_ip_address()) as env: rabit_args = [("%s=%s" % item).encode() for item in env.items()] # Train fut = [ actor.train.remote(rabit_args, params, *args, **kwargs) for _, actor in actors ] # All results should be the same because of Rabit tracking. So we just # return the first one. result = RayWrapper.materialize(fut[0]) LOGGER.info(f"Training time: {time.time() - s} s") return result @ray.remote def _map_predict(booster, part, columns, dmatrix_kwargs={}, **kwargs): """ Run prediction on a remote worker. Parameters ---------- booster : xgboost.Booster or ray.ObjectRef A trained booster. part : pandas.DataFrame or ray.ObjectRef Partition of full data used for local prediction. columns : list or ray.ObjectRef Columns for the result. dmatrix_kwargs : dict, optional Keyword parameters for ``xgb.DMatrix``. **kwargs : dict Other parameters are the same as for ``xgboost.Booster.predict``. Returns ------- ray.ObjectRef ``ray.ObjectRef`` with partial prediction. """ dmatrix = xgb.DMatrix(part, **dmatrix_kwargs) prediction = pandas.DataFrame( booster.predict(dmatrix, **kwargs), index=part.index, columns=columns, ) return prediction def _predict( booster, data, **kwargs, ): """ Run distributed prediction with a trained booster on Ray engine. During execution it runs ``xgb.predict`` on each worker for subset of `data` and creates Modin DataFrame with prediction results. Parameters ---------- booster : xgboost.Booster A trained booster. data : modin.experimental.xgboost.DMatrix Input data used for prediction. **kwargs : dict Other parameters are the same as for ``xgboost.Booster.predict``. Returns ------- modin.pandas.DataFrame Modin DataFrame with prediction results. """ s = time.time() dmatrix_kwargs = data.get_dmatrix_params() # Get metadata from DMatrix input_index, input_columns, row_lengths = data.metadata # Infer columns of result def _get_num_columns(booster, n_features, **kwargs): rng = np.random.RandomState(777) test_data = rng.randn(1, n_features) test_predictions = booster.predict( xgb.DMatrix(test_data), validate_features=False, **kwargs ) num_columns = ( test_predictions.shape[1] if len(test_predictions.shape) > 1 else 1 ) return num_columns result_num_columns = _get_num_columns(booster, len(input_columns), **kwargs) new_columns = list(range(result_num_columns)) # Put common data in object store booster = RayWrapper.put(booster) new_columns_ref = RayWrapper.put(new_columns) prediction_refs = [ _map_predict.remote(booster, part, new_columns_ref, dmatrix_kwargs, **kwargs) for _, part in data.data ] predictions = from_partitions( prediction_refs, 0, index=input_index, columns=new_columns, row_lengths=row_lengths, column_widths=[len(new_columns)], ) LOGGER.info(f"Prediction time: {time.time() - s} s") return predictions ================================================ FILE: modin/logging/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. from .class_logger import ClassLogger # noqa: F401 from .config import DEFAULT_LOGGER_NAME, get_logger # noqa: F401 from .logger_decorator import disable_logging, enable_logging # noqa: F401 from .metrics import add_metric_handler, clear_metric_handler, emit_metric __all__ = [ "ClassLogger", "get_logger", "enable_logging", "disable_logging", "emit_metric", "add_metric_handler", "clear_metric_handler", "DEFAULT_LOGGER_NAME", ] ================================================ FILE: modin/logging/class_logger.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Module contains ``ClassLogger`` class. ``ClassLogger`` is used for adding logging to Modin classes and their subclasses. """ from typing import Dict, Optional from .config import LogLevel from .logger_decorator import enable_logging class ClassLogger: """ Ensure all subclasses of the class being inherited are logged, too. Notes ----- This mixin must go first in class bases declaration to have the desired effect. """ _modin_logging_layer = "PANDAS-API" _log_level = LogLevel.INFO @classmethod def __init_subclass__( cls, modin_layer: Optional[str] = None, class_name: Optional[str] = None, log_level: Optional[LogLevel] = None, **kwargs: Dict, ) -> None: """ Apply logging decorator to all children of ``ClassLogger``. Parameters ---------- modin_layer : str, optional Specified by the logger (e.g. PANDAS-API). class_name : str, optional The name of the class the decorator is being applied to. Composed from the decorated class name if not specified. log_level : LogLevel, optional The log level (LogLevel.INFO, LogLevel.DEBUG, LogLevel.WARNING, etc.). **kwargs : dict """ modin_layer = modin_layer or cls._modin_logging_layer log_level = log_level or cls._log_level super().__init_subclass__(**kwargs) enable_logging(modin_layer, class_name, log_level)(cls) cls._modin_logging_layer = modin_layer cls._log_level = log_level ================================================ FILE: modin/logging/config.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Module contains ``ModinFormatter`` class. ``ModinFormatter`` and the associated functions are used for logging configuration. """ import datetime as dt import logging import platform import threading import time import uuid from enum import IntEnum from logging.handlers import RotatingFileHandler from pathlib import Path from typing import Optional import pandas import psutil import modin from modin.config import LogFileSize, LogMemoryInterval, LogMode DEFAULT_LOGGER_NAME = "modin.logger.default" __LOGGER_CONFIGURED__: bool = False class LogLevel(IntEnum): # noqa: PR01 """Enumerator to specify the valid values of LogLevel accepted by Logger.setLevel().""" DEBUG = 10 INFO = 20 WARNING = 30 ERROR = 40 CRITICAL = 50 class ModinFormatter(logging.Formatter): # noqa: PR01 """Implement custom formatter to log at microsecond granularity.""" def formatTime( self, record: logging.LogRecord, datefmt: Optional[str] = None ) -> str: """ Return the creation time of the specified LogRecord as formatted text. This custom logging formatter inherits from the logging module and records timestamps at the microsecond level of granularity. Parameters ---------- record : LogRecord The specified LogRecord object. datefmt : str, default: None Used with time.ststrftime() to format time record. Returns ------- str Datetime string containing microsecond timestamp. """ ct = dt.datetime.fromtimestamp(record.created) if datefmt: s = ct.strftime(datefmt) else: # Format datetime object ct to microseconds t = ct.strftime("%Y-%m-%d %H:%M:%S") s = f"{t},{record.msecs:03}" return s def bytes_int_to_str(num_bytes: int, suffix: str = "B") -> str: """ Scale bytes to its human-readable format (e.g: 1253656678 => '1.17GB'). Parameters ---------- num_bytes : int Number of bytes. suffix : str, default: "B" Suffix to add to conversion of num_bytes. Returns ------- str Human-readable string format. """ factor = 1000 # Convert n_bytes to float b/c we divide it by factor n_bytes: float = num_bytes for unit in ["", "K", "M", "G", "T", "P"]: if n_bytes < factor: return f"{n_bytes:.2f}{unit}{suffix}" n_bytes /= factor return f"{n_bytes * 1000:.2f}P{suffix}" def _create_logger( namespace: str, job_id: str, log_name: str, log_level: LogLevel ) -> logging.Logger: """ Create and configure logger as Modin expects it to be. Parameters ---------- namespace : str Logging namespace to use, e.g. "modin.logger.default". job_id : str Part of path to where logs are stored. log_name : str Name of the log file to create. log_level : LogLevel Returns ------- Logger Logger object configured per Modin settings. """ # Pathlib makes it OS agnostic. modin_path = Path(".modin") modin_path.mkdir(exist_ok=True) # Add gitignore to the log directory. ignore_modin_path = modin_path / ".gitignore" if not ignore_modin_path.exists(): ignore_modin_path.write_text("# Automatically generated by modin.\n*\n") log_dir = modin_path / "logs" / f"job_{job_id}" log_dir.mkdir(parents=True, exist_ok=True) log_filename = log_dir / f"{log_name}.log" logger = logging.getLogger(namespace) logfile = RotatingFileHandler( filename=log_filename, mode="a", maxBytes=LogFileSize.get() * int(1e6), backupCount=10, ) formatter = ModinFormatter( fmt="%(process)d, %(thread)d, %(asctime)s, %(message)s", datefmt="%Y-%m-%d,%H:%M:%S.%f", ) logfile.setFormatter(formatter) logger.addHandler(logfile) logger.setLevel(log_level) return logger def configure_logging() -> None: """Configure Modin logging by setting up directory structure and formatting.""" global __LOGGER_CONFIGURED__ current_timestamp = dt.datetime.now().strftime("%Y.%m.%d_%H-%M-%S") job_id = f"{current_timestamp}_{uuid.uuid4().hex}" logger = _create_logger( DEFAULT_LOGGER_NAME, job_id, "trace", LogLevel.INFO, ) logger.info(f"OS Version: {platform.platform()}") logger.info(f"Python Version: {platform.python_version()}") num_physical_cores = str(psutil.cpu_count(logical=False)) num_total_cores = str(psutil.cpu_count(logical=True)) logger.info(f"Modin Version: {modin.__version__}") logger.info(f"Pandas Version: {pandas.__version__}") logger.info(f"Physical Cores: {num_physical_cores}") logger.info(f"Total Cores: {num_total_cores}") mem_sleep = LogMemoryInterval.get() mem_logger = _create_logger("modin_memory.logger", job_id, "memory", LogLevel.DEBUG) svmem = psutil.virtual_memory() mem_logger.info(f"Memory Total: {bytes_int_to_str(svmem.total)}") mem_logger.info(f"Memory Available: {bytes_int_to_str(svmem.available)}") mem_logger.info(f"Memory Used: {bytes_int_to_str(svmem.used)}") mem = threading.Thread( target=memory_thread, args=[mem_logger, mem_sleep], daemon=True ) mem.start() _create_logger("modin.logger.errors", job_id, "error", LogLevel.INFO) __LOGGER_CONFIGURED__ = True def memory_thread(logger: logging.Logger, sleep_time: int) -> None: """ Configure Modin logging system memory profiling thread. Parameters ---------- logger : logging.Logger The logger object. sleep_time : int The interval at which to profile system memory. """ while True: rss_mem = bytes_int_to_str(psutil.Process().memory_info().rss) svmem = psutil.virtual_memory() logger.info(f"Memory Percentage: {svmem.percent}%") logger.info(f"RSS Memory: {rss_mem}") time.sleep(sleep_time) def get_logger(namespace: str = "modin.logger.default") -> logging.Logger: """ Configure Modin logger based on Modin config and returns the logger. Parameters ---------- namespace : str, default: "modin.logger.default" Which namespace to use for logging. Returns ------- logging.Logger The Modin logger. """ if not __LOGGER_CONFIGURED__ and LogMode.get() != "disable": configure_logging() return logging.getLogger(namespace) ================================================ FILE: modin/logging/logger_decorator.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Module contains the functions designed for the enable/disable of logging. ``enable_logging`` is used for decorating individual Modin functions or classes. """ from __future__ import annotations from functools import wraps from time import perf_counter from types import FunctionType, MethodType from typing import Any, Callable, Dict, Optional, Tuple, TypeVar, overload from modin.config import LogMode from modin.logging.metrics import emit_metric from .config import LogLevel, get_logger _MODIN_LOGGER_NOWRAP = "__modin_logging_nowrap__" Fn = TypeVar("Fn", bound=Any) def disable_logging(func: Callable) -> Any: """ Disable logging of one particular function. Useful for decorated classes. Parameters ---------- func : callable A method in a logger-decorated class for which logging should be disabled. Returns ------- func A function with logging disabled. """ setattr(func, _MODIN_LOGGER_NOWRAP, True) return func @overload def enable_logging(modin_layer: Fn) -> Fn: # This helps preserve typings when the decorator is used without parentheses pass @overload def enable_logging( modin_layer: str = "PANDAS-API", name: Optional[str] = None, log_level: LogLevel = LogLevel.INFO, ) -> Callable[[Fn], Fn]: pass def enable_logging( modin_layer: str | Fn = "PANDAS-API", name: Optional[str] = None, log_level: LogLevel = LogLevel.INFO, ) -> Callable[[Fn], Fn] | Fn: """ Log Decorator used on specific Modin functions or classes. Parameters ---------- modin_layer : str or object to decorate, default: "PANDAS-API" Specified by the logger (e.g. PANDAS-API). If it's an object to decorate, call logger_decorator() on it with default arguments. name : str, optional The name of the object the decorator is being applied to. Composed from the decorated object name if not specified. log_level : LogLevel, default: LogLevel.INFO The log level (LogLevel.INFO, LogLevel.DEBUG, LogLevel.WARNING, etc.). Returns ------- func A decorator function. """ if not isinstance(modin_layer, str): # assume the decorator is used in a form without parenthesis like: # @enable_logging # def func() return enable_logging()(modin_layer) def decorator(obj: Fn) -> Fn: """Decorate function or class to add logs to Modin API function(s).""" if isinstance(obj, type): seen: Dict[Any, Any] = {} for attr_name, attr_value in vars(obj).items(): if isinstance( attr_value, (FunctionType, MethodType, classmethod, staticmethod) ) and not hasattr(attr_value, _MODIN_LOGGER_NOWRAP): try: wrapped = seen[attr_value] except KeyError: wrapped = seen[attr_value] = enable_logging( modin_layer, f"{name or obj.__name__}.{attr_name}", log_level, )(attr_value) setattr(obj, attr_name, wrapped) return obj elif isinstance(obj, classmethod): return classmethod(decorator(obj.__func__)) # type: ignore [return-value, arg-type] elif isinstance(obj, staticmethod): return staticmethod(decorator(obj.__func__)) # type: ignore [return-value, arg-type] assert isinstance(modin_layer, str), "modin_layer is somehow not a string!" api_call_name = f"{name or obj.__name__}" log_line = f"{modin_layer.upper()}::{api_call_name}" metric_name = f"{modin_layer.lower()}.{api_call_name.lower()}" start_line = f"START::{log_line}" stop_line = f"STOP::{log_line}" @wraps(obj) def run_and_log(*args: Tuple, **kwargs: Dict) -> Any: """ Compute function with logging if Modin logging is enabled. Parameters ---------- *args : tuple The function arguments. **kwargs : dict The function keyword arguments. Returns ------- Any """ start_time = perf_counter() if LogMode.get() == "disable": result = obj(*args, **kwargs) emit_metric(metric_name, perf_counter() - start_time) return result logger = get_logger() logger.log(log_level, start_line) try: result = obj(*args, **kwargs) emit_metric(metric_name, perf_counter() - start_time) except BaseException as e: # Only log the exception if a deeper layer of the modin stack has not # already logged it. if not hasattr(e, "_modin_logged"): # use stack_info=True so that even if we are a few layers deep in # modin, we log a stack trace that includes calls to higher layers # of modin get_logger("modin.logger.errors").exception( stop_line, stack_info=True ) e._modin_logged = True # type: ignore[attr-defined] raise finally: logger.log(log_level, stop_line) return result # make sure we won't decorate multiple times return disable_logging(run_and_log) return decorator ================================================ FILE: modin/logging/metrics.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Module contains metrics handler functions. Allows for the registration of functions to collect API metrics. """ import re from typing import Callable, Union from modin.config.envvars import MetricsMode metric_name_pattern = r"[a-zA-Z\._\-0-9]+$" _metric_handlers: list[Callable[[str, Union[int, float]], None]] = [] # Metric/Telemetry hooks can be implemented by plugin engines # to collect discrete data on how modin is performing at the # high level modin layer. def emit_metric(name: str, value: Union[int, float]) -> None: """ Emit a metric using the set of registered handlers. Parameters ---------- name : str, required Name of the metric, in dot-format. value : int or float required Value of the metric. """ if MetricsMode.get() == "disable": return if not re.fullmatch(metric_name_pattern, name): raise KeyError( f"Metrics name is not in metric-name dot format, (eg. modin.dataframe.hist.duration ): {name}" ) handlers = _metric_handlers.copy() for fn in handlers: try: fn(f"modin.{name}", value) except Exception: clear_metric_handler(fn) def add_metric_handler(handler: Callable[[str, Union[int, float]], None]) -> None: """ Add a metric handler to Modin which can collect metrics. Parameters ---------- handler : Callable, required """ _metric_handlers.append(handler) def clear_metric_handler(handler: Callable[[str, Union[int, float]], None]) -> None: """ Remove a metric handler from Modin. Parameters ---------- handler : Callable, required """ if handler in _metric_handlers: _metric_handlers.remove(handler) ================================================ FILE: modin/numpy/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import numpy from packaging import version from . import linalg from .arr import array from .array_creation import ones_like, tri, zeros_like from .array_shaping import append, hstack, ravel, shape, split, transpose from .constants import e, euler_gamma, inf, nan, newaxis, pi if version.parse(numpy.__version__) < version.parse("2.0.0b1"): from .constants import ( NAN, NINF, NZERO, PINF, PZERO, Inf, Infinity, NaN, infty, ) from .logic import ( all, any, equal, greater, greater_equal, iscomplex, isfinite, isinf, isnan, isnat, isneginf, isposinf, isreal, isscalar, less, less_equal, logical_and, logical_not, logical_or, logical_xor, not_equal, ) from .math import ( abs, absolute, add, amax, amin, argmax, argmin, divide, dot, exp, float_power, floor_divide, max, maximum, mean, min, minimum, mod, multiply, power, prod, remainder, sqrt, subtract, sum, true_divide, var, ) from .trigonometry import tanh def where(condition, x=None, y=None): if condition is True: return x if condition is False: return y if hasattr(condition, "where"): return condition.where(x=x, y=y) raise NotImplementedError( f"np.where for condition of type {type(condition)} is not yet supported in Modin." ) __all__ = [ # noqa: F405 "linalg", "array", "zeros_like", "ones_like", "ravel", "shape", "transpose", "all", "any", "isfinite", "isinf", "isnan", "isnat", "isneginf", "isposinf", "iscomplex", "isreal", "isscalar", "logical_not", "logical_and", "logical_or", "logical_xor", "greater", "greater_equal", "less", "less_equal", "equal", "not_equal", "absolute", "abs", "add", "divide", "dot", "float_power", "floor_divide", "power", "prod", "multiply", "remainder", "mod", "subtract", "sum", "true_divide", "mean", "maximum", "amax", "max", "minimum", "amin", "min", "where", "e", "euler_gamma", "inf", "nan", "newaxis", "pi", "sqrt", "tanh", "exp", "argmax", "argmin", "var", "split", "hstack", "append", "tri", ] if version.parse(numpy.__version__) < version.parse("2.0.0b1"): __all__ += [ "Inf", "Infinity", "NAN", "NINF", "NZERO", "NaN", "PINF", "PZERO", "infty", ] ================================================ FILE: modin/numpy/arr.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses ``array`` class, that is distributed version of ``numpy.array``.""" from inspect import signature from math import prod import numpy import pandas from pandas.api.types import is_scalar from pandas.core.dtypes.common import is_bool_dtype, is_list_like, is_numeric_dtype import modin.pandas as pd from modin.core.dataframe.algebra import Binary, Map, Reduce from modin.error_message import ErrorMessage from .utils import try_convert_from_interoperable_type def check_kwargs(order="C", subok=True, keepdims=None, casting="same_kind", where=True): if order not in ["K", "C"]: ErrorMessage.single_warning( "Array order besides 'C' is not currently supported in Modin. Defaulting to 'C' order." ) if not subok: ErrorMessage.single_warning( "Subclassing types is not currently supported in Modin. Defaulting to the same base dtype." ) if keepdims: ErrorMessage.single_warning( "Modin does not yet support broadcasting between nested 1D arrays and 2D arrays." ) if casting != "same_kind": ErrorMessage.single_warning( "Modin does not yet support the `casting` argument." ) if not ( is_scalar(where) or (isinstance(where, array) and is_bool_dtype(where.dtype)) ): if not isinstance(where, array): raise NotImplementedError( f"Modin only supports scalar or modin.numpy.array `where` parameter, not `where` parameter of type {type(where)}" ) raise TypeError( f"Cannot cast array data from {where.dtype} to dtype('bool') according to the rule 'safe'" ) def check_can_broadcast_to_output(arr_in: "array", arr_out: "array"): if not isinstance(arr_out, array): raise TypeError("return arrays must be of modin.numpy.array type.") # Broadcasting is ok if both arrays have matching ndim + shape, OR # arr_in is 1xN or a 1D N-element array and arr_out is MxN. # Note that 1xN arr_in cannot be broadcasted into a 1D N-element arr_out. # # This is slightly different from the rules for checking if two inputs # of a binary operation can be broadcasted together. broadcast_ok = ( ( # Case 1: arrays have matching ndim + shape # Case 2a: arr_in is 1D N-element, arr_out is 1D N-element (covered here) arr_in._ndim == arr_out._ndim and arr_in.shape == arr_out.shape ) or ( # Case 2b: both arrays are 2D, arr_in is 1xN and arr_out is MxN arr_in._ndim == 2 and arr_out._ndim == 2 and arr_in.shape[0] == 1 and arr_in.shape[1] == arr_out.shape[1] ) or ( # Case 2c: arr_in is 1D N-element, arr_out is MxN arr_in._ndim == 1 and arr_out._ndim == 2 and arr_in.shape[0] == arr_out.shape[1] and arr_out.shape[0] == 1 ) ) # Case 2b would require duplicating the 1xN result M times to match the shape of out, # which we currently do not support. See GH#5831. if ( arr_in._ndim == 2 and arr_out._ndim == 2 and arr_in.shape[0] == 1 and arr_in.shape[1] == arr_out.shape[1] and arr_in.shape[0] != 1 ): raise NotImplementedError( f"Modin does not currently support broadcasting shape {arr_in.shape} to output operand with shape {arr_out.shape}" ) if not broadcast_ok: raise ValueError( f"non-broadcastable output operand with shape {arr_out.shape} doesn't match the broadcast shape {arr_in.shape}" ) def fix_dtypes_and_determine_return( query_compiler_in, _ndim, dtype=None, out=None, where=True ): if dtype is not None: query_compiler_in = query_compiler_in.astype( {col_name: dtype for col_name in query_compiler_in.columns} ) result = array(_query_compiler=query_compiler_in, _ndim=_ndim) if out is not None: out = try_convert_from_interoperable_type(out, copy=False) check_can_broadcast_to_output(result, out) result._query_compiler = result._query_compiler.astype( {col_name: out.dtype for col_name in result._query_compiler.columns} ) if isinstance(where, array): out._update_inplace(where.where(result, out)._query_compiler) elif where: out._update_inplace(result._query_compiler) return out if isinstance(where, array) and out is None: from .array_creation import zeros_like out = zeros_like(result).astype(dtype if dtype is not None else result.dtype) out._query_compiler = where.where(result, out)._query_compiler return out elif not where: from .array_creation import zeros_like return zeros_like(result) return result class array(object): """ Modin distributed representation of ``numpy.array``. Internally, the data can be divided into partitions along both columns and rows in order to parallelize computations and utilize the user's hardware as much as possible. Notes ----- The ``array`` class is a lightweight shim that relies on the pandas Query Compiler in order to provide functionality. """ def __init__( self, object=None, dtype=None, *, copy=True, order="K", subok=False, ndmin=0, like=numpy._NoValue, _query_compiler=None, _ndim=None, ): self._siblings = [] ErrorMessage.single_warning( "Using Modin's new NumPy API. To convert from a Modin object to a NumPy array, either turn off the ModinNumpy flag, or use `modin.pandas.io.to_numpy`." ) if isinstance(object, array): _query_compiler = object._query_compiler.copy() if not copy: object._add_sibling(self) _ndim = object._ndim elif isinstance(object, (pd.DataFrame, pd.Series)): _query_compiler = object._query_compiler.copy() if not copy: object._add_sibling(self) _ndim = 1 if isinstance(object, pd.Series) else 2 if _query_compiler is not None: self._query_compiler = _query_compiler self._ndim = _ndim new_dtype = pandas.core.dtypes.cast.find_common_type( list(self._query_compiler.dtypes.values) ) elif is_list_like(object) and not is_list_like(object[0]): series = pd.Series(object) self._query_compiler = series._query_compiler self._ndim = 1 new_dtype = self._query_compiler.dtypes.values[0] else: target_kwargs = { "dtype": None, "copy": True, "order": "K", "subok": False, "ndmin": 0, "like": numpy._NoValue, } for key, value in target_kwargs.copy().items(): if value == locals()[key]: target_kwargs.pop(key) else: target_kwargs[key] = locals()[key] arr = numpy.asarray(object) assert arr.ndim in ( 1, 2, ), "modin.numpy currently only supports 1D and 2D objects." self._ndim = len(arr.shape) if self._ndim > 2: ErrorMessage.not_implemented( "NumPy arrays with dimensions higher than 2 are not yet supported." ) self._query_compiler = pd.DataFrame(arr)._query_compiler new_dtype = arr.dtype # These two lines are necessary so that our query compiler does not keep track of indices # and try to map like indices to like indices. (e.g. if we multiply two arrays that used # to be dataframes, and the dataframes had the same column names but ordered differently # we want to do a simple broadcast where we only consider position, as numpy would, rather # than pair columns with the same name and multiply them.) self._query_compiler = self._query_compiler.reset_index(drop=True) self._query_compiler.columns = range(len(self._query_compiler.columns)) new_dtype = new_dtype if dtype is None else dtype if isinstance(new_dtype, pandas.Float64Dtype): new_dtype = numpy.float64 cols_with_wrong_dtype = self._query_compiler.dtypes != new_dtype if cols_with_wrong_dtype.any(): self._query_compiler = self._query_compiler.astype( { col_name: new_dtype for col_name in self._query_compiler.columns[cols_with_wrong_dtype] } ) self.indexer = None def __getitem__(self, key): if isinstance(key, array) and is_bool_dtype(key.dtype) and key._ndim == 2: raise NotImplementedError( "Advanced indexing with 2D boolean indexes is not currently supported." ) if self.indexer is None: from .indexing import ArrayIndexer self.indexer = ArrayIndexer(self) return self.indexer.__getitem__(key) def __setitem__(self, key, item): if self.indexer is None: from .indexing import ArrayIndexer self.indexer = ArrayIndexer(self) return self.indexer.__setitem__(key, item) def _add_sibling(self, sibling): """ Add an array object to the list of siblings. Siblings are objects that share the same query compiler. This function is called when a shallow copy is made. Parameters ---------- sibling : BasePandasDataset Dataset to add to siblings list. """ sibling._siblings = self._siblings + [self] self._siblings += [sibling] for sib in self._siblings: sib._siblings += [sibling] def _update_inplace(self, new_query_compiler): """ Update the current array inplace. Parameters ---------- new_query_compiler : query_compiler The new QueryCompiler to use to manage the data. """ old_query_compiler = self._query_compiler self._query_compiler = new_query_compiler for sib in self._siblings: sib._query_compiler = new_query_compiler old_query_compiler.free() def _validate_axis(self, axis): """ Check that the provided axis argument is valid on this array. Parameters ---------- axis : int, optional The axis argument passed to the function. Returns ------- int, optional Axis to apply the function over (None, 0, or 1). Raises ------- numpy.AxisError if the axis is invalid. """ if axis is not None and axis < 0: new_axis = axis + self._ndim if self._ndim == 1 and new_axis != 0: raise numpy.AxisError(axis, 1) elif self._ndim == 2 and new_axis not in [0, 1]: raise numpy.AxisError(axis, 2) return new_axis return axis def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): ufunc_name = ufunc.__name__ supported_array_layer = hasattr(self, ufunc_name) or hasattr( self, f"__{ufunc_name}__" ) if supported_array_layer: args = [] for input in inputs: input = try_convert_from_interoperable_type(input) if not (isinstance(input, array) or is_scalar(input)): input = array(input) args += [input] function = ( getattr(args[0], ufunc_name) if hasattr(args[0], ufunc_name) else getattr(args[0], f"__{ufunc_name}__") ) len_expected_arguments = len( [ param for param in signature(function).parameters.values() if param.default == param.empty ] ) if len_expected_arguments == (len(args) - 1) and method == "__call__": return function(*tuple(args[1:]), **kwargs) else: ErrorMessage.single_warning( f"{ufunc} method {method} is not yet supported in Modin. Defaulting to NumPy." ) args = [] for input in inputs: if isinstance(input, array): input = input._to_numpy() if isinstance(input, pd.DataFrame): input = input._query_compiler.to_numpy() if isinstance(input, pd.Series): input = input._query_compiler.to_numpy().flatten() args += [input] output = self._to_numpy().__array_ufunc__( ufunc, method, *args, **kwargs ) if is_scalar(output): return output return array(output) new_ufunc = None out_ndim = -1 if method == "__call__": if len(inputs) == 1: new_ufunc = Map.register(ufunc) out_ndim = len(inputs[0].shape) else: new_ufunc = Binary.register(ufunc) out_ndim = max( [len(inp.shape) for inp in inputs if hasattr(inp, "shape")] ) elif method == "reduce": if len(inputs) == 1: new_ufunc = Reduce.register(ufunc, axis=kwargs.get("axis", None)) if kwargs.get("axis", None) is None: out_ndim = 0 else: out_ndim = len(inputs[0].shape) - 1 elif method == "accumulate": if len(inputs) == 1: new_ufunc = Reduce.register(ufunc, axis=None) out_ndim = 0 if new_ufunc is None: ErrorMessage.single_warning( f"{ufunc} is not yet supported in Modin. Defaulting to NumPy." ) args = [] for input in inputs: if isinstance(input, array): input = input._to_numpy() if isinstance(input, pd.DataFrame): input = input._query_compiler.to_numpy() if isinstance(input, pd.Series): input = input._query_compiler.to_numpy().flatten() args += [input] output = self._to_numpy().__array_ufunc__(ufunc, method, *args, **kwargs) if is_scalar(output): return output return array(output) args = [] for input in inputs: input = try_convert_from_interoperable_type(input) if not (isinstance(input, array) or is_scalar(input)): input = array(input) args += [ input._query_compiler if hasattr(input, "_query_compiler") else input ] out_kwarg = kwargs.get("out", None) if out_kwarg is not None: # If `out` is a modin.numpy.array, `kwargs.get("out")` returns a 1-tuple # whose only element is that array, so we need to unwrap it from the tuple. out_kwarg = out_kwarg[0] where_kwarg = kwargs.get("where", True) kwargs["out"] = None kwargs["where"] = True result = new_ufunc(*args, **kwargs) return fix_dtypes_and_determine_return( result, out_ndim, dtype=kwargs.get("dtype", None), out=out_kwarg, where=where_kwarg, ) def __array_function__(self, func, types, args, kwargs): from . import array_creation as creation from . import array_shaping as shaping from . import math func_name = func.__name__ modin_func = None if hasattr(math, func_name): modin_func = getattr(math, func_name) elif hasattr(shaping, func_name): modin_func = getattr(shaping, func_name) elif hasattr(creation, func_name): modin_func = getattr(creation, func_name) if modin_func is None: return NotImplemented return modin_func(*args, **kwargs) def where(self, x=None, y=None): if not is_bool_dtype(self.dtype): raise NotImplementedError( "Modin currently only supports where on condition arrays with boolean dtype." ) if x is None and y is None: ErrorMessage.single_warning( "np.where method with only condition specified is not yet supported in Modin. Defaulting to NumPy." ) condition = self._to_numpy() return array(numpy.where(condition)) x, y = try_convert_from_interoperable_type( x ), try_convert_from_interoperable_type(y) if not ( (isinstance(x, array) or is_scalar(x)) and (isinstance(y, array) or is_scalar(y)) ): raise ValueError( "np.where requires x and y to either be np.arrays or scalars." ) if is_scalar(x) and is_scalar(y): ErrorMessage.single_warning( "np.where not supported when both x and y are scalars. Defaulting to NumPy." ) return array(numpy.where(self._to_numpy(), x, y)) if is_scalar(x) and not is_scalar(y): if self._ndim < y._ndim: if not self.shape[0] == y.shape[1]: raise ValueError( f"operands could not be broadcast together with shapes {self.shape} {y.shape}" ) ErrorMessage.single_warning( "np.where method where condition must be broadcast is not yet available in Modin. Defaulting to NumPy." ) return array(numpy.where(self._to_numpy(), x, y._to_numpy())) elif self._ndim == y._ndim: if not self.shape == y.shape: raise ValueError( f"operands could not be broadcast together with shapes {self.shape} {y.shape}" ) return array( _query_compiler=y._query_compiler.where((~self)._query_compiler, x), _ndim=y._ndim, ) else: ErrorMessage.single_warning( "np.where method with broadcast is not yet available in Modin. Defaulting to NumPy." ) return numpy.where(self._to_numpy(), x, y._to_numpy()) if not is_scalar(x) and is_scalar(y): if self._ndim < x._ndim: if not self.shape[0] == x.shape[1]: raise ValueError( f"operands could not be broadcast together with shapes {self.shape} {x.shape}" ) ErrorMessage.single_warning( "np.where method where condition must be broadcast is not yet available in Modin. Defaulting to NumPy." ) return array(numpy.where(self._to_numpy(), x._to_numpy(), y)) elif self._ndim == x._ndim: if not self.shape == x.shape: raise ValueError( f"operands could not be broadcast together with shapes {self.shape} {x.shape}" ) return array( _query_compiler=x._query_compiler.where(self._query_compiler, y), _ndim=x._ndim, ) else: ErrorMessage.single_warning( "np.where method with broadcast is not yet available in Modin. Defaulting to NumPy." ) return array(numpy.where(self._to_numpy(), x._to_numpy(), y)) if not (x.shape == y.shape and y.shape == self.shape): ErrorMessage.single_warning( "np.where method with broadcast is not yet available in Modin. Defaulting to NumPy." ) return array(numpy.where(self._to_numpy(), x._to_numpy(), y._to_numpy())) return array( _query_compiler=x._query_compiler.where( self._query_compiler, y._query_compiler ), _ndim=self._ndim, ) def max( self, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=True ): check_kwargs(keepdims=keepdims, where=where) apply_axis = self._validate_axis(axis) truthy_where = bool(where) if initial is None and where is not True: raise ValueError( "reduction operation 'maximum' does not have an identity, so to use a where mask one has to specify 'initial'" ) if self._ndim == 1: if apply_axis == 1: raise numpy.AxisError(1, 1) target = where.where(self, initial) if isinstance(where, array) else self result = target._query_compiler.max(axis=0) if keepdims: if initial is not None and result.lt(initial).any(): result = pd.Series([initial])._query_compiler if initial is not None and out is not None: out._update_inplace( (numpy.ones_like(out) * initial)._query_compiler ) if out is not None and out.shape != (1,): raise ValueError( f"operand was set up as a reduction along axis 0, but the length of the axis is {out.shape[0]} (it has to be 1)" ) if truthy_where or out is not None: return fix_dtypes_and_determine_return( result, 1, dtype, out, truthy_where ) else: return array([initial]) if initial is not None: result = max(result.to_numpy()[0, 0], initial) else: result = result.to_numpy()[0, 0] return result if truthy_where else initial if axis is None: target = where.where(self, initial) if isinstance(where, array) else self result = target._query_compiler.max(axis=0).max(axis=1).to_numpy()[0, 0] if initial is not None: result = max(result, initial) if keepdims: if out is not None and out.shape != (1, 1): raise ValueError( f"operand was set up as a reduction along axis 1, but the length of the axis is {out.shape[0]} (it has to be 1)" ) if initial is not None and out is not None: out._update_inplace( (numpy.ones_like(out) * initial)._query_compiler ) if truthy_where or out is not None: return fix_dtypes_and_determine_return( array(numpy.array([[result]]))._query_compiler, 2, dtype, out, truthy_where, ) else: return array([[initial]]) return result if truthy_where else initial if apply_axis > 1: raise numpy.AxisError(axis, 2) target = where.where(self, initial) if isinstance(where, array) else self result = target._query_compiler.max(axis=apply_axis) new_ndim = self._ndim - 1 if not keepdims else self._ndim if new_ndim == 0: if initial is not None: result = max(result.to_numpy()[0, 0], initial) else: result = result.to_numpy()[0, 0] return result if truthy_where else initial if not keepdims and apply_axis != 1: result = result.transpose() if initial is not None and out is not None: out._update_inplace((numpy.ones_like(out) * initial)._query_compiler) intermediate = fix_dtypes_and_determine_return( result, new_ndim, dtype, out, truthy_where ) if initial is not None: intermediate._update_inplace( (intermediate > initial).where(intermediate, initial)._query_compiler ) if truthy_where or out is not None: return intermediate else: return numpy.ones_like(intermediate) * initial def min( self, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=True ): check_kwargs(keepdims=keepdims, where=where) truthy_where = bool(where) apply_axis = self._validate_axis(axis) if initial is None and where is not True: raise ValueError( "reduction operation 'minimum' does not have an identity, so to use a where mask one has to specify 'initial'" ) if self._ndim == 1: if apply_axis == 1: raise numpy.AxisError(1, 1) target = where.where(self, initial) if isinstance(where, array) else self result = target._query_compiler.min(axis=0) if keepdims: if initial is not None and result.gt(initial).any(): result = pd.Series([initial])._query_compiler if initial is not None and out is not None: out._update_inplace( (numpy.ones_like(out) * initial)._query_compiler ) if out is not None and out.shape != (1,): raise ValueError( f"operand was set up as a reduction along axis 0, but the length of the axis is {out.shape[0]} (it has to be 1)" ) if truthy_where or out is not None: return fix_dtypes_and_determine_return( result, 1, dtype, out, truthy_where ) else: return array([initial]) if initial is not None: result = min(result.to_numpy()[0, 0], initial) else: result = result.to_numpy()[0, 0] return result if truthy_where else initial if apply_axis is None: target = where.where(self, initial) if isinstance(where, array) else self result = target._query_compiler.min(axis=0).min(axis=1).to_numpy()[0, 0] if initial is not None: result = min(result, initial) if keepdims: if out is not None and out.shape != (1, 1): raise ValueError( f"operand was set up as a reduction along axis 1, but the length of the axis is {out.shape[0]} (it has to be 1)" ) if initial is not None and out is not None: out._update_inplace( (numpy.ones_like(out) * initial)._query_compiler ) if truthy_where or out is not None: return fix_dtypes_and_determine_return( array(numpy.array([[result]]))._query_compiler, 2, dtype, out, truthy_where, ) else: return array([[initial]]) return result if truthy_where else initial if apply_axis > 1: raise numpy.AxisError(axis, 2) target = where.where(self, initial) if isinstance(where, array) else self result = target._query_compiler.min(axis=apply_axis) new_ndim = self._ndim - 1 if not keepdims else self._ndim if new_ndim == 0: if initial is not None: result = min(result.to_numpy()[0, 0], initial) else: result = result.to_numpy()[0, 0] return result if truthy_where else initial if not keepdims and apply_axis != 1: result = result.transpose() if initial is not None and out is not None: out._update_inplace((numpy.ones_like(out) * initial)._query_compiler) intermediate = fix_dtypes_and_determine_return( result, new_ndim, dtype, out, truthy_where ) if initial is not None: intermediate._update_inplace( (intermediate < initial).where(intermediate, initial)._query_compiler ) if truthy_where or out is not None: return intermediate else: return numpy.ones_like(intermediate) * initial def __abs__( self, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True, ): out_dtype = ( dtype if dtype is not None else (out.dtype if out is not None else self.dtype) ) check_kwargs(order=order, casting=casting, subok=subok, where=where) result = self._query_compiler.astype( {col_name: out_dtype for col_name in self._query_compiler.columns} ).abs() if dtype is not None: result = result.astype({col_name: dtype for col_name in result.columns}) if out is not None: out = try_convert_from_interoperable_type(out, copy=False) check_can_broadcast_to_output(self, out) out._update_inplace(result) return out return array(_query_compiler=result, _ndim=self._ndim) absolute = __abs__ def __invert__(self): """ Apply bitwise inverse to each element of the `BasePandasDataset`. Returns ------- BasePandasDataset New BasePandasDataset containing bitwise inverse to each value. """ if not is_numeric_dtype(self.dtype): raise TypeError(f"bad operand type for unary ~: '{self.dtype}'") return array(_query_compiler=self._query_compiler.invert(), _ndim=self._ndim) def _preprocess_binary_op(self, other, cast_input_types=True, dtype=None, out=None): """ Processes arguments and performs dtype conversions necessary to perform binary operations. If the arguments to the binary operation are a 1D object and a 2D object, then it will swap the order of the caller and callee return values in order to facilitate native broadcasting by modin. This function may modify `self._query_compiler` and `other._query_compiler` by replacing it with the result of `astype`. Parameters ---------- other : array or scalar The RHS of the binary operation. cast_input_types : bool, default: True If specified, the columns of the caller/callee query compilers will be assigned dtypes in the following priority, depending on what values were specified: (1) the `dtype` argument, (2) the dtype of the `out` array, (3) the common parent dtype of `self` and `other`. If this flag is not specified, then the resulting dtype is left to be determined by the result of the modin operation. dtype : numpy type, optional The desired dtype of the output array. out : array, optional Existing array object to which to assign the computation's result. Returns ------- tuple Returns a 4-tuple with the following elements: - 0: QueryCompiler object that is the LHS of the binary operation, with types converted as needed. - 1: QueryCompiler object OR scalar that is the RHS of the binary operation, with types converted as needed. - 2: The ndim of the result. - 3: kwargs to pass to the query compiler. """ other = try_convert_from_interoperable_type(other) if cast_input_types: operand_dtype = ( self.dtype if not isinstance(other, array) else pandas.core.dtypes.cast.find_common_type([self.dtype, other.dtype]) ) out_dtype = ( dtype if dtype is not None else (out.dtype if out is not None else operand_dtype) ) self._query_compiler = self._query_compiler.astype( {col_name: out_dtype for col_name in self._query_compiler.columns} ) if is_scalar(other): # Return early, since no need to check broadcasting behavior if RHS is a scalar return (self._query_compiler, other, self._ndim, {}) elif cast_input_types: other._query_compiler = other._query_compiler.astype( {col_name: out_dtype for col_name in other._query_compiler.columns} ) if not isinstance(other, array): raise TypeError( f"Unsupported operand type(s): '{type(self)}' and '{type(other)}'" ) broadcast = self._ndim != other._ndim if broadcast: # In this case, we have a 1D object doing a binary op with a 2D object caller, callee = (self, other) if self._ndim == 2 else (other, self) if callee.shape[0] != caller.shape[1]: raise ValueError( f"operands could not be broadcast together with shapes {self.shape} {other.shape}" ) return ( caller._query_compiler, callee._query_compiler, caller._ndim, {"broadcast": broadcast, "axis": 1}, ) else: if self.shape != other.shape: # In this case, we either have two mismatched objects trying to do an operation # or a nested 1D object that must be broadcasted trying to do an operation. broadcast = True if self.shape[0] == other.shape[0]: matched_dimension = 0 elif self.shape[1] == other.shape[1]: matched_dimension = 1 broadcast = False else: raise ValueError( f"operands could not be broadcast together with shapes {self.shape} {other.shape}" ) if ( self.shape[matched_dimension ^ 1] == 1 or other.shape[matched_dimension ^ 1] == 1 ): return ( self._query_compiler, other._query_compiler, self._ndim, {"broadcast": broadcast, "axis": matched_dimension}, ) else: raise ValueError( f"operands could not be broadcast together with shapes {self.shape} {other.shape}" ) else: return ( self._query_compiler, other._query_compiler, self._ndim, {"broadcast": False}, ) def _greater( self, x2, /, out=None, *, where=True, casting="same_kind", order="K", dtype=None, subok=True, ): check_kwargs(where=where, casting=casting, order=order, subok=subok) if is_scalar(x2): return array(_query_compiler=self._query_compiler.gt(x2), _ndim=self._ndim) caller, callee, new_ndim, kwargs = self._preprocess_binary_op( x2, cast_input_types=False, dtype=dtype, out=out ) if caller != self._query_compiler: # In this case, we are doing an operation that looks like this 1D_object > 2D_object. # For Modin to broadcast directly, we have to swap it so that the operation is actually # 2D_object < 1D_object. result = caller.lt(callee, **kwargs) else: result = caller.gt(callee, **kwargs) return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) def __gt__(self, x2): return self._greater(x2) def _greater_equal( self, x2, /, out=None, *, where=True, casting="same_kind", order="K", dtype=None, subok=True, ): check_kwargs(where=where, casting=casting, order=order, subok=subok) if is_scalar(x2): return array(_query_compiler=self._query_compiler.ge(x2), _ndim=self._ndim) caller, callee, new_ndim, kwargs = self._preprocess_binary_op( x2, cast_input_types=False, dtype=dtype, out=out ) if caller != self._query_compiler: # In this case, we are doing an operation that looks like this 1D_object >= 2D_object. # For Modin to broadcast directly, we have to swap it so that the operation is actually # 2D_object <= 1D_object. result = caller.le(callee, **kwargs) else: result = caller.ge(callee, **kwargs) return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) def __ge__(self, x2): return self._greater_equal(x2) def _less( self, x2, /, out=None, *, where=True, casting="same_kind", order="K", dtype=None, subok=True, ): check_kwargs(where=where, casting=casting, order=order, subok=subok) if is_scalar(x2): return array(_query_compiler=self._query_compiler.lt(x2), _ndim=self._ndim) caller, callee, new_ndim, kwargs = self._preprocess_binary_op( x2, cast_input_types=False, dtype=dtype, out=out ) if caller != self._query_compiler: # In this case, we are doing an operation that looks like this 1D_object < 2D_object. # For Modin to broadcast directly, we have to swap it so that the operation is actually # 2D_object < 1D_object. result = caller.gt(callee, **kwargs) else: result = caller.lt(callee, **kwargs) return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) def __lt__(self, x2): return self._less(x2) def _less_equal( self, x2, /, out=None, *, where=True, casting="same_kind", order="K", dtype=None, subok=True, ): check_kwargs(where=where, casting=casting, order=order, subok=subok) if is_scalar(x2): return array(_query_compiler=self._query_compiler.le(x2), _ndim=self._ndim) caller, callee, new_ndim, kwargs = self._preprocess_binary_op( x2, cast_input_types=False, dtype=dtype, out=out ) if caller != self._query_compiler: # In this case, we are doing an operation that looks like this 1D_object <= 2D_object. # For Modin to broadcast directly, we have to swap it so that the operation is actually # 2D_object <= 1D_object. result = caller.ge(callee, **kwargs) else: result = caller.le(callee, **kwargs) return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) def __le__(self, x2): return self._less_equal(x2) def _equal( self, x2, /, out=None, *, where=True, casting="same_kind", order="K", dtype=None, subok=True, ): check_kwargs(where=where, casting=casting, order=order, subok=subok) if is_scalar(x2): return array(_query_compiler=self._query_compiler.eq(x2), _ndim=self._ndim) caller, callee, new_ndim, kwargs = self._preprocess_binary_op( x2, cast_input_types=False, dtype=dtype, out=out ) result = caller.eq(callee, **kwargs) return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) def __eq__(self, x2): return self._equal(x2) def _not_equal( self, x2, /, out=None, *, where=True, casting="same_kind", order="K", dtype=None, subok=True, ): check_kwargs(where=where, casting=casting, order=order, subok=subok) if is_scalar(x2): return array(_query_compiler=self._query_compiler.ne(x2), _ndim=self._ndim) caller, callee, new_ndim, kwargs = self._preprocess_binary_op( x2, cast_input_types=False, dtype=dtype, out=out ) result = caller.ne(callee, **kwargs) return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) def __ne__(self, x2): return self._not_equal(x2) def _unary_math_operator( self, opName, *args, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True, ): out_dtype = ( dtype if dtype is not None else (out.dtype if out is not None else self.dtype) ) check_kwargs(order=order, casting=casting, subok=subok, where=where) result = self._query_compiler.astype( {col_name: out_dtype for col_name in self._query_compiler.columns} ) result = getattr(result, opName)(*args) if dtype is not None: result = result.astype({col_name: dtype for col_name in result.columns}) if out is not None: out = try_convert_from_interoperable_type(out) check_can_broadcast_to_output(self, out) out._query_compiler = result return out return array(_query_compiler=result, _ndim=self._ndim) def tanh( self, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True, ): return self._unary_math_operator( "_tanh", out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok, ) def exp( self, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True, ): return self._unary_math_operator( "_exp", out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok, ) def sqrt( self, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True, ): return self._unary_math_operator( "_sqrt", out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok, ) def append(self, values, axis=None): if not isinstance(values, array): if is_list_like(values): lengths = [len(a) if is_list_like(a) else None for a in values] if any(numpy.array(lengths[1:]) != lengths[0]): raise ValueError( "setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part." ) values = array(values) if axis is None: return self.flatten().hstack([values.flatten()]) elif self._ndim == 1: if values._ndim == 1: return self.hstack([values]) raise ValueError( f"all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has {values._ndim} dimension(s)" ) if (axis ^ 1 < values._ndim) and self.shape[axis ^ 1] != values.shape[axis ^ 1]: raise ValueError( f"all the input array dimensions except for the concatenation axis must match exactly, but along dimension {axis ^ 1}, the array at index 0 has size {self.shape[axis^1]} and the array at index 1 has size {values.shape[axis^1]}" ) new_qc = self._query_compiler.concat(axis, values._query_compiler) return array(_query_compiler=new_qc, _ndim=self._ndim) def hstack(self, others, dtype=None, casting="same_kind"): check_kwargs(casting=casting) new_dtype = ( dtype if dtype is not None else pandas.core.dtypes.cast.find_common_type( [self.dtype] + [a.dtype for a in others] ) ) for index, i in enumerate([a._ndim for a in others]): if i != self._ndim: raise ValueError( f"all the input arrays must have same number of dimensions, but the array at index 0 has {self._ndim} dimension(s) and the array at index {index} has {i} dimension(s)" ) if self._ndim == 1: new_qc = self._query_compiler.concat(0, [o._query_compiler for o in others]) else: for index, i in enumerate([a.shape[0] for a in others]): if i != self.shape[0]: raise ValueError( f"all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size {self.shape[0]} and the array at index {index} has size {i}" ) new_qc = self._query_compiler.concat(1, [o._query_compiler for o in others]) return array(_query_compiler=new_qc, _ndim=self._ndim, dtype=new_dtype) def split(self, indices, axis=0): if axis is not None and axis < 0: new_axis = axis + self._ndim if self._ndim == 1 and new_axis != 0: raise IndexError elif self._ndim == 2 and new_axis not in [0, 1]: raise IndexError axis = new_axis if self._ndim == 1: if axis != 0: raise IndexError if self._ndim == 2: if axis > 1: raise IndexError arrays = [] if is_list_like(indices) or isinstance(indices, array): if not isinstance(indices, array): indices = array(indices) if indices._ndim != 1: raise TypeError( "only integer scalar arrays can be converted to a scalar index" ) prev_index = 0 for i in range(len(indices) + 1): if i < len(indices): end_index = indices._query_compiler.take_2d_positional( [i] ).to_numpy()[0, 0] if end_index == 0: ErrorMessage.single_warning( "Defaulting to NumPy for empty arrays." ) new_shape = list(self.shape) new_shape[axis] = 0 arrays.append(numpy.empty(new_shape, dtype=self.dtype)) continue if end_index < 0: end_index = self.shape[axis] + end_index else: end_index = self.shape[axis] if prev_index > self.shape[axis] or prev_index == end_index: ErrorMessage.single_warning("Defaulting to NumPy for empty arrays.") new_shape = list(self.shape) new_shape[axis] = 0 arrays.append(numpy.empty(new_shape, dtype=self.dtype)) else: idxs = list(range(prev_index, min(end_index, self.shape[axis]))) if axis == 0: new_qc = self._query_compiler.take_2d_positional(index=idxs) else: new_qc = self._query_compiler.take_2d_positional(columns=idxs) arrays.append(array(_query_compiler=new_qc, _ndim=self._ndim)) prev_index = end_index else: if self.shape[axis] % indices != 0: raise ValueError("array split does not result in an equal division") for i in range(0, self.shape[axis], self.shape[axis] // indices): if axis == 0: new_qc = self._query_compiler.take_2d_positional( index=list(range(i, i + self.shape[axis] // indices)) ) else: new_qc = self._query_compiler.take_2d_positional( columns=list(range(i, i + self.shape[axis] // indices)) ) arrays.append(array(_query_compiler=new_qc, _ndim=self._ndim)) return arrays def _compute_masked_variance(self, mask, output_dtype, axis, ddof): if axis == 0 and self._ndim != 1: # Our broadcasting is wrong, so we can't do the final subtraction at the end. raise NotImplementedError( "Masked variance on 2D arrays along axis = 0 is currently unsupported." ) axis_mean = self.mean(axis, output_dtype, keepdims=True, where=mask) target = mask.where(self, numpy.nan) if self._ndim == 1: axis_mean = axis_mean._to_numpy()[0] target = target._query_compiler.sub(axis_mean).pow(2).sum(axis=axis) else: target = (target - axis_mean)._query_compiler.pow(2).sum(axis=axis) num_elems = ( mask.where(self, 0)._query_compiler.notna().sum(axis=axis, skipna=False) ) num_elems = num_elems.sub(ddof) target = target.truediv(num_elems) na_propagation_mask = mask.where(self, 0)._query_compiler.sum( axis=axis, skipna=False ) target = target.where(na_propagation_mask.notna(), numpy.nan) return target def var( self, axis=None, dtype=None, out=None, ddof=0, keepdims=None, *, where=True ): out_dtype = ( dtype if dtype is not None else (out.dtype if out is not None else self.dtype) ) out_type = getattr(out_dtype, "type", out_dtype) if isinstance(where, array) and issubclass(out_type, numpy.integer): out_dtype = numpy.float64 apply_axis = self._validate_axis(axis) check_kwargs(keepdims=keepdims, where=where) truthy_where = bool(where) if self._ndim == 1: if apply_axis == 1: raise numpy.AxisError(1, 1) if isinstance(where, array): result = self._compute_masked_variance(where, out_dtype, 0, ddof) else: result = self._query_compiler.var(axis=0, skipna=False, ddof=ddof) if keepdims: if out is not None and out.shape != (1,): raise ValueError( f"operand was set up as a reduction along axis 0, but the length of the axis is {out.shape[0]} (it has to be 1)" ) if out is not None: out._query_compiler = ( numpy.ones_like(out) * numpy.nan )._query_compiler if truthy_where or out is not None: return fix_dtypes_and_determine_return( result, 1, dtype, out, truthy_where ) else: return array([numpy.nan], dtype=out_dtype) if apply_axis is None: # If any of the (non-masked) elements of our array are `NaN`, we know that the # result of `mean` must be `NaN`. This is a fastpath to see if any unmasked elements # are `NaN`. contains_na_check = ( where.where(self, 0) if isinstance(where, array) else self ) if ( contains_na_check._query_compiler.isna() .any(axis=1) .any(axis=0) .to_numpy()[0, 0] ): return numpy.nan result = where.where(self, numpy.nan) if isinstance(where, array) else self # Since our current QueryCompiler does not have a variance that reduces 2D objects to # a single value, we need to calculate the variance ourselves. First though, we need # to figure out how many objects that we are taking the variance over (since any # entries in our array that are `numpy.nan` must be ignored when taking the variance, # and so cannot be included in the final division (of the sum over num total elements)) num_na_elements = ( result._query_compiler.isna().sum(axis=1).sum(axis=0).to_numpy()[0, 0] ) num_total_elements = prod(self.shape) - num_na_elements mean = ( numpy.array( [result._query_compiler.sum(axis=1).sum(axis=0).to_numpy()[0, 0]], dtype=out_dtype, ) / num_total_elements )[0] result = ( numpy.array( [ result._query_compiler.sub(mean) .pow(2) .sum(axis=1) .sum(axis=0) .to_numpy()[0, 0] ], dtype=out_dtype, ) / (num_total_elements - ddof) )[0] if keepdims: if out is not None and out.shape != (1, 1): raise ValueError( f"operand was set up as a reduction along axis 1, but the length of the axis is {out.shape[0]} (it has to be 1)" ) if out is not None: out._query_compiler = ( numpy.ones_like(out) * numpy.nan )._query_compiler if truthy_where or out is not None: return fix_dtypes_and_determine_return( array(numpy.array([[result]])) .astype(out_dtype) ._query_compiler, 2, dtype, out, truthy_where, ) else: return array([[numpy.nan]], dtype=out_dtype) return result if truthy_where else numpy.nan if apply_axis > 1: raise numpy.AxisError(axis, 2) if isinstance(where, array): result = self._compute_masked_variance(where, out_dtype, apply_axis, ddof) else: result = self._query_compiler.astype( {col_name: out_dtype for col_name in self._query_compiler.columns} ).var(axis=apply_axis, skipna=False, ddof=ddof) new_ndim = self._ndim - 1 if not keepdims else self._ndim if new_ndim == 0: return result.to_numpy()[0, 0] if truthy_where else numpy.nan if not keepdims and apply_axis != 1: result = result.transpose() if out is not None: out._query_compiler = (numpy.ones_like(out) * numpy.nan)._query_compiler if truthy_where or out is not None: return fix_dtypes_and_determine_return( result, new_ndim, dtype, out, truthy_where ) else: return ( numpy.ones(array(_query_compiler=result, _ndim=new_ndim).shape) ) * numpy.nan def _compute_masked_mean(self, mask, output_dtype, axis): # By default, pandas ignores NaN values when doing computations. # NumPy; however, propagates the value by default. We use pandas # default behaviour in order to mask values (by replacing them) # with NaN when initially computing the mean, but we need to propagate # NaN values that were not masked to the final output, so we do a # sum along the same axis (where masked values are 0) to see where # NumPy would propagate NaN, and swap out those values in our result # with NaN. target = mask.where(self, numpy.nan)._query_compiler target = target.astype( {col_name: output_dtype for col_name in target.columns} ).mean(axis=axis) na_propagation_mask = mask.where(self, 0)._query_compiler na_propagation_mask = na_propagation_mask.sum(axis=axis, skipna=False) target = target.where(na_propagation_mask.notna(), numpy.nan) return target def mean(self, axis=None, dtype=None, out=None, keepdims=None, *, where=True): out_dtype = ( dtype if dtype is not None else (out.dtype if out is not None else self.dtype) ) out_type = getattr(out_dtype, "type", out_dtype) if isinstance(where, array) and issubclass(out_type, numpy.integer): out_dtype = numpy.float64 apply_axis = self._validate_axis(axis) check_kwargs(keepdims=keepdims, where=where) truthy_where = bool(where) if self._ndim == 1: if apply_axis == 1: raise numpy.AxisError(1, 1) if isinstance(where, array): result = self._compute_masked_mean(where, out_dtype, 0) else: result = self._query_compiler.astype( {col_name: out_dtype for col_name in self._query_compiler.columns} ).mean(axis=0, skipna=False) if keepdims: if out is not None and out.shape != (1,): raise ValueError( f"operand was set up as a reduction along axis 0, but the length of the axis is {out.shape[0]} (it has to be 1)" ) if out is not None: out._update_inplace( (numpy.ones_like(out) * numpy.nan)._query_compiler ) if truthy_where or out is not None: return fix_dtypes_and_determine_return( result, 1, dtype, out, truthy_where ) else: return array([numpy.nan], dtype=out_dtype) # This is just to see if `where` is a truthy value. If `where` is an array, # we would have already masked the input before computing `result`, so here # we just want to ensure that `where=False` was not passed in, and if it was # we return `numpy.nan`, since that is what NumPy would do. return result.to_numpy()[0, 0] if where else numpy.nan if apply_axis is None: # If any of the (non-masked) elements of our array are `NaN`, we know that the # result of `mean` must be `NaN`. This is a fastpath to see if any unmasked elements # are `NaN`. contains_na_check = ( where.where(self, 0) if isinstance(where, array) else self ) if ( contains_na_check._query_compiler.isna() .any(axis=1) .any(axis=0) .to_numpy()[0, 0] ): return numpy.nan result = where.where(self, numpy.nan) if isinstance(where, array) else self # Since our current QueryCompiler does not have a mean that reduces 2D objects to # a single value, we need to calculate the mean ourselves. First though, we need # to figure out how many objects that we are taking the mean over (since any # entries in our array that are `numpy.nan` must be ignored when taking the mean, # and so cannot be included in the final division (of the sum over num total elements)) num_na_elements = ( result._query_compiler.isna().sum(axis=1).sum(axis=0).to_numpy()[0, 0] ) num_total_elements = prod(self.shape) - num_na_elements result = ( numpy.array( [result._query_compiler.sum(axis=1).sum(axis=0).to_numpy()[0, 0]], dtype=out_dtype, ) / num_total_elements )[0] if keepdims: if out is not None and out.shape != (1, 1): raise ValueError( f"operand was set up as a reduction along axis 1, but the length of the axis is {out.shape[0]} (it has to be 1)" ) if out is not None: out._update_inplace( (numpy.ones_like(out) * numpy.nan)._query_compiler ) if truthy_where or out is not None: return fix_dtypes_and_determine_return( array(numpy.array([[result]])) .astype(out_dtype) ._query_compiler, 2, dtype, out, truthy_where, ) else: return array([[numpy.nan]], dtype=out_dtype) return result if truthy_where else numpy.nan if apply_axis > 1: raise numpy.AxisError(axis, 2) if isinstance(where, array): result = self._compute_masked_mean(where, out_dtype, apply_axis) else: result = self._query_compiler.astype( {col_name: out_dtype for col_name in self._query_compiler.columns} ).mean(axis=apply_axis, skipna=False) new_ndim = self._ndim - 1 if not keepdims else self._ndim if new_ndim == 0: return result.to_numpy()[0, 0] if truthy_where else numpy.nan if not keepdims and apply_axis != 1: result = result.transpose() if out is not None: out._update_inplace((numpy.ones_like(out) * numpy.nan)._query_compiler) if truthy_where or out is not None: return fix_dtypes_and_determine_return( result, new_ndim, dtype, out, truthy_where ) else: return ( numpy.ones(array(_query_compiler=result, _ndim=new_ndim).shape) ) * numpy.nan def __add__( self, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True, ): check_kwargs(order=order, subok=subok, casting=casting, where=where) caller, callee, new_ndim, kwargs = self._preprocess_binary_op( x2, dtype=dtype, out=out ) result = caller.add(callee, **kwargs) return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) def __radd__( self, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True, ): return self.__add__(x2, out, where, casting, order, dtype, subok) def divide( self, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True, ): check_kwargs(order=order, subok=subok, casting=casting, where=where) caller, callee, new_ndim, kwargs = self._preprocess_binary_op( x2, dtype=dtype, out=out ) if caller != self._query_compiler: # In this case, we are doing an operation that looks like this 1D_object/2D_object. # For Modin to broadcast directly, we have to swap it so that the operation is actually # 2D_object.rtruediv(1D_object). result = caller.rtruediv(callee, **kwargs) else: result = caller.truediv(callee, **kwargs) return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) __truediv__ = divide def __rtruediv__( self, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True, ): check_kwargs(order=order, subok=subok, casting=casting, where=where) caller, callee, new_ndim, kwargs = self._preprocess_binary_op( x2, dtype=dtype, out=out ) if caller != self._query_compiler: result = caller.truediv(callee, **kwargs) else: result = caller.rtruediv(callee, **kwargs) return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) def floor_divide( self, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True, ): operand_dtype = ( self.dtype if not isinstance(x2, array) else pandas.core.dtypes.cast.find_common_type([self.dtype, x2.dtype]) ) out_dtype = ( dtype if dtype is not None else (out.dtype if out is not None else operand_dtype) ) check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): result = self._query_compiler.floordiv(x2) if x2 == 0 and numpy.issubdtype(out_dtype, numpy.integer): # NumPy's floor_divide by 0 works differently from pandas', so we need to fix # the output. result = ( result.replace(numpy.inf, 0) .replace(-numpy.inf, 0) .where(self._query_compiler.ne(0), 0) ) return fix_dtypes_and_determine_return( result, self._ndim, dtype, out, where ) caller, callee, new_ndim, kwargs = self._preprocess_binary_op( x2, dtype=dtype, out=out ) if caller != self._query_compiler: # Modin does not correctly support broadcasting when the caller of the function is # a Series (1D), and the operand is a Dataframe (2D). We cannot workaround this using # commutativity, and `rfloordiv` also works incorrectly. GH#5529 raise NotImplementedError( "Using floor_divide with broadcast is not currently available in Modin." ) result = caller.floordiv(callee, **kwargs) if callee.eq(0).any() and numpy.issubdtype(out_dtype, numpy.integer): # NumPy's floor_divide by 0 works differently from pandas', so we need to fix # the output. result = ( result.replace(numpy.inf, 0) .replace(-numpy.inf, 0) .where(callee.ne(0), 0) ) return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) __floordiv__ = floor_divide def power( self, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True, ): check_kwargs(order=order, subok=subok, casting=casting, where=where) caller, callee, new_ndim, kwargs = self._preprocess_binary_op( x2, dtype=dtype, out=out ) if caller != self._query_compiler: # Modin does not correctly support broadcasting when the caller of the function is # a Series (1D), and the operand is a Dataframe (2D). We cannot workaround this using # commutativity, and `rpow` also works incorrectly. GH#5529 raise NotImplementedError( "Using power with broadcast is not currently available in Modin." ) result = caller.pow(callee, **kwargs) return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) __pow__ = power def prod( self, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=True ): out_dtype = ( dtype if dtype is not None else (out.dtype if out is not None else self.dtype) ) initial = 1 if initial is None else initial check_kwargs(keepdims=keepdims, where=where) apply_axis = self._validate_axis(axis) truthy_where = bool(where) if self._ndim == 1: if apply_axis == 1: raise numpy.AxisError(1, 1) target = where.where(self, 1) if isinstance(where, array) else self result = target._query_compiler.astype( {col_name: out_dtype for col_name in target._query_compiler.columns} ).prod(axis=0, skipna=False) result = result.mul(initial) if keepdims: if out is not None: out._update_inplace( (numpy.ones_like(out) * initial) .astype(out_dtype) ._query_compiler ) if out is not None and out.shape != (1,): raise ValueError( f"operand was set up as a reduction along axis 0, but the length of the axis is {out.shape[0]} (it has to be 1)" ) if truthy_where or out is not None: return fix_dtypes_and_determine_return( result, 1, dtype, out, truthy_where ) else: return array([initial], dtype=out_dtype) return result.to_numpy()[0, 0] if truthy_where else initial if apply_axis is None: result = self if isinstance(where, array): result = where.where(self, 1) result = ( result.astype(out_dtype) ._query_compiler.prod(axis=1, skipna=False) .prod(axis=0, skipna=False) .to_numpy()[0, 0] ) result *= initial if keepdims: if out is not None and out.shape != (1, 1): raise ValueError( f"operand was set up as a reduction along axis 1, but the length of the axis is {out.shape[0]} (it has to be 1)" ) if out is not None: out._update_inplace( (numpy.ones_like(out) * initial) .astype(out_dtype) ._query_compiler ) if truthy_where or out is not None: return fix_dtypes_and_determine_return( array(numpy.array([[result]])) .astype(out_dtype) ._query_compiler, 2, dtype, out, truthy_where, ) else: return array([[initial]], dtype=out_dtype) return result if truthy_where else initial if apply_axis > 1: raise numpy.AxisError(axis, 2) target = where.where(self, 1) if isinstance(where, array) else self result = target._query_compiler.astype( {col_name: out_dtype for col_name in target._query_compiler.columns} ).prod(axis=apply_axis, skipna=False) result = result.mul(initial) new_ndim = self._ndim - 1 if not keepdims else self._ndim if new_ndim == 0: return result.to_numpy()[0, 0] if truthy_where else initial if not keepdims and apply_axis != 1: result = result.transpose() if initial is not None and out is not None: out._update_inplace( (numpy.ones_like(out) * initial).astype(out_dtype)._query_compiler ) if truthy_where or out is not None: return fix_dtypes_and_determine_return( result, new_ndim, dtype, out, truthy_where ) else: return ( numpy.ones_like(array(_query_compiler=result, _ndim=new_ndim)) * initial ) def multiply( self, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True, ): check_kwargs(order=order, subok=subok, casting=casting, where=where) caller, callee, new_ndim, kwargs = self._preprocess_binary_op( x2, dtype=dtype, out=out ) result = caller.mul(callee, **kwargs) return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) __mul__ = multiply def __rmul__( self, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True, ): return self.multiply(x2, out, where, casting, order, dtype, subok) def dot(self, other, out=None): other = try_convert_from_interoperable_type(other) if numpy.isscalar(other): # other is scalar -- result is an array result = self._query_compiler.mul(other) result_ndim = self._ndim elif not isinstance(other, array): raise TypeError( f"Unsupported operand type(s): '{type(self)}' and '{type(other)}'" ) elif self._ndim == 1 and other._ndim == 1: # both 1D arrays -- result is a scalar result = self._query_compiler.dot( other._query_compiler, squeeze_self=True, squeeze_other=True ) return result.to_numpy()[0, 0] elif self._ndim == 2 and other._ndim == 2: # both 2D arrays -- result is a 2D array result = self._query_compiler.dot(other._query_compiler) result_ndim = 2 elif self._ndim == 1 and other._ndim == 2: result = self._query_compiler.dot(other._query_compiler, squeeze_self=True) result_ndim = 1 elif self._ndim == 2 and other._ndim == 1: result = self._query_compiler.dot(other._query_compiler) result_ndim = 1 return fix_dtypes_and_determine_return( result, result_ndim, out=out, ) def __matmul__(self, other): if numpy.isscalar(other): # numpy's original error message is something cryptic about a gufunc signature raise ValueError( "cannot call matmul with a scalar argument (use np.dot instead)" ) return self.dot(other) def _norm(self, ord=None, axis=None, keepdims=False): check_kwargs(keepdims=keepdims) if ord is not None and ord not in ("fro",): # , numpy.inf, -numpy.inf, 0): raise NotImplementedError("unsupported ord argument for norm:", ord) if isinstance(axis, int) and axis < 0: apply_axis = self._ndim + axis else: apply_axis = axis or 0 if apply_axis >= self._ndim or apply_axis < 0: raise numpy.AxisError(axis, self._ndim) result = self._query_compiler.pow(2) if self._ndim == 2: result = result.sum(axis=apply_axis) if axis is None: result = result.sum(axis=apply_axis ^ 1) else: result = result.sum(axis=0) if axis is None: # Return a scalar return result._sqrt().to_numpy()[0, 0] else: result = result._sqrt() # the DF may be transposed after processing through pandas # check query compiler shape to ensure this is a row vector (1xN) not column (Nx1) if len(result.index) != 1: result = result.transpose() return array(_query_compiler=result, _ndim=1) def remainder( self, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True, ): operand_dtype = ( self.dtype if not isinstance(x2, array) else pandas.core.dtypes.cast.find_common_type([self.dtype, x2.dtype]) ) out_dtype = ( dtype if dtype is not None else (out.dtype if out is not None else operand_dtype) ) check_kwargs(order=order, subok=subok, casting=casting, where=where) if is_scalar(x2): result = self._query_compiler.astype( {col_name: out_dtype for col_name in self._query_compiler.columns} ).mod(x2) if x2 == 0 and numpy.issubdtype(out_dtype, numpy.integer): # NumPy's remainder by 0 works differently from pandas', so we need to fix # the output. result = result.replace(numpy.nan, 0) return fix_dtypes_and_determine_return( result, self._ndim, dtype, out, where ) caller, callee, new_ndim, kwargs = self._preprocess_binary_op( x2, dtype=dtype, out=out ) if caller != self._query_compiler: # Modin does not correctly support broadcasting when the caller of the function is # a Series (1D), and the operand is a Dataframe (2D). We cannot workaround this using # commutativity, and `rmod` also works incorrectly. GH#5529 raise NotImplementedError( "Using remainder with broadcast is not currently available in Modin." ) result = caller.mod(callee, **kwargs) if callee.eq(0).any() and numpy.issubdtype(out_dtype, numpy.integer): # NumPy's floor_divide by 0 works differently from pandas', so we need to fix # the output. result = result.replace(numpy.nan, 0) return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) __mod__ = remainder def subtract( self, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True, ): check_kwargs(order=order, subok=subok, casting=casting, where=where) caller, callee, new_ndim, kwargs = self._preprocess_binary_op( x2, dtype=dtype, out=out ) if caller != self._query_compiler: # In this case, we are doing an operation that looks like this 1D_object - 2D_object. # For Modin to broadcast directly, we have to swap it so that the operation is actually # 2D_object.rsub(1D_object). result = caller.rsub(callee, **kwargs) else: result = caller.sub(callee, **kwargs) return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) __sub__ = subtract def __rsub__( self, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True, ): check_kwargs(order=order, subok=subok, casting=casting, where=where) caller, callee, new_ndim, kwargs = self._preprocess_binary_op( x2, dtype=dtype, out=out ) if caller != self._query_compiler: # In this case, we are doing an operation that looks like this 1D_object - 2D_object. # For Modin to broadcast directly, we have to swap it so that the operation is actually # 2D_object.sub(1D_object). result = caller.sub(callee, **kwargs) else: result = caller.rsub(callee, **kwargs) return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) def sum( self, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=True ): out_dtype = ( dtype if dtype is not None else (out.dtype if out is not None else self.dtype) ) initial = 0 if initial is None else initial check_kwargs(keepdims=keepdims, where=where) apply_axis = self._validate_axis(axis) truthy_where = bool(where) if self._ndim == 1: if apply_axis == 1: raise numpy.AxisError(1, 1) target = where.where(self, 0) if isinstance(where, array) else self if target.dtype != out_dtype: target = target.astype(out_dtype) result = target._query_compiler.sum(axis=0, skipna=False) if initial != 0: result = result.add(initial) if keepdims: if out is not None: out._update_inplace( ( numpy.ones_like(out, dtype=out_dtype) * initial )._query_compiler ) if out is not None and out.shape != (1,): raise ValueError( f"operand was set up as a reduction along axis 0, but the length of the axis is {out.shape[0]} (it has to be 1)" ) if truthy_where or out is not None: return fix_dtypes_and_determine_return( result, 1, dtype, out, truthy_where ) else: return array([initial], dtype=out_dtype) return result.to_numpy()[0, 0] if truthy_where else initial if apply_axis is None: target = where.where(self, 0) if isinstance(where, array) else self if target.dtype != out_dtype: target = target.astype(out_dtype) result = ( target._query_compiler.sum(axis=1, skipna=False) .sum(axis=0, skipna=False) .to_numpy()[0, 0] ) if initial != 0: result += initial if keepdims: if out is not None and out.shape != (1, 1): raise ValueError( f"operand was set up as a reduction along axis 1, but the length of the axis is {out.shape[0]} (it has to be 1)" ) if out is not None: out._update_inplace( ( numpy.ones_like(out, dtype=out_dtype) * initial )._query_compiler ) if truthy_where or out is not None: return fix_dtypes_and_determine_return( array(numpy.array([[result]], dtype=out_dtype))._query_compiler, 2, dtype, out, truthy_where, ) else: return array([[initial]], dtype=out_dtype) return result if truthy_where else initial if apply_axis > 1: raise numpy.AxisError(axis, 2) target = where.where(self, 0) if isinstance(where, array) else self if target.dtype != out_dtype: target = target.astype(out_dtype) result = target._query_compiler.sum(axis=apply_axis, skipna=False) if initial != 0: result = result.add(initial) new_ndim = self._ndim - 1 if not keepdims else self._ndim if new_ndim == 0: return result.to_numpy()[0, 0] if truthy_where else initial if not keepdims and apply_axis != 1: result = result.transpose() if out is not None: out._update_inplace( (numpy.ones_like(out, dtype=out_dtype) * initial)._query_compiler ) if truthy_where or out is not None: return fix_dtypes_and_determine_return( result, new_ndim, dtype, out, truthy_where ) else: return ( numpy.zeros_like(array(_query_compiler=result, _ndim=new_ndim)) + initial ) def all(self, axis=None, out=None, keepdims=None, *, where=True): check_kwargs(keepdims=keepdims, where=where) truthy_where = bool(where) apply_axis = self._validate_axis(axis) target = where.where(self, True) if isinstance(where, array) else self if self._ndim == 1: if apply_axis == 1: raise numpy.AxisError(1, 1) result = target._query_compiler.all(axis=0) if keepdims: if out is not None and out.shape != (1,): raise ValueError( f"operand was set up as a reduction along axis 0, but the length of the axis is {out.shape[0]} (it has to be 1)" ) if truthy_where or out is not None: return fix_dtypes_and_determine_return( result, 1, bool, out, truthy_where ) else: return array([True], dtype=bool) return result.to_numpy()[0, 0] if truthy_where else True if apply_axis is None: result = target._query_compiler.all(axis=1).all(axis=0) if keepdims: if out is not None and out.shape != (1, 1): raise ValueError( f"operand was set up as a reduction along axis 0, but the length of the axis is {out.shape[0]} (it has to be 1)" ) if truthy_where or out is not None: return fix_dtypes_and_determine_return( array(numpy.array([[result]], dtype=bool))._query_compiler, 2, bool, out, truthy_where, ) else: return array([[True]], dtype=bool) return result.to_numpy()[0, 0] if truthy_where else True if apply_axis > 1: raise numpy.AxisError(axis, 2) result = target._query_compiler.all(axis=apply_axis) new_ndim = self._ndim - 1 if not keepdims else self._ndim if new_ndim == 0: result = result.to_numpy()[0, 0] return result if truthy_where else True if not keepdims and apply_axis != 1: result = result.transpose() if truthy_where or out is not None: return fix_dtypes_and_determine_return( result, new_ndim, bool, out, truthy_where ) else: return numpy.ones_like(array(_query_compiler=result, _ndim=new_ndim)) _all = all def any(self, axis=None, out=None, keepdims=None, *, where=True): check_kwargs(keepdims=keepdims, where=where) truthy_where = bool(where) apply_axis = self._validate_axis(axis) target = where.where(self, False) if isinstance(where, array) else self if self._ndim == 1: if apply_axis == 1: raise numpy.AxisError(1, 1) result = target._query_compiler.any(axis=0) if keepdims: if out is not None and out.shape != (1,): raise ValueError( f"operand was set up as a reduction along axis 0, but the length of the axis is {out.shape[0]} (it has to be 1)" ) if truthy_where or out is not None: return fix_dtypes_and_determine_return( result, 1, bool, out, truthy_where ) else: return array([False], dtype=bool) return result.to_numpy()[0, 0] if truthy_where else False if apply_axis is None: result = target._query_compiler.any(axis=1).any(axis=0) if keepdims: if out is not None and out.shape != (1, 1): raise ValueError( f"operand was set up as a reduction along axis 0, but the length of the axis is {out.shape[0]} (it has to be 1)" ) if truthy_where or out is not None: return fix_dtypes_and_determine_return( array(numpy.array([[result]], dtype=bool))._query_compiler, 2, bool, out, truthy_where, ) else: return array([[False]], dtype=bool) return result.to_numpy()[0, 0] if truthy_where else False if apply_axis > 1: raise numpy.AxisError(axis, 2) result = target._query_compiler.any(axis=apply_axis) new_ndim = self._ndim - 1 if not keepdims else self._ndim if new_ndim == 0: result = result.to_numpy()[0, 0] return result if truthy_where else False if not keepdims and apply_axis != 1: result = result.transpose() if truthy_where or out is not None: return fix_dtypes_and_determine_return( result, new_ndim, bool, out, truthy_where ) else: return numpy.zeros_like(array(_query_compiler=result, _ndim=new_ndim)) _any = any def argmax(self, axis=None, out=None, keepdims=None): check_kwargs(keepdims=keepdims) apply_axis = self._validate_axis(axis) if self._ndim == 1: if apply_axis == 1: raise numpy.AxisError(1, 1) if self._query_compiler.isna().any(axis=1).any(axis=0).to_numpy()[0, 0]: na_row_map = self._query_compiler.isna().any(axis=1) result = na_row_map.idxmax() else: result = self._query_compiler.idxmax(axis=0) if keepdims: if out is not None and out.shape != (1,): raise ValueError( f"operand was set up as a reduction along axis 0, but the length of the axis is {out.shape[0]} (it has to be 1)" ) return fix_dtypes_and_determine_return( result, 1, numpy.int64, out, True ) return result.to_numpy()[0, 0] if apply_axis is None: if self._query_compiler.isna().any(axis=1).any(axis=0).to_numpy()[0, 0]: na_row_map = self._query_compiler.isna().any(axis=1) na_row = self._query_compiler.getitem_array(na_row_map) col_idx = na_row.to_numpy().argmax() final_idxmax = na_row_map.idxmax().to_numpy().flatten() else: inner_idxs = self._query_compiler.idxmax(axis=1) final_idxmax = ( self._query_compiler.max(axis=1).idxmax(axis=0).to_numpy().flatten() ) col_idx = inner_idxs.take_2d_positional(final_idxmax, [0]).to_numpy()[ 0, 0 ] result = (self.shape[1] * final_idxmax[0]) + col_idx if keepdims: if out is not None and out.shape != (1, 1): raise ValueError( f"operand was set up as a reduction along axis 0, but the length of the axis is {out.shape[0]} (it has to be 1)" ) return fix_dtypes_and_determine_return( array(numpy.array([[result]], dtype=bool))._query_compiler, 2, numpy.int64, out, True, ) return result if apply_axis > 1: raise numpy.AxisError(axis, 2) result = self._query_compiler.idxmax(axis=apply_axis) na_mask = self._query_compiler.isna().any(axis=apply_axis) if na_mask.any(axis=apply_axis ^ 1).to_numpy()[0, 0]: na_idxs = self._query_compiler.isna().idxmax(axis=apply_axis) result = na_idxs.where(na_mask, result) new_ndim = self._ndim - 1 if not keepdims else self._ndim if new_ndim == 0: result = result.to_numpy()[0, 0] return result if not keepdims and apply_axis != 1: result = result.transpose() return fix_dtypes_and_determine_return(result, new_ndim, numpy.int64, out, True) def argmin(self, axis=None, out=None, keepdims=None): check_kwargs(keepdims=keepdims) apply_axis = self._validate_axis(axis) if self._ndim == 1: if apply_axis == 1: raise numpy.AxisError(1, 1) if self._query_compiler.isna().any(axis=1).any(axis=0).to_numpy()[0, 0]: na_row_map = self._query_compiler.isna().any(axis=1) # numpy apparently considers nan to be the minimum value in an array if present # therefore, we use idxmax on the mask array to identify where nans are result = na_row_map.idxmax() else: result = self._query_compiler.idxmin(axis=0) if keepdims: if out is not None and out.shape != (1,): raise ValueError( f"operand was set up as a reduction along axis 0, but the length of the axis is {out.shape[0]} (it has to be 1)" ) return fix_dtypes_and_determine_return( result, 1, numpy.int64, out, True ) return result.to_numpy()[0, 0] if apply_axis is None: if self._query_compiler.isna().any(axis=1).any(axis=0).to_numpy()[0, 0]: na_row_map = self._query_compiler.isna().any(axis=1) na_row = self._query_compiler.getitem_array(na_row_map) col_idx = na_row.to_numpy().argmax() final_idxmax = na_row_map.idxmax().to_numpy().flatten() else: inner_idxs = self._query_compiler.idxmin(axis=1) final_idxmax = ( self._query_compiler.min(axis=1).idxmin(axis=0).to_numpy().flatten() ) col_idx = inner_idxs.take_2d_positional(final_idxmax, [0]).to_numpy()[ 0, 0 ] result = (self.shape[1] * final_idxmax[0]) + col_idx if keepdims: if out is not None and out.shape != (1, 1): raise ValueError( f"operand was set up as a reduction along axis 0, but the length of the axis is {out.shape[0]} (it has to be 1)" ) return fix_dtypes_and_determine_return( array(numpy.array([[result]], dtype=bool))._query_compiler, 2, numpy.int64, out, True, ) return result if apply_axis > 1: raise numpy.AxisError(axis, 2) result = self._query_compiler.idxmin(axis=apply_axis) na_mask = self._query_compiler.isna().any(axis=apply_axis) if na_mask.any(axis=apply_axis ^ 1).to_numpy()[0, 0]: na_idxs = self._query_compiler.isna().idxmax(axis=apply_axis) result = na_idxs.where(na_mask, result) new_ndim = self._ndim - 1 if not keepdims else self._ndim if new_ndim == 0: result = result.to_numpy()[0, 0] return result if not keepdims and apply_axis != 1: result = result.transpose() return fix_dtypes_and_determine_return(result, new_ndim, numpy.int64, out, True) def _isfinite( self, /, out=None, *, where=True, casting="same_kind", order="K", dtype=None, subok=True, ): check_kwargs(where=where, casting=casting, order=order, subok=subok) result = self._query_compiler._isfinite() return fix_dtypes_and_determine_return(result, self._ndim, dtype, out, where) def _isinf( self, /, out=None, *, where=True, casting="same_kind", order="K", dtype=None, subok=True, ): check_kwargs(where=where, casting=casting, order=order, subok=subok) result = self._query_compiler._isinf() return fix_dtypes_and_determine_return(result, self._ndim, dtype, out, where) def _isnan( self, /, out=None, *, where=True, casting="same_kind", order="K", dtype=None, subok=True, ): check_kwargs(where=where, casting=casting, order=order, subok=subok) result = self._query_compiler.isna() return fix_dtypes_and_determine_return(result, self._ndim, dtype, out, where) def _isnat( self, /, out=None, *, where=True, casting="same_kind", order="K", dtype=None, subok=True, ): check_kwargs(where=where, casting=casting, order=order, subok=subok) result = self._query_compiler._isnat() return fix_dtypes_and_determine_return(result, self._ndim, dtype, out, where) def _isneginf(self, out=None): result = self._query_compiler._isneginf() return fix_dtypes_and_determine_return(result, self._ndim, out=out) def _isposinf(self, out=None): result = self._query_compiler._isposinf() return fix_dtypes_and_determine_return(result, self._ndim, out=out) def _iscomplex(self): result = self._query_compiler._iscomplex() return fix_dtypes_and_determine_return(result, self._ndim) def _isreal(self): result = self._query_compiler._isreal() return fix_dtypes_and_determine_return(result, self._ndim) def _logical_not( self, /, out=None, *, where=True, casting="same_kind", order="K", dtype=None, subok=True, ): check_kwargs(where=where, casting=casting, order=order, subok=subok) result = self._query_compiler._logical_not() return fix_dtypes_and_determine_return(result, self._ndim, dtype, out, where) def _logical_binop( self, qc_method_name, x2, out, where, casting, order, dtype, subok ): check_kwargs(where=where, casting=casting, order=order, subok=subok) if self._ndim != x2._ndim: raise ValueError( "modin.numpy logic operators do not currently support broadcasting between arrays of different dimensions" ) caller, callee, new_ndim, kwargs = self._preprocess_binary_op( x2, cast_input_types=False, dtype=dtype, out=out ) # Deliberately do not pass **kwargs, since they're not used result = getattr(caller, qc_method_name)(callee) return fix_dtypes_and_determine_return(result, new_ndim, dtype, out, where) def _logical_and( self, x2, /, out=None, *, where=True, casting="same_kind", order="K", dtype=None, subok=True, ): return self._logical_binop( "_logical_and", x2, out, where, casting, order, dtype, subok ) def _logical_or( self, x2, /, out=None, *, where=True, casting="same_kind", order="K", dtype=None, subok=True, ): return self._logical_binop( "_logical_or", x2, out, where, casting, order, dtype, subok ) def _logical_xor( self, x2, /, out=None, *, where=True, casting="same_kind", order="K", dtype=None, subok=True, ): return self._logical_binop( "_logical_xor", x2, out, where, casting, order, dtype, subok ) def flatten(self, order="C"): check_kwargs(order=order) qcs = [ self._query_compiler.getitem_row_array([index_val]).reset_index(drop=True) for index_val in self._query_compiler.index[1:] ] new_query_compiler = ( self._query_compiler.getitem_row_array([self._query_compiler.index[0]]) .reset_index(drop=True) .concat(1, qcs, ignore_index=True) ) new_query_compiler.columns = range(len(new_query_compiler.columns)) new_query_compiler = new_query_compiler.transpose() new_ndim = 1 return array(_query_compiler=new_query_compiler, _ndim=new_ndim) def _get_shape(self): if self._ndim == 1: return (len(self._query_compiler.index),) return (len(self._query_compiler.index), len(self._query_compiler.columns)) def _set_shape(self, new_shape): if not (isinstance(new_shape, int)) and not isinstance(new_shape, tuple): raise TypeError( f"expected a sequence of integers or a single integer, got '{new_shape}'" ) elif isinstance(new_shape, tuple): for dim in new_shape: if not isinstance(dim, int): raise TypeError( f"'{type(dim)}' object cannot be interpreted as an integer" ) new_dimensions = new_shape if isinstance(new_shape, int) else prod(new_shape) if new_dimensions != prod(self._get_shape()): raise ValueError( f"cannot reshape array of size {prod(self._get_shape())} into {new_shape if isinstance(new_shape, tuple) else (new_shape,)}" ) if isinstance(new_shape, int) or len(new_shape) == 1: self._update_inplace(self.flatten()._query_compiler) self._ndim = 1 else: raise NotImplementedError( "Modin numpy does not currently support reshaping to a 2D object" ) shape = property(_get_shape, _set_shape) def transpose(self): if self._ndim == 1: return self return array(_query_compiler=self._query_compiler.transpose(), _ndim=self._ndim) T = property(transpose) @property def dtype(self): dtype = self._query_compiler.dtypes if self._ndim == 1: return dtype[0] else: return pandas.core.dtypes.cast.find_common_type(list(dtype.values)) @property def size(self): return prod(self.shape) def __len__(self): return self.shape[0] def astype(self, dtype, order="K", casting="unsafe", subok=True, copy=True): if casting != "unsafe": raise ValueError( "Modin does not support `astype` with `casting != unsafe`." ) check_kwargs(order=order, subok=subok) result = self._query_compiler.astype( {col_name: dtype for col_name in self._query_compiler.columns} ) if not copy and subok and numpy.issubdtype(self.dtype, dtype): return self return array(_query_compiler=result, _ndim=self._ndim) def _build_repr_array(self): def _generate_indices_for_axis( axis_size, num_elements=numpy.get_printoptions()["edgeitems"] ): if axis_size > num_elements * 2: return list(range(num_elements + 1)) + list( range(axis_size - num_elements, axis_size) ) return list(range(axis_size)) # We want to rely on NumPy for creating a string representation of this array; however # we also don't want to materialize all of the data to the head node. Instead, we will # materialize enough data that NumPy can build the summarized representation of the array # (while changing with the NumPy print options so it will format this smaller array as # abridged) and return this smaller array. In the worst case, this array will have # (2*numpy.get_printoptions()["edgeitems"] + 1)^2 items, so 49 items max for the default # value of 3. if self._ndim == 1 or self.shape[1] == 0: idxs = _generate_indices_for_axis(len(self)) arr = self._query_compiler.getitem_row_array(idxs).to_numpy() if self._ndim == 1: arr = arr.flatten() elif self.shape[0] == 1: idxs = _generate_indices_for_axis(self.shape[1]) arr = self._query_compiler.getitem_column_array(idxs).to_numpy() else: row_idxs = _generate_indices_for_axis(len(self)) col_idxs = _generate_indices_for_axis(self.shape[1]) arr = self._query_compiler.take_2d_positional(row_idxs, col_idxs).to_numpy() return arr def __repr__(self): # If we are dealing with a small array, we can just collate all the data on the # head node and let numpy handle the logic to get a string representation. if self.size <= numpy.get_printoptions()["threshold"]: return repr(self._to_numpy()) arr = self._build_repr_array() prev_threshold = numpy.get_printoptions()["threshold"] numpy.set_printoptions(threshold=arr.size - 1) try: repr_str = repr(arr) finally: numpy.set_printoptions(threshold=prev_threshold) return repr_str def _to_numpy(self): arr = self._query_compiler.to_numpy() if self._ndim == 1: arr = arr.flatten() return arr ================================================ FILE: modin/numpy/array_creation.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses array creation methods for Modin's NumPy API.""" import numpy from modin.error_message import ErrorMessage from .arr import array def _create_array(dtype, shape, order, subok, numpy_method): if order not in ["K", "C"]: ErrorMessage.single_warning( "Array order besides 'C' is not currently supported in Modin. Defaulting to 'C' order." ) if not subok: ErrorMessage.single_warning( "Subclassing types is not currently supported in Modin. Defaulting to the same base dtype." ) ErrorMessage.single_warning(f"np.{numpy_method}_like defaulting to NumPy.") return array(getattr(numpy, numpy_method)(shape, dtype=dtype)) def zeros_like(a, dtype=None, order="K", subok=True, shape=None): if not isinstance(a, array): ErrorMessage.bad_type_for_numpy_op("zeros_like", type(a)) return numpy.zeros_like(a, dtype=dtype, order=order, subok=subok, shape=shape) dtype = a.dtype if dtype is None else dtype shape = a.shape if shape is None else shape return _create_array(dtype, shape, order, subok, "zeros") def ones_like(a, dtype=None, order="K", subok=True, shape=None): if not isinstance(a, array): ErrorMessage.bad_type_for_numpy_op("ones_like", type(a)) return numpy.ones_like(a, dtype=dtype, order=order, subok=subok, shape=shape) dtype = a.dtype if dtype is None else dtype shape = a.shape if shape is None else shape return _create_array(dtype, shape, order, subok, "ones") def tri(N, M=None, k=0, dtype=float, like=None): if like is not None: ErrorMessage.single_warning( "Modin NumPy does not support the `like` argument for np.tri. Defaulting to `like=None`." ) ErrorMessage.single_warning("np.tri defaulting to NumPy.") return array(numpy.tri(N, M=M, k=k, dtype=dtype)) ================================================ FILE: modin/numpy/array_shaping.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses array shaping methods for Modin's NumPy API.""" import numpy from modin.error_message import ErrorMessage from .arr import array from .utils import try_convert_from_interoperable_type def ravel(a, order="C"): a = try_convert_from_interoperable_type(a) if not isinstance(a, array): ErrorMessage.bad_type_for_numpy_op("ravel", type(a)) return numpy.ravel(a, order=order) if order != "C": ErrorMessage.single_warning( "Array order besides 'C' is not currently supported in Modin. Defaulting to 'C' order." ) return a.flatten(order) def shape(a): a = try_convert_from_interoperable_type(a) if not isinstance(a, array): ErrorMessage.bad_type_for_numpy_op("shape", type(a)) return numpy.shape(a) return a.shape def transpose(a, axes=None): a = try_convert_from_interoperable_type(a) if not isinstance(a, array): ErrorMessage.bad_type_for_numpy_op("transpose", type(a)) return numpy.transpose(a, axes=axes) if axes is not None: raise NotImplementedError( "Modin does not support arrays higher than 2-dimensions. Please use `transpose` with `axis=None` on a 2-dimensional or lower object." ) return a.transpose() def split(arr, indices, axis=0): arr = try_convert_from_interoperable_type(arr) if not isinstance(arr, array): ErrorMessage.bad_type_for_numpy_op("split", type(arr)) return numpy.split(arr, indices, axis=axis) return arr.split(indices, axis) def hstack(tup, dtype=None, casting="same_kind"): a = try_convert_from_interoperable_type(tup[0]) if not isinstance(a, array): ErrorMessage.bad_type_for_numpy_op("hstack", type(a)) return numpy.hstack(tup, dtype=dtype, casting=casting) return a.hstack(tup[1:], dtype, casting) def append(arr, values, axis=None): arr = try_convert_from_interoperable_type(arr) if not isinstance(arr, array): ErrorMessage.bad_type_for_numpy_op("append", type(arr)) return numpy.append(arr, values, axis=axis) return arr.append(values, axis) ================================================ FILE: modin/numpy/constants.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import numpy from numpy import e, euler_gamma, inf, nan, newaxis, pi from packaging import version if version.parse(numpy.__version__) < version.parse("2.0.0b1"): from numpy import NAN, NINF, NZERO, PINF, PZERO, Inf, Infinity, NaN, infty __all__ = [ "e", "euler_gamma", "inf", "nan", "newaxis", "pi", ] if version.parse(numpy.__version__) < version.parse("2.0.0b1"): __all__ += [ "Inf", "Infinity", "NAN", "NINF", "NZERO", "NaN", "PINF", "PZERO", "infty", ] ================================================ FILE: modin/numpy/indexing.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. # noqa: MD02 """ Details about how Indexing Helper Class works. _LocationIndexerBase provide methods framework for __getitem__ and __setitem__ that work with Modin NumPy Array's internal index. Base class's __{get,set}item__ takes in partitions & idx_in_partition data and perform lookup/item write. _iLocIndexer is responsible for indexer specific logic and lookup computation. Loc will take care of enlarge DataFrame. Both indexer will take care of translating pandas's lookup to Modin DataFrame's internal lookup. An illustration is available at https://github.com/ray-project/ray/pull/1955#issuecomment-386781826 """ import itertools import numpy as np import pandas from pandas.api.types import is_bool, is_list_like from pandas.core.dtypes.common import is_bool_dtype, is_integer, is_integer_dtype from pandas.core.indexing import IndexingError from modin.error_message import ErrorMessage from modin.pandas.indexing import compute_sliced_len, is_range_like, is_slice, is_tuple from modin.pandas.utils import is_scalar from .arr import array def broadcast_item( obj, row_lookup, col_lookup, item, need_columns_reindex=True, ): """ Use NumPy to broadcast or reshape item with reindexing. Parameters ---------- obj : DataFrame or Series The object containing the necessary information about the axes. row_lookup : slice or scalar The global row index to locate inside of `item`. col_lookup : range, array, list, slice or scalar The global col index to locate inside of `item`. item : DataFrame, Series, or query_compiler Value that should be broadcast to a new shape of `to_shape`. need_columns_reindex : bool, default: True In the case of assigning columns to a dataframe (broadcasting is part of the flow), reindexing is not needed. Returns ------- np.ndarray `item` after it was broadcasted to `to_shape`. Raises ------ ValueError 1) If `row_lookup` or `col_lookup` contains values missing in DataFrame/Series index or columns correspondingly. 2) If `item` cannot be broadcast from its own shape to `to_shape`. Notes ----- NumPy is memory efficient, there shouldn't be performance issue. """ new_row_len = ( len(obj._query_compiler.index[row_lookup]) if isinstance(row_lookup, slice) else len(row_lookup) ) new_col_len = ( len(obj._query_compiler.columns[col_lookup]) if isinstance(col_lookup, slice) else len(col_lookup) ) to_shape = new_row_len, new_col_len if isinstance(item, array): # convert indices in lookups to names, as pandas reindex expects them to be so axes_to_reindex = {} index_values = obj._query_compiler.index[row_lookup] if not index_values.equals(item._query_compiler.index): axes_to_reindex["index"] = index_values if need_columns_reindex and isinstance(item, array) and item._ndim == 2: column_values = obj._query_compiler.columns[col_lookup] if not column_values.equals(item._query_compiler.columns): axes_to_reindex["columns"] = column_values # New value for columns/index make that reindex add NaN values if axes_to_reindex: row_axes = axes_to_reindex.get("index", None) if row_axes is not None: item._query_compiler = item._query_compiler.reindex( axis=0, labels=row_axes, copy=None ) col_axes = axes_to_reindex.get("columns", None) if col_axes is not None: item._query_compiler = item._query_compiler.reindex( axis=1, labels=col_axes, copy=None ) try: item = np.array(item) if not isinstance(item, array) else item._to_numpy() if np.prod(to_shape) == np.prod(item.shape): return item.reshape(to_shape) else: return np.broadcast_to(item, to_shape) except ValueError: from_shape = np.array(item).shape raise ValueError( f"could not broadcast input array from shape {from_shape} into shape " + f"{to_shape}" ) def is_boolean_array(x): """ Check that argument is an array of bool. Parameters ---------- x : object Object to check. Returns ------- bool True if argument is an array of bool, False otherwise. """ if isinstance(x, (np.ndarray, array, pandas.Series, pandas.Index)): return is_bool_dtype(x.dtype) return is_list_like(x) and all(map(is_bool, x)) def is_integer_array(x): """ Check that argument is an array of integers. Parameters ---------- x : object Object to check. Returns ------- bool True if argument is an array of integers, False otherwise. """ if isinstance(x, (np.ndarray, array, pandas.Series, pandas.Index)): return is_integer_dtype(x.dtype) return is_list_like(x) and all(map(is_integer, x)) def is_integer_slice(x): """ Check that argument is an array of int. Parameters ---------- x : object Object to check. Returns ------- bool True if argument is an array of int, False otherwise. """ if not is_slice(x): return False for pos in [x.start, x.stop, x.step]: if not ((pos is None) or is_integer(pos)): return False # one position is neither None nor int return True def boolean_mask_to_numeric(indexer): """ Convert boolean mask to numeric indices. Parameters ---------- indexer : list-like of booleans Returns ------- np.ndarray of ints Numerical positions of ``True`` elements in the passed `indexer`. """ if isinstance(indexer, (np.ndarray, array, pandas.Series)): return np.where(indexer)[0] else: # It's faster to build the resulting numpy array from the reduced amount of data via # `compress` iterator than convert non-numpy-like `indexer` to numpy and apply `np.where`. return np.fromiter( # `itertools.compress` masks `data` with the `selectors` mask, # works about ~10% faster than a pure list comprehension itertools.compress(data=range(len(indexer)), selectors=indexer), dtype=np.int64, ) _ILOC_INT_ONLY_ERROR = """ Location based indexing can only have [integer, integer slice (START point is INCLUDED, END point is EXCLUDED), listlike of integers, boolean array] types. """ def _compute_ndim(row_loc, col_loc): """ Compute the number of dimensions of result from locators. Parameters ---------- row_loc : list or scalar Row locator. col_loc : list or scalar Column locator. Returns ------- {0, 1, 2} Number of dimensions in located dataset. """ row_scalar = is_scalar(row_loc) or is_tuple(row_loc) col_scalar = is_scalar(col_loc) or is_tuple(col_loc) if row_scalar and col_scalar: ndim = 0 elif row_scalar ^ col_scalar: ndim = 1 else: ndim = 2 return ndim class ArrayIndexer(object): """ An indexer for modin_arr.__{get|set}item__ functionality. Parameters ---------- array : modin.numpy.array Array to operate on. """ def __init__(self, array): self.arr = array def _get_numpy_object_from_qc_view( self, qc_view, row_scalar: bool, col_scalar: bool, ndim: int, ): """ Convert the query compiler view to the appropriate NumPy object. Parameters ---------- qc_view : BaseQueryCompiler Query compiler to convert. row_scalar : bool Whether indexer for rows is scalar. col_scalar : bool Whether indexer for columns is scalar. ndim : {0, 1, 2} Number of dimensions in dataset to be retrieved. Returns ------- modin.numpy.array The array object with the data from the query compiler view. Notes ----- Usage of `slice(None)` as a lookup is a hack to pass information about full-axis grab without computing actual indices that triggers lazy computations. Ideally, this API should get rid of using slices as indexers and either use a common ``Indexer`` object or range and ``np.ndarray`` only. """ if ndim == 2: return array(_query_compiler=qc_view, _ndim=self.arr._ndim) if self.arr._ndim == 1 and not row_scalar: return array(_query_compiler=qc_view, _ndim=1) if self.arr._ndim == 1: _ndim = 0 elif ndim == 0: _ndim = 0 else: # We are in the case where ndim == 1 # The axis we squeeze on depends on whether we are looking for an exact # value or a subset of rows and columns. Knowing if we have a full MultiIndex # lookup or scalar lookup can help us figure out whether we need to squeeze # on the row or column index. if row_scalar and col_scalar: _ndim = 0 elif not any([row_scalar, col_scalar]): _ndim = 2 else: _ndim = 1 if row_scalar: qc_view = qc_view.transpose() if _ndim == 0: return qc_view.to_numpy()[0, 0] res_arr = array(_query_compiler=qc_view, _ndim=_ndim) return res_arr def _parse_row_and_column_locators(self, tup): """ Unpack the user input for getitem and setitem and compute ndim. loc[a] -> ([a], :), 1D loc[[a,b]] -> ([a,b], :), loc[a,b] -> ([a], [b]), 0D Parameters ---------- tup : tuple User input to unpack. Returns ------- row_loc : scalar or list Row locator(s) as a scalar or List. col_list : scalar or list Column locator(s) as a scalar or List. ndim : {0, 1, 2} Number of dimensions of located dataset. """ row_loc, col_loc = slice(None), slice(None) if is_tuple(tup): row_loc = tup[0] if len(tup) == 2: col_loc = tup[1] if len(tup) > 2: raise IndexingError("Too many indexers") else: row_loc = tup row_loc = row_loc(self.arr) if callable(row_loc) else row_loc col_loc = col_loc(self.arr) if callable(col_loc) else col_loc row_loc = row_loc._to_numpy() if isinstance(row_loc, array) else row_loc col_loc = col_loc._to_numpy() if isinstance(col_loc, array) else col_loc return row_loc, col_loc, _compute_ndim(row_loc, col_loc) def __getitem__(self, key): """ Retrieve dataset according to `key`. Parameters ---------- key : callable or tuple The global row numbers to retrieve data from. Returns ------- DataFrame or Series Located dataset. See Also -------- pandas.DataFrame.iloc """ row_loc, col_loc, ndim = self._parse_row_and_column_locators(key) row_scalar = is_scalar(row_loc) col_scalar = is_scalar(col_loc) self._check_dtypes(row_loc) self._check_dtypes(col_loc) row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc) if isinstance(row_lookup, slice): ErrorMessage.catch_bugs_and_request_email( failure_condition=row_lookup != slice(None), extra_log=f"Only None-slices are acceptable as a slice argument in masking, got: {row_lookup}", ) row_lookup = None if isinstance(col_lookup, slice): ErrorMessage.catch_bugs_and_request_email( failure_condition=col_lookup != slice(None), extra_log=f"Only None-slices are acceptable as a slice argument in masking, got: {col_lookup}", ) col_lookup = None qc_view = self.arr._query_compiler.take_2d_positional(row_lookup, col_lookup) result = self._get_numpy_object_from_qc_view( qc_view, row_scalar=row_scalar, col_scalar=col_scalar, ndim=ndim, ) return result def _determine_setitem_axis(self, row_lookup, col_lookup, row_scalar, col_scalar): """ Determine an axis along which we should do an assignment. Parameters ---------- row_lookup : slice or list Indexer for rows. col_lookup : slice or list Indexer for columns. row_scalar : bool Whether indexer for rows is scalar or not. col_scalar : bool Whether indexer for columns is scalar or not. Returns ------- int or None None if this will be a both axis assignment, number of axis to assign in other cases. Notes ----- axis = 0: column assignment df[col] = item axis = 1: row assignment df.loc[row] = item axis = None: assignment along both axes """ if self.arr.shape == (1, 1): return None if not (row_scalar ^ col_scalar) else 1 if row_scalar else 0 def get_axis(axis): return ( self.arr._query_compiler.index if axis == 0 else self.arr._query_compiler.columns ) row_lookup_len, col_lookup_len = [ ( len(lookup) if not isinstance(lookup, slice) else compute_sliced_len(lookup, len(get_axis(i))) ) for i, lookup in enumerate([row_lookup, col_lookup]) ] if col_lookup_len == 1 and row_lookup_len == 1: axis = None elif ( row_lookup_len == len(self.arr._query_compiler.index) and col_lookup_len == 1 and self.arr._ndim == 2 ): axis = 0 elif ( col_lookup_len == len(self.arr._query_compiler.columns) and row_lookup_len == 1 ): axis = 1 else: axis = None return axis def _setitem_positional(self, row_lookup, col_lookup, item, axis=None): """ Assign `item` value to located dataset. Parameters ---------- row_lookup : slice or scalar The global row index to write item to. col_lookup : slice or scalar The global col index to write item to. item : DataFrame, Series or scalar The new item needs to be set. It can be any shape that's broadcast-able to the product of the lookup tables. axis : {None, 0, 1}, default: None If not None, it means that whole axis is used to assign a value. 0 means assign to whole column, 1 means assign to whole row. If None, it means that partial assignment is done on both axes. """ # Convert slices to indices for the purposes of application. # TODO (devin-petersohn): Apply to slice without conversion to list if isinstance(row_lookup, slice): row_lookup = range(len(self.arr._query_compiler.index))[row_lookup] if isinstance(col_lookup, slice): col_lookup = range(len(self.arr._query_compiler.columns))[col_lookup] new_qc = self.arr._query_compiler.write_items(row_lookup, col_lookup, item) self.arr._update_inplace(new_qc) def __setitem__(self, key, item): """ Assign `item` value to dataset located by `key`. Parameters ---------- key : callable or tuple The global row numbers to assign data to. item : modin.pandas.DataFrame, modin.pandas.Series or scalar Value that should be assigned to located dataset. See Also -------- pandas.DataFrame.iloc """ row_loc, col_loc, _ = self._parse_row_and_column_locators(key) row_scalar = is_scalar(row_loc) col_scalar = is_scalar(col_loc) self._check_dtypes(row_loc) self._check_dtypes(col_loc) row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc) self._setitem_positional( row_lookup, col_lookup, item, axis=self._determine_setitem_axis( row_lookup, col_lookup, row_scalar, col_scalar ), ) def _compute_lookup(self, row_loc, col_loc): """ Compute index and column labels from index and column integer locators. Parameters ---------- row_loc : slice, list, array or tuple Row locator. col_loc : slice, list, array or tuple Columns locator. Returns ------- row_lookup : slice(None) if full axis grab, pandas.RangeIndex if repetition is detected, numpy.ndarray otherwise List of index labels. col_lookup : slice(None) if full axis grab, pandas.RangeIndex if repetition is detected, numpy.ndarray otherwise List of columns labels. Notes ----- Usage of `slice(None)` as a resulting lookup is a hack to pass information about full-axis grab without computing actual indices that triggers lazy computations. Ideally, this API should get rid of using slices as indexers and either use a common ``Indexer`` object or range and ``np.ndarray`` only. """ lookups = [] for axis, axis_loc in enumerate((row_loc, col_loc)): if is_scalar(axis_loc): axis_loc = np.array([axis_loc]) if isinstance(axis_loc, slice): axis_lookup = ( axis_loc if axis_loc == slice(None) else pandas.RangeIndex( *axis_loc.indices(len(self.arr._query_compiler.get_axis(axis))) ) ) elif is_range_like(axis_loc): axis_lookup = pandas.RangeIndex( axis_loc.start, axis_loc.stop, axis_loc.step ) elif is_boolean_array(axis_loc): axis_lookup = boolean_mask_to_numeric(axis_loc) else: if isinstance(axis_loc, pandas.Index): axis_loc = axis_loc.values elif is_list_like(axis_loc) and not isinstance(axis_loc, np.ndarray): # `Index.__getitem__` works much faster with numpy arrays than with python lists, # so although we lose some time here on converting to numpy, `Index.__getitem__` # speedup covers the loss that we gain here. axis_loc = np.array(axis_loc, dtype=np.int64) # Relatively fast check allows us to not trigger `self.qc.get_axis()` computation # if there're no negative indices and so they don't not depend on the axis length. if isinstance(axis_loc, np.ndarray) and not (axis_loc < 0).any(): axis_lookup = axis_loc else: axis_lookup = pandas.RangeIndex( len(self.arr._query_compiler.get_axis(axis)) )[axis_loc] if isinstance(axis_lookup, pandas.Index) and not is_range_like(axis_lookup): axis_lookup = axis_lookup.values lookups.append(axis_lookup) return lookups def _check_dtypes(self, locator): """ Check that `locator` is an integer scalar, integer slice, integer list or array of booleans. Parameters ---------- locator : scalar, list, slice or array Object to check. Raises ------ ValueError If check fails. """ is_int = is_integer(locator) is_int_slice = is_integer_slice(locator) is_int_arr = is_integer_array(locator) is_bool_arr = is_boolean_array(locator) if not any([is_int, is_int_slice, is_int_arr, is_bool_arr]): raise ValueError(_ILOC_INT_ONLY_ERROR) ================================================ FILE: modin/numpy/linalg.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import numpy from modin.error_message import ErrorMessage from .arr import array from .utils import try_convert_from_interoperable_type def norm(x, ord=None, axis=None, keepdims=False): x = try_convert_from_interoperable_type(x) if not isinstance(x, array): ErrorMessage.bad_type_for_numpy_op("linalg.norm", type(x)) return numpy.linalg.norm(x, ord=ord, axis=axis, keepdims=keepdims) return x._norm(ord=ord, axis=axis, keepdims=keepdims) ================================================ FILE: modin/numpy/logic.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import numpy from modin.error_message import ErrorMessage from modin.utils import _inherit_docstrings from .arr import array from .utils import try_convert_from_interoperable_type def _dispatch_logic(operator_name): @_inherit_docstrings(getattr(numpy, operator_name)) def call(x, *args, **kwargs): x = try_convert_from_interoperable_type(x) if not isinstance(x, array): ErrorMessage.bad_type_for_numpy_op(operator_name, type(x)) return getattr(numpy, operator_name)(x, *args, **kwargs) return getattr(x, f"_{operator_name}")(*args, **kwargs) return call all = _dispatch_logic("all") any = _dispatch_logic("any") isfinite = _dispatch_logic("isfinite") isinf = _dispatch_logic("isinf") isnan = _dispatch_logic("isnan") isnat = _dispatch_logic("isnat") isneginf = _dispatch_logic("isneginf") isposinf = _dispatch_logic("isposinf") iscomplex = _dispatch_logic("iscomplex") isreal = _dispatch_logic("isreal") def isscalar(e): if isinstance(e, array): return False return numpy.isscalar(e) logical_not = _dispatch_logic("logical_not") logical_and = _dispatch_logic("logical_and") logical_or = _dispatch_logic("logical_or") logical_xor = _dispatch_logic("logical_xor") greater = _dispatch_logic("greater") greater_equal = _dispatch_logic("greater_equal") less = _dispatch_logic("less") less_equal = _dispatch_logic("less_equal") equal = _dispatch_logic("equal") not_equal = _dispatch_logic("not_equal") ================================================ FILE: modin/numpy/math.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import numpy from modin.error_message import ErrorMessage from modin.utils import _inherit_docstrings from .arr import array from .utils import try_convert_from_interoperable_type def _dispatch_math(operator_name, arr_method_name=None): # `operator_name` is the name of the method on the numpy API # `arr_method_name` is the name of the method on the modin.numpy.array object, # which is assumed to be `operator_name` by default @_inherit_docstrings(getattr(numpy, operator_name)) def call(x, *args, **kwargs): x = try_convert_from_interoperable_type(x) if not isinstance(x, array): ErrorMessage.bad_type_for_numpy_op(operator_name, type(x)) return getattr(numpy, operator_name)(x, *args, **kwargs) return getattr(x, arr_method_name or operator_name)(*args, **kwargs) return call absolute = _dispatch_math("absolute") abs = absolute add = _dispatch_math("add", "__add__") divide = _dispatch_math("divide") dot = _dispatch_math("dot") float_power = _dispatch_math("float_power") floor_divide = _dispatch_math("floor_divide") power = _dispatch_math("power") prod = _dispatch_math("prod") multiply = _dispatch_math("multiply") remainder = _dispatch_math("remainder") mod = remainder subtract = _dispatch_math("subtract") sum = _dispatch_math("sum") true_divide = _dispatch_math("true_divide", "divide") mean = _dispatch_math("mean") def var(x1, axis=None, dtype=None, out=None, keepdims=None, *, where=True): x1 = try_convert_from_interoperable_type(x1) if not isinstance(x1, array): ErrorMessage.bad_type_for_numpy_op("var", type(x1)) return numpy.var( x1, axis=axis, out=out, keepdims=keepdims, where=where, dtype=dtype ) return x1.var(axis=axis, out=out, keepdims=keepdims, where=where, dtype=dtype) # Maximum and minimum are ufunc's in NumPy, which means that our array's __array_ufunc__ # implementation will automatically handle this. We still need the function though, so that # if the operands are modin.pandas objects, we can convert them to arrays, but after that # we can just use NumPy's maximum/minimum since that will route to our array's ufunc. def maximum( x1, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True ): x1 = try_convert_from_interoperable_type(x1) if not isinstance(x1, array): ErrorMessage.bad_type_for_numpy_op("maximum", type(x1)) return numpy.maximum( x1, x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok, ) def minimum( x1, x2, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True ): x1 = try_convert_from_interoperable_type(x1) if not isinstance(x1, array): ErrorMessage.bad_type_for_numpy_op("minimum", type(x1)) return numpy.minimum( x1, x2, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok, ) amax = _dispatch_math("amax", "max") amin = _dispatch_math("amin", "min") max = amax min = amin def sqrt( x, out=None, *, where=True, casting="same_kind", order="K", dtype=None, subok=True ): x = try_convert_from_interoperable_type(x) if not isinstance(x, array): ErrorMessage.bad_type_for_numpy_op("sqrt", type(x)) return numpy.sqrt( x, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok, ) return x.sqrt(out, where, casting, order, dtype, subok) def exp( x, out=None, *, where=True, casting="same_kind", order="K", dtype=None, subok=True ): x = try_convert_from_interoperable_type(x) if not isinstance(x, array): ErrorMessage.bad_type_for_numpy_op("exp", type(x)) return numpy.exp( x, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok, ) return x.exp(out, where, casting, order, dtype, subok) def argmax(a, axis=None, out=None, *, keepdims=None): a = try_convert_from_interoperable_type(a) if not isinstance(a, array): ErrorMessage.bad_type_for_numpy_op("argmax", type(a)) return numpy.argmax(a, axis=axis, out=out, keepdims=keepdims) return a.argmax(axis=axis, out=out, keepdims=keepdims) def argmin(a, axis=None, out=None, *, keepdims=None): a = try_convert_from_interoperable_type(a) if not isinstance(a, array): ErrorMessage.bad_type_for_numpy_op("argmin", type(a)) return numpy.argmin(a, axis=axis, out=out, keepdims=keepdims) return a.argmin(axis=axis, out=out, keepdims=keepdims) ================================================ FILE: modin/numpy/trigonometry.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import numpy from modin.error_message import ErrorMessage from .arr import array from .utils import try_convert_from_interoperable_type def tanh( x, out=None, where=True, casting="same_kind", order="K", dtype=None, subok=True ): x = try_convert_from_interoperable_type(x) if not isinstance(x, array): ErrorMessage.bad_type_for_numpy_op("tanh", type(x)) return numpy.tanh( x, out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok, ) return x.tanh( out=out, where=where, casting=casting, order=order, dtype=dtype, subok=subok ) ================================================ FILE: modin/numpy/utils.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Collection of array utility functions for internal use.""" import modin.numpy as np import modin.pandas as pd _INTEROPERABLE_TYPES = (pd.DataFrame, pd.Series) def try_convert_from_interoperable_type(obj, copy=False): if isinstance(obj, _INTEROPERABLE_TYPES): obj = np.array(obj, copy=copy) return obj ================================================ FILE: modin/pandas/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import warnings import pandas from packaging import version __min_pandas_version__ = "2.2" __max_pandas_version__ = "2.4" pandas_version = version.parse(pandas.__version__) if pandas_version < version.parse( __min_pandas_version__ ) or pandas_version >= version.parse(__max_pandas_version__): warnings.warn( f"The pandas version installed ({pandas.__version__}) is outside the supported range in Modin" + f" ({__min_pandas_version__} to {__max_pandas_version__}). This may cause undesired side effects!" ) # to not pollute namespace del version, pandas_version, __min_pandas_version__, __max_pandas_version__ with warnings.catch_warnings(): warnings.simplefilter("ignore") import inspect from modin.core.storage_formats.pandas.query_compiler_caster import ( wrap_free_function_in_argument_caster, ) # To allow the extensions system to override these methods, we must wrap all objects re-exported # from pandas in a backend dispatcher. _reexport_list = ( "eval", "factorize", "test", "date_range", "period_range", "Index", "MultiIndex", "CategoricalIndex", "bdate_range", "DatetimeIndex", "Timedelta", "Timestamp", "set_eng_float_format", "options", "describe_option", "set_option", "get_option", "reset_option", "option_context", "NaT", "PeriodIndex", "Categorical", "Interval", "UInt8Dtype", "UInt16Dtype", "UInt32Dtype", "UInt64Dtype", "SparseDtype", "Int8Dtype", "Int16Dtype", "Int32Dtype", "Int64Dtype", "StringDtype", "BooleanDtype", "CategoricalDtype", "DatetimeTZDtype", "IntervalDtype", "PeriodDtype", "RangeIndex", "TimedeltaIndex", "IntervalIndex", "IndexSlice", "Grouper", "array", "Period", "DateOffset", "timedelta_range", "infer_freq", "interval_range", "ExcelWriter", "NamedAgg", "NA", "api", "ArrowDtype", "Flags", "Float32Dtype", "Float64Dtype", "from_dummies", "testing", ) for name in _reexport_list: item = getattr(pandas, name) if inspect.isfunction(item): # Note that this is applied to only functions, not classes. item = wrap_free_function_in_argument_caster(name)(item) globals()[name] = item del inspect, item, _reexport_list, name, wrap_free_function_in_argument_caster import os from modin.config import Parameter _engine_initialized = {} def _initialize_engine(engine_string: str): from modin.config import ( CpuCount, Engine, IsExperimental, StorageFormat, ValueSource, ) # Set this so that Pandas doesn't try to multithread by itself os.environ["OMP_NUM_THREADS"] = "1" if engine_string == "Ray": if not _engine_initialized.get("Ray", False): from modin.core.execution.ray.common import initialize_ray initialize_ray() elif engine_string == "Dask": if not _engine_initialized.get("Dask", False): from modin.core.execution.dask.common import initialize_dask initialize_dask() elif engine_string == "Unidist": if not _engine_initialized.get("Unidist", False): from modin.core.execution.unidist.common import initialize_unidist initialize_unidist() elif engine_string not in Engine.NOINIT_ENGINES: raise ImportError("Unrecognized execution engine: {}.".format(engine_string)) _engine_initialized[engine_string] = True from modin.pandas import arrays, errors from modin.pandas.api.extensions.extensions import __getattr___impl from modin.utils import show_versions from .. import __version__ from .dataframe import DataFrame from .general import ( concat, crosstab, cut, get_dummies, isna, isnull, lreshape, melt, merge, merge_asof, merge_ordered, notna, notnull, pivot, pivot_table, qcut, to_datetime, to_numeric, to_timedelta, unique, value_counts, wide_to_long, ) from .io import ( ExcelFile, HDFStore, json_normalize, read_clipboard, read_csv, read_excel, read_feather, read_fwf, read_gbq, read_hdf, read_html, read_json, read_orc, read_parquet, read_pickle, read_sas, read_spss, read_sql, read_sql_query, read_sql_table, read_stata, read_table, read_xml, to_pickle, ) from .plotting import Plotting as plotting from .series import Series __getattr__ = __getattr___impl __all__ = [ # noqa: F405 "DataFrame", "Series", "read_csv", "read_parquet", "read_json", "read_html", "read_clipboard", "read_excel", "read_hdf", "read_feather", "read_stata", "read_sas", "read_pickle", "read_sql", "read_gbq", "read_table", "read_spss", "read_orc", "json_normalize", "concat", "eval", "cut", "factorize", "test", "qcut", "to_datetime", "get_dummies", "isna", "isnull", "merge", "pivot_table", "date_range", "Index", "MultiIndex", "Series", "bdate_range", "period_range", "DatetimeIndex", "to_timedelta", "set_eng_float_format", "options", "describe_option", "set_option", "get_option", "reset_option", "option_context", "CategoricalIndex", "Timedelta", "Timestamp", "NaT", "PeriodIndex", "Categorical", "__version__", "melt", "crosstab", "plotting", "Interval", "UInt8Dtype", "UInt16Dtype", "UInt32Dtype", "UInt64Dtype", "SparseDtype", "Int8Dtype", "Int16Dtype", "Int32Dtype", "Int64Dtype", "CategoricalDtype", "DatetimeTZDtype", "IntervalDtype", "PeriodDtype", "BooleanDtype", "StringDtype", "NA", "RangeIndex", "TimedeltaIndex", "IntervalIndex", "IndexSlice", "Grouper", "array", "Period", "show_versions", "DateOffset", "timedelta_range", "infer_freq", "interval_range", "ExcelWriter", "read_fwf", "read_sql_table", "read_sql_query", "ExcelFile", "to_pickle", "HDFStore", "lreshape", "wide_to_long", "merge_asof", "merge_ordered", "notnull", "notna", "pivot", "to_numeric", "unique", "value_counts", "NamedAgg", "api", "read_xml", "ArrowDtype", "Flags", "Float32Dtype", "Float64Dtype", "from_dummies", "errors", ] # Remove these attributes from this module's namespace. del pandas, Parameter, __getattr___impl ================================================ FILE: modin/pandas/accessor.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Implement various accessor classes for DataFrame and Series API. SparseFrameAccessor implements API of pandas.DataFrame.sparse accessor. SparseAccessor implements API of pandas.Series.sparse accessor. CachedAccessor implements API of pandas.core.accessor.CachedAccessor """ from __future__ import annotations import pickle from typing import TYPE_CHECKING, Union import pandas from pandas._typing import CompressionOptions, StorageOptions from pandas.core.dtypes.dtypes import SparseDtype from modin import pandas as pd from modin.error_message import ErrorMessage from modin.logging import ClassLogger from modin.pandas.io import to_dask, to_ray from modin.utils import _inherit_docstrings if TYPE_CHECKING: from modin.pandas import DataFrame, Series class BaseSparseAccessor(ClassLogger): """ Base class for various sparse DataFrame accessor classes. Parameters ---------- data : DataFrame or Series Object to operate on. """ _parent: Union[DataFrame, Series] _validation_msg = "Can only use the '.sparse' accessor with Sparse data." def __init__(self, data: Union[DataFrame, Series] = None): self._parent = data self._validate(data) @classmethod def _validate(cls, data: Union[DataFrame, Series]): """ Verify that `data` dtypes are compatible with `pandas.core.dtypes.dtypes.SparseDtype`. Parameters ---------- data : DataFrame or Series Object to check. Raises ------ NotImplementedError Function is implemented in child classes. """ raise NotImplementedError def _default_to_pandas(self, op, *args, **kwargs): """ Convert dataset to pandas type and call a pandas sparse.`op` on it. Parameters ---------- op : str Name of pandas function. *args : list Additional positional arguments to be passed in `op`. **kwargs : dict Additional keywords arguments to be passed in `op`. Returns ------- object Result of operation. """ return self._parent._default_to_pandas( lambda parent: op(parent.sparse, *args, **kwargs) ) @_inherit_docstrings(pandas.core.arrays.sparse.accessor.SparseFrameAccessor) class SparseFrameAccessor(BaseSparseAccessor): @classmethod def _validate(cls, data: DataFrame): """ Verify that `data` dtypes are compatible with `pandas.core.dtypes.dtypes.SparseDtype`. Parameters ---------- data : DataFrame Object to check. Raises ------ AttributeError If check fails. """ dtypes = data.dtypes if not all(isinstance(t, SparseDtype) for t in dtypes): raise AttributeError(cls._validation_msg) @property def density(self): return self._parent._default_to_pandas(pandas.DataFrame.sparse).density @classmethod def from_spmatrix(cls, data, index=None, columns=None): ErrorMessage.default_to_pandas("`from_spmatrix`") return pd.DataFrame( pandas.DataFrame.sparse.from_spmatrix(data, index=index, columns=columns) ) def to_dense(self): return self._default_to_pandas(pandas.DataFrame.sparse.to_dense) def to_coo(self): return self._default_to_pandas(pandas.DataFrame.sparse.to_coo) @_inherit_docstrings(pandas.core.arrays.sparse.accessor.SparseAccessor) class SparseAccessor(BaseSparseAccessor): @classmethod def _validate(cls, data: Series): """ Verify that `data` dtype is compatible with `pandas.core.dtypes.dtypes.SparseDtype`. Parameters ---------- data : Series Object to check. Raises ------ AttributeError If check fails. """ if not isinstance(data.dtype, SparseDtype): raise AttributeError(cls._validation_msg) @property def density(self): return self._parent._default_to_pandas(pandas.Series.sparse).density @property def fill_value(self): return self._parent._default_to_pandas(pandas.Series.sparse).fill_value @property def npoints(self): return self._parent._default_to_pandas(pandas.Series.sparse).npoints @property def sp_values(self): return self._parent._default_to_pandas(pandas.Series.sparse).sp_values @classmethod def from_coo(cls, A, dense_index=False): return cls._default_to_pandas( pandas.Series.sparse.from_coo, A, dense_index=dense_index ) def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False): return self._default_to_pandas( pandas.Series.sparse.to_coo, row_levels=row_levels, column_levels=column_levels, sort_labels=sort_labels, ) def to_dense(self): return self._default_to_pandas(pandas.Series.sparse.to_dense) @_inherit_docstrings(pandas.core.accessor.CachedAccessor) class CachedAccessor(ClassLogger): def __init__(self, name: str, accessor) -> None: self._name = name self._accessor = accessor def __get__(self, obj, cls): # noqa: GL08 if obj is None: return self._accessor accessor_obj = self._accessor(obj) object.__setattr__(obj, self._name, accessor_obj) return accessor_obj class ModinAPI: """ Namespace class for accessing additional Modin functions that are not available in pandas. Parameters ---------- data : DataFrame or Series Object to operate on. """ _data: Union[DataFrame, Series] def __init__(self, data: Union[DataFrame, Series]): self._data = data def to_pandas(self): """ Convert a Modin DataFrame/Series object to a pandas DataFrame/Series object. Returns ------- pandas.Series or pandas.DataFrame """ return self._data._to_pandas() def to_ray(self): """ Convert a Modin DataFrame/Series to a Ray Dataset. Returns ------- ray.data.Dataset Converted object with type depending on input. Notes ----- Modin DataFrame/Series can only be converted to a Ray Dataset if Modin uses a Ray engine. """ return to_ray(self._data) def to_dask(self): """ Convert a Modin DataFrame/Series to a Dask DataFrame/Series. Returns ------- dask.dataframe.DataFrame or dask.dataframe.Series Converted object with type depending on input. Notes ----- Modin DataFrame/Series can only be converted to a Dask DataFrame/Series if Modin uses a Dask engine. """ return to_dask(self._data) def to_pickle_glob( self, filepath_or_buffer, compression: CompressionOptions = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, ) -> None: """ Pickle (serialize) object to file. This experimental feature provides parallel writing into multiple pickle files which are defined by glob pattern, otherwise (without glob pattern) default pandas implementation is used. Parameters ---------- filepath_or_buffer : str File path where the pickled object will be stored. compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default: 'infer' A string representing the compression to use in the output file. By default, infers from the file extension in specified path. Compression mode may be any of the following possible values: {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}. If compression mode is 'infer' and path_or_buf is path-like, then detect compression mode from the following extensions: '.gz', '.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given and mode is 'zip' or inferred as 'zip', other entries passed as additional compression options. protocol : int, default: pickle.HIGHEST_PROTOCOL Int which indicates which protocol should be used by the pickler, default HIGHEST_PROTOCOL (see `pickle docs `_ paragraph 12.1.2 for details). The possible values are 0, 1, 2, 3, 4, 5. A negative value for the protocol parameter is equivalent to setting its value to HIGHEST_PROTOCOL. storage_options : dict, optional Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc., if using a URL that will be parsed by fsspec, e.g., starting "s3://", "gcs://". An error will be raised if providing this argument with a non-fsspec URL. See the fsspec and backend storage implementation docs for the set of allowed keys and values. """ from modin.experimental.pandas.io import to_pickle_glob to_pickle_glob( self._data, filepath_or_buffer=filepath_or_buffer, compression=compression, protocol=protocol, storage_options=storage_options, ) def to_parquet_glob( self, path, engine="auto", compression="snappy", index=None, partition_cols=None, storage_options: StorageOptions = None, **kwargs, ) -> None: # noqa: PR01 """ Write a DataFrame to the binary parquet format. This experimental feature provides parallel writing into multiple parquet files which are defined by glob pattern, otherwise (without glob pattern) default pandas implementation is used. Notes ----- * Only string type supported for `path` argument. * The rest of the arguments are the same as for `pandas.to_parquet`. """ from modin.experimental.pandas.io import to_parquet_glob if path is None: raise NotImplementedError( "`to_parquet_glob` doesn't support path=None, use `to_parquet` in that case." ) to_parquet_glob( self._data, path=path, engine=engine, compression=compression, index=index, partition_cols=partition_cols, storage_options=storage_options, **kwargs, ) def to_json_glob( self, path_or_buf=None, orient=None, date_format=None, double_precision=10, force_ascii=True, date_unit="ms", default_handler=None, lines=False, compression="infer", index=None, indent=None, storage_options: StorageOptions = None, mode="w", ) -> None: # noqa: PR01 """ Convert the object to a JSON string. Notes ----- * Only string type supported for `path_or_buf` argument. * The rest of the arguments are the same as for `pandas.to_json`. """ from modin.experimental.pandas.io import to_json_glob if path_or_buf is None: raise NotImplementedError( "`to_json_glob` doesn't support path_or_buf=None, use `to_json` in that case." ) to_json_glob( self._data, path_or_buf=path_or_buf, orient=orient, date_format=date_format, double_precision=double_precision, force_ascii=force_ascii, date_unit=date_unit, default_handler=default_handler, lines=lines, compression=compression, index=index, indent=indent, storage_options=storage_options, mode=mode, ) def to_xml_glob( self, path_or_buffer=None, index=True, root_name="data", row_name="row", na_rep=None, attr_cols=None, elem_cols=None, namespaces=None, prefix=None, encoding="utf-8", xml_declaration=True, pretty_print=True, parser="lxml", stylesheet=None, compression="infer", storage_options=None, ) -> None: # noqa: PR01 """ Render a DataFrame to an XML document. Notes ----- * Only string type supported for `path_or_buffer` argument. * The rest of the arguments are the same as for `pandas.to_xml`. """ from modin.experimental.pandas.io import to_xml_glob if path_or_buffer is None: raise NotImplementedError( "`to_xml_glob` doesn't support path_or_buffer=None, use `to_xml` in that case." ) to_xml_glob( self._data, path_or_buffer=path_or_buffer, index=index, root_name=root_name, row_name=row_name, na_rep=na_rep, attr_cols=attr_cols, elem_cols=elem_cols, namespaces=namespaces, prefix=prefix, encoding=encoding, xml_declaration=xml_declaration, pretty_print=pretty_print, parser=parser, stylesheet=stylesheet, compression=compression, storage_options=storage_options, ) ================================================ FILE: modin/pandas/api/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. # Re-export all other pandas.api submodules from pandas.api import indexers, interchange, types, typing from modin.pandas.api import extensions __all__ = ["extensions", "interchange", "indexers", "types", "typing"] ================================================ FILE: modin/pandas/api/extensions/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. from .extensions import ( register_base_accessor, register_dataframe_accessor, register_dataframe_groupby_accessor, register_pd_accessor, register_series_accessor, register_series_groupby_accessor, ) __all__ = [ "register_base_accessor", "register_dataframe_accessor", "register_series_accessor", "register_pd_accessor", "register_dataframe_groupby_accessor", "register_series_groupby_accessor", ] ================================================ FILE: modin/pandas/api/extensions/extensions.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import inspect from collections import defaultdict from functools import cached_property from types import MethodType, ModuleType from typing import Any, Dict, Optional, Union import modin.pandas as pd from modin.config import Backend from modin.core.storage_formats.pandas.query_compiler_caster import ( _GENERAL_EXTENSIONS, _NON_EXTENDABLE_ATTRIBUTES, EXTENSION_DICT_TYPE, wrap_function_in_argument_caster, ) _attrs_to_delete_on_test = defaultdict(set) # Track a dict of module-level classes that are re-exported from pandas that may need to dynamically # change when overridden by the extensions system, such as pd.Index. # See register_pd_accessor for details. _reexport_classes: Dict[str, Any] = {} def _set_attribute_on_obj( name: str, extensions: EXTENSION_DICT_TYPE, backend: Optional[str], obj: Union[type, ModuleType], set_reexport: bool = False, ): """ Create a new or override existing attribute on obj. Parameters ---------- name : str The name of the attribute to assign to `obj`. extensions : EXTENSION_DICT_TYPE The dictionary mapping extension name to `new_attr` (assigned below). backend : Optional[str] The backend to which the accessor applies. If `None`, this accessor will become the default for all backends. obj : DataFrame, Series, or modin.pandas The object we are assigning the new attribute to. set_reexport : bool, default False If True, register the original property in `_reexport_classes`. Returns ------- decorator Returns the decorator function. """ if name in _NON_EXTENDABLE_ATTRIBUTES: raise ValueError(f"Cannot register an extension with the reserved name {name}.") def decorator(new_attr: Any): """ Decorate a function or class to be assigned to the given name. Parameters ---------- new_attr : Any The new attribute to assign to name. Returns ------- new_attr Unmodified new_attr is return from the decorator. """ # Module-level functions are resolved by `wrap_free_function_in_argument_caster`, which dynamically # identifies the appropriate backend to use. We cannot apply this wrapper to classes in order # to preserve the vailidity of `isinstance` checks, and instead must force __getattr__ to directly # return the correct class. # Because the module-level __getattr__ function is not called if the object is found in the namespace, # any overrides from the extensions system must `delattr` the attribute to force any future lookups # to hit this code path. # We cannot do this by omitting those exports at module initialization time because the # __getattr__ codepath performs a call to Backend.get() that assumes the presence of an engine; # in an extensions system that may reference types like pd.Timestamp/pd.Index before registering # itself as an engine, this will cause errors. if set_reexport: original_attr = getattr(pd, name) _reexport_classes[name] = original_attr delattr(pd, name) # If the attribute is an instance of functools.cached_property, we must manually call __set_name__ on it. # https://stackoverflow.com/a/62161136 if isinstance(new_attr, cached_property): new_attr.__set_name__(obj, name) extensions[None if backend is None else Backend.normalize(backend)][ name ] = new_attr if ( callable(new_attr) and name not in dir(obj) and not inspect.isclass(new_attr) ): # For callable extensions, we add a method to `obj`'s namespace that # dispatches to the correct implementation. # If the extension is a class like pd.Index, do not add a wrapper and let # the getattr dispatcher choose the correct item. setattr( obj, name, wrap_function_in_argument_caster( klass=obj if isinstance(obj, type) else None, f=new_attr, wrapping_function_type=( MethodType if isinstance(obj, type) else None ), extensions=extensions, name=name, ), ) # "Free" functions are permanently kept in the wrapper, so no need to clear them in tests. if obj is not pd: _attrs_to_delete_on_test[obj].add(name) return new_attr return decorator def register_dataframe_accessor(name: str, *, backend: Optional[str] = None): """ Register a dataframe attribute with the name provided. This is a decorator that assigns a new attribute to DataFrame. It can be used with the following syntax: ``` @register_dataframe_accessor("new_method") def my_new_dataframe_method(*args, **kwargs): # logic goes here return ``` The new attribute can then be accessed with the name provided: ``` df.new_method(*my_args, **my_kwargs) ``` Parameters ---------- name : str The name of the attribute to assign to DataFrame. Returns ------- decorator Returns the decorator function. backend : Optional[str] The backend to which the accessor applies. If ``None``, this accessor will become the default for all backends. """ return _set_attribute_on_obj( name, pd.dataframe.DataFrame._extensions, backend, pd.dataframe.DataFrame ) def register_series_accessor(name: str, *, backend: Optional[str] = None): """ Register a series attribute with the name provided. This is a decorator that assigns a new attribute to Series. It can be used with the following syntax: ``` @register_series_accessor("new_method") def my_new_series_method(*args, **kwargs): # logic goes here return ``` The new attribute can then be accessed with the name provided: ``` s.new_method(*my_args, **my_kwargs) ``` Parameters ---------- name : str The name of the attribute to assign to Series. backend : Optional[str] The backend to which the accessor applies. If ``None``, this accessor will become the default for all backends. Returns ------- decorator Returns the decorator function. """ return _set_attribute_on_obj( name, pd.series.Series._extensions, backend=backend, obj=pd.series.Series ) def register_base_accessor(name: str, *, backend: Optional[str] = None): """ Register a base attribute with the name provided. This is a decorator that assigns a new attribute to BasePandasDataset. It can be used with the following syntax: ``` @register_base_accessor("new_method") def register_base_accessor(*args, **kwargs): # logic goes here return ``` The new attribute can then be accessed with the name provided: ``` s.new_method(*my_args, **my_kwargs) ``` Parameters ---------- name : str The name of the attribute to assign to BasePandasDataset. backend : Optional[str] The backend to which the accessor applies. If ``None``, this accessor will become the default for all backends. Returns ------- decorator Returns the decorator function. """ from modin.pandas.base import BasePandasDataset return _set_attribute_on_obj( name, BasePandasDataset._extensions, backend=backend, obj=BasePandasDataset, ) def register_pd_accessor(name: str, *, backend: Optional[str] = None): """ Register a pd namespace attribute with the name provided. This is a decorator that assigns a new attribute to modin.pandas. It can be used with the following syntax: ``` @register_pd_accessor("new_function") def my_new_pd_function(*args, **kwargs): # logic goes here return ``` The new attribute can then be accessed with the name provided: ``` import modin.pandas as pd pd.new_method(*my_args, **my_kwargs) ``` Parameters ---------- name : str The name of the attribute to assign to modin.pandas. backend : Optional[str] The backend to which the accessor applies. If ``None``, this accessor will become the default for all backends. Returns ------- decorator Returns the decorator function. """ set_reexport = name not in _GENERAL_EXTENSIONS[backend] and name in dir(pd) return _set_attribute_on_obj( name=name, extensions=_GENERAL_EXTENSIONS, backend=backend, obj=pd, set_reexport=set_reexport, ) def __getattr___impl(name: str): """ Override __getattr__ on the modin.pandas module to enable extensions. Note that python only falls back to this function if the attribute is not found in this module's namespace. Parameters ---------- name : str The name of the attribute being retrieved. Returns ------- Attribute Returns the extension attribute, if it exists, otherwise returns the attribute imported in this file. """ from modin.config import Backend backend = Backend.get() if name in _GENERAL_EXTENSIONS[backend]: return _GENERAL_EXTENSIONS[backend][name] elif name in _GENERAL_EXTENSIONS[None]: return _GENERAL_EXTENSIONS[None][name] elif name in _reexport_classes: return _reexport_classes[name] else: raise AttributeError(f"module 'modin.pandas' has no attribute '{name}'") def register_dataframe_groupby_accessor(name: str, *, backend: Optional[str] = None): """ Register a dataframe groupby attribute with the name provided. This is a decorator that assigns a new attribute to DataFrameGroupBy. It can be used with the following syntax: ``` @register_dataframe_groupby_accessor("new_method") def my_new_dataframe_groupby_method(*args, **kwargs): # logic goes here return ``` The new attribute can then be accessed with the name provided: ``` df.groupby("col").new_method(*my_args, **my_kwargs) ``` Parameters ---------- name : str The name of the attribute to assign to DataFrameGroupBy. backend : Optional[str] The backend to which the accessor applies. If ``None``, this accessor will become the default for all backends. Returns ------- decorator Returns the decorator function. """ return _set_attribute_on_obj( name, pd.groupby.DataFrameGroupBy._extensions, backend=backend, obj=pd.groupby.DataFrameGroupBy, ) def register_series_groupby_accessor(name: str, *, backend: Optional[str] = None): """ Register a series groupby attribute with the name provided. This is a decorator that assigns a new attribute to SeriesGroupBy. It can be used with the following syntax: ``` @register_series_groupby_accessor("new_method") def my_new_series_groupby_method(*args, **kwargs): # logic goes here return ``` The new attribute can then be accessed with the name provided: ``` df.groupby("col0")["col1"].new_method(*my_args, **my_kwargs) ``` Parameters ---------- name : str The name of the attribute to assign to SeriesGroupBy. backend : Optional[str] The backend to which the accessor applies. If ``None``, this accessor will become the default for all backends. Returns ------- decorator Returns the decorator function. """ return _set_attribute_on_obj( name, pd.groupby.SeriesGroupBy._extensions, backend=backend, obj=pd.groupby.SeriesGroupBy, ) ================================================ FILE: modin/pandas/arrays/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """The module is needed to allow the following import `import modin.pandas.arrays`.""" from pandas.arrays import * # noqa: F403, F401 from pandas.arrays import __all__ # noqa: F401 ================================================ FILE: modin/pandas/base.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Implement DataFrame/Series public API as pandas does.""" from __future__ import annotations import abc import pickle as pkl import re import sys import warnings from functools import cached_property from typing import ( TYPE_CHECKING, Any, Callable, Hashable, Literal, Optional, Sequence, Union, ) import numpy as np import pandas import pandas.core.generic import pandas.core.resample import pandas.core.window.rolling from pandas._libs import lib from pandas._libs.tslibs import to_offset from pandas._typing import ( Axis, CompressionOptions, DtypeBackend, IndexKeyFunc, IndexLabel, Level, RandomState, Scalar, StorageOptions, T, TimedeltaConvertibleTypes, TimestampConvertibleTypes, npt, ) from pandas.compat import numpy as numpy_compat from pandas.core.common import count_not_none, pipe from pandas.core.dtypes.common import ( is_bool_dtype, is_dict_like, is_dtype_equal, is_integer, is_integer_dtype, is_list_like, is_numeric_dtype, is_object_dtype, ) from pandas.core.indexes.api import ensure_index from pandas.core.methods.describe import _refine_percentiles from pandas.util._decorators import doc from pandas.util._validators import ( validate_ascending, validate_bool_kwarg, validate_percentile, ) from modin import pandas as pd from modin.config import Backend, ShowBackendSwitchProgress from modin.core.storage_formats.pandas.query_compiler_caster import ( EXTENSION_NO_LOOKUP, QueryCompilerCaster, ) from modin.error_message import ErrorMessage from modin.logging import ClassLogger, disable_logging from modin.pandas.accessor import CachedAccessor, ModinAPI from modin.pandas.api.extensions.extensions import EXTENSION_DICT_TYPE from modin.pandas.utils import GET_BACKEND_DOC, SET_BACKEND_DOC, is_scalar from modin.utils import ( _inherit_docstrings, expanduser_path_arg, sentinel, try_cast_to_pandas, ) from .utils import _doc_binary_op, is_full_grab_slice if TYPE_CHECKING: from typing_extensions import Self from modin.core.storage_formats import BaseQueryCompiler from .dataframe import DataFrame from .indexing import _iLocIndexer, _LocIndexer from .resample import Resampler from .series import Series from .window import Expanding, Rolling, Window # Do not lookup certain attributes in columns or index, as they're used for some # special purposes, like serving remote context _ATTRS_NO_LOOKUP = { "__name__", "_cache", "_ipython_canary_method_should_not_exist_", "_ipython_display_", "_repr_mimebundle_", # Also avoid looking up the attributes that we use to implement the # extension system. } | EXTENSION_NO_LOOKUP _DEFAULT_BEHAVIOUR = { "__init__", "__class__", "_get_index", "_set_index", "_pandas_class", "_get_axis_number", "empty", "index", "columns", "name", "dtypes", "dtype", "groupby", "_get_name", "_set_name", "_default_to_pandas", "_query_compiler", "_to_pandas", "_repartition", "_build_repr_df", "_reduce_dimension", "__repr__", "__len__", "__constructor__", "_create_or_update_from_compiler", "_update_inplace", # for persistance support; # see DataFrame methods docstrings for more "_inflate_light", "_inflate_full", "__reduce__", "__reduce_ex__", "_init", } | _ATTRS_NO_LOOKUP _doc_binary_op_kwargs = {"returns": "BasePandasDataset", "left": "BasePandasDataset"} def _get_repr_axis_label_indexer(labels, num_for_repr): """ Get the indexer for the given axis labels to be used for the repr. Parameters ---------- labels : pandas.Index The axis labels. num_for_repr : int The number of elements to display. Returns ------- slice or list The indexer to use for the repr. """ if len(labels) <= num_for_repr: return slice(None) # At this point, the entire axis has len(labels) elements, and num_for_repr < # len(labels). We want to select a pandas subframe containing elements such that: # - the repr of the pandas subframe will be the same as the repr of the entire # frame. # - the pandas repr will not be able to show all the elements and will put an # ellipsis in the middle # # We accomplish this by selecting some elements from the front and some from the # back, with the front having at most 1 element more than the back. The total # number of elements will be num_for_repr + 1. if num_for_repr % 2 == 0: # If num_for_repr is even, take an extra element from the front. # The total number of elements we are selecting is (num_for_repr // 2) * 2 + 1 # = num_for_repr + 1 front_repr_num = num_for_repr // 2 + 1 back_repr_num = num_for_repr // 2 else: # If num_for_repr is odd, take an extra element from both the front and the # back. The total number of elements we are selecting is # (num_for_repr // 2) * 2 + 1 + 1 = num_for_repr + 1 front_repr_num = num_for_repr // 2 + 1 back_repr_num = num_for_repr // 2 + 1 all_positions = range(len(labels)) return list(all_positions[:front_repr_num]) + ( [] if back_repr_num == 0 else list(all_positions[-back_repr_num:]) ) @_inherit_docstrings(pandas.DataFrame, apilink=["pandas.DataFrame", "pandas.Series"]) class BasePandasDataset(QueryCompilerCaster, ClassLogger): """ Implement most of the common code that exists in DataFrame/Series. Since both objects share the same underlying representation, and the algorithms are the same, we use this object to define the general behavior of those objects and then use those objects to define the output type. """ # Pandas class that we pretend to be; usually it has the same name as our class # but lives in "pandas" namespace. _pandas_class = pandas.core.generic.NDFrame _query_compiler: BaseQueryCompiler _siblings: list[BasePandasDataset] _extensions: EXTENSION_DICT_TYPE = EXTENSION_DICT_TYPE(dict) _pinned: bool = False @cached_property def _is_dataframe(self) -> bool: """ Tell whether this is a dataframe. Ideally, other methods of BasePandasDataset shouldn't care whether this is a dataframe or a series, but sometimes we need to know. This method is better than hasattr(self, "columns"), which for series will call self.__getattr__("columns"), which requires materializing the index. Returns ------- bool : Whether this is a dataframe. """ return issubclass(self._pandas_class, pandas.DataFrame) @abc.abstractmethod def _create_or_update_from_compiler( self, new_query_compiler: BaseQueryCompiler, inplace: bool = False ) -> Self | None: """ Return or update a ``DataFrame`` or ``Series`` with given `new_query_compiler`. Parameters ---------- new_query_compiler : BaseQueryCompiler QueryCompiler to use to manage the data. inplace : bool, default: False Whether or not to perform update or creation inplace. Returns ------- DataFrame, Series or None None if update was done, ``DataFrame`` or ``Series`` otherwise. """ pass def _add_sibling(self, sibling: BasePandasDataset) -> None: """ Add a DataFrame or Series object to the list of siblings. Siblings are objects that share the same query compiler. This function is called when a shallow copy is made. Parameters ---------- sibling : BasePandasDataset Dataset to add to siblings list. """ sibling._siblings = self._siblings + [self] self._siblings += [sibling] for sib in self._siblings: sib._siblings += [sibling] def _build_repr_df( self, num_rows: int, num_cols: int ) -> pandas.DataFrame | pandas.Series: """ Build pandas DataFrame for string representation. Parameters ---------- num_rows : int Number of rows to show in string representation. If number of rows in this dataset is greater than `num_rows` then half of `num_rows` rows from the beginning and half of `num_rows` rows from the end are shown. num_cols : int Number of columns to show in string representation. If number of columns in this dataset is greater than `num_cols` then half of `num_cols` columns from the beginning and half of `num_cols` columns from the end are shown. Returns ------- pandas.DataFrame or pandas.Series A pandas dataset with `num_rows` or fewer rows and `num_cols` or fewer columns. """ # Fast track for empty dataframe. if len(self) == 0 or ( self._is_dataframe and self._query_compiler.get_axis_len(1) == 0 ): return pandas.DataFrame( index=self.index, columns=self.columns if self._is_dataframe else None, ) row_indexer = _get_repr_axis_label_indexer(self.index, num_rows) if self._is_dataframe: indexer = row_indexer, _get_repr_axis_label_indexer(self.columns, num_cols) else: indexer = row_indexer return self.iloc[indexer]._query_compiler.to_pandas() def _update_inplace(self, new_query_compiler: BaseQueryCompiler) -> None: """ Update the current DataFrame inplace. Parameters ---------- new_query_compiler : BaseQueryCompiler The new QueryCompiler to use to manage the data. """ old_query_compiler = self._query_compiler self._query_compiler = new_query_compiler for sib in self._siblings: sib._query_compiler = new_query_compiler old_query_compiler.free() def _validate_other( self, other, axis, dtype_check=False, compare_index=False, ): """ Help to check validity of other in inter-df operations. Parameters ---------- other : modin.pandas.BasePandasDataset Another dataset to validate against `self`. axis : {None, 0, 1} Specifies axis along which to do validation. When `1` or `None` is specified, validation is done along `index`, if `0` is specified validation is done along `columns` of `other` frame. dtype_check : bool, default: False Validates that both frames have compatible dtypes. compare_index : bool, default: False Compare Index if True. Returns ------- BaseQueryCompiler or Any Other frame if it is determined to be valid. Raises ------ ValueError If `other` is `Series` and its length is different from length of `self` `axis`. TypeError If any validation checks fail. """ if isinstance(other, BasePandasDataset): return other._query_compiler if not is_list_like(other): # We skip dtype checking if the other is a scalar. Note that pandas # is_scalar can be misleading as it is False for almost all objects, # even when those objects should be treated as scalars. See e.g. # https://github.com/modin-project/modin/issues/5236. Therefore, we # detect scalars by checking that `other` is neither a list-like nor # another BasePandasDataset. return other axis = self._get_axis_number(axis) if axis is not None else 1 result = other if axis == 0: if len(other) != len(self._query_compiler.index): raise ValueError( f"Unable to coerce to Series, length must be {len(self._query_compiler.index)}: " + f"given {len(other)}" ) else: if len(other) != len(self._query_compiler.columns): raise ValueError( f"Unable to coerce to Series, length must be {len(self._query_compiler.columns)}: " + f"given {len(other)}" ) if hasattr(other, "dtype"): other_dtypes = [other.dtype] * len(other) elif is_dict_like(other): other_dtypes = [ other[label] if pandas.isna(other[label]) else type(other[label]) for label in self._get_axis(axis) # The binary operation is applied for intersection of axis labels # and dictionary keys. So filtering out extra keys. if label in other ] else: other_dtypes = [x if pandas.isna(x) else type(x) for x in other] if compare_index: if not self.index.equals(other.index): raise TypeError("Cannot perform operation with non-equal index") # Do dtype checking. if dtype_check: self_dtypes = self._get_dtypes() if is_dict_like(other): # The binary operation is applied for the intersection of axis labels # and dictionary keys. So filtering `self_dtypes` to match the `other` # dictionary. self_dtypes = [ dtype for label, dtype in zip(self._get_axis(axis), self._get_dtypes()) if label in other ] # TODO(https://github.com/modin-project/modin/issues/5239): # this spuriously rejects other that is a list including some # custom type that can be added to self's elements. for self_dtype, other_dtype in zip(self_dtypes, other_dtypes): if not ( (is_numeric_dtype(self_dtype) and is_numeric_dtype(other_dtype)) or (is_numeric_dtype(self_dtype) and pandas.isna(other_dtype)) or (is_object_dtype(self_dtype) and is_object_dtype(other_dtype)) or ( lib.is_np_dtype(self_dtype, "mM") and lib.is_np_dtype(self_dtype, "mM") ) or is_dtype_equal(self_dtype, other_dtype) ): raise TypeError("Cannot do operation with improper dtypes") return result def _validate_function(self, func, on_invalid=None) -> None: """ Check the validity of the function which is intended to be applied to the frame. Parameters ---------- func : object on_invalid : callable(str, cls), optional Function to call in case invalid `func` is met, `on_invalid` takes an error message and an exception type as arguments. If not specified raise an appropriate exception. **Note:** This parameter is a hack to concord with pandas error types. """ def error_raiser(msg, exception=Exception): raise exception(msg) if on_invalid is None: on_invalid = error_raiser if isinstance(func, dict): [self._validate_function(fn, on_invalid) for fn in func.values()] return # We also could validate this, but it may be quite expensive for lazy-frames # if not all(idx in self._get_axis(axis) for idx in func.keys()): # error_raiser("Invalid dict keys", KeyError) if not is_list_like(func): func = [func] for fn in func: if isinstance(fn, str): if not (hasattr(self, fn) or hasattr(np, fn)): on_invalid( f"'{fn}' is not a valid function for '{type(self).__name__}' object", AttributeError, ) elif not callable(fn): on_invalid( f"One of the passed functions has an invalid type: {type(fn)}: {fn}, " + "only callable or string is acceptable.", TypeError, ) def _binary_op(self, op, other, **kwargs) -> Self: """ Do binary operation between two datasets. Parameters ---------- op : str Name of binary operation. other : modin.pandas.BasePandasDataset Second operand of binary operation. **kwargs : dict Additional parameters to binary operation. Returns ------- modin.pandas.BasePandasDataset Result of binary operation. """ # _axis indicates the operator will use the default axis if kwargs.pop("_axis", None) is None: if kwargs.get("axis", None) is not None: kwargs["axis"] = axis = self._get_axis_number(kwargs.get("axis", None)) else: kwargs["axis"] = axis = 1 else: axis = 0 if kwargs.get("level", None) is not None: # Broadcast is an internally used argument kwargs.pop("broadcast", None) return self._default_to_pandas( getattr(self._pandas_class, op), other, **kwargs ) other = self._validate_other(other, axis, dtype_check=True) exclude_list = [ "__add__", "__radd__", "__and__", "__rand__", "__or__", "__ror__", "__xor__", "__rxor__", ] if op in exclude_list: kwargs.pop("axis") # Series logical operations take an additional fill_value argument that DF does not series_specialize_list = [ "eq", "ge", "gt", "le", "lt", "ne", ] if not self._is_dataframe and op in series_specialize_list: op = "series_" + op new_query_compiler = getattr(self._query_compiler, op)(other, **kwargs) return self._create_or_update_from_compiler(new_query_compiler) def _default_to_pandas(self, op, *args, reason: str = None, **kwargs): """ Convert dataset to pandas type and call a pandas function on it. Parameters ---------- op : str Name of pandas function. *args : list Additional positional arguments to be passed to `op`. reason : str, optional **kwargs : dict Additional keywords arguments to be passed to `op`. Returns ------- object Result of operation. """ empty_self_str = "" if not self.empty else " for empty DataFrame" self._query_compiler._maybe_warn_on_default( message="`{}.{}`{}".format( type(self).__name__, op if isinstance(op, str) else op.__name__, empty_self_str, ), reason=reason, ) args = try_cast_to_pandas(args) kwargs = try_cast_to_pandas(kwargs) pandas_obj = self._to_pandas() with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=FutureWarning) if callable(op): result = op(pandas_obj, *args, **kwargs) elif isinstance(op, str): # The inner `getattr` is ensuring that we are treating this object (whether # it is a DataFrame, Series, etc.) as a pandas object. The outer `getattr` # will get the operation (`op`) from the pandas version of the class and run # it on the object after we have converted it to pandas. attr = getattr(self._pandas_class, op) if isinstance(attr, property): result = getattr(pandas_obj, op) else: result = attr(pandas_obj, *args, **kwargs) else: ErrorMessage.catch_bugs_and_request_email( failure_condition=True, extra_log="{} is an unsupported operation".format(op), ) if isinstance(result, pandas.DataFrame): from .dataframe import DataFrame return DataFrame(result) elif isinstance(result, pandas.Series): from .series import Series return Series(result) # inplace elif result is None: return self._create_or_update_from_compiler( getattr(pd, type(pandas_obj).__name__)(pandas_obj)._query_compiler, inplace=True, ) else: try: if ( isinstance(result, (list, tuple)) and len(result) == 2 and isinstance(result[0], pandas.DataFrame) ): # Some operations split the DataFrame into two (e.g. align). We need to wrap # both of the returned results if isinstance(result[1], pandas.DataFrame): second = self.__constructor__(result[1]) else: second = result[1] return self.__constructor__(result[0]), second else: return result except TypeError: return result @classmethod def _get_axis_number(cls, axis) -> int: """ Convert axis name or number to axis index. Parameters ---------- axis : int, str or pandas._libs.lib.NoDefault Axis name ('index' or 'columns') or number to be converted to axis index. Returns ------- int 0 or 1 - axis index in the array of axes stored in the dataframe. """ if axis is lib.no_default: axis = None return cls._pandas_class._get_axis_number(axis) if axis is not None else 0 @cached_property def __constructor__(self) -> type[Self]: """ Construct DataFrame or Series object depending on self type. Returns ------- modin.pandas.BasePandasDataset Constructed object. """ return type(self) def abs(self) -> Self: # noqa: RT01, D200 """ Return a `BasePandasDataset` with absolute numeric value of each element. """ self._validate_dtypes(numeric_only=True) return self.__constructor__(query_compiler=self._query_compiler.abs()) def _set_index(self, new_index) -> None: """ Set the index for this DataFrame. Parameters ---------- new_index : pandas.Index The new index to set this. """ self._query_compiler.index = new_index def _get_index(self) -> pandas.Index: """ Get the index for this DataFrame. Returns ------- pandas.Index The union of all indexes across the partitions. """ return self._query_compiler.index index: pandas.Index = property(_get_index, _set_index) def _get_axis(self, axis) -> pandas.Index: """ Return index labels of the specified axis. Parameters ---------- axis : {0, 1} Axis to return labels on. 0 is for index, when 1 is for columns. Returns ------- pandas.Index """ return self.index if axis == 0 else self.columns def add( self, other, axis="columns", level=None, fill_value=None ) -> Self: # noqa: PR01, RT01, D200 """ Return addition of `BasePandasDataset` and `other`, element-wise (binary operator `add`). """ return self._binary_op( "add", other, axis=axis, level=level, fill_value=fill_value ) def aggregate( self, func=None, axis=0, *args, **kwargs ) -> DataFrame | Series | Scalar: # noqa: PR01, RT01, D200 """ Aggregate using one or more operations over the specified axis. """ axis = self._get_axis_number(axis) result = None if axis == 0: result = self._aggregate(func, _axis=axis, *args, **kwargs) # TODO: handle case when axis == 1 if result is None: kwargs.pop("is_transform", None) return self.apply(func, axis=axis, args=args, **kwargs) return result agg: DataFrame | Series | Scalar = aggregate def _aggregate(self, func, *args, **kwargs): """ Aggregate using one or more operations over index axis. Parameters ---------- func : function, str, list or dict Function to use for aggregating the data. *args : list Positional arguments to pass to func. **kwargs : dict Keyword arguments to pass to func. Returns ------- scalar or BasePandasDataset See Also -------- aggregate : Aggregate along any axis. """ _axis = kwargs.pop("_axis", 0) kwargs.pop("_level", None) if isinstance(func, str): kwargs.pop("is_transform", None) return self._string_function(func, *args, **kwargs) # Dictionaries have complex behavior because they can be renamed here. elif func is None or isinstance(func, dict): return self._default_to_pandas("agg", func, *args, **kwargs) kwargs.pop("is_transform", None) return self.apply(func, axis=_axis, args=args, **kwargs) def _string_function(self, func, *args, **kwargs): """ Execute a function identified by its string name. Parameters ---------- func : str Function name to call on `self`. *args : list Positional arguments to pass to func. **kwargs : dict Keyword arguments to pass to func. Returns ------- object Function result. """ assert isinstance(func, str) f = getattr(self, func, None) if f is not None: if callable(f): return f(*args, **kwargs) assert len(args) == 0 assert ( len([kwarg for kwarg in kwargs if kwarg not in ["axis", "_level"]]) == 0 ) return f f = getattr(np, func, None) if f is not None: return self._default_to_pandas("agg", func, *args, **kwargs) raise ValueError("{} is an unknown string function".format(func)) def _get_dtypes(self) -> list: """ Get dtypes as list. Returns ------- list Either a one-element list that contains `dtype` if object denotes a Series or a list that contains `dtypes` if object denotes a DataFrame. """ if hasattr(self, "dtype"): return [self.dtype] else: return list(self.dtypes) def align( self, other, join="outer", axis=None, level=None, copy=None, fill_value=None, method=lib.no_default, limit=lib.no_default, fill_axis=lib.no_default, broadcast_axis=lib.no_default, ) -> tuple[Self, Self]: # noqa: PR01, RT01, D200 """ Align two objects on their axes with the specified join method. """ if ( method is not lib.no_default or limit is not lib.no_default or fill_axis is not lib.no_default ): warnings.warn( "The 'method', 'limit', and 'fill_axis' keywords in " + f"{type(self).__name__}.align are deprecated and will be removed " + "in a future version. Call fillna directly on the returned objects " + "instead.", FutureWarning, ) if fill_axis is lib.no_default: fill_axis = 0 if method is lib.no_default: method = None if limit is lib.no_default: limit = None if broadcast_axis is not lib.no_default: msg = ( f"The 'broadcast_axis' keyword in {type(self).__name__}.align is " + "deprecated and will be removed in a future version." ) if broadcast_axis is not None: if self.ndim == 1 and other.ndim == 2: msg += ( " Use left = DataFrame({col: left for col in right.columns}, " + "index=right.index) before calling `left.align(right)` instead." ) elif self.ndim == 2 and other.ndim == 1: msg += ( " Use right = DataFrame({col: right for col in left.columns}, " + "index=left.index) before calling `left.align(right)` instead" ) warnings.warn(msg, FutureWarning) else: broadcast_axis = None left, right = self._query_compiler.align( other._query_compiler, join=join, axis=axis, level=level, copy=copy, fill_value=fill_value, method=method, limit=limit, fill_axis=fill_axis, broadcast_axis=broadcast_axis, ) return self.__constructor__(query_compiler=left), self.__constructor__( query_compiler=right ) @abc.abstractmethod def _reduce_dimension(self, query_compiler: BaseQueryCompiler) -> Series | Scalar: """ Reduce the dimension of data from the `query_compiler`. Parameters ---------- query_compiler : BaseQueryCompiler Query compiler to retrieve the data. Returns ------- Series | Scalar """ pass def all( self, axis=0, bool_only=False, skipna=True, **kwargs ) -> Self: # noqa: PR01, RT01, D200 """ Return whether all elements are True, potentially over an axis. """ validate_bool_kwarg(skipna, "skipna", none_allowed=False) if axis is not None: axis = self._get_axis_number(axis) if bool_only and axis == 0: if hasattr(self, "dtype"): raise NotImplementedError( "{}.{} does not implement numeric_only.".format( type(self).__name__, "all" ) ) data_for_compute = self[self.columns[self.dtypes == np.bool_]] return data_for_compute.all( axis=axis, bool_only=False, skipna=skipna, **kwargs ) return self._reduce_dimension( self._query_compiler.all( axis=axis, bool_only=bool_only, skipna=skipna, **kwargs ) ) else: if bool_only: raise ValueError("Axis must be 0 or 1 (got {})".format(axis)) # Reduce to a scalar if axis is None. result = self._reduce_dimension( # FIXME: Judging by pandas docs `**kwargs` serves only compatibility # purpose and does not affect the result, we shouldn't pass them to the query compiler. self._query_compiler.all( axis=0, bool_only=bool_only, skipna=skipna, **kwargs, ) ) if isinstance(result, BasePandasDataset): return result.all( axis=axis, bool_only=bool_only, skipna=skipna, **kwargs ) return result def any( self, *, axis=0, bool_only=False, skipna=True, **kwargs ) -> Self: # noqa: PR01, RT01, D200 """ Return whether any element is True, potentially over an axis. """ validate_bool_kwarg(skipna, "skipna", none_allowed=False) if axis is not None: axis = self._get_axis_number(axis) if bool_only and axis == 0: if hasattr(self, "dtype"): raise NotImplementedError( "{}.{} does not implement numeric_only.".format( type(self).__name__, "all" ) ) data_for_compute = self[self.columns[self.dtypes == np.bool_]] return data_for_compute.any( axis=axis, bool_only=False, skipna=skipna, **kwargs ) return self._reduce_dimension( self._query_compiler.any( axis=axis, bool_only=bool_only, skipna=skipna, **kwargs ) ) else: if bool_only: raise ValueError("Axis must be 0 or 1 (got {})".format(axis)) # Reduce to a scalar if axis is None. result = self._reduce_dimension( self._query_compiler.any( axis=0, bool_only=bool_only, skipna=skipna, **kwargs, ) ) if isinstance(result, BasePandasDataset): return result.any( axis=axis, bool_only=bool_only, skipna=skipna, **kwargs ) return result def apply( self, func, axis, raw, result_type, args, **kwds, ) -> BaseQueryCompiler: # noqa: PR01, RT01, D200 """ Apply a function along an axis of the `BasePandasDataset`. """ def error_raiser(msg, exception): """Convert passed exception to the same type as pandas do and raise it.""" # HACK: to concord with pandas error types by replacing all of the # TypeErrors to the AssertionErrors exception = exception if exception is not TypeError else AssertionError raise exception(msg) self._validate_function(func, on_invalid=error_raiser) axis = self._get_axis_number(axis) if isinstance(func, str): # if axis != 1 function can be bounded to the Series, which doesn't # support axis parameter if axis == 1: kwds["axis"] = axis result = self._string_function(func, *args, **kwds) if isinstance(result, BasePandasDataset): return result._query_compiler return result elif isinstance(func, dict): if self._query_compiler.get_axis_len(1) != len(set(self.columns)): warnings.warn( "duplicate column names not supported with apply().", FutureWarning, stacklevel=2, ) query_compiler = self._query_compiler.apply( func, axis, args=args, raw=raw, result_type=result_type, **kwds, ) return query_compiler def asfreq( self, freq, method=None, how=None, normalize=False, fill_value=None ) -> Self: # noqa: PR01, RT01, D200 """ Convert time series to specified frequency. """ return self.__constructor__( query_compiler=self._query_compiler.asfreq( freq=freq, method=method, how=how, normalize=normalize, fill_value=fill_value, ) ) def asof(self, where, subset=None) -> Self: # noqa: PR01, RT01, D200 """ Return the last row(s) without any NaNs before `where`. """ scalar = not is_list_like(where) if isinstance(where, pandas.Index): # Prevent accidental mutation of original: where = where.copy() else: if scalar: where = [where] where = pandas.Index(where) if subset is None: data = self else: # Only relevant for DataFrames: data = self[subset] no_na_index = data.dropna().index new_index = pandas.Index([no_na_index.asof(i) for i in where]) result = self.reindex(new_index) result.index = where if scalar: # Need to return a Series: result = result.squeeze() return result def astype( self, dtype, copy=None, errors="raise" ) -> Self: # noqa: PR01, RT01, D200 """ Cast a Modin object to a specified dtype `dtype`. """ if copy is None: copy = True # dtype can be a series, a dict, or a scalar. If it's series, # convert it to a dict before passing it to the query compiler. if isinstance(dtype, (pd.Series, pandas.Series)): if not dtype.index.is_unique: raise ValueError("cannot reindex on an axis with duplicate labels") dtype = {column: dtype for column, dtype in dtype.items()} # If we got a series or dict originally, dtype is a dict now. Its keys # must be column names. if isinstance(dtype, dict): # avoid materializing columns in lazy mode. the query compiler # will handle errors where dtype dict includes keys that are not # in columns. if ( not self._query_compiler.lazy_column_labels and not set(dtype.keys()).issubset(set(self._query_compiler.columns)) and errors == "raise" ): raise KeyError( "Only a column name can be used for the key in " + "a dtype mappings argument." ) if not copy: # If the new types match the old ones, then copying can be avoided if self._query_compiler.frame_has_materialized_dtypes: frame_dtypes = self._query_compiler.dtypes if isinstance(dtype, dict): for col in dtype: if dtype[col] != frame_dtypes[col]: copy = True break else: if not (frame_dtypes == dtype).all(): copy = True else: copy = True if copy: new_query_compiler = self._query_compiler.astype(dtype, errors=errors) return self._create_or_update_from_compiler(new_query_compiler) return self @property def at(self, axis=None) -> _LocIndexer: # noqa: PR01, RT01, D200 """ Get a single value for a row/column label pair. """ from .indexing import _LocIndexer return _LocIndexer(self) def at_time(self, time, asof=False, axis=None) -> Self: # noqa: PR01, RT01, D200 """ Select values at particular time of day (e.g., 9:30AM). """ if asof: # pandas raises NotImplementedError for asof=True, so we do, too. raise NotImplementedError("'asof' argument is not supported") return self.between_time( start_time=time, end_time=time, inclusive="both", axis=axis ) @_inherit_docstrings( pandas.DataFrame.between_time, apilink="pandas.DataFrame.between_time" ) def between_time( self, start_time, end_time, inclusive="both", axis=None, ) -> Self: # noqa: PR01, RT01, D200 return self._create_or_update_from_compiler( self._query_compiler.between_time( start_time=pandas.core.tools.times.to_time(start_time), end_time=pandas.core.tools.times.to_time(end_time), inclusive=inclusive, axis=self._get_axis_number(axis), ) ) def _deprecate_downcast(self, downcast, method_name: str): # noqa: GL08 if downcast is not lib.no_default: warnings.warn( f"The 'downcast' keyword in {method_name} is deprecated and " + "will be removed in a future version. Use " + "res.infer_objects(copy=False) to infer non-object dtype, or " + "pd.to_numeric with the 'downcast' keyword to downcast numeric " + "results.", FutureWarning, ) else: downcast = None return downcast def bfill( self, *, axis=None, inplace=False, limit=None, limit_area=None, downcast=lib.no_default, ) -> Self: # noqa: PR01, RT01, D200 """ Synonym for `DataFrame.fillna` with ``method='bfill'``. """ if limit_area is not None: return self._default_to_pandas( "bfill", reason="'limit_area' parameter isn't supported", axis=axis, inplace=inplace, limit=limit, limit_area=limit_area, downcast=downcast, ) downcast = self._deprecate_downcast(downcast, "bfill") with warnings.catch_warnings(): warnings.filterwarnings( "ignore", ".*fillna with 'method' is deprecated", category=FutureWarning ) return self.fillna( method="bfill", axis=axis, limit=limit, downcast=downcast, inplace=inplace, ) def backfill( self, *, axis=None, inplace=False, limit=None, downcast=lib.no_default ) -> Self: # noqa: PR01, RT01, D200 """ Synonym for `DataFrame.bfill`. """ warnings.warn( "DataFrame.backfill/Series.backfill is deprecated. Use DataFrame.bfill/Series.bfill instead", FutureWarning, ) with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=FutureWarning) return self.bfill( axis=axis, inplace=inplace, limit=limit, downcast=downcast ) def bool(self) -> bool: # noqa: RT01, D200 """ Return the bool of a single element `BasePandasDataset`. """ warnings.warn( f"{type(self).__name__}.bool is now deprecated and will be removed " + "in future version of pandas", FutureWarning, ) shape = self.shape if shape != (1,) and shape != (1, 1): raise ValueError( """The PandasObject does not have exactly 1 element. Return the bool of a single element PandasObject. The truth value is ambiguous. Use a.empty, a.item(), a.any() or a.all().""" ) else: return self._to_pandas().bool() def clip( self, lower=None, upper=None, *, axis=None, inplace=False, **kwargs ) -> Self: # noqa: PR01, RT01, D200 """ Trim values at input threshold(s). """ # validate inputs if axis is not None: axis = self._get_axis_number(axis) self._validate_dtypes(numeric_only=True) inplace = validate_bool_kwarg(inplace, "inplace") axis = numpy_compat.function.validate_clip_with_axis(axis, (), kwargs) # any np.nan bounds are treated as None if lower is not None and np.any(np.isnan(lower)): lower = None if upper is not None and np.any(np.isnan(upper)): upper = None if is_list_like(lower) or is_list_like(upper): lower = self._validate_other(lower, axis) upper = self._validate_other(upper, axis) # FIXME: Judging by pandas docs `*args` and `**kwargs` serves only compatibility # purpose and does not affect the result, we shouldn't pass them to the query compiler. new_query_compiler = self._query_compiler.clip( lower=lower, upper=upper, axis=axis, **kwargs ) return self._create_or_update_from_compiler(new_query_compiler, inplace) def combine( self, other, func, fill_value=None, **kwargs ) -> Self: # noqa: PR01, RT01, D200 """ Perform combination of `BasePandasDataset`-s according to `func`. """ return self._binary_op( "combine", other, _axis=0, func=func, fill_value=fill_value, **kwargs ) def combine_first(self, other) -> Self: # noqa: PR01, RT01, D200 """ Update null elements with value in the same location in `other`. """ return self._binary_op("combine_first", other, _axis=0) def copy(self, deep=True) -> Self: # noqa: PR01, RT01, D200 """ Make a copy of the object's metadata. """ if deep: return self.__constructor__(query_compiler=self._query_compiler.copy()) new_obj = self.__constructor__(query_compiler=self._query_compiler) self._add_sibling(new_obj) return new_obj def count( self, axis=0, numeric_only=False ) -> Series | Scalar: # noqa: PR01, RT01, D200 """ Count non-NA cells for `BasePandasDataset`. """ axis = self._get_axis_number(axis) # select_dtypes is only implemented on DataFrames, but the numeric_only # flag will always be set to false by the Series frontend frame = self.select_dtypes([np.number, np.bool_]) if numeric_only else self return frame._reduce_dimension( frame._query_compiler.count(axis=axis, numeric_only=numeric_only) ) def cummax( self, axis=None, skipna=True, *args, **kwargs ) -> Self: # noqa: PR01, RT01, D200 """ Return cumulative maximum over a `BasePandasDataset` axis. """ axis = self._get_axis_number(axis) if axis == 1: self._validate_dtypes(numeric_only=True) return self.__constructor__( # FIXME: Judging by pandas docs `*args` and `**kwargs` serves only compatibility # purpose and does not affect the result, we shouldn't pass them to the query compiler. query_compiler=self._query_compiler.cummax( fold_axis=axis, axis=axis, skipna=skipna, **kwargs ) ) def cummin( self, axis=None, skipna=True, *args, **kwargs ) -> Self: # noqa: PR01, RT01, D200 """ Return cumulative minimum over a `BasePandasDataset` axis. """ axis = self._get_axis_number(axis) if axis == 1: self._validate_dtypes(numeric_only=True) return self.__constructor__( # FIXME: Judging by pandas docs `*args` and `**kwargs` serves only compatibility # purpose and does not affect the result, we shouldn't pass them to the query compiler. query_compiler=self._query_compiler.cummin( fold_axis=axis, axis=axis, skipna=skipna, **kwargs ) ) def cumprod( self, axis=None, skipna=True, *args, **kwargs ) -> Self: # noqa: PR01, RT01, D200 """ Return cumulative product over a `BasePandasDataset` axis. """ axis = self._get_axis_number(axis) self._validate_dtypes(numeric_only=True) return self.__constructor__( # FIXME: Judging by pandas docs `**kwargs` serves only compatibility # purpose and does not affect the result, we shouldn't pass them to the query compiler. query_compiler=self._query_compiler.cumprod( fold_axis=axis, axis=axis, skipna=skipna, **kwargs ) ) def cumsum( self, axis=None, skipna=True, *args, **kwargs ) -> Self: # noqa: PR01, RT01, D200 """ Return cumulative sum over a `BasePandasDataset` axis. """ axis = self._get_axis_number(axis) self._validate_dtypes(numeric_only=True) return self.__constructor__( # FIXME: Judging by pandas docs `*args` and `**kwargs` serves only compatibility # purpose and does not affect the result, we shouldn't pass them to the query compiler. query_compiler=self._query_compiler.cumsum( fold_axis=axis, axis=axis, skipna=skipna, **kwargs ) ) def describe( self, percentiles=None, include=None, exclude=None, ) -> Self: # noqa: PR01, RT01, D200 """ Generate descriptive statistics. """ # copied from pandas.core.describe.describe_ndframe percentiles = _refine_percentiles(percentiles) data = self if self._is_dataframe: # include/exclude arguments are ignored for Series if (include is None) and (exclude is None): # when some numerics are found, keep only numerics default_include: list[npt.DTypeLike] = [np.number] default_include.append("datetime") data = self.select_dtypes(include=default_include) if len(data.columns) == 0: data = self elif include == "all": if exclude is not None: msg = "exclude must be None when include is 'all'" raise ValueError(msg) data = self else: data = self.select_dtypes( include=include, exclude=exclude, ) if data.empty: # Match pandas error from concatenting empty list of series descriptions. raise ValueError("No objects to concatenate") return self.__constructor__( query_compiler=data._query_compiler.describe(percentiles=percentiles) ) def diff(self, periods=1, axis=0) -> Self: # noqa: PR01, RT01, D200 """ First discrete difference of element. """ # Attempting to match pandas error behavior here if not isinstance(periods, int): raise ValueError(f"periods must be an int. got {type(periods)} instead") # Attempting to match pandas error behavior here for dtype in self._get_dtypes(): if not (is_numeric_dtype(dtype) or lib.is_np_dtype(dtype, "mM")): raise TypeError(f"unsupported operand type for -: got {dtype}") axis = self._get_axis_number(axis) return self.__constructor__( query_compiler=self._query_compiler.diff(axis=axis, periods=periods) ) def drop( self, labels=None, *, axis=0, index=None, columns=None, level=None, inplace=False, errors="raise", ) -> Self: # noqa: PR01, RT01, D200 """ Drop specified labels from `BasePandasDataset`. """ # TODO implement level if level is not None: return self._default_to_pandas( "drop", labels=labels, axis=axis, index=index, columns=columns, level=level, inplace=inplace, errors=errors, ) inplace = validate_bool_kwarg(inplace, "inplace") if labels is not None: if index is not None or columns is not None: raise ValueError("Cannot specify both 'labels' and 'index'/'columns'") axis_name = pandas.DataFrame._get_axis_name(axis) axes = {axis_name: labels} elif index is not None or columns is not None: axes = {"index": index} if self.ndim == 2: axes["columns"] = columns else: raise ValueError( "Need to specify at least one of 'labels', 'index' or 'columns'" ) for axis in ["index", "columns"]: if axis not in axes: axes[axis] = None elif axes[axis] is not None: if not is_list_like(axes[axis]): axes[axis] = [axes[axis]] # In case of lazy execution we should bypass these error checking components # because they can force the materialization of the row or column labels. if (axis == "index" and self._query_compiler.lazy_row_labels) or ( axis == "columns" and self._query_compiler.lazy_column_labels ): continue if errors == "raise": non_existent = pandas.Index(axes[axis]).difference( getattr(self, axis) ) if len(non_existent): raise KeyError(f"labels {non_existent} not contained in axis") else: axes[axis] = [ obj for obj in axes[axis] if obj in getattr(self, axis) ] # If the length is zero, we will just do nothing if not len(axes[axis]): axes[axis] = None new_query_compiler = self._query_compiler.drop( index=axes["index"], columns=axes["columns"], errors=errors ) return self._create_or_update_from_compiler(new_query_compiler, inplace) def dropna( self, *, axis: Axis = 0, how: str | lib.NoDefault = lib.no_default, thresh: int | lib.NoDefault = lib.no_default, subset: IndexLabel = None, inplace: bool = False, ignore_index: bool = False, ) -> Self: # noqa: PR01, RT01, D200 """ Remove missing values. """ inplace = validate_bool_kwarg(inplace, "inplace") if is_list_like(axis): raise TypeError("supplying multiple axes to axis is no longer supported.") axis = self._get_axis_number(axis) if how is not None and how not in ["any", "all", lib.no_default]: raise ValueError("invalid how option: %s" % how) if how is None and thresh is None: raise TypeError("must specify how or thresh") if subset is not None: if axis == 1: indices = self.index.get_indexer_for(subset) check = indices == -1 if check.any(): raise KeyError(list(np.compress(check, subset))) else: indices = self.columns.get_indexer_for(subset) check = indices == -1 if check.any(): raise KeyError(list(np.compress(check, subset))) new_query_compiler = self._query_compiler.dropna( axis=axis, how=how, thresh=thresh, subset=subset ) if ignore_index: new_query_compiler.index = pandas.RangeIndex( stop=len(new_query_compiler.index) ) return self._create_or_update_from_compiler(new_query_compiler, inplace) def droplevel(self, level, axis=0) -> Self: # noqa: PR01, RT01, D200 """ Return `BasePandasDataset` with requested index / column level(s) removed. """ axis = self._get_axis_number(axis) result = self.copy() if axis == 0: index_columns = result.index.names.copy() if is_integer(level): level = index_columns[level] elif is_list_like(level): level = [ index_columns[lev] if is_integer(lev) else lev for lev in level ] if is_list_like(level): for lev in level: index_columns.remove(lev) else: index_columns.remove(level) if len(result.columns.names) > 1: # In this case, we are dealing with a MultiIndex column, so we need to # be careful when dropping the additional index column. if is_list_like(level): drop_labels = [(lev, "") for lev in level] else: drop_labels = [(level, "")] result = result.reset_index().drop(columns=drop_labels) else: result = result.reset_index().drop(columns=level) result = result.set_index(index_columns) else: result.columns = self.columns.droplevel(level) return result def drop_duplicates( self, keep="first", inplace=False, **kwargs ) -> Self: # noqa: PR01, RT01, D200 """ Return `BasePandasDataset` with duplicate rows removed. """ inplace = validate_bool_kwarg(inplace, "inplace") ignore_index = kwargs.get("ignore_index", False) subset = kwargs.get("subset", None) if subset is not None: if is_list_like(subset): if not isinstance(subset, list): subset = list(subset) else: subset = [subset] if len(diff := pandas.Index(subset).difference(self.columns)) > 0: raise KeyError(diff) result_qc = self._query_compiler.unique( keep=keep, ignore_index=ignore_index, subset=subset ) result = self.__constructor__(query_compiler=result_qc) if inplace: self._update_inplace(result._query_compiler) else: return result def eq(self, other, axis="columns", level=None) -> Self: # noqa: PR01, RT01, D200 """ Get equality of `BasePandasDataset` and `other`, element-wise (binary operator `eq`). """ return self._binary_op("eq", other, axis=axis, level=level, dtypes=np.bool_) def explode( self, column, ignore_index: bool = False ) -> Self: # noqa: PR01, RT01, D200 """ Transform each element of a list-like to a row. """ exploded = self.__constructor__( query_compiler=self._query_compiler.explode(column) ) if ignore_index: exploded = exploded.reset_index(drop=True) return exploded def ewm( self, com: float | None = None, span: float | None = None, halflife: float | TimedeltaConvertibleTypes | None = None, alpha: float | None = None, min_periods: int | None = 0, adjust: bool = True, ignore_na: bool = False, axis: Axis = lib.no_default, times: str | np.ndarray | BasePandasDataset | None = None, method: str = "single", ) -> pandas.core.window.ewm.ExponentialMovingWindow: # noqa: PR01, RT01, D200 """ Provide exponentially weighted (EW) calculations. """ return self._default_to_pandas( "ewm", com=com, span=span, halflife=halflife, alpha=alpha, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na, axis=axis, times=times, method=method, ) def expanding( self, min_periods=1, axis=lib.no_default, method="single" ) -> Expanding: # noqa: PR01, RT01, D200 """ Provide expanding window calculations. """ from .window import Expanding if axis is not lib.no_default: axis = self._get_axis_number(axis) name = "expanding" if axis == 1: warnings.warn( f"Support for axis=1 in {type(self).__name__}.{name} is " + "deprecated and will be removed in a future version. " + f"Use obj.T.{name}(...) instead", FutureWarning, ) else: warnings.warn( f"The 'axis' keyword in {type(self).__name__}.{name} is " + "deprecated and will be removed in a future version. " + "Call the method without the axis keyword instead.", FutureWarning, ) else: axis = 0 return Expanding( self, min_periods=min_periods, axis=axis, method=method, ) def ffill( self, *, axis=None, inplace=False, limit=None, limit_area=None, downcast=lib.no_default, ) -> Self | None: # noqa: PR01, RT01, D200 """ Synonym for `DataFrame.fillna` with ``method='ffill'``. """ if limit_area is not None: return self._default_to_pandas( "ffill", reason="'limit_area' parameter isn't supported", axis=axis, inplace=inplace, limit=limit, limit_area=limit_area, downcast=downcast, ) downcast = self._deprecate_downcast(downcast, "ffill") with warnings.catch_warnings(): warnings.filterwarnings( "ignore", ".*fillna with 'method' is deprecated", category=FutureWarning ) return self.fillna( method="ffill", axis=axis, limit=limit, downcast=downcast, inplace=inplace, ) def pad( self, *, axis=None, inplace=False, limit=None, downcast=lib.no_default ) -> Self | None: # noqa: PR01, RT01, D200 """ Synonym for `DataFrame.ffill`. """ warnings.warn( "DataFrame.pad/Series.pad is deprecated. Use DataFrame.ffill/Series.ffill instead", FutureWarning, ) with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=FutureWarning) return self.ffill( axis=axis, inplace=inplace, limit=limit, downcast=downcast ) def fillna( self, squeeze_self, squeeze_value, value=None, method=None, axis=None, inplace=False, limit=None, downcast=lib.no_default, ) -> Self | None: """ Fill NA/NaN values using the specified method. Parameters ---------- squeeze_self : bool If True then self contains a Series object, if False then self contains a DataFrame object. squeeze_value : bool If True then value contains a Series object, if False then value contains a DataFrame object. value : scalar, dict, Series, or DataFrame, default: None Value to use to fill holes (e.g. 0), alternately a dict/Series/DataFrame of values specifying which value to use for each index (for a Series) or column (for a DataFrame). Values not in the dict/Series/DataFrame will not be filled. This value cannot be a list. method : {'backfill', 'bfill', 'pad', 'ffill', None}, default: None Method to use for filling holes in reindexed Series pad / ffill: propagate last valid observation forward to next valid backfill / bfill: use next valid observation to fill gap. axis : {None, 0, 1}, default: None Axis along which to fill missing values. inplace : bool, default: False If True, fill in-place. Note: this will modify any other views on this object (e.g., a no-copy slice for a column in a DataFrame). limit : int, default: None If method is specified, this is the maximum number of consecutive NaN values to forward/backward fill. In other words, if there is a gap with more than this number of consecutive NaNs, it will only be partially filled. If method is not specified, this is the maximum number of entries along the entire axis where NaNs will be filled. Must be greater than 0 if not None. downcast : dict, default: None A dict of item->dtype of what to downcast if possible, or the string 'infer' which will try to downcast to an appropriate equal type (e.g. float64 to int64 if possible). Returns ------- Series, DataFrame or None Object with missing values filled or None if ``inplace=True``. """ if method is not None: warnings.warn( f"{type(self).__name__}.fillna with 'method' is deprecated and " + "will raise in a future version. Use obj.ffill() or obj.bfill() " + "instead.", FutureWarning, ) downcast = self._deprecate_downcast(downcast, "fillna") inplace = validate_bool_kwarg(inplace, "inplace") axis = self._get_axis_number(axis) if isinstance(value, (list, tuple)): raise TypeError( '"value" parameter must be a scalar or dict, but ' + f'you passed a "{type(value).__name__}"' ) if value is None and method is None: raise ValueError("must specify a fill method or value") if value is not None and method is not None: raise ValueError("cannot specify both a fill method and value") if method is not None and method not in ["backfill", "bfill", "pad", "ffill"]: expecting = "pad (ffill) or backfill (bfill)" msg = "Invalid fill method. Expecting {expecting}. Got {method}".format( expecting=expecting, method=method ) raise ValueError(msg) if limit is not None: if not isinstance(limit, int): raise ValueError("Limit must be an integer") elif limit <= 0: raise ValueError("Limit must be greater than 0") if isinstance(value, BasePandasDataset): value = value._query_compiler new_query_compiler = self._query_compiler.fillna( squeeze_self=squeeze_self, squeeze_value=squeeze_value, value=value, method=method, axis=axis, inplace=False, limit=limit, downcast=downcast, ) return self._create_or_update_from_compiler(new_query_compiler, inplace) def filter( self, items=None, like=None, regex=None, axis=None ) -> Self: # noqa: PR01, RT01, D200 """ Subset the `BasePandasDataset` rows or columns according to the specified index labels. """ nkw = count_not_none(items, like, regex) if nkw > 1: raise TypeError( "Keyword arguments `items`, `like`, or `regex` are mutually exclusive" ) if nkw == 0: raise TypeError("Must pass either `items`, `like`, or `regex`") if axis is None: axis = "columns" # This is the default info axis for dataframes axis = self._get_axis_number(axis) labels = self.columns if axis else self.index if items is not None: bool_arr = labels.isin(items) elif like is not None: def f(x): return like in str(x) bool_arr = labels.map(f).tolist() else: def f(x): return matcher.search(str(x)) is not None matcher = re.compile(regex) bool_arr = labels.map(f).tolist() if not axis: return self[bool_arr] return self[self.columns[bool_arr]] def first(self, offset) -> Self | None: # noqa: PR01, RT01, D200 """ Select initial periods of time series data based on a date offset. """ warnings.warn( "first is deprecated and will be removed in a future version. " + "Please create a mask and filter using `.loc` instead", FutureWarning, ) return self._create_or_update_from_compiler( self._query_compiler.first(offset=to_offset(offset)) ) def first_valid_index(self) -> int: # noqa: RT01, D200 """ Return index for first non-NA value or None, if no non-NA value is found. """ return self._query_compiler.first_valid_index() def floordiv( self, other, axis="columns", level=None, fill_value=None ) -> Self: # noqa: PR01, RT01, D200 """ Get integer division of `BasePandasDataset` and `other`, element-wise (binary operator `floordiv`). """ return self._binary_op( "floordiv", other, axis=axis, level=level, fill_value=fill_value ) def ge(self, other, axis="columns", level=None) -> Self: # noqa: PR01, RT01, D200 """ Get greater than or equal comparison of `BasePandasDataset` and `other`, element-wise (binary operator `ge`). """ return self._binary_op("ge", other, axis=axis, level=level, dtypes=np.bool_) def get( self, key, default=None ) -> DataFrame | Series | Scalar: # noqa: PR01, RT01, D200 """ Get item from object for given key. """ # Match pandas behavior here try: return self.__getitem__(key) except (KeyError, ValueError, IndexError): return default def gt(self, other, axis="columns", level=None) -> Self: # noqa: PR01, RT01, D200 """ Get greater than comparison of `BasePandasDataset` and `other`, element-wise (binary operator `gt`). """ return self._binary_op("gt", other, axis=axis, level=level, dtypes=np.bool_) def head(self, n=5) -> Self: # noqa: PR01, RT01, D200 """ Return the first `n` rows. """ return self.iloc[:n] @property def iat(self, axis=None) -> _iLocIndexer: # noqa: PR01, RT01, D200 """ Get a single value for a row/column pair by integer position. """ from .indexing import _iLocIndexer return _iLocIndexer(self) def idxmax( self, axis=0, skipna=True, numeric_only=False ) -> Self: # noqa: PR01, RT01, D200 """ Return index of first occurrence of maximum over requested axis. """ axis = self._get_axis_number(axis) return self._reduce_dimension( self._query_compiler.idxmax( axis=axis, skipna=skipna, numeric_only=numeric_only ) ) def idxmin( self, axis=0, skipna=True, numeric_only=False ) -> Self: # noqa: PR01, RT01, D200 """ Return index of first occurrence of minimum over requested axis. """ axis = self._get_axis_number(axis) return self._reduce_dimension( self._query_compiler.idxmin( axis=axis, skipna=skipna, numeric_only=numeric_only ) ) def infer_objects(self, copy=None) -> Self: # noqa: PR01, RT01, D200 """ Attempt to infer better dtypes for object columns. """ new_query_compiler = self._query_compiler.infer_objects() return self._create_or_update_from_compiler( new_query_compiler, inplace=False if copy is None else not copy ) def convert_dtypes( self, infer_objects: bool = True, convert_string: bool = True, convert_integer: bool = True, convert_boolean: bool = True, convert_floating: bool = True, dtype_backend: DtypeBackend = "numpy_nullable", ) -> Self: # noqa: PR01, RT01, D200 """ Convert columns to best possible dtypes using dtypes supporting ``pd.NA``. """ return self.__constructor__( query_compiler=self._query_compiler.convert_dtypes( infer_objects=infer_objects, convert_string=convert_string, convert_integer=convert_integer, convert_boolean=convert_boolean, convert_floating=convert_floating, dtype_backend=dtype_backend, ) ) def isin(self, values) -> Self: # noqa: PR01, RT01, D200 """ Whether elements in `BasePandasDataset` are contained in `values`. """ from .series import Series ignore_indices = isinstance(values, Series) values = getattr(values, "_query_compiler", values) return self.__constructor__( query_compiler=self._query_compiler.isin( values=values, ignore_indices=ignore_indices ) ) def isna(self) -> Self: # noqa: RT01, D200 """ Detect missing values. """ return self.__constructor__(query_compiler=self._query_compiler.isna()) isnull: Self = isna @property def iloc(self) -> _iLocIndexer: # noqa: RT01, D200 """ Purely integer-location based indexing for selection by position. """ from .indexing import _iLocIndexer return _iLocIndexer(self) @_inherit_docstrings(pandas.DataFrame.kurt, apilink="pandas.DataFrame.kurt") def kurt(self, axis=0, skipna=True, numeric_only=False, **kwargs) -> Series | float: return self._stat_operation("kurt", axis, skipna, numeric_only, **kwargs) kurtosis: Series | float = kurt def last(self, offset) -> Self: # noqa: PR01, RT01, D200 """ Select final periods of time series data based on a date offset. """ warnings.warn( "last is deprecated and will be removed in a future version. " + "Please create a mask and filter using `.loc` instead", FutureWarning, ) return self._create_or_update_from_compiler( self._query_compiler.last(offset=to_offset(offset)) ) def last_valid_index(self) -> int: # noqa: RT01, D200 """ Return index for last non-NA value or None, if no non-NA value is found. """ return self._query_compiler.last_valid_index() def le(self, other, axis="columns", level=None) -> Self: # noqa: PR01, RT01, D200 """ Get less than or equal comparison of `BasePandasDataset` and `other`, element-wise (binary operator `le`). """ return self._binary_op("le", other, axis=axis, level=level, dtypes=np.bool_) def lt(self, other, axis="columns", level=None) -> Self: # noqa: PR01, RT01, D200 """ Get less than comparison of `BasePandasDataset` and `other`, element-wise (binary operator `lt`). """ return self._binary_op("lt", other, axis=axis, level=level, dtypes=np.bool_) @property def loc(self) -> _LocIndexer: # noqa: RT01, D200 """ Get a group of rows and columns by label(s) or a boolean array. """ from .indexing import _LocIndexer return _LocIndexer(self) def mask( self, cond, other=lib.no_default, *, inplace: bool = False, axis: Optional[Axis] = None, level: Optional[Level] = None, ) -> Self | None: # noqa: PR01, RT01, D200 """ Replace values where the condition is True. """ return self._create_or_update_from_compiler( self._query_compiler.mask( cond, other=other, inplace=False, axis=axis, level=level, ), inplace=inplace, ) def max( self, axis: Axis = 0, skipna=True, numeric_only=False, **kwargs, ) -> Series | None: # noqa: PR01, RT01, D200 """ Return the maximum of the values over the requested axis. """ validate_bool_kwarg(skipna, "skipna", none_allowed=False) orig_axis = axis axis = self._get_axis_number(axis) data = self._validate_dtypes_min_max(axis, numeric_only) res = data._reduce_dimension( data._query_compiler.max( axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs, ) ) if orig_axis is None: res = res._reduce_dimension( res._query_compiler.max( axis=0, skipna=skipna, numeric_only=False, **kwargs, ) ) return res def min( self, axis: Axis = 0, skipna: bool = True, numeric_only=False, **kwargs, ) -> Series | None: # noqa: PR01, RT01, D200 """ Return the minimum of the values over the requested axis. """ validate_bool_kwarg(skipna, "skipna", none_allowed=False) orig_axis = axis axis = self._get_axis_number(axis) data = self._validate_dtypes_min_max(axis, numeric_only) res = data._reduce_dimension( data._query_compiler.min( axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs, ) ) if orig_axis is None: res = res._reduce_dimension( res._query_compiler.min( axis=0, skipna=skipna, numeric_only=False, **kwargs, ) ) return res def _stat_operation( self, op_name: str, axis: Optional[Union[int, str]], skipna: bool, numeric_only: Optional[bool] = False, **kwargs, ): """ Do common statistic reduce operations under frame. Parameters ---------- op_name : str Name of method to apply. axis : int or str Axis to apply method on. skipna : bool Exclude NA/null values when computing the result. numeric_only : bool, default: False Include only float, int, boolean columns. If None, will attempt to use everything, then use only numeric data. **kwargs : dict Additional keyword arguments to pass to `op_name`. Returns ------- scalar, Series or DataFrame `scalar` - self is Series and level is not specified. `Series` - self is Series and level is specified, or self is DataFrame and level is not specified. `DataFrame` - self is DataFrame and level is specified. """ axis = self._get_axis_number(axis) if axis is not None else None validate_bool_kwarg(skipna, "skipna", none_allowed=False) if op_name == "median": numpy_compat.function.validate_median((), kwargs) elif op_name in ("sem", "var", "std"): val_kwargs = {k: v for k, v in kwargs.items() if k != "ddof"} numpy_compat.function.validate_stat_ddof_func((), val_kwargs, fname=op_name) else: numpy_compat.function.validate_stat_func((), kwargs, fname=op_name) if not numeric_only: self._validate_dtypes(numeric_only=True) data = ( self._get_numeric_data(axis if axis is not None else 0) if numeric_only else self ) result_qc = getattr(data._query_compiler, op_name)( axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs, ) return ( self._reduce_dimension(result_qc) if isinstance(result_qc, type(self._query_compiler)) # scalar case else result_qc ) def memory_usage( self, index=True, deep=False ) -> Series | None: # noqa: PR01, RT01, D200 """ Return the memory usage of the `BasePandasDataset`. """ return self._reduce_dimension( self._query_compiler.memory_usage(index=index, deep=deep) ) def mod( self, other, axis="columns", level=None, fill_value=None ) -> Self: # noqa: PR01, RT01, D200 """ Get modulo of `BasePandasDataset` and `other`, element-wise (binary operator `mod`). """ return self._binary_op( "mod", other, axis=axis, level=level, fill_value=fill_value ) def mode( self, axis=0, numeric_only=False, dropna=True ) -> Self: # noqa: PR01, RT01, D200 """ Get the mode(s) of each element along the selected axis. """ axis = self._get_axis_number(axis) return self.__constructor__( query_compiler=self._query_compiler.mode( axis=axis, numeric_only=numeric_only, dropna=dropna ) ) def mul( self, other, axis="columns", level=None, fill_value=None ) -> Self: # noqa: PR01, RT01, D200 """ Get multiplication of `BasePandasDataset` and `other`, element-wise (binary operator `mul`). """ return self._binary_op( "mul", other, axis=axis, level=level, fill_value=fill_value ) multiply: Self = mul def ne(self, other, axis="columns", level=None) -> Self: # noqa: PR01, RT01, D200 """ Get Not equal comparison of `BasePandasDataset` and `other`, element-wise (binary operator `ne`). """ return self._binary_op("ne", other, axis=axis, level=level, dtypes=np.bool_) def notna(self) -> Self: # noqa: RT01, D200 """ Detect existing (non-missing) values. """ return self.__constructor__(query_compiler=self._query_compiler.notna()) notnull: Self = notna def nunique(self, axis=0, dropna=True) -> Series | int: # noqa: PR01, RT01, D200 """ Return number of unique elements in the `BasePandasDataset`. """ axis = self._get_axis_number(axis) return self._reduce_dimension( self._query_compiler.nunique(axis=axis, dropna=dropna) ) def pct_change( self, periods=1, fill_method=lib.no_default, limit=lib.no_default, freq=None, **kwargs, ) -> Self: # noqa: PR01, RT01, D200 """ Percentage change between the current and a prior element. """ if fill_method not in (lib.no_default, None) or limit is not lib.no_default: warnings.warn( "The 'fill_method' keyword being not None and the 'limit' keyword in " + f"{type(self).__name__}.pct_change are deprecated and will be removed " + "in a future version. Either fill in any non-leading NA values prior " + "to calling pct_change or specify 'fill_method=None' to not fill NA " + "values.", FutureWarning, ) if fill_method is lib.no_default: if self.isna().values.any(): warnings.warn( "The default fill_method='pad' in " + f"{type(self).__name__}.pct_change is deprecated and will be " + "removed in a future version. Call ffill before calling " + "pct_change to retain current behavior and silence this warning.", FutureWarning, ) fill_method = "pad" if limit is lib.no_default: limit = None # Attempting to match pandas error behavior here if not isinstance(periods, int): raise ValueError(f"periods must be an int. got {type(periods)} instead") # Attempting to match pandas error behavior here for dtype in self._get_dtypes(): if not is_numeric_dtype(dtype): raise TypeError(f"unsupported operand type for /: got {dtype}") return self.__constructor__( query_compiler=self._query_compiler.pct_change( periods=periods, fill_method=fill_method, limit=limit, freq=freq, **kwargs, ) ) def pipe( self, func: Callable[..., T] | tuple[Callable[..., T], str], *args, **kwargs ) -> T: # noqa: PR01, RT01, D200 """ Apply chainable functions that expect `BasePandasDataset`. """ return pipe(self, func, *args, **kwargs) def pop(self, item) -> Series | Scalar: # noqa: PR01, RT01, D200 """ Return item and drop from frame. Raise KeyError if not found. """ result = self[item] del self[item] return result def pow( self, other, axis="columns", level=None, fill_value=None ) -> Self: # noqa: PR01, RT01, D200 """ Get exponential power of `BasePandasDataset` and `other`, element-wise (binary operator `pow`). """ return self._binary_op( "pow", other, axis=axis, level=level, fill_value=fill_value ) def quantile( self, q, axis, numeric_only, interpolation, method ) -> DataFrame | Series | Scalar: # noqa: PR01, RT01, D200 """ Return values at the given quantile over requested axis. """ axis = self._get_axis_number(axis) def check_dtype(t): return is_numeric_dtype(t) or lib.is_np_dtype(t, "mM") numeric_only_df = self if not numeric_only: # If not numeric_only and columns, then check all columns are either # numeric, timestamp, or timedelta if not axis and not all(check_dtype(t) for t in self._get_dtypes()): raise TypeError("can't multiply sequence by non-int of type 'float'") # If over rows, then make sure that all dtypes are equal for not # numeric_only elif axis: for i in range(1, len(self._get_dtypes())): pre_dtype = self._get_dtypes()[i - 1] curr_dtype = self._get_dtypes()[i] if not is_dtype_equal(pre_dtype, curr_dtype): raise TypeError( "Cannot compare type '{0}' with type '{1}'".format( pre_dtype, curr_dtype ) ) else: numeric_only_df = self.drop( columns=[ i for i in self.dtypes.index if not is_numeric_dtype(self.dtypes[i]) ] ) # check that all qs are between 0 and 1 validate_percentile(q) axis = numeric_only_df._get_axis_number(axis) if isinstance(q, (pandas.Series, np.ndarray, pandas.Index, list, tuple)): return numeric_only_df.__constructor__( query_compiler=numeric_only_df._query_compiler.quantile_for_list_of_values( q=q, axis=axis, # `numeric_only=True` has already been processed by using `self.drop` function numeric_only=False, interpolation=interpolation, method=method, ) ) else: result = numeric_only_df._reduce_dimension( numeric_only_df._query_compiler.quantile_for_single_value( q=q, axis=axis, # `numeric_only=True` has already been processed by using `self.drop` function numeric_only=False, interpolation=interpolation, method=method, ) ) if isinstance(result, BasePandasDataset): result.name = q return result @_inherit_docstrings(pandas.DataFrame.rank, apilink="pandas.DataFrame.rank") def rank( self, axis=0, method: str = "average", numeric_only=False, na_option: str = "keep", ascending: bool = True, pct: bool = False, ) -> Self: if axis is None: raise ValueError( f"No axis named None for object type {type(self).__name__}" ) axis = self._get_axis_number(axis) return self.__constructor__( query_compiler=self._query_compiler.rank( axis=axis, method=method, numeric_only=numeric_only, na_option=na_option, ascending=ascending, pct=pct, ) ) def _copy_index_metadata(self, source, destination): # noqa: PR01, RT01, D200 """ Copy Index metadata from `source` to `destination` inplace. """ if hasattr(source, "name") and hasattr(destination, "name"): destination.name = source.name if hasattr(source, "names") and hasattr(destination, "names"): destination.names = source.names return destination def _ensure_index(self, index_like, axis=0): # noqa: PR01, RT01, D200 """ Ensure that we have an index from some index-like object. """ if ( self._query_compiler.has_multiindex(axis=axis) and not isinstance(index_like, pandas.Index) and is_list_like(index_like) and len(index_like) > 0 and isinstance(index_like[0], tuple) ): try: return pandas.MultiIndex.from_tuples(index_like) except TypeError: # not all tuples pass return ensure_index(index_like) def reindex( self, index=None, columns=None, copy=True, **kwargs, ) -> Self: # noqa: PR01, RT01, D200 """ Conform `BasePandasDataset` to new index with optional filling logic. """ new_query_compiler = None if index is not None: if not isinstance(index, pandas.Index) or not index.equals(self.index): new_query_compiler = self._query_compiler.reindex( axis=0, labels=index, **kwargs ) if new_query_compiler is None: new_query_compiler = self._query_compiler final_query_compiler = None if columns is not None: if not isinstance(index, pandas.Index) or not columns.equals(self.columns): final_query_compiler = new_query_compiler.reindex( axis=1, labels=columns, **kwargs ) if final_query_compiler is None: final_query_compiler = new_query_compiler return self._create_or_update_from_compiler( final_query_compiler, inplace=False if copy is None else not copy ) def rename_axis( self, mapper=lib.no_default, *, index=lib.no_default, columns=lib.no_default, axis=0, copy=None, inplace=False, ) -> DataFrame | Series | None: # noqa: PR01, RT01, D200 """ Set the name of the axis for the index or columns. """ axes = {"index": index, "columns": columns} if copy is None: copy = True if axis is not None: axis = self._get_axis_number(axis) inplace = validate_bool_kwarg(inplace, "inplace") if mapper is not lib.no_default: # Use v0.23 behavior if a scalar or list non_mapper = is_scalar(mapper) or ( is_list_like(mapper) and not is_dict_like(mapper) ) if non_mapper: return self._set_axis_name(mapper, axis=axis, inplace=inplace) else: raise ValueError("Use `.rename` to alter labels with a mapper.") else: # Use new behavior. Means that index and/or columns is specified result = self if inplace else self.copy(deep=copy) for axis in range(self.ndim): v = axes.get(pandas.DataFrame._get_axis_name(axis)) if v is lib.no_default: continue non_mapper = is_scalar(v) or (is_list_like(v) and not is_dict_like(v)) if non_mapper: newnames = v else: def _get_rename_function(mapper): if isinstance(mapper, (dict, BasePandasDataset)): def f(x): if x in mapper: return mapper[x] else: return x else: f = mapper return f f = _get_rename_function(v) curnames = self.index.names if axis == 0 else self.columns.names newnames = [f(name) for name in curnames] result._set_axis_name(newnames, axis=axis, inplace=True) if not inplace: return result def reorder_levels(self, order, axis=0) -> Self: # noqa: PR01, RT01, D200 """ Rearrange index levels using input order. """ axis = self._get_axis_number(axis) new_labels = self._get_axis(axis).reorder_levels(order) return self.set_axis(new_labels, axis=axis) def resample( self, rule, axis: Axis = lib.no_default, closed: Optional[str] = None, label: Optional[str] = None, convention: str = lib.no_default, kind: Optional[str] = lib.no_default, on: Level = None, level: Level = None, origin: str | TimestampConvertibleTypes = "start_day", offset: Optional[TimedeltaConvertibleTypes] = None, group_keys=False, ) -> Resampler: # noqa: PR01, RT01, D200 """ Resample time-series data. """ from .resample import Resampler if axis is not lib.no_default: axis = self._get_axis_number(axis) if axis == 1: warnings.warn( "DataFrame.resample with axis=1 is deprecated. Do " + "`frame.T.resample(...)` without axis instead.", FutureWarning, ) else: warnings.warn( f"The 'axis' keyword in {type(self).__name__}.resample is " + "deprecated and will be removed in a future version.", FutureWarning, ) else: axis = 0 return Resampler( dataframe=self, rule=rule, axis=axis, closed=closed, label=label, convention=convention, kind=kind, on=on, level=level, origin=origin, offset=offset, group_keys=group_keys, ) def reset_index( self, level: IndexLabel = None, *, drop: bool = False, inplace: bool = False, col_level: Hashable = 0, col_fill: Hashable = "", allow_duplicates=lib.no_default, names: Hashable | Sequence[Hashable] = None, ) -> DataFrame | Series | None: # noqa: PR01, RT01, D200 """ Reset the index, or a level of it. """ inplace = validate_bool_kwarg(inplace, "inplace") # Error checking for matching pandas. Pandas does not allow you to # insert a dropped index into a DataFrame if these columns already # exist. if ( not drop and not ( self._query_compiler.lazy_column_labels or self._query_compiler.lazy_row_labels ) and not self._query_compiler.has_multiindex() and all(n in self.columns for n in ["level_0", "index"]) ): raise ValueError("cannot insert level_0, already exists") new_query_compiler = self._query_compiler.reset_index( drop=drop, level=level, col_level=col_level, col_fill=col_fill, allow_duplicates=allow_duplicates, names=names, ) return self._create_or_update_from_compiler(new_query_compiler, inplace) def radd( self, other, axis="columns", level=None, fill_value=None ) -> Self: # noqa: PR01, RT01, D200 """ Return addition of `BasePandasDataset` and `other`, element-wise (binary operator `radd`). """ return self._binary_op( "radd", other, axis=axis, level=level, fill_value=fill_value ) def rfloordiv( self, other, axis="columns", level=None, fill_value=None ) -> Self: # noqa: PR01, RT01, D200 """ Get integer division of `BasePandasDataset` and `other`, element-wise (binary operator `rfloordiv`). """ return self._binary_op( "rfloordiv", other, axis=axis, level=level, fill_value=fill_value ) def rmod( self, other, axis="columns", level=None, fill_value=None ) -> Self: # noqa: PR01, RT01, D200 """ Get modulo of `BasePandasDataset` and `other`, element-wise (binary operator `rmod`). """ return self._binary_op( "rmod", other, axis=axis, level=level, fill_value=fill_value ) def rmul( self, other, axis="columns", level=None, fill_value=None ) -> Self: # noqa: PR01, RT01, D200 """ Get Multiplication of dataframe and other, element-wise (binary operator `rmul`). """ return self._binary_op( "rmul", other, axis=axis, level=level, fill_value=fill_value ) def rolling( self, window, min_periods: int | None = None, center: bool = False, win_type: str | None = None, on: str | None = None, axis: Axis = lib.no_default, closed: str | None = None, step: int | None = None, method: str = "single", ) -> Rolling | Window: # noqa: PR01, RT01, D200 """ Provide rolling window calculations. """ if axis is not lib.no_default: axis = self._get_axis_number(axis) name = "rolling" if axis == 1: warnings.warn( f"Support for axis=1 in {type(self).__name__}.{name} is " + "deprecated and will be removed in a future version. " + f"Use obj.T.{name}(...) instead", FutureWarning, ) else: warnings.warn( f"The 'axis' keyword in {type(self).__name__}.{name} is " + "deprecated and will be removed in a future version. " + "Call the method without the axis keyword instead.", FutureWarning, ) else: axis = 0 if win_type is not None: from .window import Window return Window( self, window=window, min_periods=min_periods, center=center, win_type=win_type, on=on, axis=axis, closed=closed, step=step, method=method, ) from .window import Rolling return Rolling( self, window=window, min_periods=min_periods, center=center, win_type=win_type, on=on, axis=axis, closed=closed, step=step, method=method, ) def round(self, decimals=0, *args, **kwargs) -> Self: # noqa: PR01, RT01, D200 """ Round a `BasePandasDataset` to a variable number of decimal places. """ # FIXME: Judging by pandas docs `*args` and `**kwargs` serves only compatibility # purpose and does not affect the result, we shouldn't pass them to the query compiler. return self.__constructor__( query_compiler=self._query_compiler.round(decimals=decimals, **kwargs) ) def rpow( self, other, axis="columns", level=None, fill_value=None ) -> Self: # noqa: PR01, RT01, D200 """ Get exponential power of `BasePandasDataset` and `other`, element-wise (binary operator `rpow`). """ return self._binary_op( "rpow", other, axis=axis, level=level, fill_value=fill_value ) def rsub( self, other, axis="columns", level=None, fill_value=None ) -> Self: # noqa: PR01, RT01, D200 """ Get subtraction of `BasePandasDataset` and `other`, element-wise (binary operator `rsub`). """ return self._binary_op( "rsub", other, axis=axis, level=level, fill_value=fill_value ) def rtruediv( self, other, axis="columns", level=None, fill_value=None ) -> Self: # noqa: PR01, RT01, D200 """ Get floating division of `BasePandasDataset` and `other`, element-wise (binary operator `rtruediv`). """ return self._binary_op( "rtruediv", other, axis=axis, level=level, fill_value=fill_value ) rdiv: Self = rtruediv def sample( self, n: int | None = None, frac: float | None = None, replace: bool = False, weights=None, random_state: RandomState | None = None, axis: Axis | None = None, ignore_index: bool = False, ) -> Self: # noqa: PR01, RT01, D200 """ Return a random sample of items from an axis of object. """ axis = self._get_axis_number(axis) if axis: axis_labels = self.columns axis_length = len(axis_labels) else: # Getting rows requires indices instead of labels. RangeIndex provides this. axis_labels = pandas.RangeIndex(len(self)) axis_length = len(axis_labels) if weights is not None: # Index of the weights Series should correspond to the index of the # Dataframe in order to sample if isinstance(weights, BasePandasDataset): weights = weights.reindex(self._get_axis(axis)) # If weights arg is a string, the weights used for sampling will # the be values in the column corresponding to that string if isinstance(weights, str): if axis == 0: try: weights = self[weights] except KeyError: raise KeyError("String passed to weights not a valid column") else: raise ValueError( "Strings can only be passed to " + "weights when sampling from rows on " + "a DataFrame" ) weights = pandas.Series(weights, dtype="float64") if len(weights) != axis_length: raise ValueError( "Weights and axis to be sampled must be of same length" ) if (weights == np.inf).any() or (weights == -np.inf).any(): raise ValueError("weight vector may not include `inf` values") if (weights < 0).any(): raise ValueError("weight vector many not include negative values") # weights cannot be NaN when sampling, so we must set all nan # values to 0 weights = weights.fillna(0) # If passed in weights are not equal to 1, renormalize them # otherwise numpy sampling function will error weights_sum = weights.sum() if weights_sum != 1: if weights_sum != 0: weights = weights / weights_sum else: raise ValueError("Invalid weights: weights sum to zero") weights = weights.values if n is None and frac is None: # default to n = 1 if n and frac are both None (in accordance with # pandas specification) n = 1 elif n is not None and frac is None and n % 1 != 0: # n must be an integer raise ValueError("Only integers accepted as `n` values") elif n is None and frac is not None: # compute the number of samples based on frac n = int(round(frac * axis_length)) elif n is not None and frac is not None: # Pandas specification does not allow both n and frac to be passed # in raise ValueError("Please enter a value for `frac` OR `n`, not both") if n < 0: raise ValueError( "A negative number of rows requested. Please provide positive value." ) if n == 0: # This returns an empty object, and since it is a weird edge case that # doesn't need to be distributed, we default to pandas for n=0. # We don't need frac to be set to anything since n is already 0. return self._default_to_pandas( "sample", n=n, frac=None, replace=replace, weights=weights, random_state=random_state, axis=axis, ignore_index=ignore_index, ) if random_state is not None: # Get a random number generator depending on the type of # random_state that is passed in if isinstance(random_state, int): random_num_gen = np.random.RandomState(random_state) elif isinstance(random_state, np.random.RandomState): random_num_gen = random_state else: # random_state must be an int or a numpy RandomState object raise ValueError( "Please enter an `int` OR a " + "np.random.RandomState for random_state" ) # choose random numbers and then get corresponding labels from # chosen axis sample_indices = random_num_gen.choice( np.arange(0, axis_length), size=n, replace=replace, p=weights ) samples = axis_labels[sample_indices] else: # randomly select labels from chosen axis samples = np.random.choice( a=axis_labels, size=n, replace=replace, p=weights ) if axis: query_compiler = self._query_compiler.getitem_column_array(samples) return self.__constructor__(query_compiler=query_compiler) else: query_compiler = self._query_compiler.getitem_row_array(samples) return self.__constructor__(query_compiler=query_compiler) def sem( self, axis: Axis = 0, skipna: bool = True, ddof: int = 1, numeric_only=False, **kwargs, ) -> Series | float: # noqa: PR01, RT01, D200 """ Return unbiased standard error of the mean over requested axis. """ return self._stat_operation( "sem", axis, skipna, numeric_only, ddof=ddof, **kwargs ) def mean( self, axis: Axis = 0, skipna=True, numeric_only=False, **kwargs, ) -> Series | float: # noqa: PR01, RT01, D200 """ Return the mean of the values over the requested axis. """ return self._stat_operation("mean", axis, skipna, numeric_only, **kwargs) def median( self, axis: Axis = 0, skipna=True, numeric_only=False, **kwargs, ) -> Series | float: # noqa: PR01, RT01, D200 """ Return the mean of the values over the requested axis. """ return self._stat_operation("median", axis, skipna, numeric_only, **kwargs) def set_axis( self, labels, *, axis: Axis = 0, copy=None, ) -> Self: # noqa: PR01, RT01, D200 """ Assign desired index to given axis. """ if copy is None: copy = True obj = self.copy() if copy else self setattr(obj, pandas.DataFrame._get_axis_name(axis), labels) return obj def set_flags( self, *, copy: bool = False, allows_duplicate_labels: Optional[bool] = None ) -> Self: # noqa: PR01, RT01, D200 """ Return a new `BasePandasDataset` with updated flags. """ return self._default_to_pandas( pandas.DataFrame.set_flags, copy=copy, allows_duplicate_labels=allows_duplicate_labels, ) @property def flags(self): return self._default_to_pandas(lambda df: df.flags) def shift( self, periods: int = 1, freq=None, axis: Axis = 0, fill_value: Hashable = lib.no_default, suffix=None, ) -> Self | DataFrame: # noqa: PR01, RT01, D200 """ Shift index by desired number of periods with an optional time `freq`. """ if suffix: return self._default_to_pandas( lambda df: df.shift( periods=periods, freq=freq, axis=axis, fill_value=fill_value, suffix=suffix, ) ) if freq is not None and fill_value is not lib.no_default: raise ValueError( "Cannot pass both 'freq' and 'fill_value' to " + f"{type(self).__name__}.shift" ) if periods == 0: # Check obvious case first return self.copy() return self._create_or_update_from_compiler( new_query_compiler=self._query_compiler.shift( periods, freq, axis, fill_value ), inplace=False, ) def skew( self, axis: Axis = 0, skipna: bool = True, numeric_only=False, **kwargs, ) -> Series | float: # noqa: PR01, RT01, D200 """ Return unbiased skew over requested axis. """ return self._stat_operation("skew", axis, skipna, numeric_only, **kwargs) def sort_index( self, *, axis=0, level=None, ascending=True, inplace=False, kind="quicksort", na_position="last", sort_remaining=True, ignore_index: bool = False, key: Optional[IndexKeyFunc] = None, ) -> Self | None: # noqa: PR01, RT01, D200 """ Sort object by labels (along an axis). """ # pandas throws this exception. See pandas issie #39434 if ascending is None: raise ValueError( "the `axis` parameter is not supported in the pandas implementation of argsort()" ) axis = self._get_axis_number(axis) inplace = validate_bool_kwarg(inplace, "inplace") new_query_compiler = self._query_compiler.sort_index( axis=axis, level=level, ascending=ascending, inplace=inplace, kind=kind, na_position=na_position, sort_remaining=sort_remaining, ignore_index=ignore_index, key=key, ) return self._create_or_update_from_compiler(new_query_compiler, inplace) def sort_values( self, by, *, axis=0, ascending=True, inplace: bool = False, kind="quicksort", na_position="last", ignore_index: bool = False, key: Optional[IndexKeyFunc] = None, ) -> Self | None: # noqa: PR01, RT01, D200 """ Sort by the values along either axis. """ axis = self._get_axis_number(axis) inplace = validate_bool_kwarg(inplace, "inplace") ascending = validate_ascending(ascending) if axis == 0: result = self._query_compiler.sort_rows_by_column_values( by, ascending=ascending, kind=kind, na_position=na_position, ignore_index=ignore_index, key=key, ) else: result = self._query_compiler.sort_columns_by_row_values( by, ascending=ascending, kind=kind, na_position=na_position, ignore_index=ignore_index, key=key, ) return self._create_or_update_from_compiler(result, inplace) def std( self, axis: Axis = 0, skipna: bool = True, ddof: int = 1, numeric_only=False, **kwargs, ) -> Series | float: # noqa: PR01, RT01, D200 """ Return sample standard deviation over requested axis. """ return self._stat_operation( "std", axis, skipna, numeric_only, ddof=ddof, **kwargs ) def sub( self, other, axis="columns", level=None, fill_value=None ) -> Self: # noqa: PR01, RT01, D200 """ Get subtraction of `BasePandasDataset` and `other`, element-wise (binary operator `sub`). """ return self._binary_op( "sub", other, axis=axis, level=level, fill_value=fill_value ) subtract: Self = sub def swapaxes(self, axis1, axis2, copy=None) -> Self: # noqa: PR01, RT01, D200 """ Interchange axes and swap values axes appropriately. """ if copy is None: copy = True axis1 = self._get_axis_number(axis1) axis2 = self._get_axis_number(axis2) if axis1 != axis2: return self.transpose() if copy: return self.copy() return self def swaplevel(self, i=-2, j=-1, axis=0) -> Self: # noqa: PR01, RT01, D200 """ Swap levels `i` and `j` in a `MultiIndex`. """ axis = self._get_axis_number(axis) idx = self.index if axis == 0 else self.columns return self.set_axis(idx.swaplevel(i, j), axis=axis) def tail(self, n=5) -> Self: # noqa: PR01, RT01, D200 """ Return the last `n` rows. """ if n != 0: return self.iloc[-n:] return self.iloc[len(self) :] def take(self, indices, axis=0, **kwargs) -> Self: # noqa: PR01, RT01, D200 """ Return the elements in the given *positional* indices along an axis. """ axis = self._get_axis_number(axis) slice_obj = indices if axis == 0 else (slice(None), indices) return self.iloc[slice_obj] def to_clipboard( self, excel=True, sep=None, **kwargs ): # pragma: no cover # noqa: PR01, RT01, D200 """ Copy object to the system clipboard. """ return self._default_to_pandas("to_clipboard", excel=excel, sep=sep, **kwargs) @expanduser_path_arg("path_or_buf") def to_csv( self, path_or_buf=None, sep=",", na_rep="", float_format=None, columns=None, header=True, index=True, index_label=None, mode="w", encoding=None, compression="infer", quoting=None, quotechar='"', lineterminator=None, chunksize=None, date_format=None, doublequote=True, escapechar=None, decimal=".", errors: str = "strict", storage_options: StorageOptions = None, ) -> str | None: # pragma: no cover from modin.core.execution.dispatching.factories.dispatcher import ( FactoryDispatcher, ) return FactoryDispatcher.to_csv( self._query_compiler, path_or_buf=path_or_buf, sep=sep, na_rep=na_rep, float_format=float_format, columns=columns, header=header, index=index, index_label=index_label, mode=mode, encoding=encoding, compression=compression, quoting=quoting, quotechar=quotechar, lineterminator=lineterminator, chunksize=chunksize, date_format=date_format, doublequote=doublequote, escapechar=escapechar, decimal=decimal, errors=errors, storage_options=storage_options, ) @expanduser_path_arg("excel_writer") def to_excel( self, excel_writer, sheet_name="Sheet1", na_rep="", float_format=None, columns=None, header=True, index=True, index_label=None, startrow=0, startcol=0, engine=None, merge_cells=True, inf_rep="inf", freeze_panes=None, storage_options: StorageOptions = None, engine_kwargs=None, ) -> None: # pragma: no cover # noqa: PR01, RT01, D200 """ Write object to an Excel sheet. """ return self._default_to_pandas( "to_excel", excel_writer, sheet_name=sheet_name, na_rep=na_rep, float_format=float_format, columns=columns, header=header, index=index, index_label=index_label, startrow=startrow, startcol=startcol, engine=engine, merge_cells=merge_cells, inf_rep=inf_rep, freeze_panes=freeze_panes, storage_options=storage_options, engine_kwargs=engine_kwargs, ) def to_dict(self, orient="dict", into=dict, index=True) -> dict: return self._query_compiler.dataframe_to_dict(orient, into, index) @expanduser_path_arg("path_or_buf") def to_hdf( self, path_or_buf, key: str, mode: Literal["a", "w", "r+"] = "a", complevel: int | None = None, complib: Literal["zlib", "lzo", "bzip2", "blosc"] | None = None, append: bool = False, format: Literal["fixed", "table"] | None = None, index: bool = True, min_itemsize: int | dict[str, int] | None = None, nan_rep=None, dropna: bool | None = None, data_columns: Literal[True] | list[str] | None = None, errors: str = "strict", encoding: str = "UTF-8", ) -> None: # pragma: no cover # noqa: PR01, RT01, D200 """ Write the contained data to an HDF5 file using HDFStore. """ return self._default_to_pandas( "to_hdf", path_or_buf, key=key, mode=mode, complevel=complevel, complib=complib, append=append, format=format, index=index, min_itemsize=min_itemsize, nan_rep=nan_rep, dropna=dropna, data_columns=data_columns, errors=errors, encoding=encoding, ) @expanduser_path_arg("path_or_buf") def to_json( self, path_or_buf=None, orient=None, date_format=None, double_precision=10, force_ascii=True, date_unit="ms", default_handler=None, lines=False, compression="infer", index=None, indent=None, storage_options: StorageOptions = None, mode="w", ) -> str | None: # pragma: no cover # noqa: PR01, RT01, D200 """ Convert the object to a JSON string. """ from modin.core.execution.dispatching.factories.dispatcher import ( FactoryDispatcher, ) return FactoryDispatcher.to_json( self._query_compiler, path_or_buf, orient=orient, date_format=date_format, double_precision=double_precision, force_ascii=force_ascii, date_unit=date_unit, default_handler=default_handler, lines=lines, compression=compression, index=index, indent=indent, storage_options=storage_options, mode=mode, ) @expanduser_path_arg("buf") def to_latex( self, buf=None, columns=None, header=True, index=True, na_rep="NaN", formatters=None, float_format=None, sparsify=None, index_names=True, bold_rows=False, column_format=None, longtable=None, escape=None, encoding=None, decimal=".", multicolumn=None, multicolumn_format=None, multirow=None, caption=None, label=None, position=None, ) -> str | None: # pragma: no cover # noqa: PR01, RT01, D200 """ Render object to a LaTeX tabular, longtable, or nested table. """ return self._default_to_pandas( "to_latex", buf=buf, columns=columns, header=header, index=index, na_rep=na_rep, formatters=formatters, float_format=float_format, sparsify=sparsify, index_names=index_names, bold_rows=bold_rows, column_format=column_format, longtable=longtable, escape=escape, encoding=encoding, decimal=decimal, multicolumn=multicolumn, multicolumn_format=multicolumn_format, multirow=multirow, caption=caption, label=label, position=position, ) @expanduser_path_arg("buf") def to_markdown( self, buf=None, mode: str = "wt", index: bool = True, storage_options: StorageOptions = None, **kwargs, ) -> str: # noqa: PR01, RT01, D200 """ Print `BasePandasDataset` in Markdown-friendly format. """ return self._default_to_pandas( "to_markdown", buf=buf, mode=mode, index=index, storage_options=storage_options, **kwargs, ) @expanduser_path_arg("path") def to_pickle( self, path, compression: CompressionOptions = "infer", protocol: int = pkl.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, ) -> None: # pragma: no cover # noqa: PR01, D200 """ Pickle (serialize) object to file. """ from modin.pandas import to_pickle to_pickle( self, path, compression=compression, protocol=protocol, storage_options=storage_options, ) def _to_bare_numpy( self, dtype=None, copy=False, na_value=lib.no_default ): # noqa: PR01, RT01, D200 """ Convert the `BasePandasDataset` to a NumPy array. """ return self._query_compiler.to_numpy( dtype=dtype, copy=copy, na_value=na_value, ) def to_numpy( self, dtype=None, copy=False, na_value=lib.no_default ) -> np.ndarray: # noqa: PR01, RT01, D200 """ Convert the `BasePandasDataset` to a NumPy array or a Modin wrapper for NumPy array. """ from modin.config import ModinNumpy if ModinNumpy.get(): from ..numpy.arr import array return array(self, copy=copy) return self._to_bare_numpy( dtype=dtype, copy=copy, na_value=na_value, ) # TODO(williamma12): When this gets implemented, have the series one call this. def to_period( self, freq=None, axis=0, copy=None ) -> Self: # pragma: no cover # noqa: PR01, RT01, D200 """ Convert `BasePandasDataset` from DatetimeIndex to PeriodIndex. """ return self._default_to_pandas("to_period", freq=freq, axis=axis, copy=copy) @expanduser_path_arg("buf") def to_string( self, buf=None, columns=None, col_space=None, header=True, index=True, na_rep="NaN", formatters=None, float_format=None, sparsify=None, index_names=True, justify=None, max_rows=None, min_rows=None, max_cols=None, show_dimensions=False, decimal=".", line_width=None, max_colwidth=None, encoding=None, ) -> str | None: # noqa: PR01, RT01, D200 """ Render a `BasePandasDataset` to a console-friendly tabular output. """ return self._default_to_pandas( "to_string", buf=buf, columns=columns, col_space=col_space, header=header, index=index, na_rep=na_rep, formatters=formatters, float_format=float_format, sparsify=sparsify, index_names=index_names, justify=justify, max_rows=max_rows, max_cols=max_cols, show_dimensions=show_dimensions, decimal=decimal, line_width=line_width, max_colwidth=max_colwidth, encoding=encoding, ) def to_sql( self, name, con, schema=None, if_exists="fail", index=True, index_label=None, chunksize=None, dtype=None, method=None, ) -> int | None: # noqa: PR01, D200 """ Write records stored in a `BasePandasDataset` to a SQL database. """ new_query_compiler = self._query_compiler # writing the index to the database by inserting it to the DF if index: new_query_compiler = new_query_compiler.reset_index() if index_label is not None: if not is_list_like(index_label): index_label = [index_label] new_query_compiler.columns = list(index_label) + list( new_query_compiler.columns[len(index_label) :] ) # so pandas._to_sql will not write the index to the database as well index = False from modin.core.execution.dispatching.factories.dispatcher import ( FactoryDispatcher, ) FactoryDispatcher.to_sql( new_query_compiler, name=name, con=con, schema=schema, if_exists=if_exists, index=index, index_label=index_label, chunksize=chunksize, dtype=dtype, method=method, ) # TODO(williamma12): When this gets implemented, have the series one call this. def to_timestamp( self, freq=None, how="start", axis=0, copy=None ) -> Self: # noqa: PR01, RT01, D200 """ Cast to DatetimeIndex of timestamps, at *beginning* of period. """ return self._default_to_pandas( "to_timestamp", freq=freq, how=how, axis=axis, copy=copy ) def to_xarray(self): # noqa: PR01, RT01, D200 """ Return an xarray object from the `BasePandasDataset`. """ return self._default_to_pandas("to_xarray") def truediv( self, other, axis="columns", level=None, fill_value=None ) -> Self: # noqa: PR01, RT01, D200 """ Get floating division of `BasePandasDataset` and `other`, element-wise (binary operator `truediv`). """ return self._binary_op( "truediv", other, axis=axis, level=level, fill_value=fill_value ) div: Self = truediv divide: Self = truediv def truncate( self, before=None, after=None, axis=None, copy=None ) -> Self: # noqa: PR01, RT01, D200 """ Truncate a `BasePandasDataset` before and after some index value. """ axis = self._get_axis_number(axis) if ( not self._get_axis(axis).is_monotonic_increasing and not self._get_axis(axis).is_monotonic_decreasing ): raise ValueError("truncate requires a sorted index") if before is not None and after is not None and before > after: raise ValueError(f"Truncate: {after} must be after {before}") s = slice(*self._get_axis(axis).slice_locs(before, after)) slice_obj = s if axis == 0 else (slice(None), s) return self.iloc[slice_obj] def transform( self, func, axis=0, *args, **kwargs ) -> Self: # noqa: PR01, RT01, D200 """ Call ``func`` on self producing a `BasePandasDataset` with the same axis shape as self. """ kwargs["is_transform"] = True self._validate_function(func) try: result = self.agg(func, axis=axis, *args, **kwargs) except (TypeError, pandas.errors.SpecificationError): raise except Exception as err: raise ValueError("Transform function failed") from err if getattr(result, "_pandas_class", None) not in ( pandas.Series, pandas.DataFrame, ) or not result.index.equals(self.index): raise ValueError("Function did not transform") return result def tz_convert( self, tz, axis=0, level=None, copy=None ) -> Self: # noqa: PR01, RT01, D200 """ Convert tz-aware axis to target time zone. """ if copy is None: copy = True return self._create_or_update_from_compiler( self._query_compiler.tz_convert( tz, axis=self._get_axis_number(axis), level=level, copy=copy ), inplace=(not copy), ) def tz_localize( self, tz, axis=0, level=None, copy=None, ambiguous="raise", nonexistent="raise" ) -> Self: # noqa: PR01, RT01, D200 """ Localize tz-naive index of a `BasePandasDataset` to target time zone. """ if copy is None: copy = True return self._create_or_update_from_compiler( self._query_compiler.tz_localize( tz, axis=self._get_axis_number(axis), level=level, copy=copy, ambiguous=ambiguous, nonexistent=nonexistent, ), inplace=(not copy), ) def interpolate( self, method="linear", *, axis=0, limit=None, inplace=False, limit_direction: Optional[str] = None, limit_area=None, downcast=lib.no_default, **kwargs, ) -> Self: # noqa: PR01, RT01, D200 if downcast is not lib.no_default: warnings.warn( f"The 'downcast' keyword in {type(self).__name__}.interpolate " + "is deprecated and will be removed in a future version. " + "Call result.infer_objects(copy=False) on the result instead.", FutureWarning, ) else: downcast = None return self._create_or_update_from_compiler( self._query_compiler.interpolate( method=method, axis=axis, limit=limit, inplace=False, limit_direction=limit_direction, limit_area=limit_area, downcast=downcast, **kwargs, ), inplace=inplace, ) # TODO: uncomment the following lines when #3331 issue will be closed # @prepend_to_notes( # """ # In comparison with pandas, Modin's ``value_counts`` returns Series with ``MultiIndex`` # only if multiple columns were passed via the `subset` parameter, otherwise, the resulted # Series's index will be a regular single dimensional ``Index``. # """ # ) @_inherit_docstrings( pandas.DataFrame.value_counts, apilink="pandas.DataFrame.value_counts" ) def value_counts( self, subset: Sequence[Hashable] | None = None, normalize: bool = False, sort: bool = True, ascending: bool = False, dropna: bool = True, ) -> Series: if subset is None: subset = self._query_compiler.columns with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message=".*groupby keys will be sorted anyway.*", category=UserWarning, ) counted_values = self.groupby( by=subset, dropna=dropna, observed=True, sort=False ).size() if sort: if counted_values.name is None: counted_values.name = 0 by = counted_values.name result = counted_values._query_compiler.sort_rows_by_column_values( columns=by, ascending=ascending, ) counted_values = self._create_or_update_from_compiler(result) if isinstance(counted_values, pd.DataFrame): counted_values = counted_values.squeeze(axis=1) if normalize: counted_values = counted_values / counted_values.sum() # TODO: uncomment when strict compability mode will be implemented: # https://github.com/modin-project/modin/issues/3411 # if STRICT_COMPABILITY and not isinstance(counted_values.index, MultiIndex): # counted_values.index = pandas.MultiIndex.from_arrays( # [counted_values.index], names=counted_values.index.names # ) # https://pandas.pydata.org/pandas-docs/version/2.0/whatsnew/v2.0.0.html#value-counts-sets-the-resulting-name-to-count counted_values.name = "proportion" if normalize else "count" return counted_values def var( self, axis: Axis = 0, skipna: bool = True, ddof: int = 1, numeric_only=False, **kwargs, ) -> Series | float: # noqa: PR01, RT01, D200 """ Return unbiased variance over requested axis. """ return self._stat_operation( "var", axis, skipna, numeric_only, ddof=ddof, **kwargs ) def __abs__(self) -> Self: """ Return a `BasePandasDataset` with absolute numeric value of each element. Returns ------- BasePandasDataset Object containing the absolute value of each element. """ return self.abs() @_doc_binary_op( operation="union", bin_op="and", right="other", **_doc_binary_op_kwargs ) def __and__(self, other) -> Self: return self._binary_op("__and__", other, axis=0) @_doc_binary_op( operation="union", bin_op="rand", right="other", **_doc_binary_op_kwargs ) def __rand__(self, other) -> Self: return self._binary_op("__rand__", other, axis=0) def __array__( self, dtype: npt.DTypeLike | None = None, copy: bool | None = None ) -> np.ndarray: """ Return the values as a NumPy array. Parameters ---------- dtype : str or np.dtype, optional The dtype of returned array. copy : bool, default: None This parameter has no effect; the method always returns a copy of the data. Returns ------- arr : np.ndarray NumPy representation of Modin object. """ return self._to_bare_numpy(dtype) def __copy__(self, deep=True) -> Self: """ Return the copy of the `BasePandasDataset`. Parameters ---------- deep : bool, default: True Whether the copy should be deep or not. Returns ------- BasePandasDataset """ return self.copy(deep=deep) def __deepcopy__(self, memo=None) -> Self: """ Return the deep copy of the `BasePandasDataset`. Parameters ---------- memo : Any, optional Deprecated parameter. Returns ------- BasePandasDataset """ return self.copy(deep=True) @_doc_binary_op( operation="equality comparison", bin_op="eq", right="other", **_doc_binary_op_kwargs, ) def __eq__(self, other) -> Self: return self.eq(other) def __finalize__(self, other, method=None, **kwargs) -> Self: """ Propagate metadata from `other` to `self`. Parameters ---------- other : BasePandasDataset The object from which to get the attributes that we are going to propagate. method : str, optional A passed method name providing context on where `__finalize__` was called. **kwargs : dict Additional keywords arguments to be passed to `__finalize__`. Returns ------- BasePandasDataset """ return self._default_to_pandas("__finalize__", other, method=method, **kwargs) @_doc_binary_op( operation="greater than or equal comparison", bin_op="ge", right="right", **_doc_binary_op_kwargs, ) def __ge__(self, right) -> Self: return self.ge(right) def __getitem__(self, key) -> Self: """ Retrieve dataset according to `key`. Parameters ---------- key : callable, scalar, slice, str or tuple The global row index to retrieve data from. Returns ------- BasePandasDataset Located dataset. """ if not self._query_compiler.lazy_row_count and len(self) == 0: return self._default_to_pandas("__getitem__", key) # see if we can slice the rows # This lets us reuse code in pandas to error check indexer = None if isinstance(key, slice): indexer = self.index._convert_slice_indexer(key, kind="getitem") if indexer is not None: return self._getitem_slice(indexer) else: return self._getitem(key) def xs( self, key, axis=0, level=None, drop_level: bool = True, ) -> Self: # noqa: PR01, RT01, D200 """ Return cross-section from the Series/DataFrame. """ axis = self._get_axis_number(axis) labels = self.columns if axis else self.index if isinstance(key, list): # deprecated in pandas, to be removed in 2.0 warnings.warn( "Passing lists as key for xs is deprecated and will be removed in a " + "future version. Pass key as a tuple instead.", FutureWarning, ) if level is not None: if not isinstance(labels, pandas.MultiIndex): raise TypeError("Index must be a MultiIndex") loc, new_ax = labels.get_loc_level(key, level=level, drop_level=drop_level) # create the tuple of the indexer _indexer = [slice(None)] * self.ndim _indexer[axis] = loc indexer = tuple(_indexer) result = self.iloc[indexer] setattr(result, self._pandas_class._get_axis_name(axis), new_ax) return result if axis == 1: if drop_level: return self[key] index = self.columns else: index = self.index new_index = None if isinstance(index, pandas.MultiIndex): loc, new_index = index._get_loc_level(key, level=0) if not drop_level: if is_integer(loc): new_index = index[loc : loc + 1] else: new_index = index[loc] else: loc = index.get_loc(key) if isinstance(loc, np.ndarray): if loc.dtype == np.bool_: (loc,) = loc.nonzero() # Note: pandas uses self._take_with_is_copy here return self.take(loc, axis=axis) if not is_scalar(loc): new_index = index[loc] if is_scalar(loc) and axis == 0: # In this case loc should be an integer if self.ndim == 1: # if we encounter an array-like and we only have 1 dim # that means that their are list/ndarrays inside the Series! # so just return them (pandas GH 6394) return self.iloc[loc] result = self.iloc[loc] elif is_scalar(loc): result = self.iloc[:, slice(loc, loc + 1)] elif axis == 1: result = self.iloc[:, loc] else: result = self.iloc[loc] if new_index is None: raise RuntimeError( "`new_index` variable shouldn't be equal to None here, something went wrong." ) result.index = new_index # Note: pandas does result._set_is_copy here return result __hash__ = None def _setitem_slice(self, key: slice, value) -> None: """ Set rows specified by `key` slice with `value`. Parameters ---------- key : location or index-based slice Key that points rows to modify. value : object Value to assing to the rows. """ indexer = self.index._convert_slice_indexer(key, kind="getitem") self.iloc[indexer] = value def _getitem_slice(self, key: slice) -> Self: """ Get rows specified by `key` slice. Parameters ---------- key : location or index-based slice Key that points to rows to retrieve. Returns ------- modin.pandas.BasePandasDataset Selected rows. """ if is_full_grab_slice( key, # Avoid triggering shape computation for lazy executions sequence_len=(None if self._query_compiler.lazy_row_count else len(self)), ): return self.copy() return self.iloc[key] @_doc_binary_op( operation="greater than comparison", bin_op="gt", right="right", **_doc_binary_op_kwargs, ) def __gt__(self, right) -> Self: return self.gt(right) def __invert__(self) -> Self: """ Apply bitwise inverse to each element of the `BasePandasDataset`. Returns ------- BasePandasDataset New BasePandasDataset containing bitwise inverse to each value. """ if not all(is_bool_dtype(d) or is_integer_dtype(d) for d in self._get_dtypes()): raise TypeError( "bad operand type for unary ~: '{}'".format( next( d for d in self._get_dtypes() if not (is_bool_dtype(d) or is_integer_dtype(d)) ) ) ) return self.__constructor__(query_compiler=self._query_compiler.invert()) @_doc_binary_op( operation="less than or equal comparison", bin_op="le", right="right", **_doc_binary_op_kwargs, ) def __le__(self, right) -> Self: return self.le(right) def __len__(self) -> int: """ Return length of info axis. Returns ------- int """ return self._query_compiler.get_axis_len(0) @_doc_binary_op( operation="less than comparison", bin_op="lt", right="right", **_doc_binary_op_kwargs, ) def __lt__(self, right) -> Self: return self.lt(right) def __matmul__(self, other) -> Self | np.ndarray | Scalar: """ Compute the matrix multiplication between the `BasePandasDataset` and `other`. Parameters ---------- other : BasePandasDataset or array-like The other object to compute the matrix product with. Returns ------- BasePandasDataset, np.ndarray or scalar """ return self.dot(other) @_doc_binary_op( operation="not equal comparison", bin_op="ne", right="other", **_doc_binary_op_kwargs, ) def __ne__(self, other) -> Self: return self.ne(other) def __neg__(self) -> Self: """ Change the sign for every value of self. Returns ------- BasePandasDataset """ self._validate_dtypes(numeric_only=True) return self.__constructor__(query_compiler=self._query_compiler.negative()) def __nonzero__(self): """ Evaluate `BasePandasDataset` as boolean object. Raises ------ ValueError Always since truth value for self is ambiguous. """ raise ValueError( f"The truth value of a {self.__class__.__name__} is ambiguous. " + "Use a.empty, a.bool(), a.item(), a.any() or a.all()." ) __bool__ = __nonzero__ @_doc_binary_op( operation="disjunction", bin_op="or", right="other", **_doc_binary_op_kwargs, ) def __or__(self, other) -> Self: return self._binary_op("__or__", other, axis=0) @_doc_binary_op( operation="disjunction", bin_op="ror", right="other", **_doc_binary_op_kwargs, ) def __ror__(self, other) -> Self: return self._binary_op("__ror__", other, axis=0) def __sizeof__(self) -> int: """ Generate the total memory usage for an `BasePandasDataset`. Returns ------- int """ return self._query_compiler.sizeof() def __str__(self) -> str: # pragma: no cover """ Return str(self). Returns ------- str """ return repr(self) @_doc_binary_op( operation="exclusive disjunction", bin_op="xor", right="other", **_doc_binary_op_kwargs, ) def __xor__(self, other) -> Self: return self._binary_op("__xor__", other, axis=0) @_doc_binary_op( operation="exclusive disjunction", bin_op="rxor", right="other", **_doc_binary_op_kwargs, ) def __rxor__(self, other) -> Self: return self._binary_op("__rxor__", other, axis=0) @property def size(self) -> int: # noqa: RT01, D200 """ Return an int representing the number of elements in this `BasePandasDataset` object. """ return len(self._query_compiler.index) * len(self._query_compiler.columns) @property def values(self) -> np.ndarray: # noqa: RT01, D200 """ Return a NumPy representation of the `BasePandasDataset`. """ return self.to_numpy() def _repartition(self, axis: Optional[int] = None) -> Self: """ Repartitioning Modin objects to get ideal partitions inside. Allows to improve performance where the query compiler can't improve yet by doing implicit repartitioning. Parameters ---------- axis : {0, 1, None}, optional The axis along which the repartitioning occurs. `None` is used for repartitioning along both axes. Returns ------- DataFrame or Series The repartitioned dataframe or series, depending on the original type. """ allowed_axis_values = (0, 1, None) if axis not in allowed_axis_values: raise ValueError( f"Passed `axis` parameter: {axis}, but should be one of {allowed_axis_values}" ) return self.__constructor__( query_compiler=self._query_compiler.repartition(axis=axis) ) @disable_logging def __getattribute__(self, item) -> Any: """ Return item from the `BasePandasDataset`. Parameters ---------- item : hashable Item to get. Returns ------- Any """ # NOTE that to get an attribute, python calls __getattribute__() first and # then falls back to __getattr__() if the former raises an AttributeError. if item not in EXTENSION_NO_LOOKUP: extensions_result = self._getattribute__from_extension_impl( item, __class__._extensions ) if extensions_result is not sentinel: return extensions_result attr = super().__getattribute__(item) if item not in _DEFAULT_BEHAVIOUR and not self._query_compiler.lazy_shape: # We default to pandas on empty DataFrames. This avoids a large amount of # pain in underlying implementation and returns a result immediately rather # than dealing with the edge cases that empty DataFrames have. if callable(attr) and self.empty and hasattr(self._pandas_class, item): def default_handler(*args, **kwargs): return self._default_to_pandas(item, *args, **kwargs) return default_handler return attr def __array_ufunc__( self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any ) -> DataFrame | Series | Any: """ Apply the `ufunc` to the `BasePandasDataset`. Parameters ---------- ufunc : np.ufunc The NumPy ufunc to apply. method : str The method to apply. *inputs : tuple The inputs to the ufunc. **kwargs : dict Additional keyword arguments. Returns ------- BasePandasDataset The result of the ufunc applied to the `BasePandasDataset`. """ return self._query_compiler.do_array_ufunc_implementation( self, ufunc, method, *inputs, **kwargs ) def __array_function__( self, func: np.func, types: tuple, args: tuple, kwargs: dict, ) -> DataFrame | Series | Any: """ Apply `func` to the `BasePandasDataset`. This function implements NEP18-style dispatch for certain NumPy functions: https://numpy.org/neps/nep-0018-array-function-protocol.html#nep18 By default, this function will transparently call __array__, followed by __array_function__ on the returned NumPy array. We implement this function to prevent bugs with the extension system when another backend overrides this method. Parameters ---------- func : np.func The NumPy func to apply. types : tuple The types of the args. args : tuple The args to the func. kwargs : dict Additional keyword arguments. Returns ------- DataFrame | Series | Any The result of applying the function to this dataset. By default, it will return a NumPy array. """ return self._query_compiler.do_array_function_implementation( self, func, types, args, kwargs ) # namespace for additional Modin functions that are not available in Pandas modin: ModinAPI = CachedAccessor("modin", ModinAPI) @disable_logging def is_backend_pinned(self) -> bool: """ Get whether this object's data is pinned to a particular backend. Returns ------- bool True if the data is pinned. """ return self._pinned def _set_backend_pinned( self, pinned: bool, inplace: bool = False ) -> Optional[Self]: """ Update whether this object's data is pinned to a particular backend. Parameters ---------- pinned : bool Whether the data is pinned. inplace : bool, default: False Whether to update the object in place. Returns ------- Optional[Self] The object with the new pin state, if `inplace` is False. Otherwise, None. """ change = (self.is_backend_pinned() and not pinned) or ( not self.is_backend_pinned() and pinned ) if inplace: self._pinned = pinned return None else: if change: new_obj = self.__constructor__(query_compiler=self._query_compiler) new_obj._pinned = pinned return new_obj return self @doc(SET_BACKEND_DOC, class_name=__qualname__) def set_backend( self, backend: str, inplace: bool = False, *, switch_operation: str = None ) -> Optional[Self]: # TODO(https://github.com/modin-project/modin/issues/7467): refactor # to avoid this cyclic import in most places we do I/O, e.g. in # modin/pandas/io.py from modin.core.execution.dispatching.factories.dispatcher import ( FactoryDispatcher, ) progress_split_count = 2 progress_iter = iter(range(progress_split_count)) self_backend = self.get_backend() normalized_backend = Backend.normalize(backend) if normalized_backend != self_backend: max_rows, max_cols = self._query_compiler._max_shape() # Format the transfer string to be relatively short, but informative. Each # backend is given an allowable width of 10 and the shape integers use the # general format to use scientific notation when needed. std_field_length = 10 operation_str = switch_operation self_backend_str = self_backend normalized_backend_str = normalized_backend if switch_operation is None: operation_str = "" # Provide the switch_operation; and specifically only the method, so # DataFrame.merge would become "merge" operation_str = operation_str.split(".")[-1] # truncate all strings to the field length if needed if len(operation_str) > 15: operation_str = operation_str[: 15 - 3] + "..." if len(self_backend_str) > std_field_length: self_backend_str = self_backend_str[: std_field_length - 3] + "..." if len(normalized_backend_str) > std_field_length: normalized_backend_str = ( normalized_backend_str[: std_field_length - 3] + "..." ) # format the estimated max shape max_shape_str = f"({max_rows:.0g}, {max_cols:.0g})" desc = ( f"Transfer: {self_backend_str:>10.10} → {normalized_backend_str:<10.10} " + f" | {operation_str:^15.15} ≃ {max_shape_str:<10}" ) if ShowBackendSwitchProgress.get(): try: from tqdm.auto import trange progress_iter = iter( trange( progress_split_count, desc=desc, bar_format="{desc} [{bar}]" ) ) except ImportError: # Fallback to simple print statement when tqdm is not available. # Print to stderr to match tqdm's behavior. print(desc, file=sys.stderr) # noqa: T201 else: # Use a dummy progress iterator with no side effects if we do # not want to show the progress. progress_iter = iter(range(progress_split_count)) else: return None if inplace else self # If tqdm is imported and a conversion is necessary, then display a progress bar. # Otherwise, use fallback print statements. next(progress_iter) # Attempt to transfer data based on the following preference order. # 1. The `self._query_compiler.move_to()`, if implemented. # 2. Otherwise, tries the other `query_compiler`'s `move_from()` method. # 3. If both methods return `NotImplemented`, it falls back to materializing # as a pandas DataFrame, and then creates a new `query_compiler` on the # specified backend using `from_pandas`. query_compiler = self._query_compiler.move_to(backend) if query_compiler is NotImplemented: query_compiler = FactoryDispatcher._get_prepared_factory_for_backend( backend ).io_cls.query_compiler_cls.move_from( self._query_compiler, ) if query_compiler is NotImplemented: pandas_self = self._query_compiler.to_pandas() next(progress_iter) query_compiler = FactoryDispatcher.from_pandas( df=pandas_self, backend=backend ) else: next(progress_iter) try: next(progress_iter) except StopIteration: # Last call to next informs tqdm that the operation is done pass if inplace: self._update_inplace(query_compiler) # Always unpin after an explicit set_backend operation self._pinned = False return None else: return self.__constructor__(query_compiler=query_compiler) move_to = set_backend @doc(GET_BACKEND_DOC, class_name=__qualname__) @disable_logging def get_backend(self) -> str: return self._query_compiler.get_backend() @disable_logging def __setattr__(self, key: str, value: Any) -> None: """ Set attribute on this `BasePandasDataset`. Parameters ---------- key : str The attribute name. value : Any The attribute value. Returns ------- None """ # An extension property is only accessible if the backend supports it. extension = self._get_extension(key, __class__._extensions) if extension is not sentinel and hasattr(extension, "__set__"): return extension.__set__(self, value) return super().__setattr__(key, value) @disable_logging def __delattr__(self, name) -> None: """ Delete attribute on this `BasePandasDataset`. Parameters ---------- name : str The attribute name. Returns ------- None """ # An extension property is only accessible if the backend supports it. extension = self._get_extension(name, __class__._extensions) if extension is not sentinel and hasattr(extension, "__delete__"): return extension.__delete__(self) return super().__delattr__(name) @disable_logging @_inherit_docstrings(QueryCompilerCaster._get_query_compiler) def _get_query_compiler(self): return getattr(self, "_query_compiler", None) ================================================ FILE: modin/pandas/dataframe.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses ``DataFrame`` class, that is distributed version of ``pandas.DataFrame``.""" from __future__ import annotations import datetime import functools import itertools import os import re import sys import warnings from typing import ( IO, TYPE_CHECKING, Any, Hashable, Iterable, Iterator, Optional, Sequence, Union, ) import numpy as np import pandas from pandas import Categorical from pandas._libs import lib from pandas._typing import ( CompressionOptions, FilePath, IndexLabel, Scalar, StorageOptions, WriteBuffer, ) from pandas.core.common import apply_if_callable, get_cython_func from pandas.core.dtypes.common import ( infer_dtype_from_object, is_dict_like, is_list_like, is_numeric_dtype, ) from pandas.core.indexes.frozen import FrozenList from pandas.io.formats.info import DataFrameInfo from pandas.util._decorators import doc from pandas.util._validators import validate_bool_kwarg from modin.config import PersistentPickle from modin.core.storage_formats.pandas.query_compiler_caster import ( EXTENSION_DICT_TYPE, EXTENSION_NO_LOOKUP, ) from modin.error_message import ErrorMessage from modin.logging import disable_logging from modin.pandas.io import from_non_pandas, from_pandas, to_pandas from modin.utils import ( MODIN_UNNAMED_SERIES_LABEL, _inherit_docstrings, expanduser_path_arg, hashable, import_optional_dependency, sentinel, try_cast_to_pandas, ) from .accessor import CachedAccessor, SparseFrameAccessor from .base import _ATTRS_NO_LOOKUP, BasePandasDataset from .groupby import DataFrameGroupBy from .iterator import PartitionIterator from .series import Series from .utils import ( GET_BACKEND_DOC, SET_BACKEND_DOC, SET_DATAFRAME_ATTRIBUTE_WARNING, _doc_binary_op, cast_function_modin2pandas, ) if TYPE_CHECKING: from typing_extensions import Self from modin.core.storage_formats import BaseQueryCompiler @_inherit_docstrings( pandas.DataFrame, excluded=[pandas.DataFrame.__init__], apilink="pandas.DataFrame" ) class DataFrame(BasePandasDataset): """ Modin distributed representation of ``pandas.DataFrame``. Internally, the data can be divided into partitions along both columns and rows in order to parallelize computations and utilize the user's hardware as much as possible. Inherit common for ``DataFrame``-s and ``Series`` functionality from the `BasePandasDataset` class. Parameters ---------- data : DataFrame, Series, pandas.DataFrame, ndarray, Iterable or dict, optional Dict can contain ``Series``, arrays, constants, dataclass or list-like objects. If data is a dict, column order follows insertion-order. index : Index or array-like, optional Index to use for resulting frame. Will default to ``RangeIndex`` if no indexing information part of input data and no index provided. columns : Index or array-like, optional Column labels to use for resulting frame. Will default to ``RangeIndex`` if no column labels are provided. dtype : str, np.dtype, or pandas.ExtensionDtype, optional Data type to force. Only a single dtype is allowed. If None, infer. copy : bool, default: False Copy data from inputs. Only affects ``pandas.DataFrame`` / 2d ndarray input. query_compiler : BaseQueryCompiler, optional A query compiler object to create the ``DataFrame`` from. Notes ----- ``DataFrame`` can be created either from passed `data` or `query_compiler`. If both parameters are provided, data source will be prioritized in the next order: 1) Modin ``DataFrame`` or ``Series`` passed with `data` parameter. 2) Query compiler from the `query_compiler` parameter. 3) Various pandas/NumPy/Python data structures passed with `data` parameter. The last option is less desirable since import of such data structures is very inefficient, please use previously created Modin structures from the fist two options or import data using highly efficient Modin IO tools (for example ``pd.read_csv``). """ _pandas_class = pandas.DataFrame _extensions: EXTENSION_DICT_TYPE = EXTENSION_DICT_TYPE(dict) def __init__( self, data=None, index=None, columns=None, dtype=None, copy=None, query_compiler: BaseQueryCompiler = None, ) -> None: from modin.numpy import array # Siblings are other dataframes that share the same query compiler. We # use this list to update inplace when there is a shallow copy. self._siblings = [] if isinstance(data, (DataFrame, Series)): self._query_compiler = data._query_compiler.copy() if index is not None and any(i not in data.index for i in index): raise NotImplementedError( "Passing non-existant columns or index values to constructor not" + " yet implemented." ) if isinstance(data, Series): # We set the column name if it is not in the provided Series if data.name is None: self.columns = [0] if columns is None else columns # If the columns provided are not in the named Series, pandas clears # the DataFrame and sets columns to the columns provided. elif columns is not None and data.name not in columns: self._query_compiler = from_pandas( pandas.DataFrame(columns=columns) )._query_compiler if index is not None: self._query_compiler = data.loc[index]._query_compiler elif columns is None and index is None: data._add_sibling(self) else: if columns is not None and any(i not in data.columns for i in columns): raise NotImplementedError( "Passing non-existant columns or index values to constructor not" + " yet implemented." ) if index is None: index = slice(None) if columns is None: columns = slice(None) self._query_compiler = data.loc[index, columns]._query_compiler elif isinstance(data, array): self._query_compiler = data._query_compiler.copy() if copy is not None and not copy: data._add_sibling(self) if columns is not None and not isinstance(columns, pandas.Index): columns = pandas.Index(columns) if columns is not None: obj_with_new_columns = self.set_axis(columns, axis=1, copy=False) self._update_inplace(obj_with_new_columns._query_compiler) if index is not None: obj_with_new_index = self.set_axis(index, axis=0, copy=False) self._update_inplace(obj_with_new_index._query_compiler) if dtype is not None: casted_obj = self.astype(dtype, copy=False) self._query_compiler = casted_obj._query_compiler # Check type of data and use appropriate constructor elif query_compiler is None: distributed_frame = from_non_pandas(data, index, columns, dtype) if distributed_frame is not None: self._query_compiler = distributed_frame._query_compiler return if isinstance(data, pandas.Index): pass elif ( is_list_like(data) and not is_dict_like(data) and not isinstance(data, np.ndarray) ): old_dtype = getattr(data, "dtype", None) values = [ obj._to_pandas() if isinstance(obj, Series) else obj for obj in data ] try: data = type(data)(values, dtype=old_dtype) except TypeError: data = values elif is_dict_like(data) and not isinstance( data, (pandas.Series, Series, pandas.DataFrame, DataFrame) ): if columns is not None: data = {key: value for key, value in data.items() if key in columns} if len(data) and all(isinstance(v, Series) for v in data.values()): from .general import concat new_qc = concat( data.values(), axis=1, keys=data.keys() )._query_compiler if dtype is not None: new_qc = new_qc.astype({col: dtype for col in new_qc.columns}) if index is not None: new_qc = new_qc.reindex(axis=0, labels=index) if columns is not None: new_qc = new_qc.reindex(axis=1, labels=columns) self._query_compiler = new_qc return data = { k: v._to_pandas() if isinstance(v, Series) else v for k, v in data.items() } pandas_df = pandas.DataFrame( data=data, index=index, columns=columns, dtype=dtype, copy=copy ) if pandas_df.size >= 1_000_000: warnings.warn( "Distributing {} object. This may take some time.".format( type(data) ) ) self._query_compiler = from_pandas(pandas_df)._query_compiler else: self._query_compiler = query_compiler def __repr__(self) -> str: """ Return a string representation for a particular ``DataFrame``. Returns ------- str """ num_rows = pandas.get_option("display.max_rows") or len(self) num_cols = pandas.get_option( "display.max_columns" ) or self._query_compiler.get_axis_len(1) result = repr(self._build_repr_df(num_rows, num_cols)) if len(self) > num_rows or self._query_compiler.get_axis_len(1) > num_cols: # The split here is so that we don't repr pandas row lengths. return result.rsplit("\n\n", 1)[0] + "\n\n[{0} rows x {1} columns]".format( *self.shape ) else: return result def _repr_html_(self) -> str: # pragma: no cover """ Return a html representation for a particular ``DataFrame``. Returns ------- str """ num_rows = pandas.get_option("display.max_rows") or 60 num_cols = pandas.get_option("display.max_columns") or 20 # We use pandas _repr_html_ to get a string of the HTML representation # of the dataframe. result = self._build_repr_df(num_rows, num_cols)._repr_html_() if len(self) > num_rows or self._query_compiler.get_axis_len(1) > num_cols: # We split so that we insert our correct dataframe dimensions. return result.split("

")[ 0 ] + "

{0} rows x {1} columns

\n".format(*self.shape) else: return result def _get_columns(self) -> pandas.Index: """ Get the columns for this ``DataFrame``. Returns ------- pandas.Index The union of all indexes across the partitions. """ return self._query_compiler.columns def _set_columns(self, new_columns) -> None: """ Set the columns for this ``DataFrame``. Parameters ---------- new_columns : list-like, Index The new index to set. """ self._query_compiler.columns = new_columns columns: pandas.Index = property(_get_columns, _set_columns) @property def ndim(self) -> int: # noqa: RT01, D200 """ Return the number of dimensions of the underlying data, by definition 2. """ return 2 def drop_duplicates( self, subset=None, *, keep="first", inplace=False, ignore_index=False ) -> Union[DataFrame, None]: # noqa: PR01, RT01, D200 """ Return ``DataFrame`` with duplicate rows removed. """ return super(DataFrame, self).drop_duplicates( subset=subset, keep=keep, inplace=inplace, ignore_index=ignore_index ) @property def dtypes(self) -> pandas.Series: # noqa: RT01, D200 """ Return the dtypes in the ``DataFrame``. """ return self._query_compiler.dtypes def duplicated(self, subset=None, keep="first") -> Series: # noqa: PR01, RT01, D200 """ Return boolean ``Series`` denoting duplicate rows. """ df = self[subset] if subset is not None else self new_qc = df._query_compiler.duplicated(keep=keep) duplicates = self._reduce_dimension(new_qc) return duplicates @property def empty(self) -> bool: # noqa: RT01, D200 """ Indicate whether ``DataFrame`` is empty. """ return self._query_compiler.get_axis_len(1) == 0 or len(self) == 0 @property def axes(self) -> list[pandas.Index]: # noqa: RT01, D200 """ Return a list representing the axes of the ``DataFrame``. """ return [self.index, self.columns] @property def shape(self) -> tuple[int, int]: # noqa: RT01, D200 """ Return a tuple representing the dimensionality of the ``DataFrame``. """ return len(self), self._query_compiler.get_axis_len(1) def add_prefix(self, prefix, axis=None) -> DataFrame: # noqa: PR01, RT01, D200 """ Prefix labels with string `prefix`. """ axis = 1 if axis is None else self._get_axis_number(axis) return self.__constructor__( query_compiler=self._query_compiler.add_prefix(prefix, axis) ) def add_suffix(self, suffix, axis=None) -> DataFrame: # noqa: PR01, RT01, D200 """ Suffix labels with string `suffix`. """ axis = 1 if axis is None else self._get_axis_number(axis) return self.__constructor__( query_compiler=self._query_compiler.add_suffix(suffix, axis) ) def map(self, func, na_action: Optional[str] = None, **kwargs) -> DataFrame: if not callable(func): raise ValueError("'{0}' object is not callable".format(type(func))) return self.__constructor__( query_compiler=self._query_compiler.map(func, na_action=na_action, **kwargs) ) def applymap(self, func, na_action: Optional[str] = None, **kwargs) -> DataFrame: warnings.warn( "DataFrame.applymap has been deprecated. Use DataFrame.map instead.", FutureWarning, ) return self.map(func, na_action=na_action, **kwargs) def apply( self, func, axis=0, raw=False, result_type=None, args=(), by_row="compat", engine="python", engine_kwargs=None, **kwargs, ) -> Union[DataFrame, Series]: # noqa: PR01, RT01, D200 """ Apply a function along an axis of the ``DataFrame``. """ if by_row != "compat" or engine != "python" or engine_kwargs: # TODO: add test return self._default_to_pandas( pandas.DataFrame.apply, func=func, axis=axis, raw=raw, result_type=result_type, args=args, by_row=by_row, engine=engine, engine_kwargs=engine_kwargs, **kwargs, ) func = cast_function_modin2pandas(func) axis = self._get_axis_number(axis) query_compiler = super(DataFrame, self).apply( func, axis=axis, raw=raw, result_type=result_type, args=args, **kwargs, ) if not isinstance(query_compiler, type(self._query_compiler)): # A scalar was returned return query_compiler if result_type == "reduce": output_type = Series elif result_type == "broadcast": output_type = DataFrame # the 'else' branch also handles 'result_type == "expand"' since it makes the output type # depend on the `func` result (Series for a scalar, DataFrame for list-like) else: reduced_index = pandas.Index([MODIN_UNNAMED_SERIES_LABEL]) if query_compiler.get_axis(axis).equals( reduced_index ) or query_compiler.get_axis(axis ^ 1).equals(reduced_index): output_type = Series else: output_type = DataFrame return output_type(query_compiler=query_compiler) def groupby( self, by=None, axis=lib.no_default, level=None, as_index=True, sort=True, group_keys=True, observed=lib.no_default, dropna: bool = True, ): # noqa: PR01, RT01, D200 """ Group ``DataFrame`` using a mapper or by a ``Series`` of columns. """ if axis is not lib.no_default: axis = self._get_axis_number(axis) if axis == 1: warnings.warn( "DataFrame.groupby with axis=1 is deprecated. Do " + "`frame.T.groupby(...)` without axis instead.", FutureWarning, ) else: warnings.warn( "The 'axis' keyword in DataFrame.groupby is deprecated and " + "will be removed in a future version.", FutureWarning, ) else: axis = 0 axis = self._get_axis_number(axis) idx_name = None # Drop here indicates whether or not to drop the data column before doing the # groupby. The typical pandas behavior is to drop when the data came from this # dataframe. When a string, Series directly from this dataframe, or list of # strings is passed in, the data used for the groupby is dropped before the # groupby takes place. drop = False return_tuple_when_iterating = False if ( not isinstance(by, (pandas.Series, Series)) and is_list_like(by) and len(by) == 1 ): by = by[0] return_tuple_when_iterating = True if callable(by): by = self.index.map(by) elif hashable(by) and not isinstance(by, (pandas.Grouper, FrozenList)): drop = by in self.columns idx_name = by if by is not None and by in self._query_compiler.get_index_names(axis): # In this case we pass the string value of the name through to the # partitions. This is more efficient than broadcasting the values. level, by = by, None elif level is None: by = self.__getitem__(by)._query_compiler elif isinstance(by, Series): drop = by._parent is self idx_name = by.name by = by._query_compiler elif isinstance(by, pandas.Grouper): drop = by.key in self elif is_list_like(by): # fastpath for multi column groupby if axis == 0 and all( ( (hashable(o) and (o in self)) or isinstance(o, Series) or (isinstance(o, pandas.Grouper) and o.key in self) or (is_list_like(o) and len(o) == len(self._get_axis(axis))) ) for o in by ): has_external = False processed_by = [] for current_by in by: if isinstance(current_by, pandas.Grouper): processed_by.append(current_by) has_external = True elif hashable(current_by): processed_by.append(current_by) elif isinstance(current_by, Series): if current_by._parent is self: processed_by.append(current_by.name) else: processed_by.append(current_by._query_compiler) has_external = True else: has_external = True processed_by.append(current_by) by = processed_by if not has_external: by = self[processed_by]._query_compiler drop = True else: mismatch = len(by) != len(self._get_axis(axis)) if mismatch and all( hashable(obj) and ( obj in self or obj in self._query_compiler.get_index_names(axis) ) for obj in by ): # In the future, we will need to add logic to handle this, but for now # we default to pandas in this case. pass elif mismatch and any( hashable(obj) and obj not in self.columns for obj in by ): names = [o.name if isinstance(o, Series) else o for o in by] raise KeyError(next(x for x in names if x not in self)) return DataFrameGroupBy( self, by, axis, level, as_index, sort, group_keys, idx_name, observed=observed, drop=drop, dropna=dropna, return_tuple_when_iterating=return_tuple_when_iterating, backend_pinned=self.is_backend_pinned(), ) def keys(self) -> pandas.Index: # noqa: RT01, D200 """ Get columns of the ``DataFrame``. """ return self.columns def transpose(self, copy=False, *args) -> DataFrame: # noqa: PR01, RT01, D200 """ Transpose index and columns. """ # FIXME: Judging by pandas docs `*args` serves only compatibility purpose # and does not affect the result, we shouldn't pass it to the query compiler. return self.__constructor__( query_compiler=self._query_compiler.transpose(*args) ) # To enable dynamic backend switching, we must use a `def` so the lookup of `self.transpose` # is performed dynamically, whereas declaring `T = property(transpose)` makes it always use # the originally-defined version without the switching wrapper. @property def T(self) -> DataFrame: return self.transpose() def add( self, other, axis="columns", level=None, fill_value=None ) -> DataFrame: # noqa: PR01, RT01, D200 """ Get addition of ``DataFrame`` and `other`, element-wise (binary operator `add`). """ return self._binary_op( "add", other, axis=axis, level=level, fill_value=fill_value, broadcast=isinstance(other, Series), ) def assign(self, **kwargs) -> DataFrame: # noqa: PR01, RT01, D200 """ Assign new columns to a ``DataFrame``. """ df = self.copy() for k, v in kwargs.items(): if callable(v): df[k] = v(df) else: df[k] = v return df def boxplot( self, column=None, by=None, ax=None, fontsize=None, rot=0, grid=True, figsize=None, layout=None, return_type=None, backend=None, **kwargs, ): # noqa: PR01, RT01, D200 """ Make a box plot from ``DataFrame`` columns. """ return to_pandas(self).boxplot( column=column, by=by, ax=ax, fontsize=fontsize, rot=rot, grid=grid, figsize=figsize, layout=layout, return_type=return_type, backend=backend, **kwargs, ) def combine( self, other, func, fill_value=None, overwrite=True ) -> DataFrame: # noqa: PR01, RT01, D200 """ Perform column-wise combine with another ``DataFrame``. """ return super(DataFrame, self).combine( other, func, fill_value=fill_value, overwrite=overwrite ) def compare( self, other, align_axis=1, keep_shape: bool = False, keep_equal: bool = False, result_names=("self", "other"), ) -> DataFrame: # noqa: PR01, RT01, D200 """ Compare to another ``DataFrame`` and show the differences. """ if not isinstance(other, DataFrame): raise TypeError(f"Cannot compare DataFrame to {type(other)}") other = self._validate_other(other, 0, compare_index=True) return self.__constructor__( query_compiler=self._query_compiler.compare( other, align_axis=align_axis, keep_shape=keep_shape, keep_equal=keep_equal, result_names=result_names, ) ) def corr( self, method="pearson", min_periods=1, numeric_only=False ) -> DataFrame: # noqa: PR01, RT01, D200 """ Compute pairwise correlation of columns, excluding NA/null values. """ return self.__constructor__( query_compiler=self._query_compiler.corr( method=method, min_periods=min_periods, numeric_only=numeric_only, ) ) def corrwith( self, other, axis=0, drop=False, method="pearson", numeric_only=False ) -> Series: # noqa: PR01, RT01, D200 """ Compute pairwise correlation. """ if not isinstance(other, (Series, DataFrame)): raise TypeError(f"unsupported type: {type(other)}") return self.__constructor__( query_compiler=self._query_compiler.corrwith( other=other._query_compiler, axis=axis, drop=drop, method=method, numeric_only=numeric_only, ) ) def cov( self, min_periods=None, ddof: Optional[int] = 1, numeric_only=False ) -> DataFrame: # noqa: PR01, RT01, D200 """ Compute pairwise covariance of columns, excluding NA/null values. """ cov_df = self if numeric_only: cov_df = self.drop( columns=[ i for i in self.dtypes.index if not is_numeric_dtype(self.dtypes[i]) ] ) if min_periods is not None and min_periods > len(cov_df): result = np.empty((cov_df.shape[1], cov_df.shape[1])) result.fill(np.nan) return cov_df.__constructor__(result) return cov_df.__constructor__( query_compiler=cov_df._query_compiler.cov( min_periods=min_periods, ddof=ddof ) ) def dot(self, other) -> Union[DataFrame, Series]: # noqa: PR01, RT01, D200 """ Compute the matrix multiplication between the ``DataFrame`` and `other`. """ if isinstance(other, BasePandasDataset): common = self.columns.union(other.index) if len(common) > self._query_compiler.get_axis_len(1) or len(common) > len( other ): raise ValueError("Matrices are not aligned") qc = other.reindex(index=common)._query_compiler if isinstance(other, DataFrame): return self.__constructor__( query_compiler=self._query_compiler.dot( qc, squeeze_self=False, squeeze_other=False ) ) else: return self._reduce_dimension( query_compiler=self._query_compiler.dot( qc, squeeze_self=False, squeeze_other=True ) ) other = np.asarray(other) if self.shape[1] != other.shape[0]: raise ValueError( "Dot product shape mismatch, {} vs {}".format(self.shape, other.shape) ) if len(other.shape) > 1: return self.__constructor__( query_compiler=self._query_compiler.dot(other, squeeze_self=False) ) return self._reduce_dimension( query_compiler=self._query_compiler.dot(other, squeeze_self=False) ) def eq( self, other, axis="columns", level=None ) -> DataFrame: # noqa: PR01, RT01, D200 """ Perform equality comparison of ``DataFrame`` and `other` (binary operator `eq`). """ return self._binary_op( "eq", other, axis=axis, level=level, broadcast=isinstance(other, Series) ) def equals(self, other) -> bool: # noqa: PR01, RT01, D200 """ Test whether two objects contain the same elements. """ if isinstance(other, pandas.DataFrame): # Copy into a Modin DataFrame to simplify logic below other = self.__constructor__(other) if ( type(self) is not type(other) or not self.index.equals(other.index) or not self.columns.equals(other.columns) ): return False result = self.__constructor__( query_compiler=self._query_compiler.equals(other._query_compiler) ) return result.all(axis=None) def _update_var_dicts_in_kwargs(self, expr, kwargs) -> None: """ Copy variables with "@" prefix in `local_dict` and `global_dict` keys of kwargs. Parameters ---------- expr : str The expression string to search variables with "@" prefix. kwargs : dict See the documentation for eval() for complete details on the keyword arguments accepted by query(). """ if "@" not in expr: return frame = sys._getframe() try: # TODO(https://github.com/modin-project/modin/issues/4478): fix this f_locals = frame.f_back.f_back.f_back.f_back.f_back.f_back.f_locals f_globals = frame.f_back.f_back.f_back.f_back.f_back.f_back.f_globals finally: del frame local_names = set(re.findall(r"@([\w]+)", expr)) local_dict = {} global_dict = {} for name in local_names: for dct_out, dct_in in ((local_dict, f_locals), (global_dict, f_globals)): try: dct_out[name] = dct_in[name] except KeyError: pass if local_dict: local_dict.update(kwargs.get("local_dict") or {}) kwargs["local_dict"] = local_dict if global_dict: global_dict.update(kwargs.get("global_dict") or {}) kwargs["global_dict"] = global_dict def eval(self, expr, inplace=False, **kwargs): # noqa: PR01, RT01, D200 """ Evaluate a string describing operations on ``DataFrame`` columns. """ from modin.core.computation.eval import _check_engine self._update_var_dicts_in_kwargs(expr, kwargs) inplace = validate_bool_kwarg(inplace, "inplace") if _check_engine(kwargs.get("engine", None)) == "numexpr": # on numexpr engine, pandas.eval returns np.array if input is not of pandas # type, so we can't use pandas eval [1]. Even if we could, pandas eval seems # to convert all the data to numpy and then do the numexpr add, which is # slow for modin. The user would not really be getting the benefit of # numexpr. # [1] https://github.com/pandas-dev/pandas/blob/934eebb532cf50e872f40638a788000be6e4dda4/pandas/core/computation/align.py#L78 return self._default_to_pandas( pandas.DataFrame.eval, expr, inplace=inplace, **kwargs ) from modin.core.computation.eval import eval as _eval kwargs["level"] = kwargs.pop("level", 0) + 1 index_resolvers = self._get_index_resolvers() column_resolvers = self._get_cleaned_column_resolvers() resolvers = column_resolvers, index_resolvers if "target" not in kwargs: kwargs["target"] = self kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + resolvers return _eval(expr, inplace=inplace, **kwargs) def fillna( self, value=None, *, method=None, axis=None, inplace=False, limit=None, downcast=lib.no_default, ) -> Union[DataFrame, None]: # noqa: PR01, RT01, D200 """ Fill NA/NaN values using the specified method. """ return super(DataFrame, self).fillna( squeeze_self=False, squeeze_value=isinstance(value, Series), value=value, method=method, axis=axis, inplace=inplace, limit=limit, downcast=downcast, ) def floordiv( self, other, axis="columns", level=None, fill_value=None ) -> DataFrame: # noqa: PR01, RT01, D200 """ Get integer division of ``DataFrame`` and `other`, element-wise (binary operator `floordiv`). """ return self._binary_op( "floordiv", other, axis=axis, level=level, fill_value=fill_value, broadcast=isinstance(other, Series), ) @classmethod def from_dict( cls, data, orient="columns", dtype=None, columns=None ) -> DataFrame: # pragma: no cover # noqa: PR01, RT01, D200 """ Construct ``DataFrame`` from dict of array-like or dicts. """ ErrorMessage.default_to_pandas("`from_dict`") return from_pandas( pandas.DataFrame.from_dict( data, orient=orient, dtype=dtype, columns=columns ) ) @classmethod def from_records( cls, data, index=None, exclude=None, columns=None, coerce_float=False, nrows=None, ) -> DataFrame: # pragma: no cover # noqa: PR01, RT01, D200 """ Convert structured or record ndarray to ``DataFrame``. """ ErrorMessage.default_to_pandas("`from_records`") return from_pandas( pandas.DataFrame.from_records( data, index=index, exclude=exclude, columns=columns, coerce_float=coerce_float, nrows=nrows, ) ) def ge( self, other, axis="columns", level=None ) -> DataFrame: # noqa: PR01, RT01, D200 """ Get greater than or equal comparison of ``DataFrame`` and `other`, element-wise (binary operator `ge`). """ return self._binary_op( "ge", other, axis=axis, level=level, broadcast=isinstance(other, Series) ) def gt( self, other, axis="columns", level=None ) -> DataFrame: # noqa: PR01, RT01, D200 """ Get greater than comparison of ``DataFrame`` and `other`, element-wise (binary operator `ge`). """ return self._binary_op( "gt", other, axis=axis, level=level, broadcast=isinstance(other, Series) ) def hist( data, column: IndexLabel | None = None, by=None, grid: bool = True, xlabelsize: int | None = None, xrot: float | None = None, ylabelsize: int | None = None, yrot: float | None = None, ax=None, sharex: bool = False, sharey: bool = False, figsize: tuple[int, int] | None = None, layout: tuple[int, int] | None = None, bins: int | Sequence[int] = 10, backend: str | None = None, legend: bool = False, **kwargs, ): # pragma: no cover # noqa: PR01, RT01, D200 """ Make a histogram of the ``DataFrame``. """ return data._default_to_pandas( pandas.DataFrame.hist, column=column, by=by, grid=grid, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot, ax=ax, sharex=sharex, sharey=sharey, figsize=figsize, layout=layout, bins=bins, backend=backend, legend=legend, **kwargs, ) def info( self, verbose: Optional[bool] = None, buf: Optional[IO[str]] = None, max_cols: Optional[int] = None, memory_usage: Optional[Union[bool, str]] = None, show_counts: Optional[bool] = None, ) -> None: # noqa: PR01, D200 """ Print a concise summary of the ``DataFrame``. """ info = DataFrameInfo( data=self, memory_usage=memory_usage, ) info.render( buf=buf, max_cols=max_cols, verbose=verbose, show_counts=show_counts, ) def insert( self, loc, column, value, allow_duplicates=lib.no_default ) -> None: # noqa: PR01, D200 """ Insert column into ``DataFrame`` at specified location. """ from modin.numpy import array if ( isinstance(value, (DataFrame, pandas.DataFrame)) or isinstance(value, (array, np.ndarray)) and len(value.shape) > 1 ): if isinstance(value, (array, np.ndarray)) and value.shape[1] != 1: raise ValueError( f"Expected a 1D array, got an array with shape {value.shape}" ) elif ( isinstance(value, (DataFrame, pandas.DataFrame)) and value.shape[1] != 1 ): raise ValueError( "Expected a one-dimensional object, got a DataFrame with " + f"{len(value.columns)} columns instead." ) value = value.squeeze(axis=1) if not self._query_compiler.lazy_row_count and len(self) == 0: if not hasattr(value, "index"): try: value = pandas.Series(value) except (TypeError, ValueError, IndexError): raise ValueError( "Cannot insert into a DataFrame with no defined index " + "and a value that cannot be converted to a " + "Series" ) new_index = value.index.copy() new_columns = self.columns.insert(loc, column) new_query_compiler = self.__constructor__( value, index=new_index, columns=new_columns )._query_compiler elif self._query_compiler.get_axis_len(1) == 0 and loc == 0: new_index = self.index new_query_compiler = self.__constructor__( data=value, columns=[column], index=None if len(new_index) == 0 else new_index, )._query_compiler else: if ( is_list_like(value) and not isinstance(value, (pandas.Series, Series)) and len(value) != len(self) ): raise ValueError( "Length of values ({}) does not match length of index ({})".format( len(value), len(self) ) ) if allow_duplicates is not True and column in self.columns: raise ValueError(f"cannot insert {column}, already exists") columns_len = self._query_compiler.get_axis_len(1) if not -columns_len <= loc <= columns_len: raise IndexError( f"index {loc} is out of bounds for axis 0 with size {columns_len}" ) elif loc < 0: raise ValueError("unbounded slice") if isinstance(value, (Series, array)): value = value._query_compiler new_query_compiler = self._query_compiler.insert(loc, column, value) self._update_inplace(new_query_compiler=new_query_compiler) def isna(self) -> DataFrame: """ Detect missing values. Returns ------- DataFrame The result of detecting missing values. """ return super(DataFrame, self).isna() def isnull(self) -> DataFrame: """ Detect missing values. Returns ------- DataFrame The result of detecting missing values. """ return super(DataFrame, self).isnull() def iterrows(self) -> Iterable[tuple[Hashable, Series]]: # noqa: D200 """ Iterate over ``DataFrame`` rows as (index, ``Series``) pairs. """ def iterrow_builder(s): """Return tuple of the given `s` parameter name and the parameter themself.""" return s.name, s partition_iterator = PartitionIterator(self, 0, iterrow_builder) for v in partition_iterator: yield v def items(self) -> Iterable[tuple[Hashable, Series]]: # noqa: D200 """ Iterate over (column name, ``Series``) pairs. """ def items_builder(s): """Return tuple of the given `s` parameter name and the parameter themself.""" return s.name, s partition_iterator = PartitionIterator(self, 1, items_builder) for v in partition_iterator: yield v def itertuples( self, index=True, name="Pandas" ) -> Iterable[tuple[Any, ...]]: # noqa: PR01, D200 """ Iterate over ``DataFrame`` rows as ``namedtuple``-s. """ def itertuples_builder(s): """Return the next ``namedtuple``.""" return next(s._to_pandas().to_frame().T.itertuples(index=index, name=name)) partition_iterator = PartitionIterator(self, 0, itertuples_builder) for v in partition_iterator: yield v def join( self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False, validate=None, ) -> DataFrame: # noqa: PR01, RT01, D200 """ Join columns of another ``DataFrame``. """ if on is not None and not isinstance(other, (Series, DataFrame)): raise ValueError( "Joining multiple DataFrames only supported for joining on index" ) if validate is not None: return self._default_to_pandas( pandas.DataFrame.join, other, on=on, how=how, lsuffix=lsuffix, rsuffix=rsuffix, sort=sort, validate=validate, ) if isinstance(other, Series): if other.name is None: raise ValueError("Other Series must have a name") other = self.__constructor__(other) if on is not None or how == "cross": return self.__constructor__( query_compiler=self._query_compiler.join( other._query_compiler, on=on, how=how, lsuffix=lsuffix, rsuffix=rsuffix, sort=sort, validate=validate, ) ) if isinstance(other, DataFrame): # Joining the empty DataFrames with either index or columns is # fast. It gives us proper error checking for the edge cases that # would otherwise require a lot more logic. new_columns = ( pandas.DataFrame(columns=self.columns) .join( pandas.DataFrame(columns=other.columns), lsuffix=lsuffix, rsuffix=rsuffix, ) .columns ) other = [other] else: new_columns = ( pandas.DataFrame(columns=self.columns) .join( [pandas.DataFrame(columns=obj.columns) for obj in other], lsuffix=lsuffix, rsuffix=rsuffix, ) .columns ) new_frame = self.__constructor__( query_compiler=self._query_compiler.concat( 1, [obj._query_compiler for obj in other], join=how, sort=sort ) ) new_frame.columns = new_columns return new_frame def isetitem(self, loc, value) -> None: return self._default_to_pandas( pandas.DataFrame.isetitem, loc=loc, value=value, ) def le( self, other, axis="columns", level=None ) -> DataFrame: # noqa: PR01, RT01, D200 """ Get less than or equal comparison of ``DataFrame`` and `other`, element-wise (binary operator `le`). """ return self._binary_op( "le", other, axis=axis, level=level, broadcast=isinstance(other, Series) ) def lt( self, other, axis="columns", level=None ) -> DataFrame: # noqa: PR01, RT01, D200 """ Get less than comparison of ``DataFrame`` and `other`, element-wise (binary operator `le`). """ return self._binary_op( "lt", other, axis=axis, level=level, broadcast=isinstance(other, Series) ) def melt( self, id_vars=None, value_vars=None, var_name=None, value_name="value", col_level=None, ignore_index=True, ) -> DataFrame: # noqa: PR01, RT01, D200 """ Unpivot a ``DataFrame`` from wide to long format, optionally leaving identifiers set. """ if id_vars is None: id_vars = [] if not is_list_like(id_vars): id_vars = [id_vars] if value_vars is None: value_vars = self.columns.drop(id_vars) if var_name is None: columns_name = self._query_compiler.get_index_name(axis=1) var_name = columns_name if columns_name is not None else "variable" return self.__constructor__( query_compiler=self._query_compiler.melt( id_vars=id_vars, value_vars=value_vars, var_name=var_name, value_name=value_name, col_level=col_level, ignore_index=ignore_index, ) ) def merge( self, right, how="inner", on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, suffixes=("_x", "_y"), copy=None, indicator=False, validate=None, ) -> DataFrame: # noqa: PR01, RT01, D200 """ Merge ``DataFrame`` or named ``Series`` objects with a database-style join. """ if copy is None: copy = True if isinstance(right, Series): if right.name is None: raise ValueError("Cannot merge a Series without a name") else: right = right.to_frame() if not isinstance(right, DataFrame): raise TypeError( f"Can only merge Series or DataFrame objects, a {type(right)} was passed" ) # If we are joining on the index and we are using # default parameters we can map this to a join if left_index and right_index and not indicator: return self.join( right, how=how, lsuffix=suffixes[0], rsuffix=suffixes[1], sort=sort ) return self.__constructor__( query_compiler=self._query_compiler.merge( right._query_compiler, how=how, on=on, left_on=left_on, right_on=right_on, left_index=left_index, right_index=right_index, sort=sort, suffixes=suffixes, copy=copy, indicator=indicator, validate=validate, ) ) def mod( self, other, axis="columns", level=None, fill_value=None ) -> DataFrame: # noqa: PR01, RT01, D200 """ Get modulo of ``DataFrame`` and `other`, element-wise (binary operator `mod`). """ return self._binary_op( "mod", other, axis=axis, level=level, fill_value=fill_value, broadcast=isinstance(other, Series), ) def mul( self, other, axis="columns", level=None, fill_value=None ) -> DataFrame: # noqa: PR01, RT01, D200 """ Get multiplication of ``DataFrame`` and `other`, element-wise (binary operator `mul`). """ return self._binary_op( "mul", other, axis=axis, level=level, fill_value=fill_value, broadcast=isinstance(other, Series), ) multiply = mul def rmul( self, other, axis="columns", level=None, fill_value=None ) -> DataFrame: # noqa: PR01, RT01, D200 """ Get multiplication of ``DataFrame`` and `other`, element-wise (binary operator `mul`). """ return self._binary_op( "rmul", other, axis=axis, level=level, fill_value=fill_value, broadcast=isinstance(other, Series), ) def ne( self, other, axis="columns", level=None ) -> DataFrame: # noqa: PR01, RT01, D200 """ Get not equal comparison of ``DataFrame`` and `other`, element-wise (binary operator `ne`). """ return self._binary_op( "ne", other, axis=axis, level=level, broadcast=isinstance(other, Series) ) def nlargest(self, n, columns, keep="first") -> DataFrame: # noqa: PR01, RT01, D200 """ Return the first `n` rows ordered by `columns` in descending order. """ return self.__constructor__( query_compiler=self._query_compiler.nlargest(n, columns, keep) ) def nsmallest( self, n, columns, keep="first" ) -> DataFrame: # noqa: PR01, RT01, D200 """ Return the first `n` rows ordered by `columns` in ascending order. """ return self.__constructor__( query_compiler=self._query_compiler.nsmallest( n=n, columns=columns, keep=keep ) ) def unstack( self, level=-1, fill_value=None, sort=True ) -> Union[DataFrame, Series]: # noqa: PR01, RT01, D200 """ Pivot a level of the (necessarily hierarchical) index labels. """ if not sort: # TODO: it should be easy to add support for sort == False return self._default_to_pandas( pandas.DataFrame.unstack, level=level, fill_value=fill_value, sort=sort ) # This ensures that non-pandas MultiIndex objects are caught. is_multiindex = len(self.index.names) > 1 if not is_multiindex or ( is_multiindex and is_list_like(level) and len(level) == self.index.nlevels ): return self._reduce_dimension( query_compiler=self._query_compiler.unstack(level, fill_value) ) else: return self.__constructor__( query_compiler=self._query_compiler.unstack(level, fill_value) ) def pivot( self, *, columns, index=lib.no_default, values=lib.no_default ) -> DataFrame: # noqa: PR01, RT01, D200 """ Return reshaped ``DataFrame`` organized by given index / column values. """ if index is lib.no_default: index = None if values is lib.no_default: values = None # if values is not specified, it should be the remaining columns not in # index or columns if values is None: values = list(self.columns) if index is not None: values = [v for v in values if v not in index] if columns is not None: values = [v for v in values if v not in columns] return self.__constructor__( query_compiler=self._query_compiler.pivot( index=index, columns=columns, values=values ) ) def pivot_table( self, values=None, index=None, columns=None, aggfunc="mean", fill_value=None, margins=False, dropna=True, margins_name="All", observed=lib.no_default, sort=True, ) -> DataFrame: # noqa: PR01, RT01, D200 """ Create a spreadsheet-style pivot table as a ``DataFrame``. """ # Convert callable to a string aggregation name if possible if hashable(aggfunc): aggfunc = get_cython_func(aggfunc) or aggfunc result = self.__constructor__( query_compiler=self._query_compiler.pivot_table( index=index, values=values, columns=columns, aggfunc=aggfunc, fill_value=fill_value, margins=margins, dropna=dropna, margins_name=margins_name, observed=observed, sort=sort, ) ) return result @property def plot( self, x=None, y=None, kind="line", ax=None, subplots=False, sharex=None, sharey=False, layout=None, figsize=None, use_index=True, title=None, grid=None, legend=True, style=None, logx=False, logy=False, loglog=False, xticks=None, yticks=None, xlim=None, ylim=None, rot=None, fontsize=None, colormap=None, table=False, yerr=None, xerr=None, secondary_y=False, sort_columns=False, **kwargs, ): # noqa: PR01, RT01, D200 """ Make plots of ``DataFrame``. """ return self._to_pandas().plot def pow( self, other, axis="columns", level=None, fill_value=None ) -> DataFrame: # noqa: PR01, RT01, D200 """ Get exponential power of ``DataFrame`` and `other`, element-wise (binary operator `pow`). """ if isinstance(other, Series): return self._default_to_pandas( "pow", other, axis=axis, level=level, fill_value=fill_value ) return self._binary_op( "pow", other, axis=axis, level=level, fill_value=fill_value, broadcast=isinstance(other, Series), ) def prod( self, axis=0, skipna=True, numeric_only=False, min_count=0, **kwargs, ): # noqa: PR01, RT01, D200 """ Return the product of the values over the requested axis. """ validate_bool_kwarg(skipna, "skipna", none_allowed=False) axis = self._get_axis_number(axis) axis_to_apply = self.columns if axis else self.index if ( skipna is not False and numeric_only is False and min_count > len(axis_to_apply) # This fast path is only suitable for the default backend and self._query_compiler.get_pandas_backend() is None ): new_index = self.columns if not axis else self.index # >>> pd.DataFrame([1,2,3,4], dtype="int64[pyarrow]").prod(min_count=10) # 0 # dtype: int64[pyarrow] return Series( [np.nan] * len(new_index), index=new_index, dtype=pandas.api.types.pandas_dtype("float64"), ) data = self._validate_dtypes_prod_mean(axis, numeric_only, ignore_axis=True) if min_count > 1: return data._reduce_dimension( data._query_compiler.prod_min_count( axis=axis, skipna=skipna, numeric_only=numeric_only, min_count=min_count, **kwargs, ) ) return data._reduce_dimension( data._query_compiler.prod( axis=axis, skipna=skipna, numeric_only=numeric_only, min_count=min_count, **kwargs, ) ) product = prod def quantile( self, q=0.5, axis=0, numeric_only=False, interpolation="linear", method="single", ) -> Union[DataFrame, Series]: return super(DataFrame, self).quantile( q=q, axis=axis, numeric_only=numeric_only, interpolation=interpolation, method=method, ) # methods and fields we need to use pandas.DataFrame.query _AXIS_ORDERS = ["index", "columns"] _get_index_resolvers = pandas.DataFrame._get_index_resolvers def _get_axis_resolvers(self, axis: str) -> dict: # noqa: GL08 # forked from pandas because we only want to update the index if there's more # than one level of the index. # index or columns axis_index = getattr(self, axis) d = {} prefix = axis[0] for i, name in enumerate(axis_index.names): if name is not None: key = level = name else: # prefix with 'i' or 'c' depending on the input axis # e.g., you must do ilevel_0 for the 0th level of an unnamed # multiiindex key = f"{prefix}level_{i}" level = i level_values = axis_index.get_level_values(level) s = level_values.to_series() if axis_index.nlevels > 1: s.index = axis_index d[key] = s # put the index/columns itself in the dict if axis_index.nlevels > 2: dindex = axis_index else: dindex = axis_index.to_series() d[axis] = dindex return d def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]: # noqa: RT01 """ Return the special character free column resolvers of a dataframe. Column names with special characters are 'cleaned up' so that they can be referred to by backtick quoting. Used in `DataFrame.eval`. Notes ----- Copied from pandas. """ from modin.core.computation.parsing import clean_column_name return { clean_column_name(k): v for k, v in self.items() if not isinstance(k, int) } def query( self, expr, inplace=False, **kwargs ) -> Union[DataFrame, None]: # noqa: PR01, RT01, D200 """ Query the columns of a ``DataFrame`` with a boolean expression. """ self._update_var_dicts_in_kwargs(expr, kwargs) self._validate_eval_query(expr, **kwargs) inplace = validate_bool_kwarg(inplace, "inplace") if not isinstance(expr, str): msg = f"expr must be a string to be evaluated, {type(expr)} given" raise ValueError(msg) # HACK: this condition kind of breaks the idea of backend agnostic API as all queries # _should_ work fine for all of the engines using `pandas.DataFrame.query(...)` approach. # However, at this point we know that we can execute simple queries way more efficiently # using the QC's API directly in case of pandas backend. Ideally, we have to make it work # with the 'pandas.query' approach the same as good the direct QC call is. But investigating # and fixing the root cause of the perf difference appears to be much more complicated # than putting this hack here. Hopefully, we'll get rid of it soon: # https://github.com/modin-project/modin/issues/6499 try: new_query_compiler = self._query_compiler.rowwise_query(expr, **kwargs) except NotImplementedError: # a non row-wise query was passed, falling back to the # implementation forked from pandas.DataFrame.query. This # implementation will effectively evaluate the condition at the # modin.pandas API level, so that e.g. we interpret # df.query("col > 0") as df.loc[df.col > 0] kwargs["target"] = None res = self.eval(expr, **kwargs) try: result = self.loc[res] except ValueError: # when res is multi-dimensional loc raises, but this is # sometimes a valid query. result = self[res] new_query_compiler = result._query_compiler return self._create_or_update_from_compiler(new_query_compiler, inplace) def rename( self, mapper=None, index=None, columns=None, axis=None, copy=None, inplace=False, level=None, errors="ignore", ) -> Union[DataFrame, None]: # noqa: PR01, RT01, D200 """ Alter axes labels. """ inplace = validate_bool_kwarg(inplace, "inplace") if mapper is None and index is None and columns is None: raise TypeError("must pass an index to rename") # We have to do this with the args because of how rename handles kwargs. It # doesn't ignore None values passed in, so we have to filter them ourselves. args = locals() kwargs = {k: v for k, v in args.items() if v is not None and k != "self"} # inplace should always be true because this is just a copy, and we will use the # results after. kwargs["inplace"] = False axis = self._get_axis_number(axis) if index is not None or (mapper is not None and axis == 0): new_index = pandas.DataFrame(index=self.index).rename(**kwargs).index else: new_index = None if columns is not None or (mapper is not None and axis == 1): new_columns = ( pandas.DataFrame(columns=self.columns).rename(**kwargs).columns ) else: new_columns = None if inplace: obj = self else: obj = self.copy() if new_index is not None: obj.index = new_index if new_columns is not None: obj.columns = new_columns if not inplace: return obj def reindex( self, labels=None, *, index=None, columns=None, axis=None, method=None, copy=None, level=None, fill_value=np.nan, limit=None, tolerance=None, ) -> DataFrame: # noqa: PR01, RT01, D200 axis = self._get_axis_number(axis) if axis == 0 and labels is not None: index = labels elif labels is not None: columns = labels return super(DataFrame, self).reindex( index=index, columns=columns, method=method, copy=copy, level=level, fill_value=fill_value, limit=limit, tolerance=tolerance, ) def replace( self, to_replace=None, value=lib.no_default, *, inplace: bool = False, limit=None, regex: bool = False, method: str | lib.NoDefault = lib.no_default, ) -> Union[DataFrame, None]: # noqa: PR01, RT01, D200 """ Replace values given in `to_replace` with `value`. """ inplace = validate_bool_kwarg(inplace, "inplace") new_query_compiler = self._query_compiler.replace( to_replace=to_replace, value=value, inplace=False, limit=limit, regex=regex, method=method, ) return self._create_or_update_from_compiler(new_query_compiler, inplace) def rfloordiv( self, other, axis="columns", level=None, fill_value=None ) -> DataFrame: # noqa: PR01, RT01, D200 """ Get integer division of ``DataFrame`` and `other`, element-wise (binary operator `rfloordiv`). """ return self._binary_op( "rfloordiv", other, axis=axis, level=level, fill_value=fill_value, broadcast=isinstance(other, Series), ) def radd( self, other, axis="columns", level=None, fill_value=None ) -> DataFrame: # noqa: PR01, RT01, D200 """ Get addition of ``DataFrame`` and `other`, element-wise (binary operator `radd`). """ return self._binary_op( "radd", other, axis=axis, level=level, fill_value=fill_value, broadcast=isinstance(other, Series), ) def rmod( self, other, axis="columns", level=None, fill_value=None ) -> DataFrame: # noqa: PR01, RT01, D200 """ Get modulo of ``DataFrame`` and `other`, element-wise (binary operator `rmod`). """ return self._binary_op( "rmod", other, axis=axis, level=level, fill_value=fill_value, broadcast=isinstance(other, Series), ) def rpow( self, other, axis="columns", level=None, fill_value=None ) -> DataFrame: # noqa: PR01, RT01, D200 """ Get exponential power of ``DataFrame`` and `other`, element-wise (binary operator `rpow`). """ if isinstance(other, Series): return self._default_to_pandas( "rpow", other, axis=axis, level=level, fill_value=fill_value ) return self._binary_op( "rpow", other, axis=axis, level=level, fill_value=fill_value, broadcast=isinstance(other, Series), ) def rsub( self, other, axis="columns", level=None, fill_value=None ) -> DataFrame: # noqa: PR01, RT01, D200 """ Get subtraction of ``DataFrame`` and `other`, element-wise (binary operator `rsub`). """ return self._binary_op( "rsub", other, axis=axis, level=level, fill_value=fill_value, broadcast=isinstance(other, Series), ) def rtruediv( self, other, axis="columns", level=None, fill_value=None ) -> DataFrame: # noqa: PR01, RT01, D200 """ Get floating division of ``DataFrame`` and `other`, element-wise (binary operator `rtruediv`). """ return self._binary_op( "rtruediv", other, axis=axis, level=level, fill_value=fill_value, broadcast=isinstance(other, Series), ) rdiv = rtruediv def select_dtypes( self, include=None, exclude=None ) -> DataFrame: # noqa: PR01, RT01, D200 """ Return a subset of the ``DataFrame``'s columns based on the column dtypes. """ # Validates arguments for whether both include and exclude are None or # if they are disjoint. Also invalidates string dtypes. pandas.DataFrame().select_dtypes(include, exclude) if include and not is_list_like(include): include = [include] elif include is None: include = [] if exclude and not is_list_like(exclude): exclude = [exclude] elif exclude is None: exclude = [] sel = tuple(map(set, (include, exclude))) include, exclude = map(lambda x: set(map(infer_dtype_from_object, x)), sel) include_these = pandas.Series(not bool(include), index=self.columns) exclude_these = pandas.Series(not bool(exclude), index=self.columns) def is_dtype_instance_mapper(column, dtype): return column, functools.partial(issubclass, dtype.type) for column, f in itertools.starmap( is_dtype_instance_mapper, self.dtypes.items() ): if include: # checks for the case of empty include or exclude include_these[column] = any(map(f, include)) if exclude: exclude_these[column] = not any(map(f, exclude)) dtype_indexer = include_these & exclude_these indicate = [ i for i in range(len(dtype_indexer.values)) if not dtype_indexer.values[i] ] return self.drop(columns=self.columns[indicate], inplace=False) def set_index( self, keys, *, drop=True, append=False, inplace=False, verify_integrity=False ) -> Union[DataFrame, None]: # noqa: PR01, RT01, D200 """ Set the ``DataFrame`` index using existing columns. """ inplace = validate_bool_kwarg(inplace, "inplace") if not isinstance(keys, list): keys = [keys] if any( isinstance(col, (pandas.Index, Series, np.ndarray, list, Iterator)) for col in keys ): if inplace: frame = self else: frame = self.copy() if drop: keys = [k if is_list_like(k) else frame.pop(k) for k in keys] keys = try_cast_to_pandas(keys) # These are single-threaded objects, so we might as well let pandas do the # calculation so that it matches. frame.index = ( pandas.DataFrame(index=self.index) .set_index(keys, append=append, verify_integrity=verify_integrity) .index ) if not inplace: return frame else: return missing = [] for col in keys: # everything else gets tried as a key; # see https://github.com/pandas-dev/pandas/issues/24969 try: found = col in self.columns except TypeError as err: raise TypeError( 'The parameter "keys" may be a column key, one-dimensional ' + "array, or a list containing only valid column keys and " + f"one-dimensional arrays. Received column of type {type(col)}" ) from err else: if not found: missing.append(col) # If the missing column is a "primitive", return the errors. # Otherwise we let the query compiler figure out what to do with # the keys if missing and not hasattr(missing[0], "__dict__"): # The keys are a primitive type raise KeyError(f"None of {missing} are in the columns") new_query_compiler = self._query_compiler.set_index_from_columns( keys, drop=drop, append=append ) if verify_integrity and not new_query_compiler.index.is_unique: duplicates = new_query_compiler.index[ new_query_compiler.index.duplicated() ].unique() raise ValueError(f"Index has duplicate keys: {duplicates}") return self._create_or_update_from_compiler(new_query_compiler, inplace=inplace) sparse = CachedAccessor("sparse", SparseFrameAccessor) def squeeze( self, axis=None ) -> Union[DataFrame, Series, Scalar]: # noqa: PR01, RT01, D200 """ Squeeze 1 dimensional axis objects into scalars. """ axis = self._get_axis_number(axis) if axis is not None else None if axis is None and ( self._query_compiler.get_axis_len(1) == 1 or len(self) == 1 ): return Series(query_compiler=self._query_compiler).squeeze() if axis == 1 and self._query_compiler.get_axis_len(1) == 1: self._query_compiler._shape_hint = "column" return Series(query_compiler=self._query_compiler) if axis == 0 and len(self) == 1: qc = self.T._query_compiler qc._shape_hint = "column" return Series(query_compiler=qc) else: return self.copy() def stack( self, level=-1, dropna=lib.no_default, sort=lib.no_default, future_stack=False ) -> Union[DataFrame, Series]: # noqa: PR01, RT01, D200 """ Stack the prescribed level(s) from columns to index. """ if future_stack: return self._default_to_pandas( pandas.DataFrame.stack, level=level, dropna=dropna, sort=sort, future_stack=future_stack, ) # FutureWarnings only needed if future_stack == True if dropna is lib.no_default: dropna = True if sort is lib.no_default: sort = True # This ensures that non-pandas MultiIndex objects are caught. is_multiindex = len(self.columns.names) > 1 if not is_multiindex or ( is_multiindex and is_list_like(level) and len(level) == self.columns.nlevels ): return self._reduce_dimension( query_compiler=self._query_compiler.stack(level, dropna, sort) ) else: return self.__constructor__( query_compiler=self._query_compiler.stack(level, dropna, sort) ) def sub( self, other, axis="columns", level=None, fill_value=None ) -> DataFrame: # noqa: PR01, RT01, D200 """ Get subtraction of ``DataFrame`` and `other`, element-wise (binary operator `sub`). """ return self._binary_op( "sub", other, axis=axis, level=level, fill_value=fill_value, broadcast=isinstance(other, Series), ) subtract = sub def sum( self, axis=0, skipna=True, numeric_only=False, min_count=0, **kwargs, ) -> Series: # noqa: PR01, RT01, D200 validate_bool_kwarg(skipna, "skipna", none_allowed=False) """ Return the sum of the values over the requested axis. """ axis = self._get_axis_number(axis) axis_to_apply = self.columns if axis else self.index if ( skipna is not False and numeric_only is False and min_count > len(axis_to_apply) # This fast path is only suitable for the default backend and self._query_compiler.get_pandas_backend() is None ): new_index = self.columns if not axis else self.index return Series( [np.nan] * len(new_index), index=new_index, dtype=pandas.api.types.pandas_dtype("float64"), ) # We cannot add datetime types, so if we are summing a column with # dtype datetime64 and cannot ignore non-numeric types, we must throw a # TypeError. if numeric_only is False and any( dtype == pandas.api.types.pandas_dtype("datetime64[ns]") for dtype in self.dtypes ): raise TypeError( "'DatetimeArray' with dtype datetime64[ns] does not support reduction 'sum'" ) data = self._get_numeric_data(axis) if numeric_only else self if min_count > 1: return data._reduce_dimension( data._query_compiler.sum_min_count( axis=axis, skipna=skipna, numeric_only=numeric_only, min_count=min_count, **kwargs, ) ) return data._reduce_dimension( data._query_compiler.sum( axis=axis, skipna=skipna, numeric_only=numeric_only, min_count=min_count, **kwargs, ) ) @expanduser_path_arg("path") def to_feather( self, path, **kwargs ) -> None: # pragma: no cover # noqa: PR01, RT01, D200 """ Write a ``DataFrame`` to the binary Feather format. """ return self._default_to_pandas(pandas.DataFrame.to_feather, path, **kwargs) def to_gbq( self, destination_table, project_id=None, chunksize=None, reauth=False, if_exists="fail", auth_local_webserver=True, table_schema=None, location=None, progress_bar=True, credentials=None, ) -> None: # pragma: no cover # noqa: PR01, RT01, D200 """ Write a ``DataFrame`` to a Google BigQuery table. """ return self._default_to_pandas( pandas.DataFrame.to_gbq, destination_table, project_id=project_id, chunksize=chunksize, reauth=reauth, if_exists=if_exists, auth_local_webserver=auth_local_webserver, table_schema=table_schema, location=location, progress_bar=progress_bar, credentials=credentials, ) @expanduser_path_arg("path") def to_orc( self, path=None, *, engine="pyarrow", index=None, engine_kwargs=None ) -> Union[bytes, None]: return self._default_to_pandas( pandas.DataFrame.to_orc, path=path, engine=engine, index=index, engine_kwargs=engine_kwargs, ) @expanduser_path_arg("buf") def to_html( self, buf=None, columns=None, col_space=None, header=True, index=True, na_rep="NaN", formatters=None, float_format=None, sparsify=None, index_names=True, justify=None, max_rows=None, max_cols=None, show_dimensions=False, decimal=".", bold_rows=True, classes=None, escape=True, notebook=False, border=None, table_id=None, render_links=False, encoding=None, ) -> Union[str, None]: # noqa: PR01, RT01, D200 """ Render a ``DataFrame`` as an HTML table. """ return self._default_to_pandas( pandas.DataFrame.to_html, buf=buf, columns=columns, col_space=col_space, header=header, index=index, na_rep=na_rep, formatters=formatters, float_format=float_format, sparsify=sparsify, index_names=index_names, justify=justify, max_rows=max_rows, max_cols=max_cols, show_dimensions=show_dimensions, decimal=decimal, bold_rows=bold_rows, classes=classes, escape=escape, notebook=notebook, border=border, table_id=table_id, render_links=render_links, encoding=None, ) @expanduser_path_arg("path") def to_parquet( self, path=None, engine="auto", compression="snappy", index=None, partition_cols=None, storage_options: StorageOptions = None, **kwargs, ) -> Union[bytes, None]: from modin.core.execution.dispatching.factories.dispatcher import ( FactoryDispatcher, ) return FactoryDispatcher.to_parquet( self._query_compiler, path=path, engine=engine, compression=compression, index=index, partition_cols=partition_cols, storage_options=storage_options, **kwargs, ) def to_period( self, freq=None, axis=0, copy=None ) -> DataFrame: # pragma: no cover # noqa: PR01, RT01, D200 """ Convert ``DataFrame`` from ``DatetimeIndex`` to ``PeriodIndex``. """ return super(DataFrame, self).to_period(freq=freq, axis=axis, copy=copy) def to_records( self, index=True, column_dtypes=None, index_dtypes=None ) -> np.rec.recarray: # noqa: PR01, RT01, D200 """ Convert ``DataFrame`` to a NumPy record array. """ return self._default_to_pandas( pandas.DataFrame.to_records, index=index, column_dtypes=column_dtypes, index_dtypes=index_dtypes, ) @expanduser_path_arg("path") def to_stata( self, path: FilePath | WriteBuffer[bytes], *, convert_dates: dict[Hashable, str] | None = None, write_index: bool = True, byteorder: str | None = None, time_stamp: datetime.datetime | None = None, data_label: str | None = None, variable_labels: dict[Hashable, str] | None = None, version: int | None = 114, convert_strl: Sequence[Hashable] | None = None, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, value_labels: dict[Hashable, dict[float | int, str]] | None = None, ) -> None: return self._default_to_pandas( pandas.DataFrame.to_stata, path, convert_dates=convert_dates, write_index=write_index, byteorder=byteorder, time_stamp=time_stamp, data_label=data_label, variable_labels=variable_labels, version=version, convert_strl=convert_strl, compression=compression, storage_options=storage_options, value_labels=value_labels, ) @expanduser_path_arg("path_or_buffer") def to_xml( self, path_or_buffer=None, index=True, root_name="data", row_name="row", na_rep=None, attr_cols=None, elem_cols=None, namespaces=None, prefix=None, encoding="utf-8", xml_declaration=True, pretty_print=True, parser="lxml", stylesheet=None, compression="infer", storage_options=None, ) -> Union[str, None]: from modin.core.execution.dispatching.factories.dispatcher import ( FactoryDispatcher, ) return FactoryDispatcher.to_xml( self._query_compiler, path_or_buffer=path_or_buffer, index=index, root_name=root_name, row_name=row_name, na_rep=na_rep, attr_cols=attr_cols, elem_cols=elem_cols, namespaces=namespaces, prefix=prefix, encoding=encoding, xml_declaration=xml_declaration, pretty_print=pretty_print, parser=parser, stylesheet=stylesheet, compression=compression, storage_options=storage_options, ) def to_timestamp( self, freq=None, how="start", axis=0, copy=None ) -> DataFrame: # noqa: PR01, RT01, D200 """ Cast to DatetimeIndex of timestamps, at *beginning* of period. """ return super(DataFrame, self).to_timestamp( freq=freq, how=how, axis=axis, copy=copy ) def truediv( self, other, axis="columns", level=None, fill_value=None ) -> DataFrame: # noqa: PR01, RT01, D200 """ Get floating division of ``DataFrame`` and `other`, element-wise (binary operator `truediv`). """ return self._binary_op( "truediv", other, axis=axis, level=level, fill_value=fill_value, broadcast=isinstance(other, Series), ) div = divide = truediv def update( self, other, join="left", overwrite=True, filter_func=None, errors="ignore" ) -> None: # noqa: PR01, RT01, D200 """ Modify in place using non-NA values from another ``DataFrame``. """ if not isinstance(other, DataFrame): other = self.__constructor__(other) query_compiler = self._query_compiler.df_update( other._query_compiler, join=join, overwrite=overwrite, filter_func=filter_func, errors=errors, ) self._update_inplace(new_query_compiler=query_compiler) def where( self, cond, other=np.nan, *, inplace=False, axis=None, level=None, ) -> Union[DataFrame, None]: # noqa: PR01, RT01, D200 """ Replace values where the condition is False. """ inplace = validate_bool_kwarg(inplace, "inplace") if isinstance(other, Series) and axis is None: raise ValueError("Must specify axis=0 or 1") if level is not None: if isinstance(other, DataFrame): other = other._query_compiler.to_pandas() if isinstance(cond, DataFrame): cond = cond._query_compiler.to_pandas() new_query_compiler = self._default_to_pandas( pandas.DataFrame.where, cond, other=other, inplace=False, axis=axis, level=level, ) return self._create_or_update_from_compiler(new_query_compiler, inplace) cond = cond(self) if callable(cond) else cond if not isinstance(cond, DataFrame): if not hasattr(cond, "shape"): cond = np.asanyarray(cond) if cond.shape != self.shape: raise ValueError("Array conditional must be same shape as self") cond = self.__constructor__(cond, index=self.index, columns=self.columns) if isinstance(other, DataFrame): other = other._query_compiler else: """ Only infer the axis number when ``other`` will be made into a series. When ``other`` is a dataframe, axis=None has a meaning distinct from 0 and 1, e.g. at pandas 1.4.3: import pandas as pd df = pd.DataFrame([[1,2], [3, 4]], index=[1, 0]) cond = pd.DataFrame([[True,False], [False, True]], columns=[1, 0]) other = pd.DataFrame([[5,6], [7,8]], columns=[1, 0]) print(df.where(cond, other, axis=None)) 0 1 1 1 7 0 6 4 print(df.where(cond, other, axis=0)) 0 1 1 1 8 0 5 4 print(df.where(cond, other, axis=1)) 0 1 1 1 5 0 8 4 """ # _get_axis_number interprets lib.no_default as None, but where doesn't # accept lib.no_default. if axis == lib.no_default: raise ValueError( "No axis named NoDefault.no_default for object type DataFrame" ) axis = self._get_axis_number(axis) if isinstance(other, Series): other = other.reindex( self.index if axis == 0 else self.columns )._query_compiler if other._shape_hint is None: # To make the query compiler recognizable as a Series at lower levels other._shape_hint = "column" elif is_list_like(other): index = self.index if axis == 0 else self.columns other = pandas.Series(other, index=index) query_compiler = self._query_compiler.where( cond._query_compiler, other, axis=axis, level=level ) return self._create_or_update_from_compiler(query_compiler, inplace) def _getitem_column(self, key) -> Series: """ Get column specified by `key`. Parameters ---------- key : hashable Key that points to column to retrieve. Returns ------- Series Selected column. """ if key not in self.keys(): raise KeyError("{}".format(key)) s = self.__constructor__( query_compiler=self._query_compiler.getitem_column_array([key]) ).squeeze(axis=1) if isinstance(s, Series): s._parent = self s._parent_axis = 1 return s @disable_logging def __getattribute__(self, item: str) -> Any: """ Return attribute from the `BasePandasDataset`. Parameters ---------- item : str Item to get. Returns ------- Any """ # NOTE that to get an attribute, python calls __getattribute__() first and # then falls back to __getattr__() if the former raises an AttributeError. if item not in EXTENSION_NO_LOOKUP: extensions_result = self._getattribute__from_extension_impl( item, __class__._extensions ) if extensions_result is not sentinel: return extensions_result return super().__getattribute__(item) @disable_logging def __getattr__(self, key) -> Any: """ Return item identified by `key`. Parameters ---------- key : hashable Key to get. Returns ------- Any Notes ----- First try to use `__getattribute__` method. If it fails try to get `key` from ``DataFrame`` fields. """ # NOTE that to get an attribute, python calls __getattribute__() first and # then falls back to __getattr__() if the former raises an AttributeError. if key not in _ATTRS_NO_LOOKUP and key in self.columns: return self[key] raise AttributeError(f"'DataFrame' object has no attribute '{key}'") def __setattr__(self, key, value) -> None: """ Set attribute `value` identified by `key`. Parameters ---------- key : hashable Key to set. value : Any Value to set. Returns ------- None """ # While we let users assign to a column labeled "x" with "df.x" , there # are some attributes that we should assume are NOT column names and # therefore should follow the default Python object assignment # behavior. These are: # - anything in self.__dict__. This includes any attributes that the # user has added to the dataframe with, e.g., `df.c = 3`, and # any attribute that Modin has added to the frame, e.g. # `_query_compiler` and `_siblings` # - `_query_compiler`, which Modin initializes before it appears in # __dict__ # - `_siblings`, which Modin initializes before it appears in __dict__ # before it appears in __dict__. if key in ("_query_compiler", "_siblings") or key in self.__dict__: pass elif self._get_extension(key, __class__._extensions) is not sentinel: return self._get_extension(key, __class__._extensions).__set__(self, value) # we have to check for the key in `dir(self)` first in order not to trigger columns computation elif key not in dir(self) and key in self: self.__setitem__(key, value) # Note: return immediately so we don't keep this `key` as dataframe state. # `__getattr__` will return the columns not present in `dir(self)`, so we do not need # to manually track this state in the `dir`. return elif is_list_like(value) and key not in ["index", "columns"]: warnings.warn( SET_DATAFRAME_ATTRIBUTE_WARNING, UserWarning, ) super().__setattr__(key, value) def __setitem__(self, key, value) -> None: """ Set attribute `value` identified by `key`. Parameters ---------- key : Any Key to set. value : Any Value to set. Returns ------- None """ if isinstance(key, slice): return self._setitem_slice(key, value) if hashable(key) and key not in self.columns: if isinstance(value, Series) and self._query_compiler.get_axis_len(1) == 0: # Note: column information is lost when assigning a query compiler prev_index = self.columns self._query_compiler = value._query_compiler.copy() # Now that the data is appended, we need to update the column name for # that column to `key`, otherwise the name could be incorrect. self.columns = prev_index.insert(0, key) return # Do new column assignment after error checks and possible value modifications self.insert( loc=self._query_compiler.get_axis_len(1), column=key, value=value ) return if not hashable(key): if isinstance(key, DataFrame) or isinstance(key, np.ndarray): if isinstance(key, np.ndarray): if key.shape != self.shape: raise ValueError("Array must be same shape as DataFrame") key = self.__constructor__(key, columns=self.columns) return self.mask(key, value, inplace=True) if isinstance(key, (list, pandas.Index)) and all( (x in self.columns for x in key) ): if is_list_like(value): if not (hasattr(value, "shape") and hasattr(value, "ndim")): value = np.array(value) if len(key) != value.shape[-1]: raise ValueError("Columns must be same length as key") if isinstance(value, type(self)): # importing here to avoid circular import from .general import concat if not value.columns.equals(pandas.Index(key)): # we only need to change the labels, so shallow copy here value = value.copy(deep=False) value.columns = key # here we iterate over every column in the 'self' frame, then check if it's in the 'key' # and so has to be taken from either from the 'value' or from the 'self'. After that, # we concatenate those mixed column chunks and get a dataframe with updated columns to_concat = [] # columns to take for this chunk to_take = [] # whether columns in this chunk are in the 'key' and has to be taken from the 'value' get_cols_from_value = False # an object to take columns from for this chunk src_obj = self for col in self.columns: if (col in key) != get_cols_from_value: if len(to_take): to_concat.append(src_obj[to_take]) to_take = [col] get_cols_from_value = not get_cols_from_value src_obj = value if get_cols_from_value else self else: to_take.append(col) if len(to_take): to_concat.append(src_obj[to_take]) new_qc = concat(to_concat, axis=1)._query_compiler else: new_qc = self._query_compiler.write_items( slice(None), self.columns.get_indexer_for(key), value, need_columns_reindex=False, ) self._update_inplace(new_qc) # self.loc[:, key] = value return elif ( isinstance(key, list) and isinstance(value, type(self)) # Mixed case is more complicated, it's defaulting to pandas for now and all((x not in self.columns for x in key)) ): if len(key) != len(value.columns): raise ValueError("Columns must be same length as key") # Aligning the value's columns with the key if not np.array_equal(value.columns, key): value = value.set_axis(key, axis=1) new_qc = self._query_compiler.insert_item( axis=1, loc=self._query_compiler.get_axis_len(1), value=value._query_compiler, how="left", ) self._update_inplace(new_qc) return def setitem_unhashable_key(df, value): df[key] = value return df return self._update_inplace( self._default_to_pandas(setitem_unhashable_key, value)._query_compiler ) if is_list_like(value): if isinstance(value, (pandas.DataFrame, DataFrame)): value = value[value.columns[0]].values elif isinstance(value, np.ndarray): assert ( len(value.shape) < 3 ), "Shape of new values must be compatible with manager shape" value = value.T.reshape(-1) if len(self) > 0: value = value[: len(self)] if not isinstance(value, (Series, Categorical, np.ndarray, list, range)): value = list(value) if isinstance(value, Series): value = value._query_compiler self._update_inplace(self._query_compiler.setitem(axis=0, key=key, value=value)) def __iter__(self) -> Iterable[Hashable]: """ Iterate over info axis. Returns ------- iterable Iterator of the columns names. """ return iter(self.columns) def __contains__(self, key) -> bool: """ Check if `key` in the ``DataFrame.columns``. Parameters ---------- key : hashable Key to check the presence in the columns. Returns ------- bool """ return self.columns.__contains__(key) def __round__(self, decimals=0) -> DataFrame: """ Round each value in a ``DataFrame`` to the given number of decimals. Parameters ---------- decimals : int, default: 0 Number of decimal places to round to. Returns ------- DataFrame """ return self.round(decimals) def __delitem__(self, key) -> None: """ Delete item identified by `key` label. Parameters ---------- key : hashable Key to delete. """ if key not in self: raise KeyError(key) self._update_inplace(new_query_compiler=self._query_compiler.delitem(key)) @_doc_binary_op( operation="integer division and modulo", bin_op="divmod", returns="tuple of two DataFrames", ) def __divmod__(self, right) -> tuple[DataFrame, DataFrame]: return self._default_to_pandas(pandas.DataFrame.__divmod__, right) @_doc_binary_op( operation="integer division and modulo", bin_op="divmod", right="left", returns="tuple of two DataFrames", ) def __rdivmod__(self, left) -> tuple[DataFrame, DataFrame]: return self._default_to_pandas(pandas.DataFrame.__rdivmod__, left) __add__ = add __iadd__ = add # pragma: no cover __radd__ = radd __mul__ = mul __imul__ = mul # pragma: no cover __rmul__ = rmul __pow__ = pow __ipow__ = pow # pragma: no cover __rpow__ = rpow __sub__ = sub __isub__ = sub # pragma: no cover __rsub__ = rsub __floordiv__ = floordiv __ifloordiv__ = floordiv # pragma: no cover __rfloordiv__ = rfloordiv __truediv__ = truediv __itruediv__ = truediv # pragma: no cover __rtruediv__ = rtruediv __mod__ = mod __imod__ = mod # pragma: no cover __rmod__ = rmod __rdiv__ = rdiv def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): """ Get a Modin DataFrame that implements the dataframe exchange protocol. See more about the protocol in https://data-apis.org/dataframe-protocol/latest/index.html. Parameters ---------- nan_as_null : bool, default: False A keyword intended for the consumer to tell the producer to overwrite null values in the data with ``NaN`` (or ``NaT``). This currently has no effect; once support for nullable extension dtypes is added, this value should be propagated to columns. allow_copy : bool, default: True A keyword that defines whether or not the library is allowed to make a copy of the data. For example, copying data would be necessary if a library supports strided buffers, given that this protocol specifies contiguous buffers. Currently, if the flag is set to ``False`` and a copy is needed, a ``RuntimeError`` will be raised. Returns ------- ProtocolDataframe A dataframe object following the dataframe protocol specification. """ return self._query_compiler.to_interchange_dataframe( nan_as_null=nan_as_null, allow_copy=allow_copy ) def __dataframe_consortium_standard__( self, *, api_version: str | None = None ): # noqa: PR01, RT01 """ Provide entry point to the Consortium DataFrame Standard API. This is developed and maintained outside of Modin. Please report any issues to https://github.com/data-apis/dataframe-api-compat. """ dataframe_api_compat = import_optional_dependency( "dataframe_api_compat", "implementation" ) convert_to_standard_compliant_dataframe = ( dataframe_api_compat.modin_standard.convert_to_standard_compliant_dataframe ) return convert_to_standard_compliant_dataframe(self, api_version=api_version) @property def attrs(self) -> dict: # noqa: RT01, D200 """ Return dictionary of global attributes of this dataset. """ def attrs(df): return df.attrs return self._default_to_pandas(attrs) @property def style(self): # noqa: RT01, D200 """ Return a Styler object. """ def style(df): """Define __name__ attr because properties do not have it.""" return df.style return self._default_to_pandas(style) def reindex_like( self: DataFrame, other, method=None, copy: Optional[bool] = None, limit=None, tolerance=None, ) -> DataFrame: if copy is None: copy = True # docs say "Same as calling .reindex(index=other.index, columns=other.columns,...).": # https://pandas.pydata.org/pandas-docs/version/1.4/reference/api/pandas.DataFrame.reindex_like.html return self.reindex( index=other.index, columns=other.columns, method=method, copy=copy, limit=limit, tolerance=tolerance, ) def _create_or_update_from_compiler( self, new_query_compiler, inplace=False ) -> Union[DataFrame, None]: """ Return or update a ``DataFrame`` with given `new_query_compiler`. Parameters ---------- new_query_compiler : PandasQueryCompiler QueryCompiler to use to manage the data. inplace : bool, default: False Whether or not to perform update or creation inplace. Returns ------- DataFrame or None None if update was done, ``DataFrame`` otherwise. """ assert isinstance( new_query_compiler, self._query_compiler.__class__.__bases__ ), "Invalid Query Compiler object: {}".format(type(new_query_compiler)) if not inplace: return self.__constructor__(query_compiler=new_query_compiler) else: self._update_inplace(new_query_compiler=new_query_compiler) def _get_numeric_data(self, axis: int) -> DataFrame: """ Grab only numeric data from ``DataFrame``. Parameters ---------- axis : {0, 1} Axis to inspect on having numeric types only. Returns ------- DataFrame ``DataFrame`` with numeric data. """ # Pandas ignores `numeric_only` if `axis` is 1, but we do have to drop # non-numeric columns if `axis` is 0. if axis != 0: return self return self.drop( columns=[ i for i in self.dtypes.index if not is_numeric_dtype(self.dtypes[i]) ] ) def _validate_dtypes(self, numeric_only=False) -> None: """ Check that all the dtypes are the same. Parameters ---------- numeric_only : bool, default: False Whether or not to allow only numeric data. If True and non-numeric data is found, exception will be raised. """ # Series.__getitem__ treating keys as positions is deprecated. In a future version, # integer keys will always be treated as labels (consistent with DataFrame behavior). # To access a value by position, use `ser.iloc[pos]` dtypes = self._query_compiler.get_dtypes_set() dtype = next(iter(dtypes)) for t in dtypes: if numeric_only and not is_numeric_dtype(t): raise TypeError("{0} is not a numeric data type".format(t)) elif not numeric_only and t != dtype: raise TypeError( "Cannot compare type '{0}' with type '{1}'".format(t, dtype) ) def _validate_dtypes_min_max(self, axis, numeric_only) -> DataFrame: """ Validate data dtype for `min` and `max` methods. Parameters ---------- axis : {0, 1} Axis to validate over. numeric_only : bool Whether or not to allow only numeric data. If True and non-numeric data is found, exception. Returns ------- DataFrame """ # If our DataFrame has both numeric and non-numeric dtypes then # comparisons between these types do not make sense and we must raise a # TypeError. We must check explicitly if # numeric_only is False because if it is None, it will default to True # if the operation fails with mixed dtypes. if ( axis and numeric_only is False and not all([is_numeric_dtype(dtype) for dtype in self.dtypes]) ): raise TypeError("Cannot compare Numeric and Non-Numeric Types") return self._get_numeric_data(axis) if numeric_only else self def _validate_dtypes_prod_mean( self, axis, numeric_only, ignore_axis=False ) -> DataFrame: """ Validate data dtype for `prod` and `mean` methods. Parameters ---------- axis : {0, 1} Axis to validate over. numeric_only : bool Whether or not to allow only numeric data. If True and non-numeric data is found, exception will be raised. ignore_axis : bool, default: False Whether or not to ignore `axis` parameter. Returns ------- DataFrame """ # If our DataFrame has both numeric and non-numeric dtypes then # operations between these types do not make sense and we must raise a # TypeError. We must check explicitly if # numeric_only is False because if it is None, it will default to True # if the operation fails with mixed dtypes. if ( (axis or ignore_axis) and numeric_only is False and not all([is_numeric_dtype(dtype) for dtype in self.dtypes]) ): raise TypeError("Cannot operate on Numeric and Non-Numeric Types") return self._get_numeric_data(axis) if numeric_only else self def _to_pandas(self) -> pandas.DataFrame: """ Convert Modin ``DataFrame`` to pandas ``DataFrame``. Recommended conversion method: `dataframe.modin.to_pandas()`. Returns ------- pandas.DataFrame """ return self._query_compiler.to_pandas() def _validate_eval_query(self, expr, **kwargs) -> None: """ Validate the arguments of ``eval`` and ``query`` functions. Parameters ---------- expr : str The expression to evaluate. This string cannot contain any Python statements, only Python expressions. **kwargs : dict Optional arguments of ``eval`` and ``query`` functions. """ if isinstance(expr, str) and expr == "": raise ValueError("expr cannot be an empty string") if isinstance(expr, str) and "not" in expr: if "parser" in kwargs and kwargs["parser"] == "python": ErrorMessage.not_implemented( "'Not' nodes are not implemented." ) # pragma: no cover def _reduce_dimension(self, query_compiler: BaseQueryCompiler) -> Series: """ Reduce the dimension of data from the `query_compiler`. Parameters ---------- query_compiler : BaseQueryCompiler Query compiler to retrieve the data. Returns ------- Series """ return Series(query_compiler=query_compiler) def _set_axis_name(self, name, axis=0, inplace=False) -> Union[DataFrame, None]: """ Alter the name or names of the axis. Parameters ---------- name : str or list of str Name for the Index, or list of names for the MultiIndex. axis : str or int, default: 0 The axis to set the label. 0 or 'index' for the index, 1 or 'columns' for the columns. inplace : bool, default: False Whether to modify `self` directly or return a copy. Returns ------- DataFrame or None """ axis = self._get_axis_number(axis) renamed = self if inplace else self.copy() if axis == 0: renamed.index = renamed.index.set_names(name) else: renamed.columns = renamed.columns.set_names(name) if not inplace: return renamed def _to_datetime(self, **kwargs) -> Series: """ Convert `self` to datetime. Parameters ---------- **kwargs : dict Optional arguments to use during query compiler's `to_datetime` invocation. Returns ------- Series of datetime64 dtype """ return self._reduce_dimension( query_compiler=self._query_compiler.to_datetime(**kwargs) ) def _getitem(self, key) -> Union[DataFrame, Series]: """ Get the data specified by `key` for this ``DataFrame``. Parameters ---------- key : callable, Series, DataFrame, np.ndarray, pandas.Index or list Data identifiers to retrieve. Returns ------- Series or DataFrame Retrieved data. """ key = apply_if_callable(key, self) # Shortcut if key is an actual column is_mi_columns = self._query_compiler.has_multiindex(axis=1) try: if key in self.columns and not is_mi_columns: return self._getitem_column(key) except (KeyError, ValueError, TypeError): pass if isinstance(key, Series): return self.__constructor__( query_compiler=self._query_compiler.getitem_array(key._query_compiler) ) elif isinstance(key, (np.ndarray, pandas.Index, list)): return self.__constructor__( query_compiler=self._query_compiler.getitem_array(key) ) elif isinstance(key, DataFrame): return self.where(key) elif is_mi_columns: return self._default_to_pandas(pandas.DataFrame.__getitem__, key) # return self._getitem_multilevel(key) else: return self._getitem_column(key) # Persistance support methods - BEGIN @classmethod def _inflate_light(cls, query_compiler, source_pid) -> DataFrame: """ Re-creates the object from previously-serialized lightweight representation. The method is used for faster but not disk-storable persistence. Parameters ---------- query_compiler : BaseQueryCompiler Query compiler to use for object re-creation. source_pid : int Determines whether a Modin or pandas object needs to be created. Modin objects are created only on the main process. Returns ------- DataFrame New ``DataFrame`` based on the `query_compiler`. """ if os.getpid() != source_pid: return query_compiler.to_pandas() # The current logic does not involve creating Modin objects # and manipulation with them in worker processes return cls(query_compiler=query_compiler) @classmethod def _inflate_full(cls, pandas_df, source_pid) -> DataFrame: """ Re-creates the object from previously-serialized disk-storable representation. Parameters ---------- pandas_df : pandas.DataFrame Data to use for object re-creation. source_pid : int Determines whether a Modin or pandas object needs to be created. Modin objects are created only on the main process. Returns ------- DataFrame New ``DataFrame`` based on the `pandas_df`. """ if os.getpid() != source_pid: return pandas_df # The current logic does not involve creating Modin objects # and manipulation with them in worker processes return cls(data=from_pandas(pandas_df)) def __reduce__(self): self._query_compiler.finalize() pid = os.getpid() if ( PersistentPickle.get() or not self._query_compiler.support_materialization_in_worker_process() ): return self._inflate_full, (self._to_pandas(), pid) return self._inflate_light, (self._query_compiler, pid) # Persistance support methods - END @doc(SET_BACKEND_DOC, class_name=__qualname__) def set_backend( self, backend: str, inplace: bool = False, *, switch_operation: Optional[str] = None, ) -> Optional[Self]: return super().set_backend( backend=backend, inplace=inplace, switch_operation=switch_operation ) move_to = set_backend @doc(GET_BACKEND_DOC, class_name=__qualname__) @disable_logging def get_backend(self) -> str: return super().get_backend() @disable_logging def __delattr__(self, name: str) -> None: """ Delete attribute `name`. Parameters ---------- name : str Name of the attribute to delete. Returns ------- None """ extension = self._get_extension(name, __class__._extensions) if extension is not sentinel: return extension.__delete__(self) return super().__delattr__(name) @disable_logging @_inherit_docstrings(BasePandasDataset._copy_into) def _copy_into(self, other: DataFrame) -> None: other._query_compiler = self._query_compiler other._siblings = self._siblings return None ================================================ FILE: modin/pandas/errors/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """The module is needed to allow the following import `import modin.pandas.errors`.""" from pandas.errors import * # noqa: F403, F401 from pandas.errors import __all__ # noqa: F401 ================================================ FILE: modin/pandas/general.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Implement pandas general API.""" from __future__ import annotations import warnings from typing import Hashable, Iterable, Mapping, Optional, Union import numpy as np import pandas from pandas._libs.lib import NoDefault, no_default from pandas._typing import ArrayLike, DtypeBackend, Scalar, npt from pandas.core.dtypes.common import is_list_like from modin.core.storage_formats import BaseQueryCompiler from modin.core.storage_formats.pandas.query_compiler_caster import ( wrap_free_function_in_argument_caster, ) from modin.logging import enable_logging from modin.pandas.io import to_pandas from modin.utils import _inherit_docstrings, _maybe_warn_on_default from .base import BasePandasDataset from .dataframe import DataFrame from .series import Series @enable_logging def _isna( obj, ) -> bool | npt.NDArray[np.bool_] | Series | DataFrame: # noqa: PR01, RT01, D200 """ Detect missing values for an array-like object. """ if isinstance(obj, BasePandasDataset): return obj.isna() else: return pandas.isna(obj) _inherit_isna_docstring = _inherit_docstrings(pandas.isnull, apilink="pandas.isna") isna = _inherit_isna_docstring(wrap_free_function_in_argument_caster("isna")(_isna)) isnull = _inherit_isna_docstring(wrap_free_function_in_argument_caster("isnull")(_isna)) @enable_logging def _notna( obj, ) -> bool | npt.NDArray[np.bool_] | Series | DataFrame: # noqa: PR01, RT01, D200 """ Detect non-missing values for an array-like object. """ if isinstance(obj, BasePandasDataset): return obj.notna() else: return pandas.notna(obj) _inherit_notna_docstring = _inherit_docstrings(pandas.notna, apilink="pandas.notna") notnull = _inherit_notna_docstring( wrap_free_function_in_argument_caster("notnull")(_notna) ) notna = _inherit_notna_docstring(wrap_free_function_in_argument_caster("notna")(_notna)) @_inherit_docstrings(pandas.merge, apilink="pandas.merge") @enable_logging @wrap_free_function_in_argument_caster("merge") def merge( left, right, how: str = "inner", on=None, left_on=None, right_on=None, left_index: bool = False, right_index: bool = False, sort: bool = False, suffixes=("_x", "_y"), copy: Optional[bool] = None, indicator: bool = False, validate=None, ) -> DataFrame: # noqa: PR01, RT01, D200 """ Merge DataFrame or named Series objects with a database-style join. """ if isinstance(left, Series): if left.name is None: raise ValueError("Cannot merge a Series without a name") else: left = left.to_frame() if not isinstance(left, DataFrame): raise TypeError( f"Can only merge Series or DataFrame objects, a {type(left)} was passed" ) return left.merge( right, how=how, on=on, left_on=left_on, right_on=right_on, left_index=left_index, right_index=right_index, sort=sort, suffixes=suffixes, copy=copy, indicator=indicator, validate=validate, ) @_inherit_docstrings(pandas.merge_ordered, apilink="pandas.merge_ordered") @enable_logging @wrap_free_function_in_argument_caster("merge_ordered") def merge_ordered( left, right, on=None, left_on=None, right_on=None, left_by=None, right_by=None, fill_method=None, suffixes=("_x", "_y"), how: str = "outer", ) -> DataFrame: # noqa: PR01, RT01, D200 """ Perform a merge for ordered data with optional filling/interpolation. """ for operand in (left, right): if not isinstance(operand, (Series, DataFrame)): raise TypeError( f"Can only merge Series or DataFrame objects, a {type(operand)} was passed" ) return DataFrame( query_compiler=left._query_compiler.merge_ordered( right._query_compiler, on=on, left_on=left_on, right_on=right_on, left_by=left_by, right_by=right_by, fill_method=fill_method, suffixes=suffixes, how=how, ) ) @_inherit_docstrings(pandas.merge_asof, apilink="pandas.merge_asof") @enable_logging @wrap_free_function_in_argument_caster("merge_asof") def merge_asof( left, right, on=None, left_on=None, right_on=None, left_index: bool = False, right_index: bool = False, by=None, left_by=None, right_by=None, suffixes=("_x", "_y"), tolerance=None, allow_exact_matches: bool = True, direction: str = "backward", ) -> DataFrame: # noqa: PR01, RT01, D200 """ Perform a merge by key distance. """ if not isinstance(left, DataFrame): raise ValueError( "can not merge DataFrame with instance of type {}".format(type(right)) ) left._query_compiler._maybe_warn_on_default(message="`merge_asof`") # As of Pandas 1.2 these should raise an error; before that it did # something likely random: if ( (on and (left_index or right_index)) or (left_on and left_index) or (right_on and right_index) ): raise ValueError("Can't combine left/right_index with left/right_on or on.") if on is not None: if left_on is not None or right_on is not None: raise ValueError("If 'on' is set, 'left_on' and 'right_on' can't be set.") left_on = on right_on = on if by is not None: if left_by is not None or right_by is not None: raise ValueError("Can't have both 'by' and 'left_by' or 'right_by'") left_by = right_by = by if left_on is None and not left_index: raise ValueError("Must pass on, left_on, or left_index=True") if right_on is None and not right_index: raise ValueError("Must pass on, right_on, or right_index=True") return DataFrame( query_compiler=left._query_compiler.merge_asof( right._query_compiler, left_on, right_on, left_index, right_index, left_by, right_by, suffixes, tolerance, allow_exact_matches, direction, ) ) @_inherit_docstrings(pandas.pivot_table, apilink="pandas.pivot_table") @enable_logging @wrap_free_function_in_argument_caster("pivot_table") def pivot_table( data, values=None, index=None, columns=None, aggfunc="mean", fill_value=None, margins=False, dropna=True, margins_name="All", observed=no_default, sort=True, ) -> DataFrame: if not isinstance(data, DataFrame): raise ValueError( "can not create pivot table with instance of type {}".format(type(data)) ) return data.pivot_table( values=values, index=index, columns=columns, aggfunc=aggfunc, fill_value=fill_value, margins=margins, dropna=dropna, margins_name=margins_name, observed=observed, sort=sort, ) @_inherit_docstrings(pandas.pivot, apilink="pandas.pivot") @enable_logging @wrap_free_function_in_argument_caster("pivot") def pivot( data, *, columns, index=no_default, values=no_default ) -> DataFrame: # noqa: PR01, RT01, D200 """ Return reshaped DataFrame organized by given index / column values. """ if not isinstance(data, DataFrame): raise ValueError("can not pivot with instance of type {}".format(type(data))) return data.pivot(index=index, columns=columns, values=values) @_inherit_docstrings(pandas.to_numeric, apilink="pandas.to_numeric") @enable_logging @wrap_free_function_in_argument_caster("to_numeric") def to_numeric( arg, errors="raise", downcast=None, dtype_backend: Union[DtypeBackend, NoDefault] = no_default, ) -> Scalar | np.ndarray | Series: # noqa: PR01, RT01, D200 """ Convert argument to a numeric type. """ if not isinstance(arg, Series): return pandas.to_numeric( arg, errors=errors, downcast=downcast, dtype_backend=dtype_backend ) return arg._to_numeric( errors=errors, downcast=downcast, dtype_backend=dtype_backend ) @_inherit_docstrings(pandas.qcut, apilink="pandas.qcut") @enable_logging @wrap_free_function_in_argument_caster("qcut") def qcut( x, q, labels=None, retbins=False, precision=3, duplicates="raise" ): # noqa: PR01, RT01, D200 """ Quantile-based discretization function. """ kwargs = { "labels": labels, "retbins": retbins, "precision": precision, "duplicates": duplicates, } if not isinstance(x, Series): return pandas.qcut(x, q, **kwargs) return x._qcut(q, **kwargs) @_inherit_docstrings(pandas.cut, apilink="pandas.cut") @enable_logging @wrap_free_function_in_argument_caster("cut") def cut( x, bins, right: bool = True, labels=None, retbins: bool = False, precision: int = 3, include_lowest: bool = False, duplicates: str = "raise", ordered: bool = True, ): if isinstance(x, DataFrame): raise ValueError("Input array must be 1 dimensional") if not isinstance(x, Series): _maybe_warn_on_default( reason=f"pd.cut is not supported on objects of type {type(x)}" ) import pandas return pandas.cut( x, bins, right=right, labels=labels, retbins=retbins, precision=precision, include_lowest=include_lowest, duplicates=duplicates, ordered=ordered, ) def _wrap_in_series_object(qc_result): if isinstance(qc_result, type(x._query_compiler)): return Series(query_compiler=qc_result) if isinstance(qc_result, (tuple, list)): return tuple([_wrap_in_series_object(result) for result in qc_result]) return qc_result return _wrap_in_series_object( x._query_compiler.cut( bins, right=right, labels=labels, retbins=retbins, precision=precision, include_lowest=include_lowest, duplicates=duplicates, ordered=ordered, ) ) @_inherit_docstrings(pandas.unique, apilink="pandas.unique") @enable_logging @wrap_free_function_in_argument_caster("unique") def unique(values) -> ArrayLike: # noqa: PR01, RT01, D200 """ Return unique values based on a hash table. """ return Series(values).unique() # Adding docstring since pandas docs don't have web section for this function. @enable_logging @wrap_free_function_in_argument_caster("value_counts") def value_counts( values, sort=True, ascending=False, normalize=False, bins=None, dropna=True ) -> Series: """ Compute a histogram of the counts of non-null values. Parameters ---------- values : ndarray (1-d) Values to perform computation. sort : bool, default: True Sort by values. ascending : bool, default: False Sort in ascending order. normalize : bool, default: False If True then compute a relative histogram. bins : integer, optional Rather than count values, group them into half-open bins, convenience for pd.cut, only works with numeric data. dropna : bool, default: True Don't include counts of NaN. Returns ------- Series """ warnings.warn( "pandas.value_counts is deprecated and will be removed in a " + "future version. Use pd.Series(obj).value_counts() instead.", FutureWarning, ) return Series(values).value_counts( sort=sort, ascending=ascending, normalize=normalize, bins=bins, dropna=dropna, ) @_inherit_docstrings(pandas.concat, apilink="pandas.concat") @enable_logging @wrap_free_function_in_argument_caster(name="concat") def concat( objs: "Iterable[DataFrame | Series] | Mapping[Hashable, DataFrame | Series]", *, axis=0, join="outer", ignore_index: bool = False, keys=None, levels=None, names=None, verify_integrity: bool = False, sort: bool = False, copy: Optional[bool] = None, ) -> DataFrame | Series: # noqa: PR01, RT01, D200 """ Concatenate Modin objects along a particular axis. """ if isinstance(objs, (pandas.Series, Series, DataFrame, str, pandas.DataFrame)): raise TypeError( "first argument must be an iterable of pandas " + "objects, you passed an object of type " + f'"{type(objs).__name__}"' ) axis = pandas.DataFrame()._get_axis_number(axis) if isinstance(objs, dict): input_list_of_objs = list(objs.values()) else: input_list_of_objs = list(objs) if len(input_list_of_objs) == 0: raise ValueError("No objects to concatenate") list_of_objs = [obj for obj in input_list_of_objs if obj is not None] if len(list_of_objs) == 0: raise ValueError("All objects passed were None") try: type_check = next( obj for obj in list_of_objs if not isinstance(obj, (pandas.Series, Series, pandas.DataFrame, DataFrame)) ) except StopIteration: type_check = None if type_check is not None: raise ValueError( 'cannot concatenate object of type "{0}"; only ' + "modin.pandas.Series " + "and modin.pandas.DataFrame objs are " + "valid", type(type_check), ) all_series = all(isinstance(obj, Series) for obj in list_of_objs) if all_series and axis == 0: return Series( query_compiler=list_of_objs[0]._query_compiler.concat( axis, [o._query_compiler for o in list_of_objs[1:]], join=join, join_axes=None, ignore_index=ignore_index, keys=None, levels=None, names=None, verify_integrity=False, copy=True, sort=sort, ) ) if join == "outer": # Filter out empties list_of_objs = [ obj for obj in list_of_objs if ( isinstance(obj, (Series, pandas.Series)) or (isinstance(obj, DataFrame) and obj._query_compiler.lazy_shape) or sum(obj.shape) > 0 ) ] elif join != "inner": raise ValueError( "Only can inner (intersect) or outer (union) join the other axis" ) list_of_objs = [ ( obj._query_compiler if isinstance(obj, DataFrame) else DataFrame(obj)._query_compiler ) for obj in list_of_objs ] if keys is None and isinstance(objs, dict): keys = list(objs.keys()) if keys is not None: if all_series: new_idx = keys else: list_of_objs = [ list_of_objs[i] for i in range(min(len(list_of_objs), len(keys))) ] new_idx_labels = { k: v.index if axis == 0 else v.columns for k, v in zip(keys, list_of_objs) } tuples = [ (k, *o) if isinstance(o, tuple) else (k, o) for k, obj in new_idx_labels.items() for o in obj ] new_idx = pandas.MultiIndex.from_tuples(tuples) if names is not None: new_idx.names = names else: old_name = _determine_name(list_of_objs, axis) if old_name is not None: new_idx.names = [None] + old_name else: new_idx = None if len(list_of_objs) == 0: return DataFrame( index=input_list_of_objs[0].index.append( [f.index for f in input_list_of_objs[1:]] ) ) new_query_compiler = list_of_objs[0].concat( axis, list_of_objs[1:], join=join, join_axes=None, ignore_index=ignore_index, keys=None, levels=None, names=None, verify_integrity=False, copy=True, sort=sort, ) result_df = DataFrame(query_compiler=new_query_compiler) if new_idx is not None: if axis == 0: result_df.index = new_idx else: result_df.columns = new_idx return result_df @_inherit_docstrings(pandas.to_datetime, apilink="pandas.to_datetime") @enable_logging @wrap_free_function_in_argument_caster("to_datetime") def to_datetime( arg, errors="raise", dayfirst=False, yearfirst=False, utc=False, format=None, exact=no_default, unit=None, infer_datetime_format=no_default, origin="unix", cache=True, ) -> Scalar | ArrayLike | Series | DataFrame: # noqa: PR01, RT01, D200 """ Convert argument to datetime. """ if not hasattr(arg, "_to_datetime"): return pandas.to_datetime( arg, errors=errors, dayfirst=dayfirst, yearfirst=yearfirst, utc=utc, format=format, exact=exact, unit=unit, infer_datetime_format=infer_datetime_format, origin=origin, cache=cache, ) return arg._to_datetime( errors=errors, dayfirst=dayfirst, yearfirst=yearfirst, utc=utc, format=format, exact=exact, unit=unit, infer_datetime_format=infer_datetime_format, origin=origin, cache=cache, ) @_inherit_docstrings(pandas.get_dummies, apilink="pandas.get_dummies") @enable_logging @wrap_free_function_in_argument_caster("get_dummies") def get_dummies( data, prefix=None, prefix_sep="_", dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None, ) -> DataFrame: # noqa: PR01, RT01, D200 """ Convert categorical variable into dummy/indicator variables. """ if sparse: raise NotImplementedError( "SparseArray is not implemented. " + "To contribute to Modin, please visit " + "github.com/modin-project/modin." ) if not isinstance(data, DataFrame): _maybe_warn_on_default("`get_dummies` on non-DataFrame") if isinstance(data, Series): data = data._to_pandas() return DataFrame( pandas.get_dummies( data, prefix=prefix, prefix_sep=prefix_sep, dummy_na=dummy_na, columns=columns, sparse=sparse, drop_first=drop_first, dtype=dtype, ) ) else: new_manager = data._query_compiler.get_dummies( columns, prefix=prefix, prefix_sep=prefix_sep, dummy_na=dummy_na, drop_first=drop_first, dtype=dtype, ) return DataFrame(query_compiler=new_manager) @_inherit_docstrings(pandas.melt, apilink="pandas.melt") @enable_logging @wrap_free_function_in_argument_caster("melt") def melt( frame, id_vars=None, value_vars=None, var_name=None, value_name="value", col_level=None, ignore_index: bool = True, ) -> DataFrame: # noqa: PR01, RT01, D200 """ Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. """ return frame.melt( id_vars=id_vars, value_vars=value_vars, var_name=var_name, value_name=value_name, col_level=col_level, ignore_index=ignore_index, ) @_inherit_docstrings(pandas.crosstab, apilink="pandas.crosstab") @enable_logging @wrap_free_function_in_argument_caster("crosstab") def crosstab( index, columns, values=None, rownames=None, colnames=None, aggfunc=None, margins=False, margins_name: str = "All", dropna: bool = True, normalize=False, ) -> DataFrame: # noqa: PR01, RT01, D200 """ Compute a simple cross tabulation of two (or more) factors. """ _maybe_warn_on_default("`crosstab`") pandas_crosstab = pandas.crosstab( index, columns, values, rownames, colnames, aggfunc, margins, margins_name, dropna, normalize, ) return DataFrame(pandas_crosstab) # Adding docstring since pandas docs don't have web section for this function. @enable_logging @wrap_free_function_in_argument_caster("lreshape") def lreshape(data: DataFrame, groups, dropna=True) -> DataFrame: """ Reshape wide-format data to long. Generalized inverse of ``DataFrame.pivot``. Accepts a dictionary, `groups`, in which each key is a new column name and each value is a list of old column names that will be "melted" under the new column name as part of the reshape. Parameters ---------- data : DataFrame The wide-format DataFrame. groups : dict Dictionary in the form: `{new_name : list_of_columns}`. dropna : bool, default: True Whether include columns whose entries are all NaN or not. Returns ------- DataFrame Reshaped DataFrame. """ if not isinstance(data, DataFrame): raise ValueError("can not lreshape with instance of type {}".format(type(data))) data._query_compiler._maybe_warn_on_default(message="`lreshape`") return DataFrame(pandas.lreshape(to_pandas(data), groups, dropna=dropna)) @_inherit_docstrings(pandas.wide_to_long, apilink="pandas.wide_to_long") @enable_logging @wrap_free_function_in_argument_caster("wide_to_long") def wide_to_long( df: DataFrame, stubnames, i, j, sep: str = "", suffix: str = r"\d+" ) -> DataFrame: # noqa: PR01, RT01, D200 """ Unpivot a DataFrame from wide to long format. """ if not isinstance(df, DataFrame): raise ValueError( "can not wide_to_long with instance of type {}".format(type(df)) ) return DataFrame( query_compiler=df._query_compiler.wide_to_long( stubnames=stubnames, i=i, j=j, sep=sep, suffix=suffix, ) ) @wrap_free_function_in_argument_caster("_determine_name") def _determine_name(objs: Iterable[BaseQueryCompiler], axis: Union[int, str]): """ Determine names of index after concatenation along passed axis. Parameters ---------- objs : iterable of QueryCompilers Objects to concatenate. axis : int or str The axis to concatenate along. Returns ------- list with single element Computed index name, `None` if it could not be determined. """ axis = pandas.DataFrame()._get_axis_number(axis) def get_names(obj): return obj.columns.names if axis else obj.index.names names = np.array([get_names(obj) for obj in objs]) # saving old name, only if index names of all objs are the same if np.all(names == names[0]): # we must do this check to avoid this calls `list(str_like_name)` return list(names[0]) if is_list_like(names[0]) else [names[0]] else: return None @_inherit_docstrings(pandas.to_datetime, apilink="pandas.to_timedelta") @enable_logging @wrap_free_function_in_argument_caster("to_timedelta") def to_timedelta( arg, unit=None, errors="raise" ) -> Scalar | pandas.Index | Series: # noqa: PR01, RT01, D200 """ Convert argument to timedelta. Accepts str, timedelta, list-like or Series for arg parameter. Returns a Series if and only if arg is provided as a Series. """ if isinstance(arg, Series): query_compiler = arg._query_compiler.to_timedelta(unit=unit, errors=errors) return Series(query_compiler=query_compiler) return pandas.to_timedelta(arg, unit=unit, errors=errors) ================================================ FILE: modin/pandas/groupby.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Implement GroupBy public API as pandas does.""" from __future__ import annotations import warnings from collections.abc import Iterable from functools import cached_property from types import BuiltinFunctionType from typing import TYPE_CHECKING, Any, Hashable, Optional, Union import numpy as np import pandas import pandas.core.common as com import pandas.core.groupby from pandas._libs import lib from pandas.api.types import is_scalar from pandas.core.apply import reconstruct_func from pandas.core.dtypes.common import ( is_datetime64_any_dtype, is_integer, is_list_like, is_numeric_dtype, ) from pandas.errors import SpecificationError from typing_extensions import Self from modin.core.dataframe.algebra.default2pandas.groupby import GroupBy from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler from modin.core.storage_formats.pandas.query_compiler_caster import ( EXTENSION_DICT_TYPE, EXTENSION_NO_LOOKUP, QueryCompilerCaster, visit_nested_args, ) from modin.error_message import ErrorMessage from modin.logging import ClassLogger, disable_logging from modin.pandas.utils import cast_function_modin2pandas from modin.utils import ( MODIN_UNNAMED_SERIES_LABEL, _inherit_docstrings, hashable, sentinel, try_cast_to_pandas, wrap_into_list, wrap_udf_function, ) from .series import Series from .utils import is_label from .window import RollingGroupby if TYPE_CHECKING: from modin.pandas import DataFrame _DEFAULT_BEHAVIOUR = EXTENSION_NO_LOOKUP | { "__class__", "__getitem__", "__init__", "__iter__", "_as_index", "_axis", "_by", "_check_index", "_columns", "_compute_index_grouped", "_default_to_pandas", "_df", "_drop", "_idx_name", "_index", "_internal_by", "_is_multi_by", "_iter", "_kwargs", "_level", "_pandas_class", "_query_compiler", "_sort", "_wrap_aggregation", } GROUPBY_EXTENSION_NO_LOOKUP = EXTENSION_NO_LOOKUP | { "_axis", "_idx_name", "_df", "_query_compiler", "_columns", "_by", "_drop", "_return_tuple_when_iterating", "_is_multi_by", "_level", "_kwargs", "_get_query_compiler", } @_inherit_docstrings(pandas.core.groupby.DataFrameGroupBy) class DataFrameGroupBy(ClassLogger, QueryCompilerCaster): # noqa: GL08 _pandas_class = pandas.core.groupby.DataFrameGroupBy _return_tuple_when_iterating = False _df: Union[DataFrame, Series] _query_compiler: BaseQueryCompiler _extensions: EXTENSION_DICT_TYPE = EXTENSION_DICT_TYPE(dict) def __init__( self, df: Union[DataFrame, Series], by, axis, level, as_index, sort, group_keys, idx_name, drop, backend_pinned: bool, **kwargs, ): self._axis = axis self._idx_name = idx_name self._df = df self._query_compiler = self._df._query_compiler self._columns = self._query_compiler.columns self._by = by self._drop = drop # When providing a list of columns of length one to DataFrame.groupby(), # the keys that are returned by iterating over the resulting DataFrameGroupBy # object will now be tuples of length one (pandas#GH47761) self._return_tuple_when_iterating = kwargs.pop( "return_tuple_when_iterating", False ) # Whether the backend of this groupby object has been pinned. self._backend_pinned = backend_pinned if ( level is None and is_list_like(by) or isinstance(by, type(self._query_compiler)) ): # This tells us whether or not there are multiple columns/rows in the groupby self._is_multi_by = ( isinstance(by, type(self._query_compiler)) and len(by.columns) > 1 ) or ( not isinstance(by, type(self._query_compiler)) and axis == 0 and all( (hashable(obj) and obj in self._query_compiler.columns) or isinstance(obj, type(self._query_compiler)) or is_list_like(obj) for obj in self._by ) ) else: self._is_multi_by = False self._level = level self._kwargs = { "level": level, "sort": sort, "as_index": as_index, "group_keys": group_keys, } self._kwargs.update(kwargs) @disable_logging @_inherit_docstrings(QueryCompilerCaster._get_query_compiler) def _get_query_compiler(self) -> Optional[BaseQueryCompiler]: if hasattr(self, "_df"): return self._df._query_compiler return None @disable_logging @_inherit_docstrings(QueryCompilerCaster.get_backend) def get_backend(self) -> str: return self._df.get_backend() @disable_logging def set_backend( self, backend: str, inplace: bool = False, *, switch_operation: Optional[str] = None, ) -> Optional[Self]: """ Move the data in this groupby object to a different backend. Parameters ---------- backend : str The name of the backend to switch to. inplace : bool, default: False Whether to perform the operation in-place. switch_operation : str, optional The operation being performed that triggered the backend switch. Returns ------- DataFrameGroupBy or None If inplace=False, returns a new groupby object with the specified backend. If inplace=True, returns None and changes the backend of the current object. Notes ----- When `inplace=True`, this method will move the data between backends for all parent objects (the DataFrame/Series used to create this groupby, and any DataFrames/Series in the `by` list). When `inplace=False`, new copies of the parent objects are created with their data in the target backend for the returned groupby object, leaving the original parent objects unchanged. """ def set_instance_variable_backend(arg: Any) -> Any: # groupby object _by and _df fields may include both # QueryCompilerCaster objects and BaseQueryCompiler objects, # so we have to be able to set the backend on both of those. if isinstance(arg, QueryCompilerCaster): result = arg.set_backend( backend=backend, inplace=inplace, switch_operation=switch_operation ) return arg if inplace else result if isinstance(arg, BaseQueryCompiler): # Use a cyclic import here because query compilers themselves # do not implement set_backend(). from modin.pandas import DataFrame return ( DataFrame(query_compiler=arg) .set_backend(backend=backend, inplace=False) ._query_compiler ) return arg new_by = visit_nested_args([self._by], set_instance_variable_backend)[0] new_df = visit_nested_args([self._df], set_instance_variable_backend)[0] if inplace: self._df = new_df self._query_compiler = new_df._query_compiler self._by = new_by return None return type(self)( df=new_df, by=new_by, axis=self._axis, level=self._level, as_index=self._as_index, sort=self._sort, group_keys=self._kwargs["group_keys"], idx_name=self._idx_name, drop=self._drop, backend_pinned=self._backend_pinned, # We have added as_index, sort, group_keys, and level to the kwargs # dictionary, so we need to remove them from the keyword arguments # that we pass to the new DataFrameGroupBy object. **{ k: v for k, v in self._kwargs.items() if k not in ["as_index", "sort", "group_keys", "level"] }, ) @_inherit_docstrings(QueryCompilerCaster.is_backend_pinned) def is_backend_pinned(self) -> bool: return self._backend_pinned @_inherit_docstrings(QueryCompilerCaster._set_backend_pinned) def _set_backend_pinned(self, pinned: bool, inplace: bool) -> Optional[Self]: if inplace: self._backend_pinned = pinned return None else: # Create a new groupby object with the updated pinned status new_obj = self._override(backend_pinned=pinned) # Force the correct pinned status since the automatic pinning logic # in query_compiler_caster.py might override it new_obj._backend_pinned = pinned return new_obj @disable_logging @_inherit_docstrings(QueryCompilerCaster._get_query_compiler) def _copy_into(self, other: Self) -> None: # TODO(https://github.com/modin-project/modin/issues/7544): implement # this method to support automatic pre-operation backend switch for # groupby methods. ErrorMessage.not_implemented() def _override(self, **kwargs): """ Override groupby parameters. Parameters ---------- **kwargs : dict Parameters to override. Returns ------- DataFrameGroupBy A groupby object with new parameters. """ new_kw = dict( df=self._df, by=self._by, axis=self._axis, idx_name=self._idx_name, drop=self._drop, backend_pinned=self._backend_pinned, **self._kwargs, ) new_kw.update(kwargs) return type(self)(**new_kw) @disable_logging def __getattr__(self, key): """ Alter regular attribute access, looks up the name in the columns. Parameters ---------- key : str Attribute name. Returns ------- The value of the attribute. """ try: return self._getattr__from_extension_impl( key=key, default_behavior_attributes=GROUPBY_EXTENSION_NO_LOOKUP, extensions=__class__._extensions, ) except AttributeError as err: if key != "_columns" and key in self._columns: return self.__getitem__(key) raise err @disable_logging def __getattribute__(self, item: str) -> Any: """ Override __getattribute__, which python calls to access any attribute of an object of this class. We override this method 1) to default to pandas for empty dataframes on non-lazy engines. 2) to get non-method extensions (e.g. properties) Parameters ---------- item : str The name of the attribute to access. Returns ------- Any The value of the attribute. """ if item not in GROUPBY_EXTENSION_NO_LOOKUP: extensions_result = self._getattribute__from_extension_impl( item, __class__._extensions ) if extensions_result is not sentinel: return extensions_result attr = super().__getattribute__(item) if item not in _DEFAULT_BEHAVIOUR and not self._query_compiler.lazy_shape: # We default to pandas on empty DataFrames. This avoids a large amount of # pain in underlying implementation and returns a result immediately rather # than dealing with the edge cases that empty DataFrames have. if callable(attr) and self._df.empty and hasattr(self._pandas_class, item): def default_handler(*args, **kwargs): return self._default_to_pandas(item, *args, **kwargs) return default_handler return attr @disable_logging def __setattr__(self, key: str, value) -> None: """ Set an attribute on the object. We override this method to set extension properties. Parameters ---------- key : str The name of the attribute to set. value : Any The value to set the attribute to. Returns ------- None """ # An extension property is only accessible if the backend supports it. extension = self._get_extension(key, __class__._extensions) if extension is not sentinel and hasattr(extension, "__set__"): return extension.__set__(self, value) return super().__setattr__(key, value) @disable_logging def __delattr__(self, name: str) -> None: """ Delete an attribute on the object. We override this method to delete extension properties. Parameters ---------- name : str The name of the attribute to delete. Returns ------- None """ # An extension property is only accessible if the backend supports it. extension = self._get_extension(name, __class__._extensions) if extension is not sentinel and hasattr(extension, "__delete__"): return extension.__delete__(self) return super().__delattr__(name) @property def ngroups(self): # noqa: GL08 return len(self) def skew(self, axis=lib.no_default, skipna=True, numeric_only=False, **kwargs): # default behaviour for aggregations; for the reference see # `_op_via_apply` func in pandas==2.0.2 if axis is None or axis is lib.no_default: axis = self._axis if axis != 0 or not skipna: return self._default_to_pandas( lambda df: df.skew( axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs ) ) return self._wrap_aggregation( type(self._query_compiler).groupby_skew, agg_kwargs=kwargs, numeric_only=numeric_only, ) def ffill(self, limit=None): ErrorMessage.single_warning( ".ffill() is implemented using .fillna() in Modin, " + "which can be impacted by pandas bug https://github.com/pandas-dev/pandas/issues/43412 " + "on dataframes with duplicated indices" ) with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message=".*fillna with 'method' is deprecated.*", category=FutureWarning, ) return self.fillna(limit=limit, method="ffill") def sem(self, ddof=1, numeric_only=False): return self._wrap_aggregation( type(self._query_compiler).groupby_sem, agg_kwargs=dict(ddof=ddof), numeric_only=numeric_only, ) def sample(self, n=None, frac=None, replace=False, weights=None, random_state=None): return self._default_to_pandas( lambda df: df.sample( n=n, frac=frac, replace=replace, weights=weights, random_state=random_state, ) ) def ewm(self, *args, **kwargs): return self._default_to_pandas(lambda df: df.ewm(*args, **kwargs)) def value_counts( self, subset=None, normalize: bool = False, sort: bool = True, ascending: bool = False, dropna: bool = True, ): return self._default_to_pandas( lambda df: df.value_counts( subset=subset, normalize=normalize, sort=sort, ascending=ascending, dropna=dropna, ) ) def mean(self, numeric_only=False, engine=None, engine_kwargs=None): if engine not in ("cython", None) and engine_kwargs is not None: return self._default_to_pandas( lambda df: df.mean( numeric_only=numeric_only, engine=engine, engine_kwargs=engine_kwargs, ) ) return self._check_index( self._wrap_aggregation( type(self._query_compiler).groupby_mean, agg_kwargs=dict(numeric_only=numeric_only), numeric_only=numeric_only, ) ) def any(self, skipna=True): return self._wrap_aggregation( type(self._query_compiler).groupby_any, numeric_only=False, agg_kwargs=dict(skipna=skipna), ) @property def plot(self): # pragma: no cover return self._default_to_pandas(lambda df: df.plot) def ohlc(self): from .dataframe import DataFrame return DataFrame( query_compiler=self._query_compiler.groupby_ohlc( by=self._by, axis=self._axis, groupby_kwargs=self._kwargs, agg_args=[], agg_kwargs={}, is_df=isinstance(self._df, DataFrame), ), ) def __bytes__(self): """ Convert DataFrameGroupBy object into a python2-style byte string. Returns ------- bytearray Byte array representation of `self`. Notes ----- Deprecated and removed in pandas and will be likely removed in Modin. """ return self._default_to_pandas(lambda df: df.__bytes__()) @cached_property def groups(self): return self._compute_index_grouped(numerical=False) def min(self, numeric_only=False, min_count=-1, engine=None, engine_kwargs=None): if engine not in ("cython", None) and engine_kwargs is not None: return self._default_to_pandas( lambda df: df.min( numeric_only=numeric_only, min_count=min_count, engine=engine, engine_kwargs=engine_kwargs, ) ) return self._wrap_aggregation( type(self._query_compiler).groupby_min, agg_kwargs=dict(min_count=min_count), numeric_only=numeric_only, ) def max(self, numeric_only=False, min_count=-1, engine=None, engine_kwargs=None): if engine not in ("cython", None) and engine_kwargs is not None: return self._default_to_pandas( lambda df: df.max( numeric_only=numeric_only, min_count=min_count, engine=engine, engine_kwargs=engine_kwargs, ) ) return self._wrap_aggregation( type(self._query_compiler).groupby_max, agg_kwargs=dict(min_count=min_count), numeric_only=numeric_only, ) def idxmax(self, axis=lib.no_default, skipna=True, numeric_only=False): if axis is not lib.no_default: self._deprecate_axis(axis, "idxmax") # default behaviour for aggregations; for the reference see # `_op_via_apply` func in pandas==2.0.2 if axis is None or axis is lib.no_default: axis = self._axis return self._wrap_aggregation( type(self._query_compiler).groupby_idxmax, agg_kwargs=dict(axis=axis, skipna=skipna), numeric_only=numeric_only, ) def idxmin(self, axis=lib.no_default, skipna=True, numeric_only=False): if axis is not lib.no_default: self._deprecate_axis(axis, "idxmin") # default behaviour for aggregations; for the reference see # `_op_via_apply` func in pandas==2.0.2 if axis is None or axis is lib.no_default: axis = self._axis return self._wrap_aggregation( type(self._query_compiler).groupby_idxmin, agg_kwargs=dict(axis=axis, skipna=skipna), numeric_only=numeric_only, ) @property def ndim(self): """ Return 2. Returns ------- int Returns 2. Notes ----- Deprecated and removed in pandas and will be likely removed in Modin. """ return 2 # ndim is always 2 for DataFrames def shift( self, periods=1, freq=None, axis=lib.no_default, fill_value=lib.no_default, suffix=None, ): if suffix: return self._default_to_pandas( lambda df: df.shift( periods=periods, freq=freq, axis=axis, fill_value=fill_value, suffix=suffix, ) ) if axis is not lib.no_default: axis = self._df._get_axis_number(axis) self._deprecate_axis(axis, "shift") else: axis = 0 def _shift(data, periods, freq, axis, fill_value, is_set_nan_rows=True): from .dataframe import DataFrame result = data.shift(periods, freq, axis, fill_value) if ( is_set_nan_rows and isinstance(self._by, BaseQueryCompiler) and ( # Check using `issubset` is effective only in case of MultiIndex set(self._by.columns).issubset(list(data.columns)) if isinstance(self._by.columns, pandas.MultiIndex) else len( self._by.columns.unique() .sort_values() .difference(data.columns.unique().sort_values()) ) == 0 ) and DataFrame(query_compiler=self._by.isna()).any(axis=None) ): mask_nan_rows = data[self._by.columns].isna().any(axis=1) result = result.loc[~mask_nan_rows] return result if freq is None and axis == 1 and self._axis == 0: result = _shift(self._df, periods, freq, axis, fill_value) elif ( freq is not None and axis == 0 and self._axis == 0 and isinstance(self._by, BaseQueryCompiler) ): result = _shift( self._df, periods, freq, axis, fill_value, is_set_nan_rows=False ) result = result.dropna(subset=self._by.columns) if self._sort: result = result.sort_values(list(self._by.columns), axis=axis) else: result = result.sort_index() else: result = self._wrap_aggregation( type(self._query_compiler).groupby_shift, numeric_only=False, agg_kwargs=dict( periods=periods, freq=freq, axis=axis, fill_value=fill_value ), ) return result def nth(self, n, dropna=None): # TODO: what we really should do is create a GroupByNthSelector to mimic # pandas behavior and then implement some of these methods there. # Adapted error checking from pandas if dropna: if not is_integer(n): raise ValueError("dropna option only supported for an integer argument") if dropna not in ("any", "all"): # Note: when agg-ing picker doesn't raise this, just returns NaN raise ValueError( "For a DataFrame or Series groupby.nth, dropna must be " + "either None, 'any' or 'all', " + f"(was passed {dropna})." ) return self._check_index( self._wrap_aggregation( type(self._query_compiler).groupby_nth, numeric_only=False, agg_kwargs=dict(n=n, dropna=dropna), ) ) def cumsum(self, axis=lib.no_default, *args, **kwargs): if axis is not lib.no_default: axis = self._df._get_axis_number(axis) self._deprecate_axis(axis, "cumsum") else: axis = 0 return self._wrap_aggregation( type(self._query_compiler).groupby_cumsum, agg_args=args, agg_kwargs=dict(axis=axis, **kwargs), ) @cached_property def indices(self): return self._compute_index_grouped(numerical=True) @_inherit_docstrings(pandas.core.groupby.DataFrameGroupBy.pct_change) def pct_change( self, periods=1, fill_method=lib.no_default, limit=lib.no_default, freq=None, axis=lib.no_default, ): from .dataframe import DataFrame if fill_method not in (lib.no_default, None) or limit is not lib.no_default: warnings.warn( "The 'fill_method' keyword being not None and the 'limit' keyword in " + f"{type(self).__name__}.pct_change are deprecated and will be removed " + "in a future version. Either fill in any non-leading NA values prior " + "to calling pct_change or specify 'fill_method=None' to not fill NA " + "values.", FutureWarning, ) if fill_method is lib.no_default: if any(grp.isna().values.any() for _, grp in self): warnings.warn( "The default fill_method='ffill' in " + f"{type(self).__name__}.pct_change is deprecated and will be " + "removed in a future version. Call ffill before calling " + "pct_change to retain current behavior and silence this warning.", FutureWarning, ) fill_method = "ffill" if limit is lib.no_default: limit = None if axis is not lib.no_default: axis = self._df._get_axis_number(axis) self._deprecate_axis(axis, "pct_change") else: axis = 0 # Should check for API level errors # Attempting to match pandas error behavior here if not isinstance(periods, int): raise TypeError(f"periods must be an int. got {type(periods)} instead") if isinstance(self._df, Series): if not is_numeric_dtype(self._df.dtypes): raise TypeError( f"unsupported operand type for -: got {self._df.dtypes}" ) elif isinstance(self._df, DataFrame) and axis == 0: for col, dtype in self._df.dtypes.items(): # can't calculate change on non-numeric columns, so check for # non-numeric columns that are not included in the `by` if not is_numeric_dtype(dtype) and not ( isinstance(self._by, BaseQueryCompiler) and col in self._by.columns ): raise TypeError(f"unsupported operand type for -: got {dtype}") return self._wrap_aggregation( type(self._query_compiler).groupby_pct_change, agg_kwargs=dict( periods=periods, fill_method=fill_method, limit=limit, freq=freq, axis=axis, ), ) def filter(self, func, dropna=True, *args, **kwargs): return self._default_to_pandas( lambda df: df.filter(func, dropna=dropna, *args, **kwargs) ) def _deprecate_axis(self, axis: int, name: str) -> None: # noqa: GL08 if axis == 1: warnings.warn( f"{type(self).__name__}.{name} with axis=1 is deprecated and " + "will be removed in a future version. Operate on the un-grouped " + "DataFrame instead", FutureWarning, ) else: warnings.warn( f"The 'axis' keyword in {type(self).__name__}.{name} is deprecated " + "and will be removed in a future version. " + "Call without passing 'axis' instead.", FutureWarning, ) def cummax(self, axis=lib.no_default, numeric_only=False, **kwargs): if axis is not lib.no_default: axis = self._df._get_axis_number(axis) self._deprecate_axis(axis, "cummax") else: axis = 0 return self._wrap_aggregation( type(self._query_compiler).groupby_cummax, agg_kwargs=dict(axis=axis, **kwargs), numeric_only=numeric_only, ) def apply(self, func, *args, include_groups=True, **kwargs): func = cast_function_modin2pandas(func) if not isinstance(func, BuiltinFunctionType): func = wrap_udf_function(func) apply_res = self._wrap_aggregation( qc_method=type(self._query_compiler).groupby_agg, numeric_only=False, agg_func=func, agg_args=args, agg_kwargs={**kwargs, "include_groups": include_groups}, how="group_wise", ) reduced_index = pandas.Index([MODIN_UNNAMED_SERIES_LABEL]) if not isinstance(apply_res, Series) and apply_res.columns.equals( reduced_index ): apply_res = apply_res.squeeze(axis=1) return self._check_index(apply_res) @property def dtypes(self): if self._axis == 1: raise ValueError("Cannot call dtypes on groupby with axis=1") warnings.warn( f"{type(self).__name__}.dtypes is deprecated and will be removed in " + "a future version. Check the dtypes on the base object instead", FutureWarning, ) return self._check_index( self._wrap_aggregation( type(self._query_compiler).groupby_dtypes, numeric_only=False, ) ) def first(self, numeric_only=False, min_count=-1, skipna=True): return self._wrap_aggregation( type(self._query_compiler).groupby_first, agg_kwargs=dict(min_count=min_count, skipna=skipna), numeric_only=numeric_only, ) def last(self, numeric_only=False, min_count=-1, skipna=True): return self._wrap_aggregation( type(self._query_compiler).groupby_last, agg_kwargs=dict(min_count=min_count, skipna=skipna), numeric_only=numeric_only, ) @cached_property def _internal_by(self) -> tuple[Hashable]: """ Get only those components of 'by' that are column labels of the source frame. Returns ------- tuple of labels """ internal_by = tuple() if self._drop: if is_list_like(self._by): internal_by_list = [] for by in self._by: if isinstance(by, str): internal_by_list.append(by) elif isinstance(by, pandas.Grouper): internal_by_list.append(by.key) internal_by = tuple(internal_by_list) elif isinstance(self._by, pandas.Grouper): internal_by = tuple([self._by.key]) else: ErrorMessage.catch_bugs_and_request_email( failure_condition=not isinstance(self._by, BaseQueryCompiler), extra_log=f"When 'drop' is True, 'by' must be either list-like, Grouper, or a QueryCompiler, met: {type(self._by)}.", ) internal_by = tuple(self._by.columns) return internal_by def __getitem__(self, key): """ Implement indexing operation on a DataFrameGroupBy object. Parameters ---------- key : list or str Names of columns to use as subset of original object. Returns ------- DataFrameGroupBy or SeriesGroupBy Result of indexing operation. Raises ------ NotImplementedError Column lookups on GroupBy with arbitrary Series in by is not yet supported. """ # These parameters are common for building the resulted Series or DataFrame groupby object kwargs = { **self._kwargs.copy(), "by": self._by, "axis": self._axis, "idx_name": self._idx_name, } # The rules of type deduction for the resulted object is the following: # 1. If `key` is a list-like or `as_index is False`, then the resulted object is a DataFrameGroupBy # 2. Otherwise, the resulted object is SeriesGroupBy # 3. Result type does not depend on the `by` origin # Examples: # - drop: any, as_index: any, __getitem__(key: list_like) -> DataFrameGroupBy # - drop: any, as_index: False, __getitem__(key: any) -> DataFrameGroupBy # - drop: any, as_index: True, __getitem__(key: label) -> SeriesGroupBy if is_list_like(key): make_dataframe = True else: if self._as_index: make_dataframe = False else: make_dataframe = True key = [key] if make_dataframe: internal_by = frozenset(self._internal_by) if len(internal_by.intersection(key)) != 0: ErrorMessage.mismatch_with_pandas( operation="GroupBy.__getitem__", message=( "intersection of the selection and 'by' columns is not yet supported, " + "to achieve the desired result rewrite the original code from:\n" + "df.groupby('by_column')['by_column']\n" + "to the:\n" + "df.groupby(df['by_column'].copy())['by_column']" ), ) # We need to maintain order of the columns in key, using a set doesn't # maintain order. # We use dictionaries since they maintain insertion order as of 3.7, # and its faster to call dict.update than it is to loop through `key` # and select only the elements which aren't in `cols_to_grab`. cols_to_grab = dict.fromkeys(self._internal_by) cols_to_grab.update(dict.fromkeys(key)) key = [col for col in cols_to_grab.keys() if col in self._df.columns] return DataFrameGroupBy( self._df[key], drop=self._drop, backend_pinned=self._backend_pinned, **kwargs, ) if ( self._is_multi_by and isinstance(self._by, list) and not all(hashable(o) and o in self._df for o in self._by) ): raise NotImplementedError( "Column lookups on GroupBy with arbitrary Series in by" + " is not yet supported." ) return SeriesGroupBy( self._df[key], drop=False, backend_pinned=self._backend_pinned, **kwargs, ) def cummin(self, axis=lib.no_default, numeric_only=False, **kwargs): if axis is not lib.no_default: axis = self._df._get_axis_number(axis) self._deprecate_axis(axis, "cummin") else: axis = 0 return self._wrap_aggregation( type(self._query_compiler).groupby_cummin, agg_kwargs=dict(axis=axis, **kwargs), numeric_only=numeric_only, ) def bfill(self, limit=None): ErrorMessage.single_warning( ".bfill() is implemented using .fillna() in Modin, " + "which can be impacted by pandas bug https://github.com/pandas-dev/pandas/issues/43412 " + "on dataframes with duplicated indices" ) with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message=".*fillna with 'method' is deprecated.*", category=FutureWarning, ) return self.fillna(limit=limit, method="bfill") def prod(self, numeric_only=False, min_count=0): return self._wrap_aggregation( type(self._query_compiler).groupby_prod, agg_kwargs=dict(min_count=min_count), numeric_only=numeric_only, ) def std(self, ddof=1, engine=None, engine_kwargs=None, numeric_only=False): if engine not in ("cython", None) and engine_kwargs is not None: return self._default_to_pandas( lambda df: df.std( ddof=ddof, engine=engine, engine_kwargs=engine_kwargs, numeric_only=numeric_only, ) ) return self._wrap_aggregation( type(self._query_compiler).groupby_std, agg_kwargs=dict(ddof=ddof), numeric_only=numeric_only, ) def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): if engine not in ("cython", None) and engine_kwargs is not None: return self._default_to_pandas( lambda df: df.aggregate( func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs ) ) if self._axis != 0: # This is not implemented in pandas, # so we throw a different message raise NotImplementedError("axis other than 0 is not supported") if ( callable(func) and isinstance(func, BuiltinFunctionType) and func.__name__ in dir(self) ): func_name = func.__name__ warnings.warn( f"The provided callable {func} is currently using " + f"{type(self).__name__}.{func_name}. In a future version of pandas, " + "the provided callable will be used directly. To keep current " + f"behavior pass the string {func_name} instead.", category=FutureWarning, ) func = func_name do_relabel = None if isinstance(func, dict) or func is None: # the order from `reconstruct_func` cannot be used correctly if there # is more than one columnar partition, since for correct use all columns # must be available within one partition. old_kwargs = dict(kwargs) relabeling_required, func_dict, new_columns, _ = reconstruct_func( func, **kwargs ) if relabeling_required: def do_relabel(obj_to_relabel): # noqa: F811 # unwrap nested labels into one level tuple result_labels = [None] * len(old_kwargs) for idx, labels in enumerate(old_kwargs.values()): if is_scalar(labels) or callable(labels): result_labels[idx] = ( labels if not callable(labels) else labels.__name__ ) continue new_elem = [] for label in labels: if is_scalar(label) or callable(label): new_elem.append( label if not callable(label) else label.__name__ ) else: new_elem.extend(label) result_labels[idx] = tuple(new_elem) new_order = obj_to_relabel.columns.get_indexer(result_labels) new_columns_idx = pandas.Index(new_columns) if not self._as_index: nby_cols = len(obj_to_relabel.columns) - len(new_columns_idx) new_order = np.concatenate([np.arange(nby_cols), new_order]) by_cols = obj_to_relabel.columns[:nby_cols] if by_cols.nlevels != new_columns_idx.nlevels: by_cols = by_cols.remove_unused_levels() empty_levels = [ i for i, level in enumerate(by_cols.levels) if len(level) == 1 and level[0] == "" ] by_cols = by_cols.droplevel(empty_levels) new_columns_idx = by_cols.append(new_columns_idx) result = obj_to_relabel.iloc[:, new_order] result.columns = new_columns_idx return result if any(isinstance(fn, list) for fn in func_dict.values()): # multicolumn case # putting functions in a `list` allows to achieve multicolumn in each partition func_dict = { col: fn if isinstance(fn, list) else [fn] for col, fn in func_dict.items() } if ( relabeling_required and not self._as_index and any(col in func_dict for col in self._internal_by) ): ErrorMessage.mismatch_with_pandas( operation="GroupBy.aggregate(**dictionary_renaming_aggregation)", message=( "intersection of the columns to aggregate and 'by' is not yet supported when 'as_index=False', " + "columns with group names of the intersection will not be presented in the result. " + "To achieve the desired result rewrite the original code from:\n" + "df.groupby('by_column', as_index=False).agg(agg_func=('by_column', agg_func))\n" + "to the:\n" + "df.groupby('by_column').agg(agg_func=('by_column', agg_func)).reset_index()" ), ) if any(i not in self._df.columns for i in func_dict.keys()): raise SpecificationError("nested renamer is not supported") if func is None: kwargs = {} func = func_dict elif is_list_like(func): # for list-list aggregation pandas always puts # groups as index in the result, ignoring as_index, # so we have to reset it to default value res = self._override(as_index=True)._wrap_aggregation( qc_method=type(self._query_compiler).groupby_agg, numeric_only=False, agg_func=func, agg_args=args, agg_kwargs=kwargs, how="axis_wise", ) if not self._kwargs["as_index"]: res.reset_index(inplace=True) return res elif callable(func): return self._check_index( self._wrap_aggregation( qc_method=type(self._query_compiler).groupby_agg, numeric_only=False, agg_func=func, agg_args=args, agg_kwargs=kwargs, how="axis_wise", ) ) elif isinstance(func, str): # Using "getattr" here masks possible AttributeError which we throw # in __getattr__, so we should call __getattr__ directly instead. agg_func = self.__getattr__(func) if callable(agg_func): return agg_func(*args, **kwargs) result = self._wrap_aggregation( qc_method=type(self._query_compiler).groupby_agg, numeric_only=False, agg_func=func, agg_args=args, agg_kwargs=kwargs, how="axis_wise", ) return do_relabel(result) if do_relabel else result agg = aggregate def rank( self, method="average", ascending=True, na_option="keep", pct=False, axis=lib.no_default, ): if na_option not in {"keep", "top", "bottom"}: raise ValueError("na_option must be one of 'keep', 'top', or 'bottom'") if axis is not lib.no_default: axis = self._df._get_axis_number(axis) self._deprecate_axis(axis, "rank") else: axis = 0 result = self._wrap_aggregation( type(self._query_compiler).groupby_rank, agg_kwargs=dict( method=method, ascending=ascending, na_option=na_option, pct=pct, axis=axis, ), numeric_only=False, ) return result @property def corrwith(self): return self._default_to_pandas(lambda df: df.corrwith) def var(self, ddof=1, engine=None, engine_kwargs=None, numeric_only=False): if engine not in ("cython", None) and engine_kwargs is not None: return self._default_to_pandas( lambda df: df.var( ddof=ddof, engine=engine, engine_kwargs=engine_kwargs, numeric_only=numeric_only, ) ) return self._wrap_aggregation( type(self._query_compiler).groupby_var, agg_kwargs=dict(ddof=ddof), numeric_only=numeric_only, ) def get_group(self, name, obj=None): work_object = self._override( df=obj if obj is not None else self._df, as_index=True ) return work_object._check_index( work_object._wrap_aggregation( qc_method=type(work_object._query_compiler).groupby_get_group, numeric_only=False, agg_kwargs=dict(name=name), ) ) def __len__(self): # noqa: GL08 return len(self.indices) def all(self, skipna=True): return self._wrap_aggregation( type(self._query_compiler).groupby_all, numeric_only=False, agg_kwargs=dict(skipna=skipna), ) def size(self): if self._axis == 1: return DataFrameGroupBy( self._df.T.iloc[:, [0]], self._by, 0, drop=self._drop, idx_name=self._idx_name, backend_pinned=self._backend_pinned, **self._kwargs, ).size() result = self._wrap_aggregation( type(self._query_compiler).groupby_size, numeric_only=False, ) if not isinstance(result, Series): result = result.squeeze(axis=1) if not self._kwargs.get("as_index") and not isinstance(result, Series): result = ( result.rename(columns={MODIN_UNNAMED_SERIES_LABEL: "index"}) if MODIN_UNNAMED_SERIES_LABEL in result.columns else result ) elif isinstance(self._df, Series): result.name = self._df.name return result def sum(self, numeric_only=False, min_count=0, engine=None, engine_kwargs=None): if engine not in ("cython", None) and engine_kwargs is not None: return self._default_to_pandas( lambda df: df.sum( numeric_only=numeric_only, min_count=min_count, engine=engine, engine_kwargs=engine_kwargs, ) ) return self._wrap_aggregation( type(self._query_compiler).groupby_sum, agg_kwargs=dict(min_count=min_count), numeric_only=numeric_only, ) def describe(self, percentiles=None, include=None, exclude=None): return self._default_to_pandas( lambda df: df.describe( percentiles=percentiles, include=include, exclude=exclude ) ) def boxplot( self, grouped, subplots=True, column=None, fontsize=None, rot=0, grid=True, ax=None, figsize=None, layout=None, sharex=False, sharey=True, backend=None, **kwargs, ): return self._default_to_pandas( lambda df: df.boxplot( grouped, subplots=subplots, column=column, fontsize=fontsize, rot=rot, grid=grid, ax=ax, figsize=figsize, layout=layout, sharex=sharex, sharey=sharey, backend=backend, **kwargs, ) ) def ngroup(self, ascending=True): result = self._wrap_aggregation( type(self._query_compiler).groupby_ngroup, numeric_only=False, agg_kwargs=dict(ascending=ascending), ) if not isinstance(result, Series): # The result should always be a Series with name None and type int64 result = result.squeeze(axis=1) return result def nunique(self, dropna=True): return self._check_index( self._wrap_aggregation( type(self._query_compiler).groupby_nunique, numeric_only=False, agg_kwargs=dict(dropna=dropna), ) ) def resample(self, rule, *args, include_groups=True, **kwargs): return self._default_to_pandas( lambda df: df.resample(rule, *args, include_groups=include_groups, **kwargs) ) def median(self, numeric_only=False): return self._check_index( self._wrap_aggregation( type(self._query_compiler).groupby_median, numeric_only=numeric_only, ) ) def head(self, n=5): # groupby().head()/.tail() ignore as_index, so override it to True work_object = self._override(as_index=True) return work_object._check_index( work_object._wrap_aggregation( type(work_object._query_compiler).groupby_head, agg_kwargs=dict(n=n), numeric_only=False, ) ) def cumprod(self, axis=lib.no_default, *args, **kwargs): if axis is not lib.no_default: axis = self._df._get_axis_number(axis) self._deprecate_axis(axis, "cumprod") else: axis = 0 return self._wrap_aggregation( type(self._query_compiler).groupby_cumprod, agg_args=args, agg_kwargs=dict(axis=axis, **kwargs), ) def __iter__(self): return self._iter.__iter__() def cov(self, min_periods=None, ddof=1, numeric_only=False): return self._wrap_aggregation( type(self._query_compiler).groupby_cov, agg_kwargs=dict(min_periods=min_periods, ddof=ddof), numeric_only=numeric_only, ) def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): if engine not in ("cython", None) and engine_kwargs is not None: return self._default_to_pandas( lambda df: df.transform( func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs ) ) return self._wrap_aggregation( qc_method=type(self._query_compiler).groupby_agg, numeric_only=False, agg_func=func, agg_args=args, agg_kwargs=kwargs, how="transform", ) def corr(self, method="pearson", min_periods=1, numeric_only=False): return self._wrap_aggregation( type(self._query_compiler).groupby_corr, agg_kwargs=dict(method=method, min_periods=min_periods), numeric_only=numeric_only, ) def fillna( self, value=None, method=None, axis=lib.no_default, inplace=False, limit=None, downcast=lib.no_default, ): if axis is not lib.no_default: self._deprecate_axis(axis, "fillna") warnings.warn( f"{type(self).__name__}.fillna is deprecated and will be removed " + "in a future version. Use obj.ffill(), obj.bfill(), " + "or obj.nearest() instead.", FutureWarning, ) # default behaviour for aggregations; for the reference see # `_op_via_apply` func in pandas==2.0.2 if axis is None or axis is lib.no_default: axis = self._axis new_groupby_kwargs = self._kwargs.copy() new_groupby_kwargs["as_index"] = True work_object = type(self)( df=self._df, by=self._by, axis=self._axis, idx_name=self._idx_name, drop=self._drop, backend_pinned=self._backend_pinned, **new_groupby_kwargs, ) return work_object._wrap_aggregation( type(self._query_compiler).groupby_fillna, agg_kwargs=dict( value=value, method=method, axis=axis, inplace=inplace, limit=limit, downcast=downcast, ), numeric_only=False, ) def count(self): return self._wrap_aggregation( type(self._query_compiler).groupby_count, numeric_only=False, ) def pipe(self, func, *args, **kwargs): return com.pipe(self, func, *args, **kwargs) def cumcount(self, ascending=True): result = self._wrap_aggregation( type(self._query_compiler).groupby_cumcount, numeric_only=False, agg_kwargs=dict(ascending=ascending), ) if not isinstance(result, Series): # The result should always be a Series with name None and type int64 result = result.squeeze(axis=1) return result def tail(self, n=5): # groupby().head()/.tail() ignore as_index, so override it to True work_object = self._override(as_index=True) return work_object._check_index( work_object._wrap_aggregation( type(work_object._query_compiler).groupby_tail, agg_kwargs=dict(n=n), numeric_only=False, ) ) # expanding and rolling are unique cases and need to likely be handled # separately. They do not appear to be commonly used. def expanding(self, *args, **kwargs): return self._default_to_pandas(lambda df: df.expanding(*args, **kwargs)) def rolling(self, *args, **kwargs): return RollingGroupby(self, *args, **kwargs) def hist( self, column=None, by=None, grid=True, xlabelsize=None, xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False, sharey=False, figsize=None, layout=None, bins=10, backend=None, legend=False, **kwargs, ): return self._default_to_pandas( lambda df: df.hist( column=column, by=by, grid=grid, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot, ax=ax, sharex=sharex, sharey=sharey, figsize=figsize, layout=layout, bins=bins, backend=backend, legend=legend, **kwargs, ) ) def quantile(self, q=0.5, interpolation="linear", numeric_only=False): # TODO: handle list-like cases properly if is_list_like(q): return self._default_to_pandas( lambda df: df.quantile(q=q, interpolation=interpolation) ) return self._check_index( self._wrap_aggregation( type(self._query_compiler).groupby_quantile, numeric_only=numeric_only, agg_kwargs=dict(q=q, interpolation=interpolation), ) ) def diff(self, periods=1, axis=lib.no_default): from .dataframe import DataFrame if axis is not lib.no_default: axis = self._df._get_axis_number(axis) self._deprecate_axis(axis, "diff") else: axis = 0 # Should check for API level errors # Attempting to match pandas error behavior here if not isinstance(periods, int): raise TypeError(f"periods must be an int. got {type(periods)} instead") if isinstance(self._df, Series): if not is_numeric_dtype(self._df.dtypes): raise TypeError( f"unsupported operand type for -: got {self._df.dtypes}" ) elif isinstance(self._df, DataFrame) and axis == 0: for col, dtype in self._df.dtypes.items(): # can't calculate diff on non-numeric columns, so check for non-numeric # columns that are not included in the `by` if not ( is_numeric_dtype(dtype) or is_datetime64_any_dtype(dtype) ) and not ( isinstance(self._by, BaseQueryCompiler) and col in self._by.columns ): raise TypeError(f"unsupported operand type for -: got {dtype}") return self._wrap_aggregation( type(self._query_compiler).groupby_diff, agg_kwargs=dict( periods=periods, axis=axis, ), ) def take(self, indices, axis=lib.no_default, **kwargs): return self._default_to_pandas(lambda df: df.take(indices, axis=axis, **kwargs)) @property def _index(self): """ Get index value. Returns ------- pandas.Index Index value. """ return self._query_compiler.index @property def _sort(self): """ Get sort parameter value. Returns ------- bool Value of sort parameter used to create DataFrameGroupBy object. """ return self._kwargs.get("sort") @property def _as_index(self): """ Get as_index parameter value. Returns ------- bool Value of as_index parameter used to create DataFrameGroupBy object. """ return self._kwargs.get("as_index") @property def _iter(self): """ Construct a tuple of (group_id, DataFrame) tuples to allow iteration over groups. Returns ------- generator Generator expression of GroupBy object broken down into tuples for iteration. """ from .dataframe import DataFrame indices = self.indices group_ids = indices.keys() if self._axis == 0: return ( ( (k,) if self._return_tuple_when_iterating else k, DataFrame( query_compiler=self._query_compiler.getitem_row_array( indices[k] ) ), ) for k in (sorted(group_ids) if self._sort else group_ids) ) else: return ( ( (k,) if self._return_tuple_when_iterating else k, DataFrame( query_compiler=self._query_compiler.getitem_column_array( indices[k], numeric=True ) ), ) for k in (sorted(group_ids) if self._sort else group_ids) ) def _compute_index_grouped(self, numerical=False): """ Construct an index of group IDs. Parameters ---------- numerical : bool, default: False Whether a group indices should be positional (True) or label-based (False). Returns ------- dict A dict of {group name -> group indices} values. See Also -------- pandas.core.groupby.GroupBy.groups """ # We end up using pure pandas to compute group indices, so raising a warning ErrorMessage.default_to_pandas("Group indices computation") # Splitting level-by and column-by since we serialize them in a different ways by = None level = [] if self._level is not None: level = self._level if not isinstance(level, list): level = [level] elif isinstance(self._by, list): by = [] for o in self._by: if hashable(o) and o in self._query_compiler.get_index_names( self._axis ): level.append(o) else: by.append(o) else: by = self._by is_multi_by = self._is_multi_by or (by is not None and len(level) > 0) # `dropna` param is the only one that matters for the group indices result dropna = self._kwargs.get("dropna", True) if isinstance(self._by, BaseQueryCompiler) and is_multi_by: by = list(self._by.columns) if is_multi_by: # Because we are doing a collect (to_pandas) here and then groupby, we # end up using pandas implementation. Add the warning so the user is # aware. ErrorMessage.catch_bugs_and_request_email(self._axis == 1) if isinstance(by, list) and all( is_label(self._df, o, self._axis) for o in by ): pandas_df = self._df._query_compiler.getitem_column_array( by ).to_pandas() else: by = try_cast_to_pandas(by, squeeze=True) pandas_df = self._df._to_pandas() by = wrap_into_list(by, level) groupby_obj = pandas_df.groupby(by=by, dropna=dropna) return groupby_obj.indices if numerical else groupby_obj.groups else: if isinstance(self._by, type(self._query_compiler)): by = self._by.to_pandas().squeeze().values elif self._by is None: index = self._query_compiler.get_axis(self._axis) levels_to_drop = [ i for i, name in enumerate(index.names) if name not in level and i not in level ] by = index.droplevel(levels_to_drop) if isinstance(by, pandas.MultiIndex): by = by.reorder_levels(level) else: by = self._by axis_labels = self._query_compiler.get_axis(self._axis) if numerical: # Since we want positional indices of the groups, we want to group # on a `RangeIndex`, not on the actual index labels axis_labels = pandas.RangeIndex(len(axis_labels)) # `pandas.Index.groupby` doesn't take any parameters except `by`. # Have to convert an Index to a Series to be able to process `dropna=False`: if dropna: return axis_labels.groupby(by) else: groupby_obj = axis_labels.to_series().groupby(by, dropna=dropna) return groupby_obj.indices if numerical else groupby_obj.groups def _wrap_aggregation( self, qc_method, numeric_only=False, agg_args=None, agg_kwargs=None, **kwargs, ): """ Perform common metadata transformations and apply groupby functions. Parameters ---------- qc_method : callable The query compiler method to call. numeric_only : {None, True, False}, default: None Specifies whether to aggregate non numeric columns: - True: include only numeric columns (including categories that holds a numeric dtype) - False: include all columns - None: infer the parameter, ``False`` if there are no numeric types in the frame, ``True`` otherwise. agg_args : list-like, optional Positional arguments to pass to the aggregation function. agg_kwargs : dict-like, optional Keyword arguments to pass to the aggregation function. **kwargs : dict Keyword arguments to pass to the specified query compiler's method. Returns ------- DataFrame or Series Returns the same type as `self._df`. """ agg_args = tuple() if agg_args is None else agg_args agg_kwargs = dict() if agg_kwargs is None else agg_kwargs if numeric_only and self.ndim == 2: by_cols = self._internal_by mask_cols = [ col for col, dtype in self._query_compiler.dtypes.items() if (is_numeric_dtype(dtype) or col in by_cols) ] groupby_qc = self._query_compiler.getitem_column_array(mask_cols) else: groupby_qc = self._query_compiler return type(self._df)( query_compiler=qc_method( groupby_qc, by=self._by, axis=self._axis, groupby_kwargs=self._kwargs, agg_args=agg_args, agg_kwargs=agg_kwargs, drop=self._drop, **kwargs, ) ) def _check_index(self, result): """ Check the result of groupby aggregation on the need of resetting index. Parameters ---------- result : DataFrame Group by aggregation result. Returns ------- DataFrame """ if self._by is None and not self._as_index: # This is a workaround to align behavior with pandas. In this case pandas # resets index, but Modin doesn't do that. More details are in https://github.com/modin-project/modin/issues/3716. result.reset_index(drop=True, inplace=True) return result def _default_to_pandas(self, f, *args, **kwargs): """ Execute function `f` in default-to-pandas way. Parameters ---------- f : callable or str The function to apply to each group. *args : list Extra positional arguments to pass to `f`. **kwargs : dict Extra keyword arguments to pass to `f`. Returns ------- modin.pandas.DataFrame A new Modin DataFrame with the result of the pandas function. """ if ( isinstance(self._by, type(self._query_compiler)) and len(self._by.columns) == 1 ): by = self._by.columns[0] if self._drop else self._by.to_pandas().squeeze() # converting QC 'by' to a list of column labels only if this 'by' comes from the self (if drop is True) elif self._drop and isinstance(self._by, type(self._query_compiler)): by = list(self._by.columns) else: by = self._by by = try_cast_to_pandas(by, squeeze=True) # Since 'by' may be a 2D query compiler holding columns to group by, # to_pandas will also produce a pandas DataFrame containing them. # So splitting 2D 'by' into a list of 1D Series using 'GroupBy.validate_by': by = GroupBy.validate_by(by) def groupby_on_multiple_columns(df, *args, **kwargs): groupby_obj = df.groupby(by=by, axis=self._axis, **self._kwargs) if callable(f): return f(groupby_obj, *args, **kwargs) else: ErrorMessage.catch_bugs_and_request_email( failure_condition=not isinstance(f, str) ) attribute = getattr(groupby_obj, f) if callable(attribute): return attribute(*args, **kwargs) return attribute return self._df._default_to_pandas(groupby_on_multiple_columns, *args, **kwargs) @_inherit_docstrings(pandas.core.groupby.SeriesGroupBy) class SeriesGroupBy(DataFrameGroupBy): # noqa: GL08 _pandas_class = pandas.core.groupby.SeriesGroupBy _extensions: EXTENSION_DICT_TYPE = EXTENSION_DICT_TYPE(dict) @disable_logging def __getattribute__(self, item: str) -> Any: """ Get an attribute of the object. Python calls this method for every attribute access. We override it to get extension attributes. Parameters ---------- item : str Attribute name. Returns ------- Any The value of the attribute. """ if item not in GROUPBY_EXTENSION_NO_LOOKUP: extensions_result = self._getattribute__from_extension_impl( item, __class__._extensions ) if extensions_result is not sentinel: return extensions_result return super().__getattribute__(item) @_inherit_docstrings(QueryCompilerCaster._getattr__from_extension_impl) def __getattr__(self, key: str) -> Any: return self._getattr__from_extension_impl( key=key, default_behavior_attributes=GROUPBY_EXTENSION_NO_LOOKUP, extensions=__class__._extensions, ) @disable_logging def __setattr__(self, key: str, value: Any) -> None: """ Set an attribute of the object. We override this method to support settable extension attributes. Parameters ---------- key : str Attribute name. value : Any Value to set the attribute to. Returns ------- None """ # An extension property is only accessible if the backend supports it. extension = self._get_extension(key, __class__._extensions) if extension is not sentinel and hasattr(extension, "__set__"): return extension.__set__(self, value) return super().__setattr__(key, value) @disable_logging def __delattr__(self, name: str) -> None: """ Delete an attribute of the object. We override this method to support deletable extension attributes. Parameters ---------- name : str Attribute name. Returns ------- None """ # An extension property is only accessible if the backend supports it. extension = self._get_extension(name, __class__._extensions) if extension is not sentinel and hasattr(extension, "__delete__"): return extension.__delete__(self) return super().__delattr__(name) @property def ndim(self): """ Return 1. Returns ------- int Returns 1. Notes ----- Deprecated and removed in pandas and will be likely removed in Modin. """ return 1 # ndim is always 1 for Series @property def _iter(self): """ Construct a tuple of (group_id, Series) tuples to allow iteration over groups. Returns ------- generator Generator expression of GroupBy object broken down into tuples for iteration. """ indices = self.indices group_ids = indices.keys() if self._axis == 0: return ( ( k, Series( query_compiler=self._query_compiler.getitem_row_array( indices[k] ) ), ) for k in (sorted(group_ids) if self._sort else group_ids) ) else: return ( ( k, Series( query_compiler=self._query_compiler.getitem_column_array( indices[k], numeric=True ) ), ) for k in (sorted(group_ids) if self._sort else group_ids) ) def _try_get_str_func(self, fn): """ Try to convert a groupby aggregation function to a string or list of such. Parameters ---------- fn : callable, str, or Iterable Returns ------- str, list If `fn` is a callable, return its name, otherwise return `fn` itself. If `fn` is a string, return it. If `fn` is an Iterable, return a list of _try_get_str_func applied to each element of `fn`. """ if not isinstance(fn, str) and isinstance(fn, Iterable): return [self._try_get_str_func(f) for f in fn] return fn.__name__ if callable(fn) else fn def value_counts( self, normalize: bool = False, sort: bool = True, ascending: bool = False, bins=None, dropna: bool = True, ): # noqa: GL08 return self._default_to_pandas( lambda ser: ser.value_counts( normalize=normalize, sort=sort, ascending=ascending, bins=bins, dropna=dropna, ) ) def corr(self, other, method="pearson", min_periods=None): return self._wrap_aggregation( type(self._query_compiler).groupby_corr, agg_kwargs=dict(other=other, method=method, min_periods=min_periods), ) def cov(self, other, min_periods=None, ddof=1): return self._wrap_aggregation( type(self._query_compiler).groupby_cov, agg_kwargs=dict(other=other, min_periods=min_periods, ddof=ddof), ) def describe(self, percentiles=None, include=None, exclude=None): return self._default_to_pandas( lambda df: df.describe( percentiles=percentiles, include=include, exclude=exclude ) ) def apply(self, func, *args, **kwargs): return super().apply(func, *args, **kwargs) def idxmax(self, axis=lib.no_default, skipna=True): if axis is not lib.no_default: axis = self._df._get_axis_number(axis) self._deprecate_axis(axis, "idxmax") else: axis = 0 return self._wrap_aggregation( type(self._query_compiler).groupby_idxmax, agg_kwargs=dict(axis=axis, skipna=skipna), ) def idxmin(self, axis=lib.no_default, skipna=True): if axis is not lib.no_default: axis = self._df._get_axis_number(axis) self._deprecate_axis(axis, "idxmin") else: axis = 0 return self._wrap_aggregation( type(self._query_compiler).groupby_idxmin, agg_kwargs=dict(axis=axis, skipna=skipna), ) def hist( self, by=None, ax=None, grid=True, xlabelsize=None, xrot=None, ylabelsize=None, yrot=None, figsize=None, bins=10, backend=None, legend=False, **kwargs, ): return self._default_to_pandas( lambda df: df.hist( by=by, ax=ax, grid=grid, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot, figsize=figsize, bins=bins, backend=backend, legend=legend, **kwargs, ) ) @property def is_monotonic_decreasing(self): return self._default_to_pandas(lambda ser: ser.is_monotonic_decreasing) @property def is_monotonic_increasing(self): return self._default_to_pandas(lambda ser: ser.is_monotonic_increasing) @property def dtype(self): return self._default_to_pandas(lambda ser: ser.dtype) def unique(self): return self._check_index( self._wrap_aggregation( type(self._query_compiler).groupby_unique, numeric_only=False, ) ) def nlargest(self, n=5, keep="first"): return self._check_index( self._wrap_aggregation( type(self._query_compiler).groupby_nlargest, agg_kwargs=dict(n=n, keep=keep), numeric_only=True, ) ) def nsmallest(self, n=5, keep="first"): return self._check_index( self._wrap_aggregation( type(self._query_compiler).groupby_nsmallest, agg_kwargs=dict(n=n, keep=keep), numeric_only=True, ) ) def _validate_func_kwargs(self, kwargs: dict): """ Validate types of user-provided "named aggregation" kwargs. Parameters ---------- kwargs : dict Returns ------- columns : List[str] List of user-provided keys. funcs : List[Union[str, callable[...,Any]]] List of user-provided aggfuncs. Raises ------ `TypeError` is raised if aggfunc is not `str` or callable. Notes ----- Copied from pandas. """ columns = list(kwargs) funcs = [] for col_func in kwargs.values(): if not (isinstance(col_func, str) or callable(col_func)): raise TypeError( f"func is expected but received {type(col_func).__name__} in **kwargs." ) funcs.append(col_func) if not columns: raise TypeError("Must provide 'func' or named aggregation **kwargs.") return columns, funcs def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): engine_default = engine is None and engine_kwargs is None # if func is None, will switch to user-provided "named aggregation" kwargs if func_is_none := func is None: columns, func = self._validate_func_kwargs(kwargs) kwargs = {} if isinstance(func, dict) and engine_default: raise SpecificationError("nested renamer is not supported") elif is_list_like(func) and engine_default: from .dataframe import DataFrame result = DataFrame( query_compiler=self._query_compiler.groupby_agg( by=self._by, agg_func=func, axis=self._axis, groupby_kwargs=self._kwargs, agg_args=args, agg_kwargs=kwargs, ) ) # query compiler always gives result a multiindex on the axis with the # function names, but series always gets a regular index on the columns # because there is no need to identify which original column's aggregation # the new column represents. alternatively we could give the query compiler # a hint that it's for a series, not a dataframe. if func_is_none: return result.set_axis(labels=columns, axis=1, copy=False) return result.set_axis( labels=self._try_get_str_func(func), axis=1, copy=False ) else: return super().aggregate( func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs ) agg = aggregate ================================================ FILE: modin/pandas/indexing.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. # noqa: MD02 """ Details about how Indexing Helper Class works. _LocationIndexerBase provide methods framework for __getitem__ and __setitem__ that work with Modin DataFrame's internal index. Base class's __{get,set}item__ takes in partitions & idx_in_partition data and perform lookup/item write. _LocIndexer and _iLocIndexer is responsible for indexer specific logic and lookup computation. Loc will take care of enlarge DataFrame. Both indexer will take care of translating pandas's lookup to Modin DataFrame's internal lookup. An illustration is available at https://github.com/ray-project/ray/pull/1955#issuecomment-386781826 """ from __future__ import annotations import itertools from typing import TYPE_CHECKING, Optional, Union import numpy as np import pandas from pandas.api.types import is_bool, is_list_like from pandas.core.dtypes.common import is_bool_dtype, is_integer, is_integer_dtype from pandas.core.indexing import IndexingError from modin.core.storage_formats.pandas.query_compiler_caster import ( EXTENSION_DICT_TYPE, QueryCompilerCaster, ) from modin.error_message import ErrorMessage from modin.logging import ClassLogger, disable_logging from modin.utils import _inherit_docstrings from .dataframe import DataFrame from .series import Series from .utils import is_scalar if TYPE_CHECKING: from typing_extensions import Self from modin.core.storage_formats import BaseQueryCompiler def is_slice(x): """ Check that argument is an instance of slice. Parameters ---------- x : object Object to check. Returns ------- bool True if argument is a slice, False otherwise. """ return isinstance(x, slice) def compute_sliced_len(slc, sequence_len): """ Compute length of sliced object. Parameters ---------- slc : slice Slice object. sequence_len : int Length of sequence, to which slice will be applied. Returns ------- int Length of object after applying slice object on it. """ # This will translate slice to a range, from which we can retrieve length return len(range(*slc.indices(sequence_len))) def is_2d(x): """ Check that argument is a list or a slice. Parameters ---------- x : object Object to check. Returns ------- bool `True` if argument is a list or slice, `False` otherwise. """ return is_list_like(x) or is_slice(x) def is_tuple(x): """ Check that argument is a tuple. Parameters ---------- x : object Object to check. Returns ------- bool True if argument is a tuple, False otherwise. """ return isinstance(x, tuple) def is_boolean_array(x): """ Check that argument is an array of bool. Parameters ---------- x : object Object to check. Returns ------- bool True if argument is an array of bool, False otherwise. """ if isinstance(x, (np.ndarray, Series, pandas.Series, pandas.Index)): return is_bool_dtype(x.dtype) elif isinstance(x, (DataFrame, pandas.DataFrame)): return all(map(is_bool_dtype, x.dtypes)) return is_list_like(x) and all(map(is_bool, x)) def is_integer_array(x): """ Check that argument is an array of integers. Parameters ---------- x : object Object to check. Returns ------- bool True if argument is an array of integers, False otherwise. """ if isinstance(x, (np.ndarray, Series, pandas.Series, pandas.Index)): return is_integer_dtype(x.dtype) elif isinstance(x, (DataFrame, pandas.DataFrame)): return all(map(is_integer_dtype, x.dtypes)) return is_list_like(x) and all(map(is_integer, x)) def is_integer_slice(x): """ Check that argument is an array of int. Parameters ---------- x : object Object to check. Returns ------- bool True if argument is an array of int, False otherwise. """ if not is_slice(x): return False for pos in [x.start, x.stop, x.step]: if not ((pos is None) or is_integer(pos)): return False # one position is neither None nor int return True def is_range_like(obj): """ Check if the object is range-like. Objects that are considered range-like have information about the range (start and stop positions, and step) and also have to be iterable. Examples of range-like objects are: Python range, pandas.RangeIndex. Parameters ---------- obj : object Returns ------- bool """ return ( hasattr(obj, "__iter__") and hasattr(obj, "start") and hasattr(obj, "stop") and hasattr(obj, "step") ) def boolean_mask_to_numeric(indexer): """ Convert boolean mask to numeric indices. Parameters ---------- indexer : list-like of booleans Returns ------- np.ndarray of ints Numerical positions of ``True`` elements in the passed `indexer`. """ if isinstance(indexer, (np.ndarray, Series, pandas.Series)): return np.where(indexer)[0] else: # It's faster to build the resulting numpy array from the reduced amount of data via # `compress` iterator than convert non-numpy-like `indexer` to numpy and apply `np.where`. return np.fromiter( # `itertools.compress` masks `data` with the `selectors` mask, # works about ~10% faster than a pure list comprehension itertools.compress(data=range(len(indexer)), selectors=indexer), dtype=np.int64, ) _ILOC_INT_ONLY_ERROR = """ Location based indexing can only have [integer, integer slice (START point is INCLUDED, END point is EXCLUDED), listlike of integers, boolean array] types. """ _one_ellipsis_message = "indexer may only contain one '...' entry" def _compute_ndim(row_loc, col_loc): """ Compute the number of dimensions of result from locators. Parameters ---------- row_loc : list or scalar Row locator. col_loc : list or scalar Column locator. Returns ------- {0, 1, 2} Number of dimensions in located dataset. """ row_scalar = is_scalar(row_loc) or is_tuple(row_loc) col_scalar = is_scalar(col_loc) or is_tuple(col_loc) if row_scalar and col_scalar: ndim = 0 elif row_scalar ^ col_scalar: ndim = 1 else: ndim = 2 return ndim class _LocationIndexerBase(QueryCompilerCaster, ClassLogger): """ Base class for location indexer like loc and iloc. Parameters ---------- modin_df : Union[DataFrame, Series] DataFrame to operate on. """ df: Union[DataFrame, Series] qc: BaseQueryCompiler _extensions: EXTENSION_DICT_TYPE = EXTENSION_DICT_TYPE(dict) def is_backend_pinned(self) -> bool: """ Get whether this object's data is pinned to a particular backend. Returns ------- bool True if the data is pinned. """ return self.df.is_backend_pinned() def _set_backend_pinned(self, pinned: bool, inplace: bool = False): """ Update whether this object's data is pinned to a particular backend. Parameters ---------- pinned : bool Whether the data is pinned. inplace : bool, default: False Whether to update the object in place. Returns ------- Optional[Self] The object with the new pin state, if `inplace` is False. Otherwise, None. """ change = (self.is_backend_pinned() and not pinned) or ( not self.is_backend_pinned() and pinned ) if not change: return None if inplace else self result = type(self)(self.df._set_backend_pinned(pinned)) if inplace: result._copy_into(self) return None return result @disable_logging @_inherit_docstrings(QueryCompilerCaster.set_backend) def set_backend( self, backend, inplace: bool = False, *, switch_operation: Optional[str] = None ) -> Optional[Self]: result = type(self)( self.df.set_backend(backend, switch_operation=switch_operation) ) if inplace: result._copy_into(self) return None return result @disable_logging @_inherit_docstrings(QueryCompilerCaster._get_query_compiler) def _get_query_compiler(self): return getattr(self, "qc", None) @disable_logging @_inherit_docstrings(QueryCompilerCaster.get_backend) def get_backend(self): return self.qc.get_backend() @disable_logging @_inherit_docstrings(QueryCompilerCaster._copy_into) def _copy_into(self, other: Series): other.qc = self.df._query_compiler other.df._update_inplace(new_query_compiler=self.df._query_compiler) other.df._set_backend_pinned(self.is_backend_pinned()) return None def __init__(self, modin_df: Union[DataFrame, Series]): # TODO(https://github.com/modin-project/modin/issues/7513): Do not keep # both `df` and `qc`. self.df = modin_df self.qc = modin_df._query_compiler def _validate_key_length(self, key: tuple) -> tuple: # noqa: GL08 # Implementation copied from pandas. if len(key) > self.df.ndim: if key[0] is Ellipsis: # e.g. Series.iloc[..., 3] reduces to just Series.iloc[3] key = key[1:] if Ellipsis in key: raise IndexingError(_one_ellipsis_message) return self._validate_key_length(key) raise IndexingError("Too many indexers") return key def __getitem__(self, key): # pragma: no cover """ Retrieve dataset according to `key`. Parameters ---------- key : callable, scalar, or tuple The global row index to retrieve data from. Returns ------- modin.pandas.DataFrame or modin.pandas.Series Located dataset. See Also -------- pandas.DataFrame.loc """ raise NotImplementedError("Implemented by subclasses") def __setitem__(self, key, item): # pragma: no cover """ Assign `item` value to dataset located by `key`. Parameters ---------- key : callable or tuple The global row numbers to assign data to. item : modin.pandas.DataFrame, modin.pandas.Series or scalar Value that should be assigned to located dataset. See Also -------- pandas.DataFrame.iloc """ raise NotImplementedError("Implemented by subclasses") def _get_pandas_object_from_qc_view( self, qc_view, row_multiindex_full_lookup: bool, col_multiindex_full_lookup: bool, row_scalar: bool, col_scalar: bool, ndim: int, ): """ Convert the query compiler view to the appropriate pandas object. Parameters ---------- qc_view : BaseQueryCompiler Query compiler to convert. row_multiindex_full_lookup : bool See _multiindex_possibly_contains_key.__doc__. col_multiindex_full_lookup : bool See _multiindex_possibly_contains_key.__doc__. row_scalar : bool Whether indexer for rows is scalar. col_scalar : bool Whether indexer for columns is scalar. ndim : {0, 1, 2} Number of dimensions in dataset to be retrieved. Returns ------- modin.pandas.DataFrame or modin.pandas.Series The pandas object with the data from the query compiler view. Notes ----- Usage of `slice(None)` as a lookup is a hack to pass information about full-axis grab without computing actual indices that triggers lazy computations. Ideally, this API should get rid of using slices as indexers and either use a common ``Indexer`` object or range and ``np.ndarray`` only. """ if ndim == 2: return self.df.__constructor__(query_compiler=qc_view) if isinstance(self.df, Series) and not row_scalar: return self.df.__constructor__(query_compiler=qc_view) if isinstance(self.df, Series): axis = 0 elif ndim == 0: axis = None else: # We are in the case where ndim == 1 # The axis we squeeze on depends on whether we are looking for an exact # value or a subset of rows and columns. Knowing if we have a full MultiIndex # lookup or scalar lookup can help us figure out whether we need to squeeze # on the row or column index. axis = ( None if (col_scalar and row_scalar) or (row_multiindex_full_lookup and col_multiindex_full_lookup) else 1 if col_scalar or col_multiindex_full_lookup else 0 ) res_df = self.df.__constructor__(query_compiler=qc_view) return res_df.squeeze(axis=axis) def _setitem_positional(self, row_lookup, col_lookup, item, axis=None): """ Assign `item` value to located dataset. Parameters ---------- row_lookup : slice or scalar The global row index to write item to. col_lookup : slice or scalar The global col index to write item to. item : DataFrame, Series or scalar The new item needs to be set. It can be any shape that's broadcast-able to the product of the lookup tables. axis : {None, 0, 1}, default: None If not None, it means that whole axis is used to assign a value. 0 means assign to whole column, 1 means assign to whole row. If None, it means that partial assignment is done on both axes. """ # Convert slices to indices for the purposes of application. # TODO (devin-petersohn): Apply to slice without conversion to list if isinstance(row_lookup, slice): row_lookup = range(len(self.qc.index))[row_lookup] if isinstance(col_lookup, slice): col_lookup = range(len(self.qc.columns))[col_lookup] # This is True when we dealing with assignment of a full column. This case # should be handled in a fastpath with `df[col] = item`. if axis == 0: assert len(col_lookup) == 1 self.df[self.df.columns[col_lookup][0]] = item # This is True when we are assigning to a full row. We want to reuse the setitem # mechanism to operate along only one axis for performance reasons. elif axis == 1: if hasattr(item, "_query_compiler"): if isinstance(item, DataFrame): item = item.squeeze(axis=0) item = item._query_compiler assert len(row_lookup) == 1 new_qc = self.qc.setitem(1, self.qc.index[row_lookup[0]], item) self.df._create_or_update_from_compiler(new_qc, inplace=True) # Assignment to both axes. else: new_qc = self.qc.write_items(row_lookup, col_lookup, item) self.df._create_or_update_from_compiler(new_qc, inplace=True) self.qc = self.df._query_compiler def _determine_setitem_axis(self, row_lookup, col_lookup, row_scalar, col_scalar): """ Determine an axis along which we should do an assignment. Parameters ---------- row_lookup : slice or list Indexer for rows. col_lookup : slice or list Indexer for columns. row_scalar : bool Whether indexer for rows is scalar or not. col_scalar : bool Whether indexer for columns is scalar or not. Returns ------- int or None None if this will be a both axis assignment, number of axis to assign in other cases. Notes ----- axis = 0: column assignment df[col] = item axis = 1: row assignment df.loc[row] = item axis = None: assignment along both axes """ if self.df.shape == (1, 1): return None if not (row_scalar ^ col_scalar) else 1 if row_scalar else 0 def get_axis(axis): return self.qc.index if axis == 0 else self.qc.columns row_lookup_len, col_lookup_len = [ ( len(lookup) if not isinstance(lookup, slice) else compute_sliced_len(lookup, len(get_axis(i))) ) for i, lookup in enumerate([row_lookup, col_lookup]) ] if col_lookup_len == 1 and row_lookup_len == 1: axis = None elif ( row_lookup_len == len(self.qc.index) and col_lookup_len == 1 and isinstance(self.df, DataFrame) ): axis = 0 elif col_lookup_len == len(self.qc.columns) and row_lookup_len == 1: axis = 1 else: axis = None return axis def _parse_row_and_column_locators(self, tup): """ Unpack the user input for getitem and setitem and compute ndim. loc[a] -> ([a], :), 1D loc[[a,b]] -> ([a,b], :), loc[a,b] -> ([a], [b]), 0D Parameters ---------- tup : tuple User input to unpack. Returns ------- row_loc : scalar or list Row locator(s) as a scalar or List. col_list : scalar or list Column locator(s) as a scalar or List. ndim : {0, 1, 2} Number of dimensions of located dataset. """ row_loc, col_loc = slice(None), slice(None) if is_tuple(tup): row_loc = tup[0] if len(tup) == 2: col_loc = tup[1] if len(tup) > 2: raise IndexingError("Too many indexers") else: row_loc = tup row_loc = row_loc(self.df) if callable(row_loc) else row_loc col_loc = col_loc(self.df) if callable(col_loc) else col_loc return row_loc, col_loc, _compute_ndim(row_loc, col_loc) # HACK: This method bypasses regular ``loc/iloc.__getitem__`` flow in order to ensure better # performance in the case of boolean masking. The only purpose of this method is to compensate # for a lack of backend's indexing API, there is no Query Compiler method allowing masking # along both axis when any of the indexers is a boolean. That's why rows and columns masking # phases are separate in this case. # TODO: Remove this method and handle this case naturally via ``loc/iloc.__getitem__`` flow # when QC API would support both-axis masking with boolean indexers. def _handle_boolean_masking(self, row_loc, col_loc): """ Retrieve dataset according to the boolean mask for rows and an indexer for columns. In comparison with the regular ``loc/iloc.__getitem__`` flow this method efficiently masks rows with a Modin Series boolean mask without materializing it (if the selected execution implements such masking). Parameters ---------- row_loc : modin.pandas.Series of bool dtype Boolean mask to index rows with. col_loc : object An indexer along column axis. Returns ------- modin.pandas.DataFrame or modin.pandas.Series Located dataset. """ ErrorMessage.catch_bugs_and_request_email( failure_condition=not isinstance(row_loc, Series), extra_log=f"Only ``modin.pandas.Series`` boolean masks are acceptable, got: {type(row_loc)}", ) masked_df = self.df.__constructor__( query_compiler=self.qc.getitem_array(row_loc._query_compiler) ) if isinstance(masked_df, Series): assert col_loc == slice(None) return masked_df # Passing `slice(None)` as a row indexer since we've just applied it return type(self)(masked_df)[(slice(None), col_loc)] def _multiindex_possibly_contains_key(self, axis, key): """ Determine if a MultiIndex row/column possibly contains a key. Check to see if the current DataFrame has a MultiIndex row/column and if it does, check to see if the key is potentially a full key-lookup such that the number of levels match up with the length of the tuple key. Parameters ---------- axis : {0, 1} 0 for row, 1 for column. key : Any Lookup key for MultiIndex row/column. Returns ------- bool If the MultiIndex possibly contains the given key. Notes ----- This function only returns False if we have a partial key lookup. It's possible that this function returns True for a key that does NOT exist since we only check the length of the `key` tuple to match the number of levels in the MultiIndex row/colunmn. """ if not self.qc.has_multiindex(axis=axis): return False multiindex = self.df.index if axis == 0 else self.df.columns return isinstance(key, tuple) and len(key) == len(multiindex.levels) class _LocIndexer(_LocationIndexerBase): """ An indexer for modin_df.loc[] functionality. Parameters ---------- modin_df : Union[DataFrame, Series] DataFrame to operate on. """ _extensions: EXTENSION_DICT_TYPE = EXTENSION_DICT_TYPE(dict) def __getitem__(self, key): """ Retrieve dataset according to `key`. Parameters ---------- key : callable, scalar, or tuple The global row index to retrieve data from. Returns ------- modin.pandas.DataFrame or modin.pandas.Series Located dataset. See Also -------- pandas.DataFrame.loc """ if self.df.empty: return self.df._default_to_pandas(lambda df: df.loc[key]) if isinstance(key, tuple): key = self._validate_key_length(key) if ( isinstance(key, tuple) and len(key) == 2 and all((is_scalar(k) for k in key)) and self.qc.has_multiindex(axis=0) ): # __getitem__ has no way to distinguish between # loc[('level_one_key', level_two_key')] and # loc['level_one_key', 'column_name']. It's possible for both to be valid # when we have a multiindex on axis=0, and it seems pandas uses # interpretation 1 if that's possible. Do the same. locators = self._parse_row_and_column_locators((key, slice(None))) try: return self._helper_for__getitem__(key, *locators) except KeyError: pass return self._helper_for__getitem__( key, *self._parse_row_and_column_locators(key) ) def _helper_for__getitem__(self, key, row_loc, col_loc, ndim): """ Retrieve dataset according to `key`, row_loc, and col_loc. Parameters ---------- key : callable, scalar, or tuple The global row index to retrieve data from. row_loc : callable, scalar, or slice Row locator(s) as a scalar or List. col_loc : callable, scalar, or slice Row locator(s) as a scalar or List. ndim : int The number of dimensions of the returned object. Returns ------- modin.pandas.DataFrame or modin.pandas.Series Located dataset. """ row_scalar = is_scalar(row_loc) col_scalar = is_scalar(col_loc) # The thought process here is that we should check to see that we have a full key lookup # for a MultiIndex DataFrame. If that's the case, then we should not drop any levels # since our resulting intermediate dataframe will have dropped these for us already. # Thus, we need to make sure we don't try to drop these levels again. The logic here is # kind of hacked together. Ideally, we should handle this properly in the lower-level # implementations, but this will have to be engineered properly later. row_multiindex_full_lookup = self._multiindex_possibly_contains_key( axis=0, key=row_loc ) col_multiindex_full_lookup = self._multiindex_possibly_contains_key( axis=1, key=col_loc ) levels_already_dropped = ( row_multiindex_full_lookup or col_multiindex_full_lookup ) if isinstance(row_loc, Series) and is_boolean_array(row_loc): return self._handle_boolean_masking(row_loc, col_loc) qc_view = self.qc.take_2d_labels(row_loc, col_loc) result = self._get_pandas_object_from_qc_view( qc_view, row_multiindex_full_lookup, col_multiindex_full_lookup, row_scalar, col_scalar, ndim, ) if isinstance(result, Series): result._parent = self.df result._parent_axis = 0 col_loc_as_list = [col_loc] if col_scalar else col_loc row_loc_as_list = [row_loc] if row_scalar else row_loc # Pandas drops the levels that are in the `loc`, so we have to as well. if ( isinstance(result, (Series, DataFrame)) and result._query_compiler.has_multiindex() and not levels_already_dropped ): if ( isinstance(result, Series) and not isinstance(col_loc_as_list, slice) and all( col_loc_as_list[i] in result.index.levels[i] for i in range(len(col_loc_as_list)) ) ): result.index = result.index.droplevel(list(range(len(col_loc_as_list)))) elif not isinstance(row_loc_as_list, slice) and all( not isinstance(row_loc_as_list[i], slice) and row_loc_as_list[i] in result.index.levels[i] for i in range(len(row_loc_as_list)) ): result.index = result.index.droplevel(list(range(len(row_loc_as_list)))) if ( isinstance(result, DataFrame) and not isinstance(col_loc_as_list, slice) and not levels_already_dropped and result._query_compiler.has_multiindex(axis=1) and all( col_loc_as_list[i] in result.columns.levels[i] for i in range(len(col_loc_as_list)) ) ): result.columns = result.columns.droplevel(list(range(len(col_loc_as_list)))) # This is done for cases where the index passed in has other state, like a # frequency in the case of DateTimeIndex. if ( row_loc is not None and isinstance(col_loc, slice) and col_loc == slice(None) and isinstance(key, pandas.Index) ): result.index = key return result def __setitem__(self, key, item): """ Assign `item` value to dataset located by `key`. Parameters ---------- key : callable or tuple The global row index to assign data to. item : modin.pandas.DataFrame, modin.pandas.Series or scalar Value that should be assigned to located dataset. See Also -------- pandas.DataFrame.loc """ if self.df.empty: def _loc(df): df.loc[key] = item return df self.df._update_inplace( new_query_compiler=self.df._default_to_pandas(_loc)._query_compiler ) self.qc = self.df._query_compiler return row_loc, col_loc, ndims = self._parse_row_and_column_locators(key) append_axis = self._check_missing_loc(row_loc, col_loc) if ndims >= 1 and append_axis is not None: # We enter this codepath if we're either appending a row or a column if append_axis: # Appending at least one new column if is_scalar(col_loc): col_loc = [col_loc] self._setitem_with_new_columns(row_loc, col_loc, item) else: # Appending at most one new row if is_scalar(row_loc) or len(row_loc) == 1: index = self.qc.index.insert(len(self.qc.index), row_loc) self.qc = self.qc.reindex(labels=index, axis=0, fill_value=0) self.df._update_inplace(new_query_compiler=self.qc) self._set_item_existing_loc(row_loc, col_loc, item) else: self._set_item_existing_loc(row_loc, col_loc, item) self.qc = self.df._query_compiler def _setitem_with_new_columns(self, row_loc, col_loc, item): """ Assign `item` value to dataset located by `row_loc` and `col_loc` with new columns. Parameters ---------- row_loc : scalar, slice, list, array or tuple Row locator. col_loc : list, array or tuple Columns locator. item : modin.pandas.DataFrame, modin.pandas.Series or scalar Value that should be assigned to located dataset. """ if is_list_like(item) and not isinstance(item, (DataFrame, Series)): item = np.array(item) if len(item.shape) == 1: if len(col_loc) != 1: raise ValueError( "Must have equal len keys and value when setting with an iterable" ) else: if item.shape[-1] != len(col_loc): raise ValueError( "Must have equal len keys and value when setting with an iterable" ) common_label_loc = np.isin(col_loc, self.qc.columns.values) if not all(common_label_loc): # In this case we have some new cols and some old ones columns = self.qc.columns for i in range(len(common_label_loc)): if not common_label_loc[i]: columns = columns.insert(len(columns), col_loc[i]) self.qc = self.qc.reindex(labels=columns, axis=1, fill_value=np.nan) self.df._update_inplace(new_query_compiler=self.qc) self._set_item_existing_loc(row_loc, np.array(col_loc), item) self.qc = self.df._query_compiler def _set_item_existing_loc(self, row_loc, col_loc, item): """ Assign `item` value to dataset located by `row_loc` and `col_loc` with existing rows and columns. Parameters ---------- row_loc : scalar, slice, list, array or tuple Row locator. col_loc : scalar, slice, list, array or tuple Columns locator. item : modin.pandas.DataFrame, modin.pandas.Series or scalar Value that should be assigned to located dataset. """ if ( isinstance(row_loc, Series) and is_boolean_array(row_loc) and is_scalar(item) ): new_qc = self.df._query_compiler.setitem_bool( row_loc._query_compiler, col_loc, item ) self.df._update_inplace(new_qc) self.qc = self.df._query_compiler return row_lookup, col_lookup = self.qc.get_positions_from_labels(row_loc, col_loc) if isinstance(item, np.ndarray) and is_boolean_array(row_loc): # fix for 'test_loc_series'; np.log(Series) returns nd.array instead # of Series as it was before (`Series.__array_wrap__` is removed) # otherwise incompatible shapes are obtained item = item.take(row_lookup) self._setitem_positional( row_lookup, col_lookup, item, axis=self._determine_setitem_axis( row_lookup, col_lookup, is_scalar(row_loc), is_scalar(col_loc) ), ) def _check_missing_loc(self, row_loc, col_loc): """ Help `__setitem__` compute whether an axis needs appending. Parameters ---------- row_loc : scalar, slice, list, array or tuple Row locator. col_loc : scalar, slice, list, array or tuple Columns locator. Returns ------- int or None : 0 if new row, 1 if new column, None if neither. """ if is_scalar(row_loc): return 0 if row_loc not in self.qc.index else None elif isinstance(row_loc, list): missing_labels = self._compute_enlarge_labels( pandas.Index(row_loc), self.qc.index ) if len(missing_labels) > 1: # We cast to list to copy pandas' error: # In pandas, we get: KeyError: [a, b,...] not in index # If we don't convert to list we get: KeyError: [a b ...] not in index raise KeyError("{} not in index".format(list(missing_labels))) if ( not (is_list_like(row_loc) or isinstance(row_loc, slice)) and row_loc not in self.qc.index ): return 0 if ( isinstance(col_loc, list) and len(pandas.Index(col_loc).difference(self.qc.columns)) >= 1 ): return 1 if is_scalar(col_loc) and col_loc not in self.qc.columns: return 1 return None def _compute_enlarge_labels(self, locator, base_index): """ Help to _enlarge_axis, compute common labels and extra labels. Parameters ---------- locator : pandas.Index Index from locator. base_index : pandas.Index Current index. Returns ------- nan_labels : pandas.Index The labels that need to be added. """ # base_index_type can be pd.Index or pd.DatetimeIndex # depending on user input and pandas behavior # See issue #2264 base_as_index = pandas.Index(list(base_index)) locator_as_index = pandas.Index(list(locator)) if locator_as_index.inferred_type == "boolean": if len(locator_as_index) != len(base_as_index): raise ValueError( f"Item wrong length {len(locator_as_index)} instead of {len(base_as_index)}!" ) common_labels = base_as_index[locator_as_index] nan_labels = pandas.Index([]) else: common_labels = locator_as_index.intersection(base_as_index) nan_labels = locator_as_index.difference(base_as_index) if len(common_labels) == 0: raise KeyError( "None of [{labels}] are in the [{base_index_name}]".format( labels=list(locator_as_index), base_index_name=base_as_index ) ) return nan_labels class _iLocIndexer(_LocationIndexerBase): """ An indexer for modin_df.iloc[] functionality. Parameters ---------- modin_df : Union[DataFrame, Series] DataFrame to operate on. """ _extensions: EXTENSION_DICT_TYPE = EXTENSION_DICT_TYPE(dict) def __getitem__(self, key): """ Retrieve dataset according to `key`. Parameters ---------- key : callable or tuple The global row numbers to retrieve data from. Returns ------- DataFrame or Series Located dataset. See Also -------- pandas.DataFrame.iloc """ if self.df.empty: return self.df._default_to_pandas(lambda df: df.iloc[key]) if isinstance(key, tuple): key = self._validate_key_length(key) row_loc, col_loc, ndim = self._parse_row_and_column_locators(key) row_scalar = is_scalar(row_loc) col_scalar = is_scalar(col_loc) self._check_dtypes(row_loc) self._check_dtypes(col_loc) if isinstance(row_loc, Series) and is_boolean_array(row_loc): return self._handle_boolean_masking(row_loc, col_loc) row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc) if isinstance(row_lookup, slice): ErrorMessage.catch_bugs_and_request_email( failure_condition=row_lookup != slice(None), extra_log=f"Only None-slices are acceptable as a slice argument in masking, got: {row_lookup}", ) row_lookup = None if isinstance(col_lookup, slice): ErrorMessage.catch_bugs_and_request_email( failure_condition=col_lookup != slice(None), extra_log=f"Only None-slices are acceptable as a slice argument in masking, got: {col_lookup}", ) col_lookup = None qc_view = self.qc.take_2d_positional(row_lookup, col_lookup) result = self._get_pandas_object_from_qc_view( qc_view, row_multiindex_full_lookup=False, col_multiindex_full_lookup=False, row_scalar=row_scalar, col_scalar=col_scalar, ndim=ndim, ) if isinstance(result, Series): result._parent = self.df result._parent_axis = 0 return result def __setitem__(self, key, item): """ Assign `item` value to dataset located by `key`. Parameters ---------- key : callable or tuple The global row numbers to assign data to. item : modin.pandas.DataFrame, modin.pandas.Series or scalar Value that should be assigned to located dataset. See Also -------- pandas.DataFrame.iloc """ if self.df.empty: def _iloc(df): df.iloc[key] = item return df self.df._update_inplace( new_query_compiler=self.df._default_to_pandas(_iloc)._query_compiler ) self.qc = self.df._query_compiler return row_loc, col_loc, _ = self._parse_row_and_column_locators(key) row_scalar = is_scalar(row_loc) col_scalar = is_scalar(col_loc) self._check_dtypes(row_loc) self._check_dtypes(col_loc) row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc) self._setitem_positional( row_lookup, col_lookup, item, axis=self._determine_setitem_axis( row_lookup, col_lookup, row_scalar, col_scalar ), ) def _compute_lookup(self, row_loc, col_loc): """ Compute index and column labels from index and column integer locators. Parameters ---------- row_loc : slice, list, array or tuple Row locator. col_loc : slice, list, array or tuple Columns locator. Returns ------- row_lookup : slice(None) if full axis grab, pandas.RangeIndex if repetition is detected, numpy.ndarray otherwise List of index labels. col_lookup : slice(None) if full axis grab, pandas.RangeIndex if repetition is detected, numpy.ndarray otherwise List of columns labels. Notes ----- Usage of `slice(None)` as a resulting lookup is a hack to pass information about full-axis grab without computing actual indices that triggers lazy computations. Ideally, this API should get rid of using slices as indexers and either use a common ``Indexer`` object or range and ``np.ndarray`` only. """ lookups = [] for axis, axis_loc in enumerate((row_loc, col_loc)): if is_scalar(axis_loc): axis_loc = np.array([axis_loc]) if isinstance(axis_loc, slice): axis_lookup = ( axis_loc if axis_loc == slice(None) else pandas.RangeIndex( *axis_loc.indices(len(self.qc.get_axis(axis))) ) ) elif is_range_like(axis_loc): axis_lookup = pandas.RangeIndex( axis_loc.start, axis_loc.stop, axis_loc.step ) elif is_boolean_array(axis_loc): axis_lookup = boolean_mask_to_numeric(axis_loc) else: if isinstance(axis_loc, pandas.Index): axis_loc = axis_loc.values elif is_list_like(axis_loc) and not isinstance(axis_loc, np.ndarray): # `Index.__getitem__` works much faster with numpy arrays than with python lists, # so although we lose some time here on converting to numpy, `Index.__getitem__` # speedup covers the loss that we gain here. axis_loc = np.array(axis_loc, dtype=np.int64) # Relatively fast check allows us to not trigger `self.qc.get_axis()` computation # if there're no negative indices and so they don't not depend on the axis length. if isinstance(axis_loc, np.ndarray) and not (axis_loc < 0).any(): axis_lookup = axis_loc else: axis_lookup = pandas.RangeIndex(len(self.qc.get_axis(axis)))[ axis_loc ] if isinstance(axis_lookup, pandas.Index) and not is_range_like(axis_lookup): axis_lookup = axis_lookup.values lookups.append(axis_lookup) return lookups def _check_dtypes(self, locator): """ Check that `locator` is an integer scalar, integer slice, integer list or array of booleans. Parameters ---------- locator : scalar, list, slice or array Object to check. Raises ------ ValueError If check fails. """ is_int = is_integer(locator) is_int_slice = is_integer_slice(locator) is_int_arr = is_integer_array(locator) is_bool_arr = is_boolean_array(locator) if not any([is_int, is_int_slice, is_int_arr, is_bool_arr]): raise ValueError(_ILOC_INT_ONLY_ERROR) ================================================ FILE: modin/pandas/io.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Implement I/O public API as pandas does. Almost all docstrings for public and magic methods should be inherited from pandas for better maintability. Manually add documentation for methods which are not presented in pandas. """ from __future__ import annotations import csv import inspect import pathlib import pickle from typing import ( IO, TYPE_CHECKING, Any, AnyStr, Callable, Dict, Hashable, Iterable, Iterator, List, Literal, Optional, Pattern, Sequence, Union, ) import numpy as np import pandas from pandas._libs.lib import NoDefault, no_default from pandas._typing import ( CompressionOptions, ConvertersArg, CSVEngine, DtypeArg, DtypeBackend, FilePath, IndexLabel, IntStrT, ParseDatesArg, ReadBuffer, ReadCsvBuffer, StorageOptions, XMLParsers, ) from pandas.io.parsers import TextFileReader from pandas.io.parsers.readers import _c_parser_defaults from modin.config import ModinNumpy from modin.core.dataframe.base.interchange.dataframe_protocol.dataframe import ( ProtocolDataframe, ) from modin.core.storage_formats.pandas.query_compiler_caster import ( wrap_free_function_in_argument_caster, ) from modin.logging import ClassLogger, enable_logging from modin.utils import ( SupportsPrivateToNumPy, SupportsPublicToNumPy, SupportsPublicToPandas, _inherit_docstrings, _maybe_warn_on_default, classproperty, expanduser_path_arg, ) # below logic is to handle circular imports without errors if TYPE_CHECKING: from .dataframe import DataFrame from .series import Series class ModinObjects: """Lazily import Modin classes and provide an access to them.""" _dataframe = None @classproperty def DataFrame(cls): """Get ``modin.pandas.DataFrame`` class.""" if cls._dataframe is None: from .dataframe import DataFrame cls._dataframe = DataFrame return cls._dataframe def _read(**kwargs): """ Read csv file from local disk. Parameters ---------- **kwargs : dict Keyword arguments in pandas.read_csv. Returns ------- modin.pandas.DataFrame """ from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher squeeze = kwargs.pop("squeeze", False) pd_obj = FactoryDispatcher.read_csv(**kwargs) # This happens when `read_csv` returns a TextFileReader object for iterating through if isinstance(pd_obj, TextFileReader): reader = pd_obj.read pd_obj.read = lambda *args, **kwargs: ModinObjects.DataFrame( query_compiler=reader(*args, **kwargs) ) return pd_obj result = ModinObjects.DataFrame(query_compiler=pd_obj) if squeeze: return result.squeeze(axis=1) return result @_inherit_docstrings(pandas.read_xml, apilink="pandas.read_xml") @enable_logging @wrap_free_function_in_argument_caster("read_xml") @expanduser_path_arg("path_or_buffer") def read_xml( path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], *, xpath: str = "./*", namespaces: dict[str, str] | None = None, elems_only: bool = False, attrs_only: bool = False, names: Sequence[str] | None = None, dtype: DtypeArg | None = None, converters: ConvertersArg | None = None, parse_dates: ParseDatesArg | None = None, encoding: str | None = "utf-8", parser: XMLParsers = "lxml", stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None, iterparse: dict[str, list[str]] | None = None, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, dtype_backend: Union[DtypeBackend, NoDefault] = no_default, ) -> DataFrame: _maybe_warn_on_default("read_xml") _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) return ModinObjects.DataFrame(pandas.read_xml(**kwargs)) @_inherit_docstrings(pandas.read_csv, apilink="pandas.read_csv") @enable_logging @wrap_free_function_in_argument_caster("read_csv") @expanduser_path_arg("filepath_or_buffer") def read_csv( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], *, sep: str | None | NoDefault = no_default, delimiter: str | None | NoDefault = None, # Column and Index Locations and Names header: int | Sequence[int] | None | Literal["infer"] = "infer", names: Sequence[Hashable] | None | NoDefault = no_default, index_col: IndexLabel | Literal[False] | None = None, usecols=None, # General Parsing Configuration dtype: DtypeArg | None = None, engine: CSVEngine | None = None, converters=None, true_values=None, false_values=None, skipinitialspace: bool = False, skiprows=None, skipfooter: int = 0, nrows: int | None = None, # NA and Missing Data Handling na_values=None, keep_default_na: bool = True, na_filter: bool = True, verbose: bool = no_default, skip_blank_lines: bool = True, # Datetime Handling parse_dates=None, infer_datetime_format: bool = no_default, keep_date_col: bool = no_default, date_parser=no_default, date_format=None, dayfirst: bool = False, cache_dates: bool = True, # Iteration iterator: bool = False, chunksize: int | None = None, # Quoting, Compression, and File Format compression: CompressionOptions = "infer", thousands: str | None = None, decimal: str = ".", lineterminator: str | None = None, quotechar: str = '"', quoting: int = csv.QUOTE_MINIMAL, doublequote: bool = True, escapechar: str | None = None, comment: str | None = None, encoding: str | None = None, encoding_errors: str | None = "strict", dialect: str | csv.Dialect | None = None, # Error Handling on_bad_lines="error", # Internal delim_whitespace: bool = no_default, low_memory=_c_parser_defaults["low_memory"], memory_map: bool = False, float_precision: Literal["high", "legacy"] | None = None, storage_options: StorageOptions = None, dtype_backend: Union[DtypeBackend, NoDefault] = no_default, ) -> DataFrame | TextFileReader: # ISSUE #2408: parse parameter shared with pandas read_csv and read_table and update with provided args _pd_read_csv_signature = { val.name for val in inspect.signature(pandas.read_csv).parameters.values() } _, _, _, f_locals = inspect.getargvalues(inspect.currentframe()) kwargs = {k: v for k, v in f_locals.items() if k in _pd_read_csv_signature} return _read(**kwargs) @_inherit_docstrings(pandas.read_table, apilink="pandas.read_table") @enable_logging @wrap_free_function_in_argument_caster("read_table") @expanduser_path_arg("filepath_or_buffer") def read_table( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], *, sep: str | None | NoDefault = no_default, delimiter: str | None | NoDefault = None, # Column and Index Locations and Names header: int | Sequence[int] | None | Literal["infer"] = "infer", names: Sequence[Hashable] | None | NoDefault = no_default, index_col: IndexLabel | Literal[False] | None = None, usecols=None, # General Parsing Configuration dtype: DtypeArg | None = None, engine: CSVEngine | None = None, converters=None, true_values=None, false_values=None, skipinitialspace: bool = False, skiprows=None, skipfooter: int = 0, nrows: int | None = None, # NA and Missing Data Handling na_values=None, keep_default_na: bool = True, na_filter: bool = True, verbose: bool = no_default, skip_blank_lines: bool = True, # Datetime Handling parse_dates=False, infer_datetime_format: bool = no_default, keep_date_col: bool = no_default, date_parser=no_default, date_format: str = None, dayfirst: bool = False, cache_dates: bool = True, # Iteration iterator: bool = False, chunksize: int | None = None, # Quoting, Compression, and File Format compression: CompressionOptions = "infer", thousands: str | None = None, decimal: str = ".", lineterminator: str | None = None, quotechar: str = '"', quoting: int = csv.QUOTE_MINIMAL, doublequote: bool = True, escapechar: str | None = None, comment: str | None = None, encoding: str | None = None, encoding_errors: str | None = "strict", dialect: str | csv.Dialect | None = None, # Error Handling on_bad_lines="error", # Internal delim_whitespace: bool = no_default, low_memory=_c_parser_defaults["low_memory"], memory_map: bool = False, float_precision: str | None = None, storage_options: StorageOptions = None, dtype_backend: Union[DtypeBackend, NoDefault] = no_default, ) -> DataFrame | TextFileReader: # ISSUE #2408: parse parameter shared with pandas read_csv and read_table and update with provided args _pd_read_table_signature = { val.name for val in inspect.signature(pandas.read_table).parameters.values() } _, _, _, f_locals = inspect.getargvalues(inspect.currentframe()) if f_locals.get("sep", sep) is False or f_locals.get("sep", sep) is no_default: f_locals["sep"] = "\t" kwargs = {k: v for k, v in f_locals.items() if k in _pd_read_table_signature} return _read(**kwargs) @_inherit_docstrings(pandas.read_parquet, apilink="pandas.read_parquet") @enable_logging @wrap_free_function_in_argument_caster("read_parquet") @expanduser_path_arg("path") def read_parquet( path, engine: str = "auto", columns: list[str] | None = None, storage_options: StorageOptions = None, use_nullable_dtypes: bool = no_default, dtype_backend=no_default, filesystem=None, filters=None, **kwargs, ) -> DataFrame: from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher if engine == "fastparquet" and dtype_backend is not no_default: raise ValueError( "The 'dtype_backend' argument is not supported for the fastparquet engine" ) return ModinObjects.DataFrame( query_compiler=FactoryDispatcher.read_parquet( path=path, engine=engine, columns=columns, storage_options=storage_options, use_nullable_dtypes=use_nullable_dtypes, dtype_backend=dtype_backend, filesystem=filesystem, filters=filters, **kwargs, ) ) @_inherit_docstrings(pandas.read_json, apilink="pandas.read_json") @enable_logging @wrap_free_function_in_argument_caster("read_json") @expanduser_path_arg("path_or_buf") def read_json( path_or_buf, *, orient: str | None = None, typ: Literal["frame", "series"] = "frame", dtype: DtypeArg | None = None, convert_axes=None, convert_dates: bool | list[str] = True, keep_default_dates: bool = True, precise_float: bool = False, date_unit: str | None = None, encoding: str | None = None, encoding_errors: str | None = "strict", lines: bool = False, chunksize: int | None = None, compression: CompressionOptions = "infer", nrows: int | None = None, storage_options: StorageOptions = None, dtype_backend: Union[DtypeBackend, NoDefault] = no_default, engine="ujson", ) -> DataFrame | Series | pandas.io.json._json.JsonReader: _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return ModinObjects.DataFrame(query_compiler=FactoryDispatcher.read_json(**kwargs)) @_inherit_docstrings(pandas.read_gbq, apilink="pandas.read_gbq") @enable_logging @wrap_free_function_in_argument_caster("read_gbq") def read_gbq( query: str, project_id: str | None = None, index_col: str | None = None, col_order: list[str] | None = None, reauth: bool = False, auth_local_webserver: bool = True, dialect: str | None = None, location: str | None = None, configuration: dict[str, Any] | None = None, credentials=None, use_bqstorage_api: bool | None = None, max_results: int | None = None, progress_bar_type: str | None = None, ) -> DataFrame: _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) kwargs.update(kwargs.pop("kwargs", {})) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return ModinObjects.DataFrame(query_compiler=FactoryDispatcher.read_gbq(**kwargs)) @_inherit_docstrings(pandas.read_html, apilink="pandas.read_html") @enable_logging @wrap_free_function_in_argument_caster("read_html") @expanduser_path_arg("io") def read_html( io, *, match: str | Pattern = ".+", flavor: str | None = None, header: int | Sequence[int] | None = None, index_col: int | Sequence[int] | None = None, skiprows: int | Sequence[int] | slice | None = None, attrs: dict[str, str] | None = None, parse_dates: bool = False, thousands: str | None = ",", encoding: str | None = None, decimal: str = ".", converters: dict | None = None, na_values: Iterable[object] | None = None, keep_default_na: bool = True, displayed_only: bool = True, extract_links: Literal[None, "header", "footer", "body", "all"] = None, dtype_backend: Union[DtypeBackend, NoDefault] = no_default, storage_options: StorageOptions = None, ) -> list[DataFrame]: # noqa: PR01, RT01, D200 """ Read HTML tables into a ``DataFrame`` object. """ _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher qcs = FactoryDispatcher.read_html(**kwargs) return [ModinObjects.DataFrame(query_compiler=qc) for qc in qcs] @_inherit_docstrings(pandas.read_clipboard, apilink="pandas.read_clipboard") @enable_logging @wrap_free_function_in_argument_caster("read_clipboard") def read_clipboard( sep=r"\s+", dtype_backend: Union[DtypeBackend, NoDefault] = no_default, **kwargs, ) -> DataFrame: # pragma: no cover # noqa: PR01, RT01, D200 """ Read text from clipboard and pass to read_csv. """ _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) kwargs.update(kwargs.pop("kwargs", {})) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return ModinObjects.DataFrame( query_compiler=FactoryDispatcher.read_clipboard(**kwargs) ) @_inherit_docstrings(pandas.read_excel, apilink="pandas.read_excel") @enable_logging @wrap_free_function_in_argument_caster("read_excel") @expanduser_path_arg("io") def read_excel( io, sheet_name: str | int | list[IntStrT] | None = 0, *, header: int | Sequence[int] | None = 0, names: list[str] | None = None, index_col: int | Sequence[int] | None = None, usecols: ( int | str | Sequence[int] | Sequence[str] | Callable[[str], bool] | None ) = None, dtype: DtypeArg | None = None, engine: Literal[("xlrd", "openpyxl", "odf", "pyxlsb")] | None = None, converters: dict[str, Callable] | dict[int, Callable] | None = None, true_values: Iterable[Hashable] | None = None, false_values: Iterable[Hashable] | None = None, skiprows: Sequence[int] | int | Callable[[int], object] | None = None, nrows: int | None = None, na_values=None, keep_default_na: bool = True, na_filter: bool = True, verbose: bool = False, parse_dates: list | dict | bool = False, date_parser: Union[Callable, NoDefault] = no_default, date_format=None, thousands: str | None = None, decimal: str = ".", comment: str | None = None, skipfooter: int = 0, storage_options: StorageOptions = None, dtype_backend: Union[DtypeBackend, NoDefault] = no_default, engine_kwargs: Optional[dict] = None, ) -> DataFrame | dict[IntStrT, DataFrame]: _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher intermediate = FactoryDispatcher.read_excel(**kwargs) if isinstance(intermediate, dict): parsed = type(intermediate)() for key in intermediate.keys(): parsed[key] = ModinObjects.DataFrame(query_compiler=intermediate.get(key)) return parsed else: return ModinObjects.DataFrame(query_compiler=intermediate) @_inherit_docstrings(pandas.read_hdf, apilink="pandas.read_hdf") @enable_logging @wrap_free_function_in_argument_caster("read_hdf") @expanduser_path_arg("path_or_buf") def read_hdf( path_or_buf, key=None, mode: str = "r", errors: str = "strict", where=None, start: Optional[int] = None, stop: Optional[int] = None, columns=None, iterator=False, chunksize: Optional[int] = None, **kwargs, ): # noqa: PR01, RT01, D200 """ Read data from the store into DataFrame. """ _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) kwargs.update(kwargs.pop("kwargs", {})) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return ModinObjects.DataFrame(query_compiler=FactoryDispatcher.read_hdf(**kwargs)) @_inherit_docstrings(pandas.read_feather, apilink="pandas.read_feather") @enable_logging @wrap_free_function_in_argument_caster("read_feather") @expanduser_path_arg("path") def read_feather( path, columns: Sequence[Hashable] | None = None, use_threads: bool = True, storage_options: StorageOptions = None, dtype_backend: Union[DtypeBackend, NoDefault] = no_default, ) -> DataFrame: _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return ModinObjects.DataFrame( query_compiler=FactoryDispatcher.read_feather(**kwargs) ) @_inherit_docstrings(pandas.read_stata) @enable_logging @wrap_free_function_in_argument_caster("read_stata") @expanduser_path_arg("filepath_or_buffer") def read_stata( filepath_or_buffer, *, convert_dates: bool = True, convert_categoricals: bool = True, index_col: str | None = None, convert_missing: bool = False, preserve_dtypes: bool = True, columns: Sequence[str] | None = None, order_categoricals: bool = True, chunksize: int | None = None, iterator: bool = False, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, ) -> DataFrame | pandas.io.stata.StataReader: _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return ModinObjects.DataFrame(query_compiler=FactoryDispatcher.read_stata(**kwargs)) @_inherit_docstrings(pandas.read_sas, apilink="pandas.read_sas") @enable_logging @wrap_free_function_in_argument_caster("read_sas") @expanduser_path_arg("filepath_or_buffer") def read_sas( filepath_or_buffer, *, format: str | None = None, index: Hashable | None = None, encoding: str | None = None, chunksize: int | None = None, iterator: bool = False, compression: CompressionOptions = "infer", ) -> DataFrame | pandas.io.sas.sasreader.ReaderBase: # noqa: PR01, RT01, D200 """ Read SAS files stored as either XPORT or SAS7BDAT format files. """ from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return ModinObjects.DataFrame( query_compiler=FactoryDispatcher.read_sas( filepath_or_buffer=filepath_or_buffer, format=format, index=index, encoding=encoding, chunksize=chunksize, iterator=iterator, compression=compression, ) ) @_inherit_docstrings(pandas.read_pickle, apilink="pandas.read_pickle") @enable_logging @wrap_free_function_in_argument_caster("read_pickle") @expanduser_path_arg("filepath_or_buffer") def read_pickle( filepath_or_buffer, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, ) -> DataFrame | Series: _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return ModinObjects.DataFrame( query_compiler=FactoryDispatcher.read_pickle(**kwargs) ) @_inherit_docstrings(pandas.read_sql, apilink="pandas.read_sql") @enable_logging @wrap_free_function_in_argument_caster("read_sql") def read_sql( sql, con, index_col=None, coerce_float=True, params=None, parse_dates=None, columns=None, chunksize=None, dtype_backend: Union[DtypeBackend, NoDefault] = no_default, dtype=None, ) -> DataFrame | Iterator[DataFrame]: # noqa: PR01, RT01, D200 """ Read SQL query or database table into a DataFrame. """ _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher if kwargs.get("chunksize") is not None: _maybe_warn_on_default("Parameters provided [chunksize]") df_gen = pandas.read_sql(**kwargs) return ( ModinObjects.DataFrame(query_compiler=FactoryDispatcher.from_pandas(df)) for df in df_gen ) return ModinObjects.DataFrame(query_compiler=FactoryDispatcher.read_sql(**kwargs)) @_inherit_docstrings(pandas.read_fwf, apilink="pandas.read_fwf") @enable_logging @wrap_free_function_in_argument_caster("read_fwf") @expanduser_path_arg("filepath_or_buffer") def read_fwf( filepath_or_buffer: Union[str, pathlib.Path, IO[AnyStr]], *, colspecs="infer", widths=None, infer_nrows=100, dtype_backend: Union[DtypeBackend, NoDefault] = no_default, iterator: bool = False, chunksize: Optional[int] = None, **kwds, ) -> DataFrame | TextFileReader: # noqa: PR01, RT01, D200 """ Read a table of fixed-width formatted lines into DataFrame. """ from pandas.io.parsers.base_parser import parser_defaults from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) kwargs.update(kwargs.pop("kwds", {})) target_kwargs = parser_defaults.copy() target_kwargs.update(kwargs) pd_obj = FactoryDispatcher.read_fwf(**target_kwargs) # When `read_fwf` returns a TextFileReader object for iterating through if isinstance(pd_obj, TextFileReader): reader = pd_obj.read pd_obj.read = lambda *args, **kwargs: ModinObjects.DataFrame( query_compiler=reader(*args, **kwargs) ) return pd_obj return ModinObjects.DataFrame(query_compiler=pd_obj) @_inherit_docstrings(pandas.read_sql_table, apilink="pandas.read_sql_table") @enable_logging @wrap_free_function_in_argument_caster("read_sql_table") def read_sql_table( table_name, con, schema=None, index_col=None, coerce_float=True, parse_dates=None, columns=None, chunksize=None, dtype_backend: Union[DtypeBackend, NoDefault] = no_default, ) -> DataFrame | Iterator[DataFrame]: # noqa: PR01, RT01, D200 """ Read SQL database table into a DataFrame. """ _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return ModinObjects.DataFrame( query_compiler=FactoryDispatcher.read_sql_table(**kwargs) ) @_inherit_docstrings(pandas.read_sql_query, apilink="pandas.read_sql_query") @enable_logging @wrap_free_function_in_argument_caster("read_sql_query") def read_sql_query( sql, con, index_col: str | list[str] | None = None, coerce_float: bool = True, params: list[str] | dict[str, str] | None = None, parse_dates: list[str] | dict[str, str] | None = None, chunksize: int | None = None, dtype: DtypeArg | None = None, dtype_backend: Union[DtypeBackend, NoDefault] = no_default, ) -> DataFrame | Iterator[DataFrame]: _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return ModinObjects.DataFrame( query_compiler=FactoryDispatcher.read_sql_query(**kwargs) ) @_inherit_docstrings(pandas.to_pickle) @enable_logging @wrap_free_function_in_argument_caster("to_pickle") @expanduser_path_arg("filepath_or_buffer") def to_pickle( obj: Any, filepath_or_buffer, compression: CompressionOptions = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, ) -> None: from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher if isinstance(obj, ModinObjects.DataFrame): obj = obj._query_compiler return FactoryDispatcher.to_pickle( obj, filepath_or_buffer=filepath_or_buffer, compression=compression, protocol=protocol, storage_options=storage_options, ) @_inherit_docstrings(pandas.read_spss, apilink="pandas.read_spss") @enable_logging @wrap_free_function_in_argument_caster("read_spss") @expanduser_path_arg("path") def read_spss( path: Union[str, pathlib.Path], usecols: Optional[Sequence[str]] = None, convert_categoricals: bool = True, dtype_backend: Union[DtypeBackend, NoDefault] = no_default, ) -> DataFrame: # noqa: PR01, RT01, D200 """ Load an SPSS file from the file path, returning a DataFrame. """ from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return ModinObjects.DataFrame( query_compiler=FactoryDispatcher.read_spss( path=path, usecols=usecols, convert_categoricals=convert_categoricals, dtype_backend=dtype_backend, ) ) @_inherit_docstrings(pandas.json_normalize, apilink="pandas.json_normalize") @enable_logging @wrap_free_function_in_argument_caster("json_normalize") def json_normalize( data: Union[Dict, List[Dict]], record_path: Optional[Union[str, List]] = None, meta: Optional[Union[str, List[Union[str, List[str]]]]] = None, meta_prefix: Optional[str] = None, record_prefix: Optional[str] = None, errors: Optional[str] = "raise", sep: str = ".", max_level: Optional[int] = None, ) -> DataFrame: # noqa: PR01, RT01, D200 """ Normalize semi-structured JSON data into a flat table. """ _maybe_warn_on_default("json_normalize") return ModinObjects.DataFrame( pandas.json_normalize( data, record_path, meta, meta_prefix, record_prefix, errors, sep, max_level ) ) @_inherit_docstrings(pandas.read_orc, apilink="pandas.read_orc") @enable_logging @wrap_free_function_in_argument_caster("read_orc") @expanduser_path_arg("path") def read_orc( path, columns: Optional[List[str]] = None, dtype_backend: Union[DtypeBackend, NoDefault] = no_default, filesystem=None, **kwargs, ) -> DataFrame: # noqa: PR01, RT01, D200 """ Load an ORC object from the file path, returning a DataFrame. """ _maybe_warn_on_default("read_orc") return ModinObjects.DataFrame( pandas.read_orc( path, columns=columns, dtype_backend=dtype_backend, filesystem=filesystem, **kwargs, ) ) @_inherit_docstrings(pandas.HDFStore) class HDFStore(ClassLogger, pandas.HDFStore): # noqa: PR01, D200 """ Dict-like IO interface for storing pandas objects in PyTables. """ _return_modin_dataframe = True def __getattribute__(self, item): default_behaviors = ["__init__", "__class__"] method = super(HDFStore, self).__getattribute__(item) if item not in default_behaviors: if callable(method): def return_handler(*args, **kwargs): """ Replace the default behavior of methods with inplace kwarg. Returns ------- A Modin DataFrame in place of a pandas DataFrame, or the same return type as pandas.HDFStore. Notes ----- This function will replace all of the arguments passed to methods of HDFStore with the pandas equivalent. It will convert Modin DataFrame to pandas DataFrame, etc. Currently, pytables does not accept Modin DataFrame objects, so we must convert to pandas. """ # We don't want to constantly be giving this error message for # internal methods. if item[0] != "_": _maybe_warn_on_default("`{}`".format(item)) args = [ ( to_pandas(arg) if isinstance(arg, ModinObjects.DataFrame) else arg ) for arg in args ] kwargs = { k: to_pandas(v) if isinstance(v, ModinObjects.DataFrame) else v for k, v in kwargs.items() } obj = super(HDFStore, self).__getattribute__(item)(*args, **kwargs) if self._return_modin_dataframe and isinstance( obj, pandas.DataFrame ): return ModinObjects.DataFrame(obj) return obj # We replace the method with `return_handler` for inplace operations method = return_handler return method @_inherit_docstrings(pandas.ExcelFile) class ExcelFile(ClassLogger, pandas.ExcelFile): # noqa: PR01, D200 """ Class for parsing tabular excel sheets into DataFrame objects. """ _behave_like_pandas = False def _set_pandas_mode(self): # noqa # disable Modin behavior to be able to pass object to `pandas.read_excel` # otherwise, Modin objects may be passed to the pandas context, resulting # in undefined behavior self._behave_like_pandas = True def __getattribute__(self, item): if item in ["_set_pandas_mode", "_behave_like_pandas"]: return object.__getattribute__(self, item) default_behaviors = ["__init__", "__class__"] method = super(ExcelFile, self).__getattribute__(item) if not self._behave_like_pandas and item not in default_behaviors: if callable(method): def return_handler(*args, **kwargs): """ Replace the default behavior of methods with inplace kwarg. Returns ------- A Modin DataFrame in place of a pandas DataFrame, or the same return type as pandas.ExcelFile. Notes ----- This function will replace all of the arguments passed to methods of ExcelFile with the pandas equivalent. It will convert Modin DataFrame to pandas DataFrame, etc. """ # We don't want to constantly be giving this error message for # internal methods. if item[0] != "_": _maybe_warn_on_default("`{}`".format(item)) args = [ ( to_pandas(arg) if isinstance(arg, ModinObjects.DataFrame) else arg ) for arg in args ] kwargs = { k: to_pandas(v) if isinstance(v, ModinObjects.DataFrame) else v for k, v in kwargs.items() } obj = super(ExcelFile, self).__getattribute__(item)(*args, **kwargs) if isinstance(obj, pandas.DataFrame): return ModinObjects.DataFrame(obj) return obj # We replace the method with `return_handler` for inplace operations method = return_handler return method @wrap_free_function_in_argument_caster("from_non_pandas") def from_non_pandas(df, index, columns, dtype) -> DataFrame | None: """ Convert a non-pandas DataFrame into Modin DataFrame. Parameters ---------- df : object Non-pandas DataFrame. index : object Index for non-pandas DataFrame. columns : object Columns for non-pandas DataFrame. dtype : type Data type to force. Returns ------- modin.pandas.DataFrame Converted DataFrame. """ from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher new_qc = FactoryDispatcher.from_non_pandas(df, index, columns, dtype) if new_qc is not None: return ModinObjects.DataFrame(query_compiler=new_qc) return new_qc @wrap_free_function_in_argument_caster("from_pandas") def from_pandas(df) -> DataFrame: """ Convert a pandas DataFrame to a Modin DataFrame. Parameters ---------- df : pandas.DataFrame The pandas DataFrame to convert. Returns ------- modin.pandas.DataFrame A new Modin DataFrame object. """ from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return ModinObjects.DataFrame(query_compiler=FactoryDispatcher.from_pandas(df)) @wrap_free_function_in_argument_caster("from_arrow") def from_arrow(at) -> DataFrame: """ Convert an Arrow Table to a Modin DataFrame. Parameters ---------- at : Arrow Table The Arrow Table to convert from. Returns ------- DataFrame A new Modin DataFrame object. """ from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return ModinObjects.DataFrame(query_compiler=FactoryDispatcher.from_arrow(at)) @wrap_free_function_in_argument_caster("from_dataframe") def from_dataframe(df: ProtocolDataframe) -> DataFrame: """ Convert a DataFrame implementing the dataframe interchange protocol to a Modin DataFrame. See more about the protocol in https://data-apis.org/dataframe-protocol/latest/index.html. Parameters ---------- df : ProtocolDataframe An object supporting the dataframe interchange protocol. Returns ------- DataFrame A new Modin DataFrame object. """ from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return ModinObjects.DataFrame( query_compiler=FactoryDispatcher.from_interchange_dataframe(df) ) @wrap_free_function_in_argument_caster("from_ray") def from_ray(ray_obj) -> DataFrame: """ Convert a Ray Dataset into Modin DataFrame. Parameters ---------- ray_obj : ray.data.Dataset The Ray Dataset to convert from. Returns ------- DataFrame A new Modin DataFrame object. Notes ----- Ray Dataset can only be converted to Modin DataFrame if Modin uses a Ray engine. """ from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return ModinObjects.DataFrame(query_compiler=FactoryDispatcher.from_ray(ray_obj)) @wrap_free_function_in_argument_caster("from_dask") def from_dask(dask_obj) -> DataFrame: """ Convert a Dask DataFrame to a Modin DataFrame. Parameters ---------- dask_obj : dask.dataframe.DataFrame The Dask DataFrame to convert from. Returns ------- DataFrame A new Modin DataFrame object. Notes ----- Dask DataFrame can only be converted to Modin DataFrame if Modin uses a Dask engine. """ from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return ModinObjects.DataFrame(query_compiler=FactoryDispatcher.from_dask(dask_obj)) @wrap_free_function_in_argument_caster("from_map") def from_map(func, iterable, *args, **kwargs) -> DataFrame: """ Create a Modin DataFrame from map function applied to an iterable object. This method will construct a Modin DataFrame split by row partitions. The number of row partitions matches the number of elements in the iterable object. Parameters ---------- func : callable Function to map across the iterable object. iterable : Iterable An iterable object. *args : tuple Positional arguments to pass in `func`. **kwargs : dict Keyword arguments to pass in `func`. Returns ------- DataFrame A new Modin DataFrame object. """ from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return ModinObjects.DataFrame( query_compiler=FactoryDispatcher.from_map(func, iterable, *args, *kwargs) ) @wrap_free_function_in_argument_caster("to_pandas") def to_pandas(modin_obj: SupportsPublicToPandas) -> DataFrame | Series: """ Convert a Modin DataFrame/Series to a pandas DataFrame/Series. Parameters ---------- modin_obj : modin.DataFrame, modin.Series The Modin DataFrame/Series to convert. Returns ------- pandas.DataFrame or pandas.Series Converted object with type depending on input. """ return modin_obj._to_pandas() @wrap_free_function_in_argument_caster("to_numpy") def to_numpy( modin_obj: Union[SupportsPrivateToNumPy, SupportsPublicToNumPy], ) -> np.ndarray: """ Convert a Modin object to a NumPy array. Parameters ---------- modin_obj : modin.DataFrame, modin.Series, modin.numpy.array The Modin distributed object to convert. Returns ------- numpy.array Converted object with type depending on input. """ if isinstance(modin_obj, SupportsPrivateToNumPy): return modin_obj._to_numpy() array = modin_obj.to_numpy() if ModinNumpy.get(): array = array._to_numpy() return array @wrap_free_function_in_argument_caster("to_ray") def to_ray(modin_obj): """ Convert a Modin DataFrame/Series to a Ray Dataset. Parameters ---------- modin_obj : modin.pandas.DataFrame, modin.pandas.Series The DataFrame/Series to convert. Returns ------- ray.data.Dataset Converted object with type depending on input. Notes ----- Modin DataFrame/Series can only be converted to a Ray Dataset if Modin uses a Ray engine. """ from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return FactoryDispatcher.to_ray(modin_obj) @wrap_free_function_in_argument_caster("to_dask") def to_dask(modin_obj): """ Convert a Modin DataFrame/Series to a Dask DataFrame/Series. Parameters ---------- modin_obj : modin.pandas.DataFrame, modin.pandas.Series The Modin DataFrame/Series to convert. Returns ------- dask.dataframe.DataFrame or dask.dataframe.Series Converted object with type depending on input. Notes ----- Modin DataFrame/Series can only be converted to a Dask DataFrame/Series if Modin uses a Dask engine. """ from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return FactoryDispatcher.to_dask(modin_obj) __all__ = [ "ExcelFile", "HDFStore", "json_normalize", "read_clipboard", "read_csv", "read_excel", "read_feather", "read_fwf", "read_gbq", "read_hdf", "read_html", "read_json", "read_orc", "read_parquet", "read_pickle", "read_sas", "read_spss", "read_sql", "read_sql_query", "read_sql_table", "read_stata", "read_table", "read_xml", "from_non_pandas", "from_pandas", "from_arrow", "from_dataframe", "to_pickle", "to_pandas", "to_numpy", ] ================================================ FILE: modin/pandas/iterator.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Place to define the Modin iterator.""" from __future__ import annotations from collections.abc import Iterator from typing import TYPE_CHECKING if TYPE_CHECKING: from modin.pandas import DataFrame class PartitionIterator(Iterator): """ Iterator on partitioned data. Parameters ---------- df : modin.pandas.DataFrame The dataframe to iterate over. axis : {0, 1} Axis to iterate over. func : callable The function to get inner iterables from each partition. """ df: DataFrame def __init__(self, df: DataFrame, axis, func): self.df = df self.axis = axis self.index_iter = ( zip( iter(slice(None) for _ in range(len(self.df.columns))), range(len(self.df.columns)), ) if axis else zip( range(len(self.df.index)), iter(slice(None) for _ in range(len(self.df.index))), ) ) self.func = func def __iter__(self): """ Implement iterator interface. Returns ------- PartitionIterator Iterator object. """ return self def __next__(self): """ Implement iterator interface. Returns ------- PartitionIterator Incremented iterator object. """ key = next(self.index_iter) df = self.df.iloc[key] return self.func(df) ================================================ FILE: modin/pandas/plotting.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Implement pandas plotting API.""" from pandas import plotting as pdplot from modin.logging import ClassLogger from modin.pandas.io import to_pandas from modin.utils import instancer from .dataframe import DataFrame @instancer class Plotting(ClassLogger): """Wrapper of pandas plotting module.""" def __dir__(self): """ Enable tab completion of plotting library. Returns ------- list List of attributes in `self`. """ return dir(pdplot) def __getattribute__(self, item): """ Convert any Modin DataFrames in parameters to pandas so that they can be plotted normally. Parameters ---------- item : str Attribute to look for. Returns ------- object If attribute is found in pandas.plotting, and it is a callable, a wrapper function is returned which converts its arguments to pandas and calls a function pandas.plotting.`item` on these arguments. If attribute is found in pandas.plotting but it is not a callable, returns it. Otherwise function tries to look for an attribute in `self`. """ if hasattr(pdplot, item): func = getattr(pdplot, item) if callable(func): def wrap_func(*args, **kwargs): """Convert Modin DataFrames to pandas then call the function.""" args = tuple( arg if not isinstance(arg, DataFrame) else to_pandas(arg) for arg in args ) kwargs = { kwd: val if not isinstance(val, DataFrame) else to_pandas(val) for kwd, val in kwargs.items() } return func(*args, **kwargs) return wrap_func else: return func else: return object.__getattribute__(self, item) ================================================ FILE: modin/pandas/resample.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Implement Resampler public API.""" from __future__ import annotations from typing import TYPE_CHECKING, Optional, Union import numpy as np import pandas import pandas.core.resample from pandas._libs import lib from pandas.core.dtypes.common import is_list_like from modin.logging import ClassLogger from modin.pandas.utils import cast_function_modin2pandas from modin.utils import _inherit_docstrings if TYPE_CHECKING: from modin.core.storage_formats import BaseQueryCompiler from modin.pandas import DataFrame, Series @_inherit_docstrings(pandas.core.resample.Resampler) class Resampler(ClassLogger): _dataframe: Union[DataFrame, Series] _query_compiler: BaseQueryCompiler def __init__( self, dataframe: Union[DataFrame, Series], rule, axis=0, closed=None, label=None, convention="start", kind=None, on=None, level=None, origin="start_day", offset=None, group_keys=lib.no_default, ): self._dataframe = dataframe self._query_compiler = dataframe._query_compiler self.axis = self._dataframe._get_axis_number(axis) self.resample_kwargs = { "rule": rule, "axis": axis, "closed": closed, "label": label, "convention": convention, "kind": kind, "on": on, "level": level, "origin": origin, "offset": offset, "group_keys": group_keys, } self.__groups = self._get_groups() def _get_groups(self): """ Compute the resampled groups. Returns ------- PandasGroupby Groups as specified by resampling arguments. """ df = self._dataframe if self.axis == 0 else self._dataframe.T convention = self.resample_kwargs["convention"] groups = df.groupby( pandas.Grouper( key=self.resample_kwargs["on"], freq=self.resample_kwargs["rule"], closed=self.resample_kwargs["closed"], label=self.resample_kwargs["label"], convention=convention if convention is not lib.no_default else "start", level=self.resample_kwargs["level"], origin=self.resample_kwargs["origin"], offset=self.resample_kwargs["offset"], ), group_keys=self.resample_kwargs["group_keys"], ) return groups def __getitem__(self, key): """ Get ``Resampler`` based on `key` columns of original dataframe. Parameters ---------- key : str or list String or list of selections. Returns ------- modin.pandas.BasePandasDataset New ``Resampler`` based on `key` columns subset of the original dataframe. """ def _get_new_resampler(key): subset = self._dataframe[key] resampler = type(self)(subset, **self.resample_kwargs) return resampler from .series import Series if isinstance( key, (list, tuple, Series, pandas.Series, pandas.Index, np.ndarray) ): if len(self._dataframe.columns.intersection(key)) != len(set(key)): missed_keys = list(set(key).difference(self._dataframe.columns)) raise KeyError(f"Columns not found: {str(sorted(missed_keys))[1:-1]}") return _get_new_resampler(list(key)) if key not in self._dataframe: raise KeyError(f"Column not found: {key}") return _get_new_resampler(key) @property def groups(self): return self._query_compiler.default_to_pandas( lambda df: pandas.DataFrame.resample(df, **self.resample_kwargs).groups ) @property def indices(self): return self._query_compiler.default_to_pandas( lambda df: pandas.DataFrame.resample(df, **self.resample_kwargs).indices ) def get_group(self, name, obj=None): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_get_group( self.resample_kwargs, name, obj ) ) def apply(self, func, *args, **kwargs): func = cast_function_modin2pandas(func) from .dataframe import DataFrame if isinstance(self._dataframe, DataFrame): query_comp_op = self._query_compiler.resample_app_df else: query_comp_op = self._query_compiler.resample_app_ser dataframe = DataFrame( query_compiler=query_comp_op( self.resample_kwargs, func, *args, **kwargs, ) ) if is_list_like(func) or isinstance(self._dataframe, DataFrame): return dataframe else: if len(dataframe.index) == 1: return dataframe.iloc[0] else: return dataframe.squeeze() def aggregate(self, func, *args, **kwargs): from .dataframe import DataFrame if isinstance(self._dataframe, DataFrame): query_comp_op = self._query_compiler.resample_agg_df else: query_comp_op = self._query_compiler.resample_agg_ser dataframe = DataFrame( query_compiler=query_comp_op( self.resample_kwargs, func, *args, **kwargs, ) ) if is_list_like(func) or isinstance(self._dataframe, DataFrame): return dataframe else: if len(dataframe.index) == 1: return dataframe.iloc[0] else: return dataframe.squeeze() def transform(self, arg, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_transform( self.resample_kwargs, arg, *args, **kwargs ) ) def pipe(self, func, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_pipe( self.resample_kwargs, func, *args, **kwargs ) ) def ffill(self, limit=None): return self.fillna(method="ffill", limit=limit) def bfill(self, limit=None): return self.fillna(method="bfill", limit=limit) def nearest(self, limit=None): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_nearest( self.resample_kwargs, limit ) ) def fillna(self, method, limit=None): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_fillna( self.resample_kwargs, method, limit ) ) def asfreq(self, fill_value=None): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_asfreq( self.resample_kwargs, fill_value ) ) def interpolate( self, method="linear", *, axis=0, limit=None, inplace=False, limit_direction: Optional[str] = None, limit_area=None, downcast=lib.no_default, **kwargs, ): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_interpolate( self.resample_kwargs, method, axis=axis, limit=limit, inplace=inplace, limit_direction=limit_direction, limit_area=limit_area, downcast=downcast, **kwargs, ) ) def count(self): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_count(self.resample_kwargs) ) def nunique(self, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_nunique( self.resample_kwargs, *args, **kwargs ) ) def first(self, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_first( self.resample_kwargs, *args, **kwargs, ) ) def last(self, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_last( self.resample_kwargs, *args, **kwargs, ) ) def max(self, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_max( self.resample_kwargs, *args, **kwargs, ) ) def mean(self, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_mean( self.resample_kwargs, *args, **kwargs, ) ) def median(self, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_median( self.resample_kwargs, *args, **kwargs, ) ) def min(self, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_min( self.resample_kwargs, *args, **kwargs, ) ) def ohlc(self, *args, **kwargs): from .dataframe import DataFrame if isinstance(self._dataframe, DataFrame): return DataFrame( query_compiler=self._query_compiler.resample_ohlc_df( self.resample_kwargs, *args, **kwargs, ) ) else: return DataFrame( query_compiler=self._query_compiler.resample_ohlc_ser( self.resample_kwargs, *args, **kwargs, ) ) def prod(self, min_count=0, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_prod( self.resample_kwargs, min_count=min_count, *args, **kwargs ) ) def size(self): from .series import Series output_series = Series( query_compiler=self._query_compiler.resample_size(self.resample_kwargs) ) if not isinstance(self._dataframe, Series): # If input is a DataFrame, rename output Series to None return output_series.rename(None) return output_series def sem(self, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_sem( self.resample_kwargs, *args, **kwargs, ) ) def std(self, ddof=1, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_std( self.resample_kwargs, *args, ddof=ddof, **kwargs ) ) def sum(self, min_count=0, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_sum( self.resample_kwargs, min_count=min_count, *args, **kwargs ) ) def var(self, ddof=1, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_var( self.resample_kwargs, *args, ddof=ddof, **kwargs ) ) def quantile(self, q=0.5, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_quantile( self.resample_kwargs, q, **kwargs ) ) ================================================ FILE: modin/pandas/series.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses `Series` class, that is distributed version of `pandas.Series`.""" from __future__ import annotations import os import warnings from typing import IO, TYPE_CHECKING, Any, Hashable, Iterable, Optional, Union import numpy as np import pandas from pandas._libs import lib from pandas._typing import ( ArrayLike, Axis, DtypeObj, IndexKeyFunc, Scalar, Sequence, StorageOptions, ) from pandas.api.types import is_integer from pandas.core.arrays import ExtensionArray from pandas.core.common import apply_if_callable, is_bool_indexer from pandas.core.dtypes.common import is_dict_like, is_list_like from pandas.core.series import _coerce_method from pandas.io.formats.info import SeriesInfo from pandas.util._decorators import doc from pandas.util._validators import validate_bool_kwarg from modin.config import PersistentPickle from modin.core.storage_formats.pandas.query_compiler_caster import ( EXTENSION_DICT_TYPE, EXTENSION_NO_LOOKUP, ) from modin.logging import disable_logging from modin.pandas.io import from_pandas, to_pandas from modin.utils import ( MODIN_UNNAMED_SERIES_LABEL, _inherit_docstrings, import_optional_dependency, sentinel, ) from .accessor import CachedAccessor, SparseAccessor from .base import _ATTRS_NO_LOOKUP, BasePandasDataset from .iterator import PartitionIterator from .series_utils import ( CategoryMethods, DatetimeProperties, ListAccessor, StringMethods, StructAccessor, ) from .utils import ( GET_BACKEND_DOC, SET_BACKEND_DOC, _doc_binary_op, cast_function_modin2pandas, is_scalar, ) if TYPE_CHECKING: import numpy.typing as npt from typing_extensions import Self from modin.core.storage_formats import BaseQueryCompiler from .dataframe import DataFrame @_inherit_docstrings( pandas.Series, excluded=[pandas.Series.__init__], apilink="pandas.Series" ) class Series(BasePandasDataset): """ Modin distributed representation of `pandas.Series`. Internally, the data can be divided into partitions in order to parallelize computations and utilize the user's hardware as much as possible. Inherit common for DataFrames and Series functionality from the `BasePandasDataset` class. Parameters ---------- data : modin.pandas.Series, array-like, Iterable, dict, or scalar value, optional Contains data stored in Series. If data is a dict, argument order is maintained. index : array-like or Index (1d), optional Values must be hashable and have the same length as `data`. dtype : str, np.dtype, or pandas.ExtensionDtype, optional Data type for the output Series. If not specified, this will be inferred from `data`. name : str, optional The name to give to the Series. copy : bool, default: False Copy input data. fastpath : bool, default: False `pandas` internal parameter. query_compiler : BaseQueryCompiler, optional A query compiler object to create the Series from. """ _pandas_class = pandas.Series __array_priority__ = pandas.Series.__array_priority__ _extensions: EXTENSION_DICT_TYPE = EXTENSION_DICT_TYPE(dict) def __init__( self, data=None, index=None, dtype=None, name=None, copy=None, fastpath=lib.no_default, query_compiler: BaseQueryCompiler = None, ) -> None: from modin.numpy import array # Siblings are other dataframes that share the same query compiler. We # use this list to update inplace when there is a shallow copy. self._siblings = [] if isinstance(data, type(self)): query_compiler = data._query_compiler.copy() if index is not None: if any(i not in data.index for i in index): raise NotImplementedError( "Passing non-existent columns or index values to constructor " + "not yet implemented." ) query_compiler = data.loc[index]._query_compiler if isinstance(data, array): if data._ndim == 2: raise ValueError("Data must be 1-dimensional") query_compiler = data._query_compiler.copy() if index is not None: query_compiler.index = index if dtype is not None: query_compiler = query_compiler.astype( {col_name: dtype for col_name in query_compiler.columns} ) if name is None: query_compiler.columns = pandas.Index([MODIN_UNNAMED_SERIES_LABEL]) if query_compiler is None: # Defaulting to pandas if name is None: name = MODIN_UNNAMED_SERIES_LABEL if isinstance(data, pandas.Series) and data.name is not None: name = data.name pandas_df = pandas.DataFrame( pandas.Series( data=data, index=index, dtype=dtype, name=name, copy=copy, fastpath=fastpath, ) ) if pandas_df.size >= 2_500_000: warnings.warn( "Distributing {} object. This may take some time.".format( type(data) ) ) query_compiler = from_pandas(pandas_df)._query_compiler self._query_compiler = query_compiler.columnarize() if name is not None: self.name = name def _get_name(self) -> Hashable: """ Get the value of the `name` property. Returns ------- hashable """ name = self._query_compiler.columns[0] if name == MODIN_UNNAMED_SERIES_LABEL: return None return name def _set_name(self, name: Hashable) -> None: """ Set the value of the `name` property. Parameters ---------- name : hashable Name value to set. """ if name is None: name = MODIN_UNNAMED_SERIES_LABEL if isinstance(name, tuple): columns = pandas.MultiIndex.from_tuples(tuples=[name]) else: columns = [name] self._query_compiler.columns = columns name: Hashable = property(_get_name, _set_name) _parent = None # Parent axis denotes axis that was used to select series in a parent dataframe. # If _parent_axis == 0, then it means that index axis was used via df.loc[row] # indexing operations and assignments should be done to rows of parent. # If _parent_axis == 1 it means that column axis was used via df[column] and assignments # should be done to columns of parent. _parent_axis = 0 @_doc_binary_op(operation="addition", bin_op="add") def __add__(self, right) -> Series: return self.add(right) @_doc_binary_op(operation="addition", bin_op="radd", right="left") def __radd__(self, left) -> Series: return self.radd(left) @_doc_binary_op(operation="union", bin_op="and", right="other") def __and__(self, other) -> Series: if isinstance(other, (list, np.ndarray, pandas.Series)): return self._default_to_pandas(pandas.Series.__and__, other) new_self, new_other = self._prepare_inter_op(other) return super(Series, new_self).__and__(new_other) @_doc_binary_op(operation="union", bin_op="and", right="other") def __rand__(self, other) -> Series: if isinstance(other, (list, np.ndarray, pandas.Series)): return self._default_to_pandas(pandas.Series.__rand__, other) new_self, new_other = self._prepare_inter_op(other) return super(Series, new_self).__rand__(new_other) # add `_inherit_docstrings` decorator to force method link addition. @_inherit_docstrings(pandas.Series.__array__, apilink="pandas.Series.__array__") def __array__( self, dtype: npt.DTypeLike | None = None, copy: bool | None = None ) -> np.ndarray: return super(Series, self).__array__(dtype).flatten() def __column_consortium_standard__( self, *, api_version: str | None = None ): # noqa: PR01, RT01 """ Provide entry point to the Consortium DataFrame Standard API. This is developed and maintained outside of Modin. Please report any issues to https://github.com/data-apis/dataframe-api-compat. """ dataframe_api_compat = import_optional_dependency( "dataframe_api_compat", "implementation" ) return dataframe_api_compat.modin_standard.convert_to_standard_compliant_column( self, api_version=api_version ) def __contains__(self, key: Hashable) -> bool: """ Check if `key` in the `Series.index`. Parameters ---------- key : hashable Key to check the presence in the index. Returns ------- bool """ return key in self.index def __copy__(self, deep: bool = True) -> Series: """ Return the copy of the Series. Parameters ---------- deep : bool, default: True Whether the copy should be deep or not. Returns ------- Series """ return self.copy(deep=deep) def __deepcopy__(self, memo=None) -> Series: """ Return the deep copy of the Series. Parameters ---------- memo : Any, optional Deprecated parameter. Returns ------- Series """ return self.copy(deep=True) def __delitem__(self, key: Hashable) -> None: """ Delete item identified by `key` label. Parameters ---------- key : hashable Key to delete. """ if key not in self.keys(): raise KeyError(key) self.drop(labels=key, inplace=True) @_doc_binary_op( operation="integer division and modulo", bin_op="divmod", returns="tuple of two Series", ) def __divmod__(self, right) -> tuple[Series, Series]: return self.divmod(right) @_doc_binary_op( operation="integer division and modulo", bin_op="divmod", right="left", returns="tuple of two Series", ) def __rdivmod__(self, left) -> tuple[Series, Series]: return self.rdivmod(left) @_doc_binary_op(operation="integer division", bin_op="floordiv") def __floordiv__(self, right) -> Series: return self.floordiv(right) @_doc_binary_op(operation="integer division", bin_op="floordiv") def __rfloordiv__(self, right) -> Series: return self.rfloordiv(right) @disable_logging def __getattribute__(self, key: str) -> Any: """ Get attribute identified by `key`. Parameters ---------- key : str Key to get. Returns ------- Any The attribute. """ # NOTE that to get an attribute, python calls __getattribute__() first and # then falls back to __getattr__() if the former raises an AttributeError. if key not in EXTENSION_NO_LOOKUP: extensions_result = self._getattribute__from_extension_impl( key, __class__._extensions ) if extensions_result is not sentinel: return extensions_result return super().__getattribute__(key) @disable_logging def __getattr__(self, key: Hashable) -> Any: """ Return item identified by `key`. Parameters ---------- key : hashable Key to get. Returns ------- Any Notes ----- First try to use `__getattribute__` method. If it fails try to get `key` from `Series` fields. """ # NOTE that to get an attribute, python calls __getattribute__() first and # then falls back to __getattr__() if the former raises an AttributeError. if key not in _ATTRS_NO_LOOKUP and key in self._query_compiler.index: return self[key] raise AttributeError(f"'Series' object has no attribute '{key}'") __float__ = _coerce_method(float) __int__ = _coerce_method(int) def __iter__(self): """ Return an iterator of the values. Returns ------- iterable """ return self._to_pandas().__iter__() @_doc_binary_op(operation="modulo", bin_op="mod") def __mod__(self, right) -> Series: return self.mod(right) @_doc_binary_op(operation="modulo", bin_op="mod", right="left") def __rmod__(self, left) -> Series: return self.rmod(left) @_doc_binary_op(operation="multiplication", bin_op="mul") def __mul__(self, right) -> Series: return self.mul(right) @_doc_binary_op(operation="multiplication", bin_op="mul", right="left") def __rmul__(self, left) -> Series: return self.rmul(left) @_doc_binary_op(operation="disjunction", bin_op="or", right="other") def __or__(self, other) -> Series: if isinstance(other, (list, np.ndarray, pandas.Series)): return self._default_to_pandas(pandas.Series.__or__, other) new_self, new_other = self._prepare_inter_op(other) return super(Series, new_self).__or__(new_other) @_doc_binary_op(operation="disjunction", bin_op="or", right="other") def __ror__(self, other) -> Series: if isinstance(other, (list, np.ndarray, pandas.Series)): return self._default_to_pandas(pandas.Series.__ror__, other) new_self, new_other = self._prepare_inter_op(other) return super(Series, new_self).__ror__(new_other) @_doc_binary_op(operation="exclusive or", bin_op="xor", right="other") def __xor__(self, other) -> Series: if isinstance(other, (list, np.ndarray, pandas.Series)): return self._default_to_pandas(pandas.Series.__xor__, other) new_self, new_other = self._prepare_inter_op(other) return super(Series, new_self).__xor__(new_other) @_doc_binary_op(operation="exclusive or", bin_op="xor", right="other") def __rxor__(self, other) -> Series: if isinstance(other, (list, np.ndarray, pandas.Series)): return self._default_to_pandas(pandas.Series.__rxor__, other) new_self, new_other = self._prepare_inter_op(other) return super(Series, new_self).__rxor__(new_other) @_doc_binary_op(operation="exponential power", bin_op="pow") def __pow__(self, right) -> Series: return self.pow(right) @_doc_binary_op(operation="exponential power", bin_op="pow", right="left") def __rpow__(self, left) -> Series: return self.rpow(left) def __repr__(self) -> str: """ Return a string representation for a particular Series. Returns ------- str """ num_rows = pandas.get_option("display.max_rows") or 60 num_cols = pandas.get_option("display.max_columns") or 20 temp_df = self._build_repr_df(num_rows, num_cols) if isinstance(temp_df, pandas.DataFrame) and not temp_df.empty: temp_df = temp_df.iloc[:, 0] temp_str = repr(temp_df) freq_str = ( "Freq: {}, ".format(self.index.freqstr) if isinstance(self.index, pandas.DatetimeIndex) else "" ) if self.name is not None: name_str = "Name: {}, ".format(str(self.name)) else: name_str = "" if len(self) > num_rows: len_str = "Length: {}, ".format(len(self)) else: len_str = "" dtype_str = "dtype: {}".format( str(self.dtype) + ")" if temp_df.empty else temp_str.rsplit("dtype: ", 1)[-1] ) if len(self) == 0: return "Series([], {}{}{}".format(freq_str, name_str, dtype_str) maxsplit = 1 if ( isinstance(temp_df, pandas.Series) and temp_df.name is not None and isinstance(temp_df.dtype, pandas.CategoricalDtype) ): maxsplit = 2 return temp_str.rsplit("\n", maxsplit)[0] + "\n{}{}{}{}".format( freq_str, name_str, len_str, dtype_str ) def __round__(self, decimals=0) -> Series: """ Round each value in a Series to the given number of decimals. Parameters ---------- decimals : int, default: 0 Number of decimal places to round to. Returns ------- Series """ return self._create_or_update_from_compiler( self._query_compiler.round(decimals=decimals) ) def __setitem__(self, key: Hashable, value: Any) -> None: """ Set `value` identified by `key` in the Series. Parameters ---------- key : hashable Key to set. value : Any Value to set. """ if isinstance(key, slice): self._setitem_slice(key, value) else: self.loc[key] = value @disable_logging def __setattr__(self, name: str, value: Any) -> None: """ Set attribute `name` to `value`. Parameters ---------- name : str Name of the attribute to set. value : Any Value to set. Returns ------- None """ # An extension property is only accessible if the backend supports it. extension = self._get_extension(name, __class__._extensions) if extension is not sentinel and hasattr(extension, "__set__"): return extension.__set__(self, value) return super().__setattr__(name, value) @disable_logging def __delattr__(self, name) -> None: """ Delete attribute `name`. Parameters ---------- name : str Name of the attribute to delete. Returns ------- None """ # An extension property is only accessible if the backend supports it. extension = self._get_extension(name, __class__._extensions) if extension is not sentinel and hasattr(extension, "__delete__"): return extension.__delete__(self) return super().__delattr__(name) @_doc_binary_op(operation="subtraction", bin_op="sub") def __sub__(self, right) -> Series: return self.sub(right) @_doc_binary_op(operation="subtraction", bin_op="sub", right="left") def __rsub__(self, left) -> Series: return self.rsub(left) @_doc_binary_op(operation="floating division", bin_op="truediv") def __truediv__(self, right) -> Series: return self.truediv(right) @_doc_binary_op(operation="floating division", bin_op="truediv", right="left") def __rtruediv__(self, left) -> Series: return self.rtruediv(left) __iadd__ = __add__ __imul__ = __mul__ __ipow__ = __pow__ __isub__ = __sub__ __itruediv__ = __truediv__ @property def values(self): # noqa: RT01, D200 """ Return Series as ndarray or ndarray-like depending on the dtype. """ import modin.pandas as pd if isinstance( self.dtype, pandas.core.dtypes.dtypes.ExtensionDtype ) and not isinstance(self.dtype, pd.CategoricalDtype): return self._default_to_pandas("values") data = self.to_numpy() if isinstance(self.dtype, pd.CategoricalDtype): from modin.config import ModinNumpy if ModinNumpy.get(): data = data._to_numpy() data = pd.Categorical(data, dtype=self.dtype) return data def __arrow_array__(self, type=None): # noqa: GL08 # Although pandas.Series does not implement this method (true for version 2.2.*), # however, pyarrow has support for it. This method emulates this behavior and # allows pyarrow to work with modin.pandas.Series. import pyarrow return pyarrow.array(self._to_pandas(), type=type) def add( self, other, level=None, fill_value=None, axis=0 ) -> Series: # noqa: PR01, RT01, D200 """ Return Addition of series and other, element-wise (binary operator add). """ new_self, new_other = self._prepare_inter_op(other) return super(Series, new_self).add( new_other, level=level, fill_value=fill_value, axis=axis ) def radd( self, other, level=None, fill_value=None, axis=0 ) -> Series: # noqa: PR01, RT01, D200 """ Return Addition of series and other, element-wise (binary operator radd). """ new_self, new_other = self._prepare_inter_op(other) return super(Series, new_self).radd( new_other, level=level, fill_value=fill_value, axis=axis ) def add_prefix( self, prefix, axis=None ) -> Union[DataFrame, Series]: # noqa: PR01, RT01, D200 """ Prefix labels with string `prefix`. """ axis = 0 if axis is None else self._get_axis_number(axis) return self.__constructor__( query_compiler=self._query_compiler.add_prefix(prefix, axis=axis) ) def add_suffix( self, suffix, axis=None ) -> Union[DataFrame, Series]: # noqa: PR01, RT01, D200 """ Suffix labels with string `suffix`. """ axis = 0 if axis is None else self._get_axis_number(axis) return self.__constructor__( query_compiler=self._query_compiler.add_suffix(suffix, axis=axis) ) def aggregate( self, func=None, axis=0, *args, **kwargs ) -> Union[Series, Scalar]: # noqa: PR01, RT01, D200 """ Aggregate using one or more operations over the specified axis. """ def error_raiser(msg, exception): """Convert passed exception to the same type as pandas do and raise it.""" # HACK: to concord with pandas error types by replacing all of the # TypeErrors to the AssertionErrors exception = exception if exception is not TypeError else AssertionError raise exception(msg) self._validate_function(func, on_invalid=error_raiser) return super(Series, self).aggregate(func, axis, *args, **kwargs) agg = aggregate def apply( self, func, convert_dtype=lib.no_default, args=(), by_row="compat", **kwargs ) -> Union[DataFrame, Series]: # noqa: PR01, RT01, D200 """ Invoke function on values of Series. """ if by_row != "compat": # TODO: add test return self._default_to_pandas( pandas.Series.apply, func=func, convert_dtype=convert_dtype, args=args, by_row=by_row, **kwargs, ) if convert_dtype is lib.no_default: convert_dtype = True else: warnings.warn( "the convert_dtype parameter is deprecated and will be removed in a " + "future version. Do ``ser.astype(object).apply()`` " + "instead if you want ``convert_dtype=False``.", FutureWarning, ) func = cast_function_modin2pandas(func) self._validate_function(func) # apply and aggregate have slightly different behaviors, so we have to use # each one separately to determine the correct return type. In the case of # `agg`, the axis is set, but it is not required for the computation, so we use # it to determine which function to run. if kwargs.pop("axis", None) is not None: apply_func = "agg" else: apply_func = "apply" # This is the simplest way to determine the return type, but there are checks # in pandas that verify that some results are created. This is a challenge for # empty DataFrames, but fortunately they only happen when the `func` type is # a list or a dictionary, which means that the return type won't change from # type(self), so we catch that error and use `type(self).__name__` for the return # type. # We create a "dummy" `Series` to do the error checking and determining # the return type. try: return_type = type( getattr( pandas.Series(self[:1].values, index=self.index[:1]), apply_func )(func, *args, **kwargs) ).__name__ except Exception: return_type = type(self).__name__ if ( isinstance(func, str) or is_list_like(func) or return_type not in ["DataFrame", "Series"] ): # use the explicit non-Compat parent to avoid infinite recursion result = super(Series, self).apply( func, axis=0, raw=False, result_type=None, args=args, **kwargs, ) else: # handle ufuncs and lambdas if kwargs or args and not isinstance(func, np.ufunc): def f(x): return func(x, *args, **kwargs) else: f = func with np.errstate(all="ignore"): if isinstance(f, np.ufunc): return f(self) # The return_type is only a DataFrame when we have a function # return a Series object. This is a very particular case that # has to be handled by the underlying pandas.Series apply # function and not our default map call. if return_type == "DataFrame": result = self._query_compiler.apply_on_series(f) else: result = self.map(f)._query_compiler if return_type == "DataFrame": from .dataframe import DataFrame result = DataFrame(query_compiler=result) elif return_type == "Series": result = self.__constructor__(query_compiler=result) if result.name == self.index[0]: result.name = None elif isinstance(result, type(self._query_compiler)): # sometimes result can be not a query_compiler, but scalar (for example # for sum or count functions) return result.to_pandas().squeeze() return result def transform( self, func, axis=0, *args, **kwargs ) -> Series: # noqa: PR01, RT01, D200 """ Call ``func`` on self producing a `BasePandasDataset` with the same axis shape as self. """ if isinstance(func, list): # drop nonunique functions to align with pandas behavior instead of getting # "pandas.errors.SpecificationError: Function names must be unique..." # Example: # >>> pandas.Series([0., 1., 4.]).transform(["sqrt", "sqrt"]) # sqrt # 0 0.0 # 1 1.0 # 2 2.0 unique_func = [func[0]] for one_func in func[1:]: if one_func not in unique_func: unique_func.append(one_func) func = unique_func return super(Series, self).transform(func, axis, *args, **kwargs) def argmax( self, axis=None, skipna=True, *args, **kwargs ) -> int: # noqa: PR01, RT01, D200 """ Return int position of the largest value in the Series. """ result = self.reset_index(drop=True).idxmax( axis=axis, skipna=skipna, *args, **kwargs ) if np.isnan(result) or result is pandas.NA: result = -1 return result def argmin( self, axis=None, skipna=True, *args, **kwargs ) -> int: # noqa: PR01, RT01, D200 """ Return int position of the smallest value in the Series. """ result = self.reset_index(drop=True).idxmin( axis=axis, skipna=skipna, *args, **kwargs ) if np.isnan(result) or result is pandas.NA: result = -1 return result def argsort( self, axis=0, kind="quicksort", order=None, stable=None ) -> Series: # noqa: PR01, RT01, D200 """ Return the integer indices that would sort the Series values. """ return self.__constructor__( query_compiler=self._query_compiler.argsort( # 'stable' parameter has no effect in Pandas and is only accepted # for compatibility with NumPy, so we're not passing it forward on purpose axis=axis, kind=kind, order=order, ) ) def autocorr(self, lag=1) -> float: # noqa: PR01, RT01, D200 """ Compute the lag-N autocorrelation. """ return self.corr(self.shift(lag)) def between( self, left, right, inclusive: str = "both" ) -> Series: # noqa: PR01, RT01, D200 """ Return boolean Series equivalent to left <= series <= right. """ # 'pandas.Series.between()' only uses public Series' API, # so passing a Modin Series there is safe return pandas.Series.between(self, left, right, inclusive) def combine(self, other, func, fill_value=None) -> Series: # noqa: PR01, RT01, D200 """ Combine the Series with a Series or scalar according to `func`. """ return super(Series, self).combine( other, lambda s1, s2: s1.combine(s2, func, fill_value=fill_value) ) def compare( self, other: Series, align_axis: Union[str, int] = 1, keep_shape: bool = False, keep_equal: bool = False, result_names: tuple = ("self", "other"), ) -> Union[DataFrame, Series]: # noqa: PR01, RT01, D200 """ Compare to another Series and show the differences. """ if not isinstance(other, Series): raise TypeError(f"Cannot compare Series to {type(other)}") result = self.to_frame().compare( other.to_frame(), align_axis=align_axis, keep_shape=keep_shape, keep_equal=keep_equal, result_names=result_names, ) if align_axis == "columns" or align_axis == 1: # Pandas.DataFrame.Compare returns a dataframe with a multidimensional index object as the # columns so we have to change column object back. result.columns = pandas.Index(["self", "other"]) else: result = result.squeeze().rename(None) return result def corr( self, other, method="pearson", min_periods=None ) -> float: # noqa: PR01, RT01, D200 """ Compute correlation with `other` Series, excluding missing values. """ if method == "pearson": this, other = self.align(other, join="inner", copy=False) this = self.__constructor__(this) other = self.__constructor__(other) if len(this) == 0: return np.nan if len(this) != len(other): raise ValueError("Operands must have same size") if min_periods is None: min_periods = 1 valid = this.notna() & other.notna() if not valid.all(): this = this[valid] other = other[valid] if len(this) < min_periods: return np.nan this = this.astype(dtype="float64") other = other.astype(dtype="float64") this -= this.mean() other -= other.mean() other = other.__constructor__(query_compiler=other._query_compiler.conj()) result = this * other / (len(this) - 1) result = np.array([result.sum()]) stddev_this = ((this * this) / (len(this) - 1)).sum() stddev_other = ((other * other) / (len(other) - 1)).sum() stddev_this = np.array([np.sqrt(stddev_this)]) stddev_other = np.array([np.sqrt(stddev_other)]) result /= stddev_this * stddev_other np.clip(result.real, -1, 1, out=result.real) if np.iscomplexobj(result): np.clip(result.imag, -1, 1, out=result.imag) return result[0] return self._query_compiler.series_corr( other=other, method=method, min_periods=min_periods ) def count(self) -> int: # noqa: PR01, RT01, D200 """ Return number of non-NA/null observations in the Series. """ return super(Series, self).count() def cov( self, other, min_periods=None, ddof: Optional[int] = 1 ) -> float: # noqa: PR01, RT01, D200 """ Compute covariance with Series, excluding missing values. """ this, other = self.align(other, join="inner", copy=False) this = self.__constructor__(this) other = self.__constructor__(other) if len(this) == 0: return np.nan if len(this) != len(other): raise ValueError("Operands must have same size") if min_periods is None: min_periods = 1 valid = this.notna() & other.notna() if not valid.all(): this = this[valid] other = other[valid] if len(this) < min_periods: return np.nan this = this.astype(dtype="float64") other = other.astype(dtype="float64") this -= this.mean() other -= other.mean() other = other.__constructor__(query_compiler=other._query_compiler.conj()) result = this * other / (len(this) - ddof) result = result.sum() return result def describe( self, percentiles=None, include=None, exclude=None, ) -> Union[DataFrame, Series]: # noqa: PR01, RT01, D200 """ Generate descriptive statistics. """ # Pandas ignores the `include` and `exclude` for Series for some reason. return super(Series, self).describe( percentiles=percentiles, include=None, exclude=None, ) def diff(self, periods=1) -> Series: # noqa: PR01, RT01, D200 """ First discrete difference of element. """ return super(Series, self).diff(periods=periods, axis=0) def divmod( self, other, level=None, fill_value=None, axis=0 ) -> tuple[Series, Series]: # noqa: PR01, RT01, D200 """ Return Integer division and modulo of series and `other`, element-wise (binary operator `divmod`). """ division, modulo = self._query_compiler.divmod( other=other, level=level, fill_value=fill_value, axis=axis ) return self.__constructor__(query_compiler=division), self.__constructor__( query_compiler=modulo ) def dot(self, other) -> Union[Series, np.ndarray]: # noqa: PR01, RT01, D200 """ Compute the dot product between the Series and the columns of `other`. """ if isinstance(other, BasePandasDataset): common = self.index.union(other.index) if len(common) > len(self) or len(common) > len(other): raise ValueError("Matrices are not aligned") qc = other.reindex(index=common)._query_compiler if isinstance(other, Series): return self._reduce_dimension( query_compiler=self._query_compiler.dot( qc, squeeze_self=True, squeeze_other=True ) ) else: return self.__constructor__( query_compiler=self._query_compiler.dot( qc, squeeze_self=True, squeeze_other=False ) ) other = np.asarray(other) if self.shape[0] != other.shape[0]: raise ValueError( "Dot product shape mismatch, {} vs {}".format(self.shape, other.shape) ) if len(other.shape) > 1: return ( self._query_compiler.dot(other, squeeze_self=True).to_numpy().squeeze() ) return self._reduce_dimension( query_compiler=self._query_compiler.dot(other, squeeze_self=True) ) def drop_duplicates( self, *, keep="first", inplace=False, ignore_index=False ) -> Union[Series, None]: # noqa: PR01, RT01, D200 """ Return Series with duplicate values removed. """ return super(Series, self).drop_duplicates( keep=keep, inplace=inplace, ignore_index=ignore_index ) def dropna( self, *, axis=0, inplace=False, how=None, ignore_index=False ) -> Union[Series, None]: # noqa: PR01, RT01, D200 """ Return a new Series with missing values removed. """ return super(Series, self).dropna( axis=axis, inplace=inplace, ignore_index=ignore_index ) def duplicated(self, keep="first") -> Series: # noqa: PR01, RT01, D200 """ Indicate duplicate Series values. """ name = self.name result = self.to_frame().duplicated(keep=keep) # DataFrame.duplicated drops the name, so we need to manually restore it if name is not None: result.name = name return result def eq( self, other, level=None, fill_value=None, axis=0 ) -> Series: # noqa: PR01, RT01, D200 """ Return Equal to of series and `other`, element-wise (binary operator `eq`). """ new_self, new_other = self._prepare_inter_op(other) return new_self._binary_op( "eq", new_other, level=level, fill_value=fill_value, axis=axis, squeeze_other=isinstance(other, Series), ) def equals(self, other) -> bool: # noqa: PR01, RT01, D200 """ Test whether two objects contain the same elements. """ if isinstance(other, pandas.Series): # Copy into a Modin Series to simplify logic below other = self.__constructor__(other) if type(self) is not type(other) or not self.index.equals(other.index): return False old_name_self = self.name old_name_other = other.name try: self.name = "temp_name_for_equals_op" other.name = "temp_name_for_equals_op" # this function should return only scalar res = self.__constructor__( query_compiler=self._query_compiler.equals(other._query_compiler) ) finally: self.name = old_name_self other.name = old_name_other return res.all() def explode(self, ignore_index: bool = False) -> Series: # noqa: PR01, RT01, D200 """ Transform each element of a list-like to a row. """ return super(Series, self).explode( MODIN_UNNAMED_SERIES_LABEL if self.name is None else self.name, ignore_index=ignore_index, ) def factorize(self, sort=False, use_na_sentinel=True): # noqa: PR01, RT01, D200 """ Encode the object as an enumerated type or categorical variable. """ return self._default_to_pandas( pandas.Series.factorize, sort=sort, use_na_sentinel=use_na_sentinel, ) def case_when(self, caselist) -> Series: # noqa: PR01, RT01, D200 """ Replace values where the conditions are True. """ modin_type = type(self) caselist = [ tuple( data._query_compiler if isinstance(data, modin_type) else data for data in case_tuple ) for case_tuple in caselist ] return self.__constructor__( query_compiler=self._query_compiler.case_when(caselist=caselist) ) def fillna( self, value=None, *, method=None, axis=None, inplace=False, limit=None, downcast=lib.no_default, ) -> Union[Series, None]: # noqa: PR01, RT01, D200 """ Fill NaNs inside of a Series object. """ if isinstance(value, BasePandasDataset) and not isinstance(value, Series): raise TypeError( '"value" parameter must be a scalar, dict or Series, but ' + f'you passed a "{type(value).__name__}"' ) return super(Series, self).fillna( squeeze_self=True, squeeze_value=isinstance(value, Series), value=value, method=method, axis=axis, inplace=inplace, limit=limit, downcast=downcast, ) def floordiv( self, other, level=None, fill_value=None, axis=0 ) -> Series: # noqa: PR01, RT01, D200 """ Get Integer division of series and `other`, element-wise (binary operator `floordiv`). """ new_self, new_other = self._prepare_inter_op(other) return super(Series, new_self).floordiv( new_other, level=level, fill_value=fill_value, axis=axis ) def ge( self, other, level=None, fill_value=None, axis=0 ) -> Series: # noqa: PR01, RT01, D200 """ Return greater than or equal to of series and `other`, element-wise (binary operator `ge`). """ new_self, new_other = self._prepare_inter_op(other) return new_self._binary_op( "ge", new_other, level=level, fill_value=fill_value, axis=axis, squeeze_other=isinstance(other, Series), ) def groupby( self, by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, observed=lib.no_default, dropna: bool = True, ): # noqa: PR01, RT01, D200 """ Group Series using a mapper or by a Series of columns. """ from .groupby import SeriesGroupBy if not as_index: raise TypeError("as_index=False only valid with DataFrame") # SeriesGroupBy expects a query compiler object if it is available if isinstance(by, Series): by = by._query_compiler elif callable(by): by = by(self.index) elif by is None and level is None: raise TypeError("You have to supply one of 'by' and 'level'") return SeriesGroupBy( self, by, axis, level, as_index, sort, group_keys, idx_name=None, observed=observed, drop=False, dropna=dropna, backend_pinned=self.is_backend_pinned(), ) def gt( self, other, level=None, fill_value=None, axis=0 ) -> Series: # noqa: PR01, RT01, D200 """ Return greater than of series and `other`, element-wise (binary operator `gt`). """ new_self, new_other = self._prepare_inter_op(other) return new_self._binary_op( "gt", new_other, level=level, fill_value=fill_value, axis=axis, squeeze_other=isinstance(other, Series), ) def hist( self, by=None, ax=None, grid: bool = True, xlabelsize: int | None = None, xrot: float | None = None, ylabelsize: int | None = None, yrot: float | None = None, figsize: tuple[int, int] | None = None, bins: int | Sequence[int] = 10, backend: str | None = None, legend: bool = False, **kwargs, ): # noqa: PR01, RT01, D200 """ Draw histogram of the input series using matplotlib. """ return self._default_to_pandas( pandas.Series.hist, by=by, ax=ax, grid=grid, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot, figsize=figsize, bins=bins, backend=backend, legend=legend, **kwargs, ) def idxmax( self, axis=0, skipna=True, *args, **kwargs ) -> Hashable: # noqa: PR01, RT01, D200 """ Return the row label of the maximum value. """ return super(Series, self).idxmax(axis=axis, skipna=skipna, *args, **kwargs) def idxmin( self, axis=0, skipna=True, *args, **kwargs ) -> Hashable: # noqa: PR01, RT01, D200 """ Return the row label of the minimum value. """ return super(Series, self).idxmin(axis=axis, skipna=skipna, *args, **kwargs) def info( self, verbose: bool | None = None, buf: IO[str] | None = None, max_cols: int | None = None, memory_usage: bool | str | None = None, show_counts: bool = True, ) -> None: return SeriesInfo(self, memory_usage).render( buf=buf, max_cols=max_cols, verbose=verbose, show_counts=show_counts, ) def isna(self) -> Series: """ Detect missing values. Returns ------- The result of detecting missing values. """ return super(Series, self).isna() def isnull(self) -> Series: """ Detect missing values. Returns ------- The result of detecting missing values. """ return super(Series, self).isnull() def item(self) -> Scalar: # noqa: RT01, D200 """ Return the first element of the underlying data as a Python scalar. """ return self[0] def items(self) -> Iterable[tuple[Hashable, Any]]: # noqa: D200 """ Lazily iterate over (index, value) tuples. """ def item_builder(s): return s.name, s.squeeze() partition_iterator = PartitionIterator(self.to_frame(), 0, item_builder) for v in partition_iterator: yield v def keys(self) -> pandas.Index: # noqa: RT01, D200 """ Return alias for index. """ return self.index def le( self, other, level=None, fill_value=None, axis=0 ) -> Series: # noqa: PR01, RT01, D200 """ Return less than or equal to of series and `other`, element-wise (binary operator `le`). """ new_self, new_other = self._prepare_inter_op(other) return new_self._binary_op( "le", new_other, level=level, fill_value=fill_value, axis=axis, squeeze_other=isinstance(other, Series), ) def lt( self, other, level=None, fill_value=None, axis=0 ) -> Series: # noqa: PR01, RT01, D200 """ Return less than of series and `other`, element-wise (binary operator `lt`). """ new_self, new_other = self._prepare_inter_op(other) return new_self._binary_op( "lt", new_other, level=level, fill_value=fill_value, axis=axis, squeeze_other=isinstance(other, Series), ) def map(self, arg, na_action=None) -> Series: # noqa: PR01, RT01, D200 """ Map values of Series according to input correspondence. """ if isinstance(arg, type(self)): # HACK: if we don't cast to pandas, then the execution engine will try to # propagate the distributed Series to workers and most likely would have # some performance problems. # TODO: A better way of doing so could be passing this `arg` as a query compiler # and broadcast accordingly. arg = arg._to_pandas() if not callable(arg) and hasattr(arg, "get"): mapper = arg def arg(s): return mapper.get(s, np.nan) return self.__constructor__( query_compiler=self._query_compiler.map( lambda s: ( arg(s) if pandas.isnull(s) is not True or na_action is None else s ) ) ) def sem( self, axis: Optional[Axis] = None, skipna: bool = True, ddof: int = 1, numeric_only=False, **kwargs, ) -> Union[float, Series]: # noqa: PR01, RT01, D200 """ Return unbiased standard error of the mean over requested axis. """ return super(Series, self)._stat_operation( "sem", axis, skipna, numeric_only, ddof=ddof, **kwargs ) def std( self, axis: Optional[Axis] = None, skipna: bool = True, ddof: int = 1, numeric_only=False, **kwargs, ) -> Union[float, Series]: # noqa: PR01, RT01, D200 """ Return sample standard deviation over requested axis. """ return super(Series, self)._stat_operation( "std", axis, skipna, numeric_only, ddof=ddof, **kwargs ) def var( self, axis: Optional[Axis] = None, skipna: bool = True, ddof: int = 1, numeric_only=False, **kwargs, ) -> Union[float, Series]: # noqa: PR01, RT01, D200 """ Return unbiased variance over requested axis. """ return super(Series, self)._stat_operation( "var", axis, skipna, numeric_only, ddof=ddof, **kwargs ) def memory_usage(self, index=True, deep=False) -> int: # noqa: PR01, RT01, D200 """ Return the memory usage of the Series. """ return super(Series, self).memory_usage(index=index, deep=deep).sum() def mod( self, other, level=None, fill_value=None, axis=0 ) -> Series: # noqa: PR01, RT01, D200 """ Return Modulo of series and `other`, element-wise (binary operator `mod`). """ new_self, new_other = self._prepare_inter_op(other) return super(Series, new_self).mod( new_other, level=level, fill_value=fill_value, axis=axis ) def mode(self, dropna=True) -> Series: # noqa: PR01, RT01, D200 """ Return the mode(s) of the Series. """ return super(Series, self).mode(numeric_only=False, dropna=dropna) def mul( self, other, level=None, fill_value=None, axis=0 ) -> Series: # noqa: PR01, RT01, D200 """ Return multiplication of series and `other`, element-wise (binary operator `mul`). """ new_self, new_other = self._prepare_inter_op(other) return super(Series, new_self).mul( new_other, level=level, fill_value=fill_value, axis=axis ) multiply = mul def rmul( self, other, level=None, fill_value=None, axis=0 ) -> Series: # noqa: PR01, RT01, D200 """ Return multiplication of series and `other`, element-wise (binary operator `mul`). """ new_self, new_other = self._prepare_inter_op(other) return super(Series, new_self).rmul( new_other, level=level, fill_value=fill_value, axis=axis ) def ne( self, other, level=None, fill_value=None, axis=0 ) -> Series: # noqa: PR01, RT01, D200 """ Return not equal to of series and `other`, element-wise (binary operator `ne`). """ new_self, new_other = self._prepare_inter_op(other) return new_self._binary_op( "ne", new_other, level=level, fill_value=fill_value, axis=axis, squeeze_other=isinstance(other, Series), ) def nlargest(self, n=5, keep="first") -> Series: # noqa: PR01, RT01, D200 """ Return the largest `n` elements. """ if len(self._query_compiler.columns) == 0: # pandas returns empty series when requested largest/smallest from empty series return self.__constructor__(data=[], dtype=float) return Series( query_compiler=self._query_compiler.nlargest( n=n, columns=self.name, keep=keep ) ) def nsmallest(self, n=5, keep="first") -> Series: # noqa: PR01, RT01, D200 """ Return the smallest `n` elements. """ if len(self._query_compiler.columns) == 0: # pandas returns empty series when requested largest/smallest from empty series return self.__constructor__(data=[], dtype=float) return self.__constructor__( query_compiler=self._query_compiler.nsmallest( n=n, columns=self.name, keep=keep ) ) def shift( self, periods=1, freq=None, axis=0, fill_value=lib.no_default, suffix=None, ) -> Series: # noqa: PR01, RT01, D200 """ Shift index by desired number of periods with an optional time `freq`. """ # pandas 2.1.0 ignores suffix parameter (https://github.com/pandas-dev/pandas/issues/54806) if freq is not None and fill_value is not lib.no_default: raise ValueError( "Cannot pass both 'freq' and 'fill_value' to " + f"{type(self).__name__}.shift" ) if axis == 1: raise ValueError( f"No axis named {axis} for object type {type(self).__name__}" ) return super(type(self), self).shift( periods=periods, freq=freq, axis=axis, fill_value=fill_value ) def unstack( self, level=-1, fill_value=None, sort=True ) -> DataFrame: # noqa: PR01, RT01, D200 """ Unstack, also known as pivot, Series with MultiIndex to produce DataFrame. """ from .dataframe import DataFrame if not sort: # TODO: it should be easy to add support for sort == False return self._default_to_pandas( pandas.Series.unstack, level=level, fill_value=fill_value, sort=sort ) # We can't unstack a Series object, if we don't have a MultiIndex. if len(self.index.names) > 1: result = DataFrame( query_compiler=self._query_compiler.unstack(level, fill_value) ) else: raise ValueError( f"index must be a MultiIndex to unstack, {type(self.index)} was passed" ) return result.droplevel(0, axis=1) if result.columns.nlevels > 1 else result @property def plot( self, kind="line", ax=None, figsize=None, use_index=True, title=None, grid=None, legend=False, style=None, logx=False, logy=False, loglog=False, xticks=None, yticks=None, xlim=None, ylim=None, rot=None, fontsize=None, colormap=None, table=False, yerr=None, xerr=None, label=None, secondary_y=False, **kwds, ): # noqa: PR01, RT01, D200 """ Make plot of Series. """ return self._to_pandas().plot def pow( self, other, level=None, fill_value=None, axis=0 ) -> Series: # noqa: PR01, RT01, D200 """ Return exponential power of series and `other`, element-wise (binary operator `pow`). """ new_self, new_other = self._prepare_inter_op(other) return super(Series, new_self).pow( new_other, level=level, fill_value=fill_value, axis=axis ) @_inherit_docstrings(pandas.Series.prod, apilink="pandas.Series.prod") def prod( self, axis=None, skipna=True, numeric_only=False, min_count=0, **kwargs, ) -> Scalar: validate_bool_kwarg(skipna, "skipna", none_allowed=False) axis = self._get_axis_number(axis) new_index = self.columns if axis else self.index if min_count > len(new_index): return np.nan data = self._validate_dtypes_prod_mean(axis, numeric_only, ignore_axis=True) if min_count > 1: return data._reduce_dimension( data._query_compiler.prod_min_count( axis=axis, skipna=skipna, numeric_only=numeric_only, min_count=min_count, **kwargs, ) ) return data._reduce_dimension( data._query_compiler.prod( axis=axis, skipna=skipna, numeric_only=numeric_only, min_count=min_count, **kwargs, ) ) product = prod def ravel(self, order="C") -> ArrayLike: # noqa: PR01, RT01, D200 """ Return the flattened underlying data as an ndarray. """ data = self._query_compiler.to_numpy().flatten(order=order) if isinstance(self.dtype, pandas.CategoricalDtype): data = pandas.Categorical(data, dtype=self.dtype) return data @_inherit_docstrings(pandas.Series.reindex, apilink="pandas.Series.reindex") def reindex( self, index=None, *, axis: Axis = None, method: str = None, copy: Optional[bool] = None, level=None, fill_value=None, limit: int = None, tolerance=None, ) -> Series: # noqa: PR01, RT01, D200 if fill_value is None: fill_value = np.nan return super(Series, self).reindex( index=index, columns=None, method=method, level=level, copy=copy, limit=limit, tolerance=tolerance, fill_value=fill_value, ) def rename_axis( self, mapper=lib.no_default, *, index=lib.no_default, axis=0, copy=True, inplace=False, ) -> Union[Series, None]: # noqa: PR01, RT01, D200 """ Set the name of the axis for the index or columns. """ return super().rename_axis( mapper=mapper, index=index, axis=axis, copy=copy, inplace=inplace ) def _set_axis_name(self, name, axis=0, inplace=False) -> Union[Series, None]: """ Alter the name of the axis. Parameters ---------- name : str Name for the Series. axis : str or int, default: 0 The axis to set the label. Only 0 is valid for Series. inplace : bool, default: False Whether to modify `self` directly or return a copy. Returns ------- Series or None """ self._get_axis_number(axis) # raises ValueError if not 0 renamed = self if inplace else self.copy() renamed.index = renamed.index.set_names(name) return None if inplace else renamed def rename( self, index=None, *, axis=None, copy=None, inplace=False, level=None, errors="ignore", ) -> Union[Series, None]: # noqa: PR01, RT01, D200 """ Alter Series index labels or name. """ non_mapping = is_scalar(index) or ( is_list_like(index) and not is_dict_like(index) ) if non_mapping: if inplace: self.name = index else: self_cp = self.copy() self_cp.name = index return self_cp else: from .dataframe import DataFrame result = DataFrame(self.copy()).rename(index=index).squeeze(axis=1) result.name = self.name return result def repeat(self, repeats, axis=None) -> Series: # noqa: PR01, RT01, D200 """ Repeat elements of a Series. """ if (isinstance(repeats, int) and repeats == 0) or ( is_list_like(repeats) and len(repeats) == 1 and repeats[0] == 0 ): return self.__constructor__() return self.__constructor__(query_compiler=self._query_compiler.repeat(repeats)) def reset_index( self, level=None, *, drop=False, name=lib.no_default, inplace=False, allow_duplicates=False, ) -> Union[DataFrame, Series, None]: # noqa: PR01, RT01, D200 """ Generate a new Series with the index reset. """ if name is lib.no_default: # For backwards compatibility, keep columns as [0] instead of # [None] when self.name is None name = 0 if self.name is None else self.name if drop and level is None: new_idx = pandas.RangeIndex(len(self)) if inplace: self.index = new_idx else: result = self.copy() result.index = new_idx return result elif not drop and inplace: raise TypeError( "Cannot reset_index inplace on a Series to create a DataFrame" ) else: obj = self.copy() obj.name = name from .dataframe import DataFrame # Here `query_compiler` is passed instead of `obj` to avoid unnecessary `copy()` # inside `DataFrame` constructor return DataFrame(query_compiler=obj._query_compiler).reset_index( level=level, drop=drop, inplace=inplace, col_level=0, col_fill="", allow_duplicates=allow_duplicates, names=None, ) def rdivmod( self, other, level=None, fill_value=None, axis=0 ) -> Series: # noqa: PR01, RT01, D200 """ Return integer division and modulo of series and `other`, element-wise (binary operator `rdivmod`). """ division, modulo = self._query_compiler.rdivmod( other=other, level=level, fill_value=fill_value, axis=axis ) return self.__constructor__(query_compiler=division), self.__constructor__( query_compiler=modulo ) def rfloordiv( self, other, level=None, fill_value=None, axis=0 ) -> Series: # noqa: PR01, RT01, D200 """ Return integer division of series and `other`, element-wise (binary operator `rfloordiv`). """ new_self, new_other = self._prepare_inter_op(other) return super(Series, new_self).rfloordiv( new_other, level=level, fill_value=fill_value, axis=axis ) def rmod( self, other, level=None, fill_value=None, axis=0 ) -> Series: # noqa: PR01, RT01, D200 """ Return modulo of series and `other`, element-wise (binary operator `rmod`). """ new_self, new_other = self._prepare_inter_op(other) return super(Series, new_self).rmod( new_other, level=level, fill_value=fill_value, axis=axis ) def rpow( self, other, level=None, fill_value=None, axis=0 ) -> Series: # noqa: PR01, RT01, D200 """ Return exponential power of series and `other`, element-wise (binary operator `rpow`). """ new_self, new_other = self._prepare_inter_op(other) return super(Series, new_self).rpow( new_other, level=level, fill_value=fill_value, axis=axis ) def rsub( self, other, level=None, fill_value=None, axis=0 ) -> Series: # noqa: PR01, RT01, D200 """ Return subtraction of series and `other`, element-wise (binary operator `rsub`). """ new_self, new_other = self._prepare_inter_op(other) return super(Series, new_self).rsub( new_other, level=level, fill_value=fill_value, axis=axis ) def rtruediv( self, other, level=None, fill_value=None, axis=0 ) -> Series: # noqa: PR01, RT01, D200 """ Return floating division of series and `other`, element-wise (binary operator `rtruediv`). """ new_self, new_other = self._prepare_inter_op(other) return super(Series, new_self).rtruediv( new_other, level=level, fill_value=fill_value, axis=axis ) rdiv = rtruediv def quantile( self, q=0.5, interpolation="linear" ) -> Union[float, Series]: # noqa: PR01, RT01, D200 """ Return value at the given quantile. """ return super(Series, self).quantile( q=q, axis=0, numeric_only=False, interpolation=interpolation, method="single", ) def reorder_levels(self, order) -> Series: # noqa: PR01, RT01, D200 """ Rearrange index levels using input order. """ return super(Series, self).reorder_levels(order) def replace( self, to_replace=None, value=lib.no_default, *, inplace=False, limit=None, regex=False, method: str | lib.NoDefault = lib.no_default, ) -> Series: # noqa: PR01, RT01, D200 """ Replace values given in `to_replace` with `value`. """ inplace = validate_bool_kwarg(inplace, "inplace") new_query_compiler = self._query_compiler.replace( to_replace=to_replace, value=value, inplace=False, limit=limit, regex=regex, method=method, ) return self._create_or_update_from_compiler(new_query_compiler, inplace) def searchsorted( self, value, side="left", sorter=None ) -> Union[npt.NDArray[np.intp], np.intp]: # noqa: PR01, RT01, D200 """ Find indices where elements should be inserted to maintain order. """ searchsorted_qc = self._query_compiler if sorter is not None: # `iloc` method works slowly (https://github.com/modin-project/modin/issues/1903), # so _default_to_pandas is used for now # searchsorted_qc = self.iloc[sorter].reset_index(drop=True)._query_compiler # sorter = None return self._default_to_pandas( pandas.Series.searchsorted, value, side=side, sorter=sorter ) # searchsorted should return item number irrespective of Series index, so # Series.index is always set to pandas.RangeIndex, which can be easily processed # on the query_compiler level if not isinstance(searchsorted_qc.index, pandas.RangeIndex): searchsorted_qc = searchsorted_qc.reset_index(drop=True) result = self.__constructor__( query_compiler=searchsorted_qc.searchsorted( value=value, side=side, sorter=sorter ) ).squeeze() # matching Pandas output if not is_scalar(value) and not is_list_like(result): result = np.array([result]) elif isinstance(result, type(self)): result = result.to_numpy() return result def sort_values( self, *, axis=0, ascending=True, inplace=False, kind="quicksort", na_position="last", ignore_index: bool = False, key: Optional[IndexKeyFunc] = None, ) -> Union[Series, None]: # noqa: PR01, RT01, D200 """ Sort by the values. """ from .dataframe import DataFrame # When we convert to a DataFrame, the name is automatically converted to 0 if it # is None, so we do this to avoid a KeyError. by = self.name if self.name is not None else 0 result = ( DataFrame(self.copy()) .sort_values( by=by, ascending=ascending, inplace=False, kind=kind, na_position=na_position, ignore_index=ignore_index, key=key, ) .squeeze(axis=1) ) result.name = self.name return self._create_or_update_from_compiler( result._query_compiler, inplace=inplace ) cat = CachedAccessor("cat", CategoryMethods) sparse = CachedAccessor("sparse", SparseAccessor) str = CachedAccessor("str", StringMethods) dt = CachedAccessor("dt", DatetimeProperties) list = CachedAccessor("list", ListAccessor) struct = CachedAccessor("struct", StructAccessor) def squeeze(self, axis=None) -> Union[Series, Scalar]: # noqa: PR01, RT01, D200 """ Squeeze 1 dimensional axis objects into scalars. """ if axis is not None: # Validate `axis` pandas.Series._get_axis_number(axis) if len(self) == 1: return self._reduce_dimension(self._query_compiler) else: return self.copy() def sub( self, other, level=None, fill_value=None, axis=0 ) -> Series: # noqa: PR01, RT01, D200 """ Return subtraction of Series and `other`, element-wise (binary operator `sub`). """ new_self, new_other = self._prepare_inter_op(other) return super(Series, new_self).sub( new_other, level=level, fill_value=fill_value, axis=axis ) subtract = sub def sum( self, axis=None, skipna=True, numeric_only=False, min_count=0, **kwargs, ) -> Scalar: # noqa: PR01, RT01, D200 """ Return the sum of the values. """ validate_bool_kwarg(skipna, "skipna", none_allowed=False) axis = self._get_axis_number(axis) new_index = self.columns if axis else self.index if min_count > len(new_index): return np.nan data = self._validate_dtypes_prod_mean(axis, numeric_only, ignore_axis=False) if min_count > 1: return data._reduce_dimension( data._query_compiler.sum_min_count( axis=axis, skipna=skipna, numeric_only=numeric_only, min_count=min_count, **kwargs, ) ) return data._reduce_dimension( data._query_compiler.sum( axis=axis, skipna=skipna, numeric_only=numeric_only, min_count=min_count, **kwargs, ) ) def swaplevel(self, i=-2, j=-1, copy=None) -> Series: # noqa: PR01, RT01, D200 """ Swap levels `i` and `j` in a `MultiIndex`. """ copy = True if copy is None else copy obj = self.copy() if copy else self return super(Series, obj).swaplevel(i, j, axis=0) def take(self, indices, axis=0, **kwargs) -> Series: # noqa: PR01, RT01, D200 """ Return the elements in the given positional indices along an axis. """ return super(Series, self).take(indices, axis=axis, **kwargs) def to_dict(self, into=dict) -> dict: # pragma: no cover # noqa: PR01, RT01, D200 """ Convert Series to {label -> value} dict or dict-like object. """ return self._query_compiler.series_to_dict(into) def to_frame( self, name: Hashable = lib.no_default ) -> DataFrame: # noqa: PR01, RT01, D200 """ Convert Series to {label -> value} dict or dict-like object. """ from .dataframe import DataFrame if name is None: name = lib.no_default self_cp = self.copy() if name is not lib.no_default: self_cp.name = name return DataFrame(self_cp) def to_json( self, path_or_buf=None, orient=None, date_format=None, double_precision=10, force_ascii=True, date_unit="ms", default_handler=None, lines=False, compression="infer", index=None, indent=None, storage_options: StorageOptions = None, mode="w", ) -> str | None: from modin.core.execution.dispatching.factories.dispatcher import ( FactoryDispatcher, ) return FactoryDispatcher.to_json_series( self._query_compiler, path_or_buf, orient=orient, date_format=date_format, double_precision=double_precision, force_ascii=force_ascii, date_unit=date_unit, default_handler=default_handler, lines=lines, compression=compression, index=index, indent=indent, storage_options=storage_options, mode=mode, ) def to_list(self) -> list: # noqa: RT01, D200 """ Return a list of the values. """ return self._query_compiler.to_list() def to_numpy( self, dtype=None, copy=False, na_value=lib.no_default, **kwargs ) -> np.ndarray: # noqa: PR01, RT01, D200 """ Return the NumPy ndarray representing the values in this Series or Index. """ from modin.config import ModinNumpy if not ModinNumpy.get(): return ( super(Series, self) .to_numpy( dtype=dtype, copy=copy, na_value=na_value, ) .flatten() ) else: from ..numpy.arr import array return array(self, copy=copy) tolist = to_list # TODO(williamma12): When we implement to_timestamp, have this call the version # in base.py def to_period(self, freq=None, copy=None) -> Series: # noqa: PR01, RT01, D200 """ Cast to PeriodArray/Index at a particular frequency. """ return self._default_to_pandas("to_period", freq=freq, copy=copy) def to_string( self, buf=None, na_rep="NaN", float_format=None, header=True, index=True, length=False, dtype=False, name=False, max_rows=None, min_rows=None, ) -> Union[str, None]: # noqa: PR01, RT01, D200 """ Render a string representation of the Series. """ return self._default_to_pandas( pandas.Series.to_string, buf=buf, na_rep=na_rep, float_format=float_format, header=header, index=index, length=length, dtype=dtype, name=name, max_rows=max_rows, ) # TODO(williamma12): When we implement to_timestamp, have this call the version # in base.py def to_timestamp( self, freq=None, how="start", copy=None ) -> Series: # noqa: PR01, RT01, D200 """ Cast to DatetimeIndex of Timestamps, at beginning of period. """ return self._default_to_pandas("to_timestamp", freq=freq, how=how, copy=copy) def transpose(self, *args, **kwargs) -> Series: # noqa: PR01, RT01, D200 """ Return the transpose, which is by definition `self`. """ return self # To enable dynamic backend switching, we must use a `def` so the lookup of `self.transpose` # is performed dynamically, whereas declaring `T = property(transpose)` makes it always use # the originally-defined version without the switching wrapper. @property def T(self) -> Series: return self.transpose() def truediv( self, other, level=None, fill_value=None, axis=0 ) -> Series: # noqa: PR01, RT01, D200 """ Return floating division of series and `other`, element-wise (binary operator `truediv`). """ new_self, new_other = self._prepare_inter_op(other) return super(Series, new_self).truediv( new_other, level=level, fill_value=fill_value, axis=axis ) div = divide = truediv def unique(self) -> ArrayLike: # noqa: RT01, D200 """ Return unique values of Series object. """ # `values` can't be used here because it performs unnecessary conversion, # after which the result type does not match the pandas return ( self.__constructor__(query_compiler=self._query_compiler.unique()) .modin.to_pandas() ._values ) def update(self, other) -> None: # noqa: PR01, D200 """ Modify Series in place using values from passed Series. """ if not isinstance(other, Series): other = self.__constructor__(other) query_compiler = self._query_compiler.series_update(other._query_compiler) self._update_inplace(new_query_compiler=query_compiler) def value_counts( self, normalize=False, sort=True, ascending=False, bins=None, dropna=True ) -> Series: # noqa: PR01, RT01, D200 """ Return a Series containing counts of unique values. """ if bins is not None: # Potentially we could implement `cut` function from pandas API, which # bins values into intervals, and then we can just count them as regular values. # TODO #1333: new_self = self.__constructor__(pd.cut(self, bins, include_lowest=True), dtype="interval") return self._default_to_pandas( pandas.Series.value_counts, normalize=normalize, sort=sort, ascending=ascending, bins=bins, dropna=dropna, ) counted_values = super(Series, self).value_counts( subset=self, normalize=normalize, sort=sort, ascending=ascending, dropna=dropna, ) return counted_values def view(self, dtype=None) -> Series: # noqa: PR01, RT01, D200 """ Create a new view of the Series. """ return self.__constructor__( query_compiler=self._query_compiler.series_view(dtype=dtype) ) def where( self, cond, other=np.nan, *, inplace=False, axis=None, level=None, ) -> Union[Series, None]: # noqa: PR01, RT01, D200 """ Replace values where the condition is False. """ # TODO: probably need to remove this conversion to pandas if isinstance(other, Series): other = to_pandas(other) # TODO: add error checking like for dataframe where, then forward to # same query compiler method return self._default_to_pandas( pandas.Series.where, cond, other=other, inplace=inplace, axis=axis, level=level, ) @property def attrs(self) -> dict: # noqa: RT01, D200 """ Return dictionary of global attributes of this dataset. """ def attrs(df): return df.attrs return self._default_to_pandas(attrs) @property def array(self) -> ExtensionArray: # noqa: RT01, D200 """ Return the ExtensionArray of the data backing this Series or Index. """ def array(df): return df.array return self._default_to_pandas(array) @property def axes(self) -> list[pandas.Index]: # noqa: RT01, D200 """ Return a list of the row axis labels. """ return [self.index] @property def dtype(self) -> DtypeObj: # noqa: RT01, D200 """ Return the dtype object of the underlying data. """ return self._query_compiler.dtypes.squeeze() dtypes = dtype @property def empty(self) -> bool: # noqa: RT01, D200 """ Indicate whether Series is empty. """ return len(self) == 0 @property def hasnans(self) -> bool: # noqa: RT01, D200 """ Return True if Series has any nans. """ return self.isna().sum() > 0 @property def is_monotonic_increasing(self) -> bool: # noqa: RT01, D200 """ Return True if values in the Series are monotonic_increasing. """ return self._reduce_dimension(self._query_compiler.is_monotonic_increasing()) @property def is_monotonic_decreasing(self) -> bool: # noqa: RT01, D200 """ Return True if values in the Series are monotonic_decreasing. """ return self._reduce_dimension(self._query_compiler.is_monotonic_decreasing()) @property def is_unique(self) -> bool: # noqa: RT01, D200 """ Return True if values in the Series are unique. """ return self.nunique(dropna=False) == len(self) @property def nbytes(self) -> int: # noqa: RT01, D200 """ Return the number of bytes in the underlying data. """ return self.memory_usage(index=False) @property def ndim(self) -> int: # noqa: RT01, D200 """ Return the number of dimensions of the underlying data, by definition 1. """ return 1 def nunique(self, dropna=True) -> int: # noqa: PR01, RT01, D200 """ Return number of unique elements in the object. """ return super(Series, self).nunique(dropna=dropna) @property def shape(self) -> tuple[int]: # noqa: RT01, D200 """ Return a tuple of the shape of the underlying data. """ return (len(self),) def reindex_like( self, other, method=None, copy: Optional[bool] = None, limit=None, tolerance=None, ) -> Series: # docs say "Same as calling .reindex(index=other.index, columns=other.columns,...).": # https://pandas.pydata.org/pandas-docs/version/1.4/reference/api/pandas.Series.reindex_like.html return self.reindex( index=other.index, method=method, copy=copy, limit=limit, tolerance=tolerance, ) def _to_pandas(self) -> pandas.Series: """ Convert Modin Series to pandas Series. Recommended conversion method: `series.modin.to_pandas()`. Returns ------- pandas.Series """ df = self._query_compiler.to_pandas() series = df[df.columns[0]] if self._query_compiler.columns[0] == MODIN_UNNAMED_SERIES_LABEL: series.name = None return series def _to_datetime(self, **kwargs) -> Series: """ Convert `self` to datetime. Parameters ---------- **kwargs : dict Optional arguments to use during query compiler's `to_datetime` invocation. Returns ------- datetime Series of datetime64 dtype. """ return self.__constructor__( query_compiler=self._query_compiler.to_datetime(**kwargs) ) def _to_numeric(self, **kwargs) -> Series: """ Convert `self` to numeric. Parameters ---------- **kwargs : dict Optional arguments to use during query compiler's `to_numeric` invocation. Returns ------- numeric Series of numeric dtype. """ return self.__constructor__( query_compiler=self._query_compiler.to_numeric(**kwargs) ) def _qcut(self, q, **kwargs): # noqa: PR01, RT01, D200 """ Quantile-based discretization function. """ return self._default_to_pandas(pandas.qcut, q, **kwargs) def _reduce_dimension(self, query_compiler) -> Series | Scalar: """ Try to reduce the dimension of data from the `query_compiler`. Parameters ---------- query_compiler : BaseQueryCompiler Query compiler to retrieve the data. Returns ------- pandas.Series or scalar. """ return query_compiler.to_pandas().squeeze() def _validate_dtypes_prod_mean( self, axis, numeric_only, ignore_axis=False ) -> Series: """ Validate data dtype for `prod` and `mean` methods. Parameters ---------- axis : {0, 1} Axis to validate over. numeric_only : bool Whether or not to allow only numeric data. If True and non-numeric data is found, exception will be raised. ignore_axis : bool, default: False Whether or not to ignore `axis` parameter. Returns ------- Series Notes ----- Actually returns unmodified `self` object, added for compatibility with Modin DataFrame. """ return self def _validate_dtypes_min_max(self, axis, numeric_only) -> Series: """ Validate data dtype for `min` and `max` methods. Parameters ---------- axis : {0, 1} Axis to validate over. numeric_only : bool Whether or not to allow only numeric data. If True and non-numeric data is found, exception. Returns ------- Series Notes ----- Actually returns unmodified `self` object, added for compatibility with Modin DataFrame. """ return self def _validate_dtypes(self, numeric_only=False) -> None: """ Check that all the dtypes are the same. Parameters ---------- numeric_only : bool, default: False Whether or not to allow only numeric data. If True and non-numeric data is found, exception will be raised. Notes ----- Actually does nothing, added for compatibility with Modin DataFrame. """ pass def _get_numeric_data(self, axis: int) -> Series: """ Grab only numeric data from Series. Parameters ---------- axis : {0, 1} Axis to inspect on having numeric types only. Returns ------- Series Notes ----- `numeric_only` parameter is not supported by Series, so this method does not do anything. The method is added for compatibility with Modin DataFrame. """ return self def _update_inplace(self, new_query_compiler) -> None: """ Update the current Series in-place using `new_query_compiler`. Parameters ---------- new_query_compiler : BaseQueryCompiler QueryCompiler to use to manage the data. """ super(Series, self)._update_inplace(new_query_compiler=new_query_compiler) # Propagate changes back to parent so that column in dataframe had the same contents if self._parent is not None: if self._parent_axis == 0: self._parent.loc[self.name] = self else: self._parent[self.name] = self def _create_or_update_from_compiler( self, new_query_compiler, inplace=False ) -> Union[Series, None]: """ Return or update a Series with given `new_query_compiler`. Parameters ---------- new_query_compiler : PandasQueryCompiler QueryCompiler to use to manage the data. inplace : bool, default: False Whether or not to perform update or creation inplace. Returns ------- Series or None None if update was done, Series otherwise. """ assert ( isinstance(new_query_compiler, type(self._query_compiler)) or type(new_query_compiler) in self._query_compiler.__class__.__bases__ ), "Invalid Query Compiler object: {}".format(type(new_query_compiler)) if not inplace and new_query_compiler.is_series_like(): return self.__constructor__(query_compiler=new_query_compiler) elif not inplace: # This can happen with things like `reset_index` where we can add columns. from .dataframe import DataFrame return DataFrame(query_compiler=new_query_compiler) else: self._update_inplace(new_query_compiler=new_query_compiler) def _prepare_inter_op(self, other) -> tuple[Series, Series]: """ Prepare `self` and `other` for further interaction. Parameters ---------- other : Series or scalar value Another object `self` should interact with. Returns ------- Series Prepared `self`. Series Prepared `other`. """ if isinstance(other, Series): names_different = self.name != other.name # NB: if we don't need a rename, do the interaction with shallow # copies so that we preserve obj.index._id. It's fine to work # with shallow copies because we'll discard the copies but keep # the result after the interaction opreation. We can't do a rename # on shallow copies because we'll mutate the original objects. new_self = self.copy(deep=names_different) new_other = other.copy(deep=names_different) if names_different: new_self.name = new_other.name = MODIN_UNNAMED_SERIES_LABEL else: new_self = self new_other = other return new_self, new_other def _getitem(self, key) -> Union[Series, Scalar]: """ Get the data specified by `key` for this Series. Parameters ---------- key : Any Column id to retrieve from Series. Returns ------- Series or scalar Retrieved data. """ key = apply_if_callable(key, self) if isinstance(key, Series) and key.dtype == np.bool_: # This ends up being significantly faster than looping through and getting # each item individually. key = key._to_pandas() if is_bool_indexer(key): return self.__constructor__( query_compiler=self._query_compiler.getitem_row_array( pandas.RangeIndex(len(self))[key] ) ) # TODO: More efficiently handle `tuple` case for `Series.__getitem__` if isinstance(key, tuple): return self._default_to_pandas(pandas.Series.__getitem__, key) if not is_list_like(key): reduce_dimension = True key = [key] else: reduce_dimension = False # The check for whether or not `key` is in `keys()` will throw a TypeError # if the object is not hashable. When that happens, we just assume the # key is a list-like of row positions. try: is_indexer = all(k in self.keys() for k in key) except TypeError: is_indexer = False row_positions = self.index.get_indexer_for(key) if is_indexer else key if not all(is_integer(x) for x in row_positions): raise KeyError(key[0] if reduce_dimension else key) result = self._query_compiler.getitem_row_array(row_positions) if reduce_dimension: return self._reduce_dimension(result) return self.__constructor__(query_compiler=result) def _repartition(self) -> Series: """ Repartitioning Series to get ideal partitions inside. Allows to improve performance where the query compiler can't improve yet by doing implicit repartitioning. Returns ------- Series The repartitioned Series. """ return super()._repartition(axis=0) # Persistance support methods - BEGIN @classmethod def _inflate_light(cls, query_compiler, name, source_pid) -> Series: """ Re-creates the object from previously-serialized lightweight representation. The method is used for faster but not disk-storable persistence. Parameters ---------- query_compiler : BaseQueryCompiler Query compiler to use for object re-creation. name : str The name to give to the new object. source_pid : int Determines whether a Modin or pandas object needs to be created. Modin objects are created only on the main process. Returns ------- Series New Series based on the `query_compiler`. """ if os.getpid() != source_pid: res = query_compiler.to_pandas() # at the query compiler layer, `to_pandas` always returns a DataFrame, # even if it stores a Series, as a single-column DataFrame if res.columns == [MODIN_UNNAMED_SERIES_LABEL]: res = res.squeeze(axis=1) res.name = None return res # The current logic does not involve creating Modin objects # and manipulation with them in worker processes return cls(query_compiler=query_compiler, name=name) @classmethod def _inflate_full(cls, pandas_series, source_pid) -> Series: """ Re-creates the object from previously-serialized disk-storable representation. Parameters ---------- pandas_series : pandas.Series Data to use for object re-creation. source_pid : int Determines whether a Modin or pandas object needs to be created. Modin objects are created only on the main process. Returns ------- Series New Series based on the `pandas_series`. """ if os.getpid() != source_pid: return pandas_series # The current logic does not involve creating Modin objects # and manipulation with them in worker processes return cls(data=pandas_series) def __reduce__(self): self._query_compiler.finalize() pid = os.getpid() if ( PersistentPickle.get() or not self._query_compiler.support_materialization_in_worker_process() ): return self._inflate_full, (self._to_pandas(), pid) return self._inflate_light, (self._query_compiler, self.name, pid) # Persistance support methods - END @doc(SET_BACKEND_DOC, class_name=__qualname__) def set_backend( self, backend: str, inplace: bool = False, *, switch_operation: Optional[str] = None, ) -> Optional[Self]: # A series which is moved, potentially without its parent needs to # have it's parent reset. This is aligned with CoW chained assigment # semantics as well, but it is a little different from existing modin # semantics. This is why we only do this for hybrid and inplace # modification. if ( inplace and self._parent is not None and backend != self._parent.get_backend() ): self._parent = None return super().set_backend( backend=backend, inplace=inplace, switch_operation=switch_operation ) move_to = set_backend @doc(GET_BACKEND_DOC, class_name=__qualname__) @disable_logging def get_backend(self) -> str: return super().get_backend() @disable_logging @_inherit_docstrings(BasePandasDataset._copy_into) def _copy_into(self, other: Series): other._query_compiler = self._query_compiler other._siblings = self._siblings return None ================================================ FILE: modin/pandas/series_utils.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Implement Series's accessors public API as pandas does. Accessors: `Series.cat`, `Series.str`, `Series.dt` """ from __future__ import annotations import re from functools import cached_property from typing import TYPE_CHECKING import numpy as np import pandas from pandas._libs import lib from modin.logging import ClassLogger from modin.utils import _inherit_docstrings if TYPE_CHECKING: from datetime import tzinfo from pandas._typing import npt from modin.core.storage_formats import BaseQueryCompiler from modin.pandas import Series @_inherit_docstrings(pandas.core.arrays.arrow.ListAccessor) class ListAccessor(ClassLogger): _series: Series _query_compiler: BaseQueryCompiler def __init__(self, data: Series = None): self._series = data self._query_compiler = data._query_compiler @cached_property def _Series(self) -> Series: # noqa: GL08 # to avoid cyclic import from .series import Series return Series def flatten(self): return self._Series(query_compiler=self._query_compiler.list_flatten()) def len(self): return self._Series(query_compiler=self._query_compiler.list_len()) def __getitem__(self, key): return self._Series( query_compiler=self._query_compiler.list__getitem__(key=key) ) @_inherit_docstrings(pandas.core.arrays.arrow.StructAccessor) class StructAccessor(ClassLogger): _series: Series _query_compiler: BaseQueryCompiler def __init__(self, data: Series = None): self._series = data self._query_compiler = data._query_compiler @cached_property def _Series(self) -> Series: # noqa: GL08 # to avoid cyclic import from modin.pandas.series import Series return Series @property def dtypes(self): return self._Series(query_compiler=self._query_compiler.struct_dtypes()) def field(self, name_or_index): return self._Series( query_compiler=self._query_compiler.struct_field( name_or_index=name_or_index ) ) def explode(self): from modin.pandas.dataframe import DataFrame return DataFrame(query_compiler=self._query_compiler.struct_explode()) @_inherit_docstrings(pandas.core.arrays.categorical.CategoricalAccessor) class CategoryMethods(ClassLogger): _series: Series _query_compiler: BaseQueryCompiler def __init__(self, data: Series): self._series = data self._query_compiler = data._query_compiler @cached_property def _Series(self) -> Series: # noqa: GL08 # to avoid cyclic import from modin.pandas.series import Series return Series @property def categories(self): return self._series.dtype.categories @categories.setter def categories(self, categories): def set_categories(series, categories): series.cat.categories = categories self._series._default_to_pandas(set_categories, categories=categories) @property def ordered(self): return self._series.dtype.ordered @property def codes(self): return self._Series(query_compiler=self._query_compiler.cat_codes()) def rename_categories(self, new_categories): return self._default_to_pandas( pandas.Series.cat.rename_categories, new_categories ) def reorder_categories(self, new_categories, ordered=None): return self._default_to_pandas( pandas.Series.cat.reorder_categories, new_categories, ordered=ordered, ) def add_categories(self, new_categories): return self._default_to_pandas(pandas.Series.cat.add_categories, new_categories) def remove_categories(self, removals): return self._default_to_pandas(pandas.Series.cat.remove_categories, removals) def remove_unused_categories(self): return self._default_to_pandas(pandas.Series.cat.remove_unused_categories) def set_categories(self, new_categories, ordered=None, rename=False): return self._default_to_pandas( pandas.Series.cat.set_categories, new_categories, ordered=ordered, rename=rename, ) def as_ordered(self): return self._default_to_pandas(pandas.Series.cat.as_ordered) def as_unordered(self): return self._default_to_pandas(pandas.Series.cat.as_unordered) def _default_to_pandas(self, op, *args, **kwargs): """ Convert `self` to pandas type and call a pandas cat.`op` on it. Parameters ---------- op : str Name of pandas function. *args : list Additional positional arguments to be passed in `op`. **kwargs : dict Additional keywords arguments to be passed in `op`. Returns ------- object Result of operation. """ return self._series._default_to_pandas( lambda series: op(series.cat, *args, **kwargs) ) @_inherit_docstrings(pandas.core.strings.accessor.StringMethods) class StringMethods(ClassLogger): _series: Series _query_compiler: BaseQueryCompiler def __init__(self, data: Series): # Check if dtypes is objects self._series = data self._query_compiler = data._query_compiler @cached_property def _Series(self) -> Series: # noqa: GL08 # to avoid cyclic import from .series import Series return Series def casefold(self): return self._Series(query_compiler=self._query_compiler.str_casefold()) def cat(self, others=None, sep=None, na_rep=None, join="left"): if isinstance(others, self._Series): others = others._to_pandas() compiler_result = self._query_compiler.str_cat( others=others, sep=sep, na_rep=na_rep, join=join ) # if others is None, result is a string. otherwise, it's a series. return ( compiler_result.to_pandas().squeeze() if others is None else self._Series(query_compiler=compiler_result) ) def decode(self, encoding, errors="strict", dtype=None): return self._Series( query_compiler=self._query_compiler.str_decode(encoding, errors, dtype) ) def split(self, pat=None, *, n=-1, expand=False, regex=None): if expand: from .dataframe import DataFrame return DataFrame( query_compiler=self._query_compiler.str_split( pat=pat, n=n, expand=True, regex=regex ) ) else: return self._Series( query_compiler=self._query_compiler.str_split( pat=pat, n=n, expand=expand, regex=regex ) ) def rsplit(self, pat=None, *, n=-1, expand=False): if not pat and pat is not None: raise ValueError("rsplit() requires a non-empty pattern match.") if expand: from .dataframe import DataFrame return DataFrame( query_compiler=self._query_compiler.str_rsplit( pat=pat, n=n, expand=True ) ) else: return self._Series( query_compiler=self._query_compiler.str_rsplit( pat=pat, n=n, expand=expand ) ) def get(self, i): return self._Series(query_compiler=self._query_compiler.str_get(i)) def join(self, sep): if sep is None: raise AttributeError("'NoneType' object has no attribute 'join'") return self._Series(query_compiler=self._query_compiler.str_join(sep)) def get_dummies(self, sep="|"): return self._Series(query_compiler=self._query_compiler.str_get_dummies(sep)) def contains(self, pat, case=True, flags=0, na=lib.no_default, regex=True): if pat is None and not case: raise AttributeError("'NoneType' object has no attribute 'upper'") if na is lib.no_default: na = None return self._Series( query_compiler=self._query_compiler.str_contains( pat, case=case, flags=flags, na=na, regex=regex ) ) def replace(self, pat, repl, n=-1, case=None, flags=0, regex=False): if not (isinstance(repl, str) or callable(repl)): raise TypeError("repl must be a string or callable") return self._Series( query_compiler=self._query_compiler.str_replace( pat, repl, n=n, case=case, flags=flags, regex=regex ) ) def pad(self, width, side="left", fillchar=" "): if len(fillchar) != 1: raise TypeError("fillchar must be a character, not str") return self._Series( query_compiler=self._query_compiler.str_pad( width, side=side, fillchar=fillchar ) ) def center(self, width, fillchar=" "): if len(fillchar) != 1: raise TypeError("fillchar must be a character, not str") return self._Series( query_compiler=self._query_compiler.str_center(width, fillchar=fillchar) ) def ljust(self, width, fillchar=" "): if len(fillchar) != 1: raise TypeError("fillchar must be a character, not str") return self._Series( query_compiler=self._query_compiler.str_ljust(width, fillchar=fillchar) ) def rjust(self, width, fillchar=" "): if len(fillchar) != 1: raise TypeError("fillchar must be a character, not str") return self._Series( query_compiler=self._query_compiler.str_rjust(width, fillchar=fillchar) ) def zfill(self, width): return self._Series(query_compiler=self._query_compiler.str_zfill(width)) def wrap(self, width, **kwargs): if width <= 0: raise ValueError("invalid width {} (must be > 0)".format(width)) return self._Series( query_compiler=self._query_compiler.str_wrap(width, **kwargs) ) def slice(self, start=None, stop=None, step=None): if step == 0: raise ValueError("slice step cannot be zero") return self._Series( query_compiler=self._query_compiler.str_slice( start=start, stop=stop, step=step ) ) def slice_replace(self, start=None, stop=None, repl=None): return self._Series( query_compiler=self._query_compiler.str_slice_replace( start=start, stop=stop, repl=repl ) ) def count(self, pat, flags=0): if not isinstance(pat, (str, re.Pattern)): raise TypeError("first argument must be string or compiled pattern") return self._Series( query_compiler=self._query_compiler.str_count(pat, flags=flags) ) def startswith(self, pat, na=lib.no_default): if na is lib.no_default: na = None return self._Series( query_compiler=self._query_compiler.str_startswith(pat, na=na) ) def encode(self, encoding, errors="strict"): return self._Series( query_compiler=self._query_compiler.str_encode(encoding, errors) ) def endswith(self, pat, na=lib.no_default): if na is lib.no_default: na = None return self._Series( query_compiler=self._query_compiler.str_endswith(pat, na=na) ) def findall(self, pat, flags=0): if not isinstance(pat, (str, re.Pattern)): raise TypeError("first argument must be string or compiled pattern") return self._Series( query_compiler=self._query_compiler.str_findall(pat, flags=flags) ) def fullmatch(self, pat, case=True, flags=0, na=lib.no_default): if not isinstance(pat, (str, re.Pattern)): raise TypeError("first argument must be string or compiled pattern") if na is lib.no_default: na = None return self._Series( query_compiler=self._query_compiler.str_fullmatch( pat, case=case, flags=flags, na=na ) ) def match(self, pat, case=True, flags=0, na=lib.no_default): if not isinstance(pat, (str, re.Pattern)): raise TypeError("first argument must be string or compiled pattern") if na is lib.no_default: na = None return self._Series( query_compiler=self._query_compiler.str_match( pat, case=case, flags=flags, na=na ) ) def extract(self, pat, flags=0, expand=True): query_compiler = self._query_compiler.str_extract( pat, flags=flags, expand=expand ) from .dataframe import DataFrame return ( DataFrame(query_compiler=query_compiler) if expand or re.compile(pat).groups > 1 else self._Series(query_compiler=query_compiler) ) def extractall(self, pat, flags=0): return self._Series( query_compiler=self._query_compiler.str_extractall(pat, flags) ) def len(self): return self._Series(query_compiler=self._query_compiler.str_len()) def strip(self, to_strip=None): return self._Series( query_compiler=self._query_compiler.str_strip(to_strip=to_strip) ) def rstrip(self, to_strip=None): return self._Series( query_compiler=self._query_compiler.str_rstrip(to_strip=to_strip) ) def lstrip(self, to_strip=None): return self._Series( query_compiler=self._query_compiler.str_lstrip(to_strip=to_strip) ) def partition(self, sep=" ", expand=True): if sep is not None and len(sep) == 0: raise ValueError("empty separator") from .dataframe import DataFrame return (DataFrame if expand else self._Series)( query_compiler=self._query_compiler.str_partition(sep=sep, expand=expand) ) def removeprefix(self, prefix): return self._Series( query_compiler=self._query_compiler.str_removeprefix(prefix) ) def removesuffix(self, suffix): return self._Series( query_compiler=self._query_compiler.str_removesuffix(suffix) ) def repeat(self, repeats): return self._Series(query_compiler=self._query_compiler.str_repeat(repeats)) def rpartition(self, sep=" ", expand=True): if sep is not None and len(sep) == 0: raise ValueError("empty separator") from .dataframe import DataFrame return (DataFrame if expand else self._Series)( query_compiler=self._query_compiler.str_rpartition(sep=sep, expand=expand) ) def lower(self): return self._Series(query_compiler=self._query_compiler.str_lower()) def upper(self): return self._Series(query_compiler=self._query_compiler.str_upper()) def title(self): return self._Series(query_compiler=self._query_compiler.str_title()) def find(self, sub, start=0, end=None): if not isinstance(sub, str): raise TypeError( "expected a string object, not {0}".format(type(sub).__name__) ) return self._Series( query_compiler=self._query_compiler.str_find(sub, start=start, end=end) ) def rfind(self, sub, start=0, end=None): if not isinstance(sub, str): raise TypeError( "expected a string object, not {0}".format(type(sub).__name__) ) return self._Series( query_compiler=self._query_compiler.str_rfind(sub, start=start, end=end) ) def index(self, sub, start=0, end=None): if not isinstance(sub, str): raise TypeError( "expected a string object, not {0}".format(type(sub).__name__) ) return self._Series( query_compiler=self._query_compiler.str_index(sub, start=start, end=end) ) def rindex(self, sub, start=0, end=None): if not isinstance(sub, str): raise TypeError( "expected a string object, not {0}".format(type(sub).__name__) ) return self._Series( query_compiler=self._query_compiler.str_rindex(sub, start=start, end=end) ) def capitalize(self): return self._Series(query_compiler=self._query_compiler.str_capitalize()) def swapcase(self): return self._Series(query_compiler=self._query_compiler.str_swapcase()) def normalize(self, form): return self._Series(query_compiler=self._query_compiler.str_normalize(form)) def translate(self, table): return self._Series(query_compiler=self._query_compiler.str_translate(table)) def isalnum(self): return self._Series(query_compiler=self._query_compiler.str_isalnum()) def isalpha(self): return self._Series(query_compiler=self._query_compiler.str_isalpha()) def isdigit(self): return self._Series(query_compiler=self._query_compiler.str_isdigit()) def isspace(self): return self._Series(query_compiler=self._query_compiler.str_isspace()) def islower(self): return self._Series(query_compiler=self._query_compiler.str_islower()) def isupper(self): return self._Series(query_compiler=self._query_compiler.str_isupper()) def istitle(self): return self._Series(query_compiler=self._query_compiler.str_istitle()) def isnumeric(self): return self._Series(query_compiler=self._query_compiler.str_isnumeric()) def isdecimal(self): return self._Series(query_compiler=self._query_compiler.str_isdecimal()) def __getitem__(self, key): # noqa: GL08 return self._Series(query_compiler=self._query_compiler.str___getitem__(key)) def _default_to_pandas(self, op, *args, **kwargs): """ Convert `self` to pandas type and call a pandas str.`op` on it. Parameters ---------- op : str Name of pandas function. *args : list Additional positional arguments to be passed in `op`. **kwargs : dict Additional keywords arguments to be passed in `op`. Returns ------- object Result of operation. """ return self._series._default_to_pandas( lambda series: op(series.str, *args, **kwargs) ) @_inherit_docstrings(pandas.core.indexes.accessors.CombinedDatetimelikeProperties) class DatetimeProperties(ClassLogger): # noqa: GL08 _series: Series _query_compiler: BaseQueryCompiler def __init__(self, data: Series): self._series = data self._query_compiler = data._query_compiler @cached_property def _Series(self) -> Series: # noqa: GL08 # to avoid cyclic import from .series import Series return Series @property def date(self): return self._Series(query_compiler=self._query_compiler.dt_date()) @property def time(self): return self._Series(query_compiler=self._query_compiler.dt_time()) @property def timetz(self): return self._Series(query_compiler=self._query_compiler.dt_timetz()) @property def year(self): return self._Series(query_compiler=self._query_compiler.dt_year()) @property def month(self): return self._Series(query_compiler=self._query_compiler.dt_month()) @property def day(self): return self._Series(query_compiler=self._query_compiler.dt_day()) @property def hour(self): return self._Series(query_compiler=self._query_compiler.dt_hour()) @property def minute(self): return self._Series(query_compiler=self._query_compiler.dt_minute()) @property def second(self): return self._Series(query_compiler=self._query_compiler.dt_second()) @property def microsecond(self): return self._Series(query_compiler=self._query_compiler.dt_microsecond()) @property def nanosecond(self): return self._Series(query_compiler=self._query_compiler.dt_nanosecond()) @property def dayofweek(self): return self._Series(query_compiler=self._query_compiler.dt_dayofweek()) day_of_week = dayofweek @property def weekday(self): return self._Series(query_compiler=self._query_compiler.dt_weekday()) @property def dayofyear(self): return self._Series(query_compiler=self._query_compiler.dt_dayofyear()) day_of_year = dayofyear @property def quarter(self): return self._Series(query_compiler=self._query_compiler.dt_quarter()) @property def is_month_start(self): return self._Series(query_compiler=self._query_compiler.dt_is_month_start()) @property def is_month_end(self): return self._Series(query_compiler=self._query_compiler.dt_is_month_end()) @property def is_quarter_start(self): return self._Series(query_compiler=self._query_compiler.dt_is_quarter_start()) @property def is_quarter_end(self): return self._Series(query_compiler=self._query_compiler.dt_is_quarter_end()) @property def is_year_start(self): return self._Series(query_compiler=self._query_compiler.dt_is_year_start()) @property def is_year_end(self): return self._Series(query_compiler=self._query_compiler.dt_is_year_end()) @property def is_leap_year(self): return self._Series(query_compiler=self._query_compiler.dt_is_leap_year()) @property def daysinmonth(self): return self._Series(query_compiler=self._query_compiler.dt_daysinmonth()) @property def days_in_month(self): return self._Series(query_compiler=self._query_compiler.dt_days_in_month()) @property def tz(self) -> "tzinfo | None": dtype = self._series.dtype if isinstance(dtype, np.dtype): return None return dtype.tz @property def freq(self): # noqa: GL08 return self._query_compiler.dt_freq().to_pandas().squeeze() @property def unit(self): # noqa: GL08 # use `iloc[0]` to return scalar return self._Series(query_compiler=self._query_compiler.dt_unit()).iloc[0] def as_unit(self, *args, **kwargs): # noqa: GL08 return self._Series( query_compiler=self._query_compiler.dt_as_unit(*args, **kwargs) ) def to_period(self, *args, **kwargs): return self._Series( query_compiler=self._query_compiler.dt_to_period(*args, **kwargs) ) def asfreq(self, *args, **kwargs): return self._Series( query_compiler=self._query_compiler.dt_asfreq(*args, **kwargs) ) def to_pydatetime(self): return self._Series( query_compiler=self._query_compiler.dt_to_pydatetime() ).to_numpy() def tz_localize(self, *args, **kwargs): return self._Series( query_compiler=self._query_compiler.dt_tz_localize(*args, **kwargs) ) def tz_convert(self, *args, **kwargs): return self._Series( query_compiler=self._query_compiler.dt_tz_convert(*args, **kwargs) ) def normalize(self, *args, **kwargs): return self._Series( query_compiler=self._query_compiler.dt_normalize(*args, **kwargs) ) def strftime(self, *args, **kwargs): return self._Series( query_compiler=self._query_compiler.dt_strftime(*args, **kwargs) ) def round(self, *args, **kwargs): return self._Series( query_compiler=self._query_compiler.dt_round(*args, **kwargs) ) def floor(self, *args, **kwargs): return self._Series( query_compiler=self._query_compiler.dt_floor(*args, **kwargs) ) def ceil(self, *args, **kwargs): return self._Series( query_compiler=self._query_compiler.dt_ceil(*args, **kwargs) ) def month_name(self, *args, **kwargs): return self._Series( query_compiler=self._query_compiler.dt_month_name(*args, **kwargs) ) def day_name(self, *args, **kwargs): return self._Series( query_compiler=self._query_compiler.dt_day_name(*args, **kwargs) ) def total_seconds(self, *args, **kwargs): return self._Series( query_compiler=self._query_compiler.dt_total_seconds(*args, **kwargs) ) def to_pytimedelta(self) -> "npt.NDArray[np.object_]": res = self._query_compiler.dt_to_pytimedelta() return res.to_numpy()[:, 0] @property def seconds(self): return self._Series(query_compiler=self._query_compiler.dt_seconds()) @property def days(self): return self._Series(query_compiler=self._query_compiler.dt_days()) @property def microseconds(self): return self._Series(query_compiler=self._query_compiler.dt_microseconds()) @property def nanoseconds(self): return self._Series(query_compiler=self._query_compiler.dt_nanoseconds()) @property def components(self): from .dataframe import DataFrame return DataFrame(query_compiler=self._query_compiler.dt_components()) def isocalendar(self): from .dataframe import DataFrame return DataFrame(query_compiler=self._query_compiler.dt_isocalendar()) @property def qyear(self): # noqa: GL08 return self._Series(query_compiler=self._query_compiler.dt_qyear()) @property def start_time(self): return self._Series(query_compiler=self._query_compiler.dt_start_time()) @property def end_time(self): return self._Series(query_compiler=self._query_compiler.dt_end_time()) def to_timestamp(self, *args, **kwargs): return self._Series( query_compiler=self._query_compiler.dt_to_timestamp(*args, **kwargs) ) ================================================ FILE: modin/pandas/testing/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Public testing utility functions. """ from __future__ import annotations from typing import Literal from pandas._libs import lib from pandas.testing import assert_extension_array_equal from pandas.testing import assert_frame_equal as pd_assert_frame_equal from pandas.testing import assert_index_equal from pandas.testing import assert_series_equal as pd_assert_series_equal from modin.utils import _inherit_docstrings, try_cast_to_pandas @_inherit_docstrings(pd_assert_frame_equal, apilink="pandas.testing.assert_frame_equal") def assert_frame_equal( left, right, check_dtype: bool | Literal["equiv"] = True, check_index_type: bool | Literal["equiv"] = "equiv", check_column_type: bool | Literal["equiv"] = "equiv", check_frame_type: bool = True, check_names: bool = True, by_blocks: bool = False, check_exact: bool | lib.NoDefault = lib.no_default, check_datetimelike_compat: bool = False, check_categorical: bool = True, check_like: bool = False, check_freq: bool = True, check_flags: bool = True, rtol: float | lib.NoDefault = lib.no_default, atol: float | lib.NoDefault = lib.no_default, obj: str = "DataFrame", ) -> None: left = try_cast_to_pandas(left) right = try_cast_to_pandas(right) pd_assert_frame_equal( left, right, check_dtype=check_dtype, check_index_type=check_index_type, check_column_type=check_column_type, check_frame_type=check_frame_type, check_names=check_names, by_blocks=by_blocks, check_exact=check_exact, check_datetimelike_compat=check_datetimelike_compat, check_categorical=check_categorical, check_like=check_like, check_freq=check_freq, check_flags=check_flags, rtol=rtol, atol=atol, obj=obj, ) @_inherit_docstrings( pd_assert_series_equal, apilink="pandas.testing.assert_series_equal" ) def assert_series_equal( left, right, check_dtype: bool | Literal["equiv"] = True, check_index_type: bool | Literal["equiv"] = "equiv", check_series_type: bool = True, check_names: bool = True, check_exact: bool | lib.NoDefault = lib.no_default, check_datetimelike_compat: bool = False, check_categorical: bool = True, check_category_order: bool = True, check_freq: bool = True, check_flags: bool = True, rtol: float | lib.NoDefault = lib.no_default, atol: float | lib.NoDefault = lib.no_default, obj: str = "Series", *, check_index: bool = True, check_like: bool = False, ) -> None: left = try_cast_to_pandas(left) right = try_cast_to_pandas(right) pd_assert_series_equal( left, right, check_dtype=check_dtype, check_index_type=check_index_type, check_series_type=check_series_type, check_names=check_names, check_exact=check_exact, check_datetimelike_compat=check_datetimelike_compat, check_categorical=check_categorical, check_category_order=check_category_order, check_freq=check_freq, check_flags=check_flags, rtol=rtol, atol=atol, obj=obj, check_index=check_index, check_like=check_like, ) __all__ = [ "assert_extension_array_equal", "assert_frame_equal", "assert_series_equal", "assert_index_equal", ] ================================================ FILE: modin/pandas/utils.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Implement utils for pandas component.""" from __future__ import annotations from typing import Any, Iterator, Optional, Tuple import numpy as np import pandas from pandas._typing import AggFuncType, AggFuncTypeBase, AggFuncTypeDict, IndexLabel from pandas.util._decorators import doc from modin.utils import hashable _doc_binary_operation = """ Return {operation} of {left} and `{right}` (binary operator `{bin_op}`). Parameters ---------- {right} : {right_type} The second operand to perform computation. Returns ------- {returns} """ SET_DATAFRAME_ATTRIBUTE_WARNING = ( "Modin doesn't allow columns to be created via a new attribute name - see " + "https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access" ) GET_BACKEND_DOC = """ Get the backend for this ``{class_name}``. Returns ------- str The name of the backend. """ SET_BACKEND_DOC = """ Move the data in this ``{class_name}`` from its current backend to the given one. Further operations on this ``{class_name}`` will use the new backend instead of the current one. Parameters ---------- backend : str The name of the backend to set. inplace : bool, default: False Whether to modify this ``{class_name}`` in place. switch_operation : Optional[str], default: None The name of the operation that triggered the set_backend call. Internal argument used for displaying progress bar information. Returns ------- ``{class_name}`` or None If ``inplace`` is False, returns a new instance of the ``{class_name}`` with the given backend. If ``inplace`` is ``True``, returns None. Notes ----- This method will attempt to use the starting and new backend's move_from or move_to methods if the backends implement them. Otherwise, it will 1) convert the data in this ``{class_name}`` to a pandas DataFrame in this Python process 2) load the data from pandas to the new backend. Either step may be slow and/or memory-intensive, especially if this ``{class_name}``'s data is large, or one or both of the backends do not store their data locally. """ def cast_function_modin2pandas(func): """ Replace Modin functions with pandas functions if `func` is callable. Parameters ---------- func : object Returns ------- object """ if callable(func) and (module := getattr(func, "__module__", None)) is not None: if module == "modin.pandas.series": func = getattr(pandas.Series, func.__name__) elif module in ("modin.pandas.dataframe", "modin.pandas.base"): # FIXME: when the method is defined in `modin.pandas.base` file, then the # type cannot be determined, in general there may be an error, but at the # moment it is better. func = getattr(pandas.DataFrame, func.__name__) return func def is_scalar(obj): """ Return True if given object is scalar. This method works the same as is_scalar method from pandas but it is optimized for Modin frames. For BasePandasDataset objects pandas version of is_scalar tries to access missing attribute causing index scan. This triggers execution for lazy frames and we avoid it by handling BasePandasDataset objects separately. Parameters ---------- obj : object Object to check. Returns ------- bool True if given object is scalar and False otherwise. """ from pandas.api.types import is_scalar as pandas_is_scalar from .base import BasePandasDataset return not isinstance(obj, BasePandasDataset) and pandas_is_scalar(obj) def get_pandas_backend(dtypes: pandas.Series) -> str | None: """ Determine the backend based on the `dtypes`. Parameters ---------- dtypes : pandas.Series DataFrame dtypes. Returns ------- str | None Backend name. """ backend = None if any(isinstance(x, pandas.ArrowDtype) for x in dtypes): backend = "pyarrow" return backend def is_full_grab_slice(slc, sequence_len=None): """ Check that the passed slice grabs the whole sequence. Parameters ---------- slc : slice Slice object to check. sequence_len : int, optional Length of the sequence to index with the passed `slc`. If not specified the function won't be able to check whether ``slc.stop`` is equal or greater than the sequence length to consider `slc` to be a full-grab, and so, only slices with ``.stop is None`` are considered to be a full-grab. Returns ------- bool """ assert isinstance(slc, slice), "slice object required" return ( slc.start in (None, 0) and slc.step in (None, 1) and ( slc.stop is None or (sequence_len is not None and slc.stop >= sequence_len) ) ) def from_modin_frame_to_mi(df, sortorder=None, names=None): """ Make a pandas.MultiIndex from a DataFrame. Parameters ---------- df : DataFrame DataFrame to be converted to pandas.MultiIndex. sortorder : int, default: None Level of sortedness (must be lexicographically sorted by that level). names : list-like, optional If no names are provided, use the column names, or tuple of column names if the columns is a MultiIndex. If a sequence, overwrite names with the given sequence. Returns ------- pandas.MultiIndex The pandas.MultiIndex representation of the given DataFrame. """ from .dataframe import DataFrame if isinstance(df, DataFrame): from modin.error_message import ErrorMessage ErrorMessage.default_to_pandas("`MultiIndex.from_frame`") df = df._to_pandas() return _original_pandas_MultiIndex_from_frame(df, sortorder, names) def is_label(obj, label, axis=0): """ Check whether or not 'obj' contain column or index level with name 'label'. Parameters ---------- obj : modin.pandas.DataFrame, modin.pandas.Series or modin.core.storage_formats.base.BaseQueryCompiler Object to check. label : object Label name to check. axis : {0, 1}, default: 0 Axis to search for `label` along. Returns ------- bool True if check is successful, False otherwise. """ qc = getattr(obj, "_query_compiler", obj) return hashable(label) and ( label in qc.get_axis(axis ^ 1) or label in qc.get_index_names(axis) ) def check_both_not_none(option1, option2): """ Check that both `option1` and `option2` are not None. Parameters ---------- option1 : Any First object to check if not None. option2 : Any Second object to check if not None. Returns ------- bool True if both option1 and option2 are not None, False otherwise. """ return not (option1 is None or option2 is None) def broadcast_item( obj, row_lookup, col_lookup, item, need_columns_reindex: bool = True, sort_lookups_and_item: bool = True, ): """ Use NumPy to broadcast or reshape item with reindexing. Parameters ---------- obj : DataFrame or Series or query compiler The object containing the necessary information about the axes. row_lookup : slice or scalar The global row index to locate inside of `item`. col_lookup : range, array, list, slice or scalar The global col index to locate inside of `item`. item : DataFrame, Series, or query_compiler Value that should be broadcast to a new shape of `to_shape`. need_columns_reindex : bool, default: True In the case of assigning columns to a dataframe (broadcasting is part of the flow), reindexing is not needed. sort_lookups_and_item : bool, default: True If set, sort the lookups in ascending order and the item to match. This is necessary to ensure writes across multiple partitions are ordered correctly when the lookups are unsorted. Returns ------- (np.ndarray, Optional[Series], array-like, array-like) * np.ndarray - `item` after it was broadcasted to `to_shape`. * Series - item's dtypes. * array-like - sorted version of `row_lookup` (may or may not be the same reference) * array-like - sorted version of `col_lookup` (may or may not be the same reference) Raises ------ ValueError 1) If `row_lookup` or `col_lookup` contains values missing in DataFrame/Series index or columns correspondingly. 2) If `item` cannot be broadcast from its own shape to `to_shape`. Notes ----- NumPy is memory efficient, there shouldn't be performance issue. """ # It is valid to pass a DataFrame or Series to __setitem__ that is larger than # the target the user is trying to overwrite. from .dataframe import DataFrame from .series import Series new_row_len = ( len(obj.index[row_lookup]) if isinstance(row_lookup, slice) else len(row_lookup) ) new_col_len = ( len(obj.columns[col_lookup]) if isinstance(col_lookup, slice) else len(col_lookup) ) to_shape = new_row_len, new_col_len dtypes = None if isinstance(item, (pandas.Series, pandas.DataFrame, Series, DataFrame)): # convert indices in lookups to names, as pandas reindex expects them to be so axes_to_reindex = {} index_values = obj.index[row_lookup] if not index_values.equals(item.index): axes_to_reindex["index"] = index_values if need_columns_reindex and isinstance(item, (pandas.DataFrame, DataFrame)): column_values = obj.columns[col_lookup] if not column_values.equals(item.columns): axes_to_reindex["columns"] = column_values # New value for columns/index make that reindex add NaN values if axes_to_reindex: item = item.reindex(**axes_to_reindex) dtypes = item.dtypes if not isinstance(dtypes, pandas.Series): dtypes = pandas.Series([dtypes]) try: # Cast to numpy drop information about heterogeneous types (cast to common) # TODO: we shouldn't do that, maybe there should be the if branch item = np.array(item) def sort_index(lookup: Any) -> np.ndarray: """ Return the argsort and sorted version of the lookup index. Values in the lookup are guaranteed by the indexing frontend to be non-negative. The sort operation must be stable to ensure proper behavior for iloc set, which will use the last item encountered if two items share an index. """ if isinstance(lookup, slice): # Special case for if a descending slice is passed # Directly calling np.array(slice(...)) does not work lookup = range(lookup.start or 0, lookup.stop or 0, lookup.step or 0) argsort_index = np.argsort(lookup, kind="stable") return argsort_index, np.array(lookup)[argsort_index] def should_avoid_sort(lookup: Any) -> bool: return ( not sort_lookups_and_item or ( isinstance(lookup, (range, pandas.RangeIndex, slice)) and lookup.step is not None and lookup.step > 0 ) or (isinstance(lookup, slice) and lookup == slice(None)) ) # Fast path to avoid sorting for range/RangeIndex, which are already sorted, or the empty slice avoid_row_lookup_sort = should_avoid_sort(row_lookup) avoid_col_lookup_sort = should_avoid_sort(col_lookup) # Sort both the columns and rows if necessary if item.ndim >= 2: if avoid_row_lookup_sort: if not avoid_col_lookup_sort: col_argsort, col_lookup = sort_index(col_lookup) item = item[:, col_argsort] elif avoid_col_lookup_sort: row_argsort, row_lookup = sort_index(row_lookup) item = item[row_argsort, :] else: row_argsort, row_lookup = sort_index(row_lookup) col_argsort, col_lookup = sort_index(col_lookup) # Use np.ix_ to handle broadcasting errors item = item[np.ix_(row_argsort, col_argsort)] elif not avoid_row_lookup_sort: # Item is 1D, so only sort row indexer row_argsort, row_lookup = sort_index(row_lookup) item = item[row_argsort] if dtypes is None: dtypes = pandas.Series([item.dtype] * len(col_lookup)) if np.prod(to_shape) == np.prod(item.shape): return item.reshape(to_shape), dtypes, row_lookup, col_lookup else: return np.broadcast_to(item, to_shape), dtypes, row_lookup, col_lookup except ValueError: from_shape = np.array(item).shape raise ValueError( f"could not broadcast input array from shape {from_shape} into shape " + f"{to_shape}" ) def _walk_aggregation_func( key: IndexLabel, value: AggFuncType, depth: int = 0 ) -> Iterator[Tuple[IndexLabel, AggFuncTypeBase, Optional[str], bool]]: """ Walk over a function from a dictionary-specified aggregation. Note: this function is not supposed to be called directly and is used by ``walk_aggregation_dict``. Parameters ---------- key : IndexLabel A key in a dictionary-specified aggregation for the passed `value`. This means an index label to apply the `value` functions against. value : AggFuncType An aggregation function matching the `key`. depth : int, default: 0 Specifies a nesting level for the `value` where ``depth=0`` is when you call the function on a raw dictionary value. Yields ------ (col: IndexLabel, func: AggFuncTypeBase, func_name: Optional[str], col_renaming_required: bool) Yield an aggregation function with its metadata: - `col`: column name to apply the function. - `func`: aggregation function to apply to the column. - `func_name`: custom function name that was specified in the dict. - `col_renaming_required`: whether it's required to rename the `col` into ``(col, func_name)``. """ col_renaming_required = bool(depth) if isinstance(value, (list, tuple)): if depth == 0: for val in value: yield from _walk_aggregation_func(key, val, depth + 1) elif depth == 1: if len(value) != 2: raise ValueError( f"Incorrect rename format. Renamer must consist of exactly two elements, got: {len(value)}." ) func_name, func = value yield key, func, func_name, col_renaming_required else: # pandas doesn't support this as well raise NotImplementedError("Nested renaming is not supported.") else: yield key, value, None, col_renaming_required def walk_aggregation_dict( agg_dict: AggFuncTypeDict, ) -> Iterator[Tuple[IndexLabel, AggFuncTypeBase, Optional[str], bool]]: """ Walk over an aggregation dictionary. Parameters ---------- agg_dict : AggFuncTypeDict Yields ------ (col: IndexLabel, func: AggFuncTypeBase, func_name: Optional[str], col_renaming_required: bool) Yield an aggregation function with its metadata: - `col`: column name to apply the function. - `func`: aggregation function to apply to the column. - `func_name`: custom function name that was specified in the dict. - `col_renaming_required`: whether it's required to rename the `col` into ``(col, func_name)``. """ for key, value in agg_dict.items(): yield from _walk_aggregation_func(key, value) def _doc_binary_op(operation, bin_op, left="Series", right="right", returns="Series"): """ Return callable documenting `Series` or `DataFrame` binary operator. Parameters ---------- operation : str Operation name. bin_op : str Binary operation name. left : str, default: 'Series' The left object to document. right : str, default: 'right' The right operand name. returns : str, default: 'Series' Type of returns. Returns ------- callable """ if left == "Series": right_type = "Series or scalar value" elif left == "DataFrame": right_type = "DataFrame, Series or scalar value" elif left == "BasePandasDataset": right_type = "BasePandasDataset or scalar value" else: raise NotImplementedError( f"Only 'BasePandasDataset', `DataFrame` and 'Series' `left` are allowed, actually passed: {left}" ) doc_op = doc( _doc_binary_operation, operation=operation, right=right, right_type=right_type, bin_op=bin_op, returns=returns, left=left, ) return doc_op _original_pandas_MultiIndex_from_frame = pandas.MultiIndex.from_frame pandas.MultiIndex.from_frame = from_modin_frame_to_mi ================================================ FILE: modin/pandas/window.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Implement Window and Rolling public API.""" from __future__ import annotations from typing import TYPE_CHECKING, Optional, Union import pandas.core.window.rolling from pandas.core.dtypes.common import is_list_like from modin.error_message import ErrorMessage from modin.logging import ClassLogger from modin.pandas.utils import cast_function_modin2pandas from modin.utils import _inherit_docstrings if TYPE_CHECKING: from modin.core.storage_formats import BaseQueryCompiler from modin.pandas import DataFrame, Series @_inherit_docstrings(pandas.core.window.rolling.Window) class Window(ClassLogger): _dataframe: Union[DataFrame, Series] _query_compiler: BaseQueryCompiler def __init__( self, dataframe: Union[DataFrame, Series], window=None, min_periods=None, center=False, win_type=None, on=None, axis=0, closed=None, step=None, method="single", ): self._dataframe = dataframe self._query_compiler = dataframe._query_compiler self.window_kwargs = { "window": window, "min_periods": min_periods, "center": center, "win_type": win_type, "on": on, "axis": axis, "closed": closed, "step": step, "method": method, } self.axis = axis def mean(self, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.window_mean( self.axis, self.window_kwargs, *args, **kwargs ) ) def sum(self, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.window_sum( self.axis, self.window_kwargs, *args, **kwargs ) ) def var(self, ddof=1, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.window_var( self.axis, self.window_kwargs, ddof, *args, **kwargs ) ) def std(self, ddof=1, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.window_std( self.axis, self.window_kwargs, ddof, *args, **kwargs ) ) @_inherit_docstrings( pandas.core.window.rolling.Rolling, excluded=[pandas.core.window.rolling.Rolling.__init__], ) class Rolling(ClassLogger): def __init__( self, dataframe, window=None, min_periods=None, center=False, win_type=None, on=None, axis=0, closed=None, step=None, method="single", ): if step is not None: raise NotImplementedError("step parameter is not implemented yet.") self._dataframe = dataframe self._query_compiler = dataframe._query_compiler self.rolling_kwargs = { "window": window, "min_periods": min_periods, "center": center, "win_type": win_type, "on": on, "axis": axis, "closed": closed, "step": step, "method": method, } self.axis = axis def _call_qc_method(self, method_name, *args, **kwargs): """ Call a query compiler method for the specified rolling aggregation. Parameters ---------- method_name : str Name of the aggregation. *args : tuple Positional arguments to pass to the query compiler method. **kwargs : dict Keyword arguments to pass to the query compiler method. Returns ------- BaseQueryCompiler QueryCompiler holding the result of the aggregation. """ qc_method = getattr(self._query_compiler, f"rolling_{method_name}") return qc_method(self.axis, self.rolling_kwargs, *args, **kwargs) def _aggregate(self, method_name, *args, **kwargs): """ Run the specified rolling aggregation. Parameters ---------- method_name : str Name of the aggregation. *args : tuple Positional arguments to pass to the aggregation. **kwargs : dict Keyword arguments to pass to the aggregation. Returns ------- DataFrame or Series Result of the aggregation. """ qc_result = self._call_qc_method(method_name, *args, **kwargs) return self._dataframe.__constructor__(query_compiler=qc_result) def count(self): return self._aggregate("count") def sem(self, *args, **kwargs): return self._aggregate("sem", *args, **kwargs) def sum(self, *args, **kwargs): return self._aggregate("sum", *args, **kwargs) def mean(self, *args, **kwargs): return self._aggregate("mean", *args, **kwargs) def median(self, **kwargs): return self._aggregate("median", **kwargs) def var(self, ddof=1, *args, **kwargs): return self._aggregate("var", ddof, *args, **kwargs) def std(self, ddof=1, *args, **kwargs): return self._aggregate("std", ddof, *args, **kwargs) def min(self, *args, **kwargs): return self._aggregate("min", *args, **kwargs) def max(self, *args, **kwargs): return self._aggregate("max", *args, **kwargs) def corr(self, other=None, pairwise=None, *args, **kwargs): from .dataframe import DataFrame from .series import Series if isinstance(other, DataFrame): other = other._query_compiler.to_pandas() elif isinstance(other, Series): other = other._query_compiler.to_pandas().squeeze() return self._aggregate("corr", other, pairwise, *args, **kwargs) def cov(self, other=None, pairwise=None, ddof: Optional[int] = 1, **kwargs): from .dataframe import DataFrame from .series import Series if isinstance(other, DataFrame): other = other._query_compiler.to_pandas() elif isinstance(other, Series): other = other._query_compiler.to_pandas().squeeze() return self._aggregate("cov", other, pairwise, ddof, **kwargs) def skew(self, **kwargs): return self._aggregate("skew", **kwargs) def kurt(self, **kwargs): return self._aggregate("kurt", **kwargs) def apply( self, func, raw=False, engine="cython", engine_kwargs=None, args=None, kwargs=None, ): func = cast_function_modin2pandas(func) return self._aggregate("apply", func, raw, engine, engine_kwargs, args, kwargs) def aggregate( self, func, *args, **kwargs, ): from .dataframe import DataFrame dataframe = DataFrame( query_compiler=self._call_qc_method( "aggregate", func, *args, **kwargs, ) ) if isinstance(self._dataframe, DataFrame): return dataframe elif is_list_like(func) and dataframe.columns.nlevels > 1: dataframe.columns = dataframe.columns.droplevel() return dataframe else: return dataframe.squeeze() agg = aggregate def quantile(self, q, interpolation="linear", **kwargs): return self._aggregate("quantile", q, interpolation, **kwargs) def rank( self, method="average", ascending=True, pct=False, numeric_only=False, **kwargs ): return self._aggregate("rank", method, ascending, pct, numeric_only, **kwargs) @_inherit_docstrings(Rolling) class RollingGroupby(Rolling): def __init__(self, groupby_obj, *args, **kwargs): self._as_index = groupby_obj._kwargs.get("as_index", True) self._groupby_obj = ( groupby_obj if self._as_index else groupby_obj._override(as_index=True) ) super().__init__(self._groupby_obj._df, *args, **kwargs) def sem(self, *args, **kwargs): ErrorMessage.mismatch_with_pandas( operation="RollingGroupby.sem() when 'as_index=False'", message=( "The group columns won't be involved in the aggregation.\n" + "See this gh-issue for more information: https://github.com/modin-project/modin/issues/6291" ), ) return super().sem(*args, **kwargs) def corr(self, other=None, pairwise=None, *args, **kwargs): # pandas behavior is that it always assumes that 'as_index=True' for the '.corr()' method return super().corr( *args, as_index=True, other=other, pairwise=pairwise, **kwargs ) def cov(self, other=None, pairwise=None, ddof: Optional[int] = 1, **kwargs): # pandas behavior is that it always assumes that 'as_index=True' for the '.cov()' method return super().cov(as_index=True, other=other, pairwise=pairwise, **kwargs) def _aggregate(self, method_name, *args, as_index=None, **kwargs): """ Run the specified rolling aggregation. Parameters ---------- method_name : str Name of the aggregation. *args : tuple Positional arguments to pass to the aggregation. as_index : bool, optional Whether the result should have the group labels as index levels or as columns. If not specified the parameter value will be taken from groupby kwargs. **kwargs : dict Keyword arguments to pass to the aggregation. Returns ------- DataFrame or Series Result of the aggregation. """ res = self._groupby_obj._wrap_aggregation( qc_method=type(self._query_compiler).groupby_rolling, numeric_only=False, agg_args=args, agg_kwargs=kwargs, agg_func=method_name, rolling_kwargs=self.rolling_kwargs, ) if as_index is None: as_index = self._as_index if not as_index: res = res.reset_index( level=[i for i in range(len(self._groupby_obj._internal_by))], drop=False, ) return res def _call_qc_method(self, method_name, *args, **kwargs): return self._aggregate(method_name, *args, **kwargs)._query_compiler @_inherit_docstrings( pandas.core.window.expanding.Expanding, excluded=[pandas.core.window.expanding.Expanding.__init__], ) class Expanding(ClassLogger): def __init__(self, dataframe, min_periods=1, axis=0, method="single"): self._dataframe = dataframe self._query_compiler = dataframe._query_compiler self.expanding_args = [min_periods, axis, method] self.axis = axis def aggregate(self, func, *args, **kwargs): from .dataframe import DataFrame dataframe = DataFrame( query_compiler=self._query_compiler.expanding_aggregate( self.axis, self.expanding_args, func, *args, **kwargs ) ) if isinstance(self._dataframe, DataFrame): return dataframe elif is_list_like(func): dataframe.columns = dataframe.columns.droplevel() return dataframe else: return dataframe.squeeze() def sum(self, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.expanding_sum( self.axis, self.expanding_args, *args, **kwargs ) ) def min(self, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.expanding_min( self.axis, self.expanding_args, *args, **kwargs ) ) def max(self, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.expanding_max( self.axis, self.expanding_args, *args, **kwargs ) ) def mean(self, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.expanding_mean( self.axis, self.expanding_args, *args, **kwargs ) ) def median(self, numeric_only=False, engine=None, engine_kwargs=None, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.expanding_median( self.axis, self.expanding_args, numeric_only=numeric_only, engine=engine, engine_kwargs=engine_kwargs, **kwargs, ) ) def var(self, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.expanding_var( self.axis, self.expanding_args, *args, **kwargs ) ) def std(self, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.expanding_std( self.axis, self.expanding_args, *args, **kwargs ) ) def count(self, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.expanding_count( self.axis, self.expanding_args, *args, **kwargs ) ) def cov(self, other=None, pairwise=None, ddof=1, numeric_only=False, **kwargs): from .dataframe import DataFrame from .series import Series return self._dataframe.__constructor__( query_compiler=self._query_compiler.expanding_cov( self.axis, self.expanding_args, squeeze_self=isinstance(self._dataframe, Series), squeeze_other=isinstance(other, Series), other=( other._query_compiler if isinstance(other, (Series, DataFrame)) else other ), pairwise=pairwise, ddof=ddof, numeric_only=numeric_only, **kwargs, ) ) def corr(self, other=None, pairwise=None, ddof=1, numeric_only=False, **kwargs): from .dataframe import DataFrame from .series import Series return self._dataframe.__constructor__( query_compiler=self._query_compiler.expanding_corr( self.axis, self.expanding_args, squeeze_self=isinstance(self._dataframe, Series), squeeze_other=isinstance(other, Series), other=( other._query_compiler if isinstance(other, (Series, DataFrame)) else other ), pairwise=pairwise, ddof=ddof, numeric_only=numeric_only, **kwargs, ) ) def sem(self, ddof=1, numeric_only=False, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.expanding_sem( self.axis, self.expanding_args, ddof=ddof, numeric_only=numeric_only, *args, **kwargs, ) ) def skew(self, numeric_only=False, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.expanding_skew( self.axis, self.expanding_args, numeric_only=numeric_only, **kwargs ) ) def kurt(self, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.expanding_kurt( self.axis, self.expanding_args, **kwargs ) ) def quantile(self, q, interpolation="linear", **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.expanding_quantile( self.axis, self.expanding_args, q, interpolation, **kwargs ) ) def rank( self, method="average", ascending=True, pct=False, numeric_only=False, **kwargs ): return self._dataframe.__constructor__( query_compiler=self._query_compiler.expanding_rank( self.axis, self.expanding_args, method, ascending, pct, numeric_only, **kwargs, ) ) ================================================ FILE: modin/polars/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. from modin.polars.dataframe import DataFrame from modin.polars.series import Series __all__ = ["DataFrame", "Series"] ================================================ FILE: modin/polars/base.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Implement DataFrame/Series public API as polars does.""" from __future__ import annotations from typing import TYPE_CHECKING, Any, Sequence import polars from modin.core.storage_formats import BaseQueryCompiler if TYPE_CHECKING: import numpy as np from modin.polars import DataFrame, Series class BasePolarsDataset: _query_compiler: BaseQueryCompiler @property def __constructor__(self): """ DataFrame constructor. Returns: Constructor of the DataFrame """ return type(self) def __eq__(self, other) -> "BasePolarsDataset": return self.__constructor__( _query_compiler=self._query_compiler.eq( other._query_compiler if isinstance(other, BasePolarsDataset) else other ) ) def __ne__(self, other) -> "BasePolarsDataset": return self.__constructor__( _query_compiler=self._query_compiler.ne( other._query_compiler if isinstance(other, BasePolarsDataset) else other ) ) def __add__(self, other) -> "BasePolarsDataset": return self.__constructor__( _query_compiler=self._query_compiler.add( other._query_compiler if isinstance(other, BasePolarsDataset) else other ) ) def __sub__(self, other) -> "BasePolarsDataset": return self.__constructor__( _query_compiler=self._query_compiler.sub( other._query_compiler if isinstance(other, BasePolarsDataset) else other ) ) def __mul__(self, other) -> "BasePolarsDataset": return self.__constructor__( _query_compiler=self._query_compiler.mul( other._query_compiler if isinstance(other, BasePolarsDataset) else other ) ) def __truediv__(self, other) -> "BasePolarsDataset": return self.__constructor__( _query_compiler=self._query_compiler.truediv( other._query_compiler if isinstance(other, BasePolarsDataset) else other ) ) def __floordiv__(self, other) -> "BasePolarsDataset": return self.__constructor__( _query_compiler=self._query_compiler.floordiv( other._query_compiler if isinstance(other, BasePolarsDataset) else other ) ) def __mod__(self, other) -> "BasePolarsDataset": return self.__constructor__( _query_compiler=self._query_compiler.mod( other._query_compiler if isinstance(other, BasePolarsDataset) else other ) ) def __pow__(self, other) -> "BasePolarsDataset": return self.__constructor__( _query_compiler=self._query_compiler.pow( other._query_compiler if isinstance(other, BasePolarsDataset) else other ) ) def __and__(self, other) -> "BasePolarsDataset": return self.__constructor__( _query_compiler=self._query_compiler.__and__( other._query_compiler if isinstance(other, BasePolarsDataset) else other ) ) def __or__(self, other) -> "BasePolarsDataset": return self.__constructor__( _query_compiler=self._query_compiler.__or__( other._query_compiler if isinstance(other, BasePolarsDataset) else other ) ) def __xor__(self, other) -> "BasePolarsDataset": return self.__constructor__( _query_compiler=self._query_compiler.__xor__( other._query_compiler if isinstance(other, BasePolarsDataset) else other ) ) def __lt__(self, other) -> "BasePolarsDataset": return self.__constructor__( _query_compiler=self._query_compiler.lt( other._query_compiler if isinstance(other, BasePolarsDataset) else other ) ) def __le__(self, other) -> "BasePolarsDataset": return self.__constructor__( _query_compiler=self._query_compiler.le( other._query_compiler if isinstance(other, BasePolarsDataset) else other ) ) def __gt__(self, other) -> "BasePolarsDataset": return self.__constructor__( _query_compiler=self._query_compiler.gt( other._query_compiler if isinstance(other, BasePolarsDataset) else other ) ) def __ge__(self, other) -> "BasePolarsDataset": return self.__constructor__( _query_compiler=self._query_compiler.ge( other._query_compiler if isinstance(other, BasePolarsDataset) else other ) ) def __invert__(self) -> "BasePolarsDataset": return self.__constructor__(_query_compiler=self._query_compiler.invert()) def __neg__(self) -> "BasePolarsDataset": return self.__constructor__(_query_compiler=self._query_compiler.negative()) def __abs__(self) -> "BasePolarsDataset": return self.__constructor__(_query_compiler=self._query_compiler.abs()) def is_duplicated(self): """ Determine whether each row is a duplicate in the DataFrame. Returns: DataFrame with True for each duplicate row, and False for unique rows. """ return self.__constructor__( _query_compiler=self._query_compiler.duplicated(keep=False) ) def is_empty(self) -> bool: """ Determine whether the DataFrame is empty. Returns: True if the DataFrame is empty, False otherwise """ return self.height == 0 def is_unique(self): """ Determine whether each row is unique in the DataFrame. Returns: DataFrame with True for each unique row, and False for duplicate rows. """ return self.__constructor__( _query_compiler=self._query_compiler.duplicated(keep=False).invert() ) def n_chunks(self, strategy: str = "first") -> int | list[int]: raise NotImplementedError("not yet") def to_arrow(self): """ Convert the DataFrame to Arrow format. Returns: Arrow representation of the DataFrame. """ return polars.from_pandas(self._query_compiler.to_pandas()).to_arrow() def to_jax(self, device=None): """ Convert the DataFrame to JAX format. Args: device: The device to use. Returns: JAX representation of the DataFrame. """ return polars.from_pandas(self._query_compiler.to_pandas()).to_jax( device=device ) def to_numpy( self, *, writable: bool = False, allow_copy: bool = True, use_pyarrow: bool | None = None, zero_copy_only: bool | None = None, ) -> "np.ndarray": """ Convert the DataFrame to a NumPy representation. Args: writable: Whether the NumPy array should be writable. allow_copy: Whether to allow copying the data. use_pyarrow: Whether to use PyArrow for conversion. zero_copy_only: Whether to use zero-copy conversion only. Returns: NumPy representation of the DataFrame. """ return polars.from_pandas(self._query_compiler.to_pandas()).to_numpy( writable=writable, allow_copy=allow_copy, use_pyarrow=use_pyarrow, zero_copy_only=zero_copy_only, ) def to_torch(self): """ Convert the DataFrame to PyTorch format. Returns: PyTorch representation of the DataFrame. """ return polars.from_pandas(self._query_compiler.to_pandas()).to_torch() def bottom_k( self, k: int, *, by, descending: bool | Sequence[bool] = False, nulls_last: bool | Sequence[bool] | None = None, maintain_order: bool | None = None, ) -> "BasePolarsDataset": raise NotImplementedError("not yet") def cast(self, dtypes, *, strict: bool = True) -> "BasePolarsDataset": """ Cast the DataFrame to the given dtypes. Args: dtypes: Dtypes to cast the DataFrame to. strict: Whether to enforce strict casting. Returns: DataFrame with the new dtypes. """ # TODO: support strict return self.__constructor__(_query_compiler=self._query_compiler.astype(dtypes)) def clone(self) -> "BasePolarsDataset": """ Clone the DataFrame. Returns: Cloned DataFrame. """ return self.copy() def drop_nulls(self, subset=None): """ Drop the rows with null values. Args: subset: Columns to consider for null values. Returns: DataFrame with the rows with null values dropped. """ return self.__constructor__( _query_compiler=self._query_compiler.dropna(subset=subset, how="any") ) def explode(self, columns: str, *more_columns: str) -> "BasePolarsDataset": """ Explode the given columns to long format. Args: columns: Columns to explode. more_columns: Additional columns to explode. Returns: DataFrame with the columns exploded. """ if len(more_columns) > 0: columns = [columns, *more_columns] return self.__constructor__( _query_compiler=self._query_compiler.explode(columns) ) def extend(self, other: "BasePolarsDataset") -> "BasePolarsDataset": """ Extend the DataFrame with another DataFrame. Args: other: DataFrame to extend with. Returns: Extended DataFrame for convenience. DataFrame is modified in place. """ self._query_compiler = self._query_compiler.concat( axis=0, other=other._query_compiler ) return self def fill_nan(self, value): """ Fill NaN values with the given value. Args: value: Value to fill NaN values with. Returns: DataFrame with NaN values filled. """ # TODO: Handle null values differently than nan. return self.__constructor__(_query_compiler=self._query_compiler.fillna(value)) def fill_null( self, value: Any | None = None, strategy: str | None = None, limit: int | None = None, *, matches_supertype: bool = True, ) -> "BasePolarsDataset": """ Fill null values with the given value or strategy. Args: value: Value to fill null values with. strategy: Strategy to fill null values with. limit: Maximum number of null values to fill. matches_supertype: Whether the value matches the supertype. Returns: DataFrame with null values filled. """ if strategy == "forward": strategy = "ffill" elif strategy == "backward": strategy = "bfill" elif strategy in ["min", "max", "mean"]: value = getattr(self, strategy)()._query_compiler strategy = None elif strategy == "zero": strategy = None value = 0 elif strategy == "one": strategy = None value = 1 else: raise ValueError(f"Unknown strategy: {strategy}") return self.__constructor__( _query_compiler=self._query_compiler.fillna( value=value, method=strategy, limit=limit ) ) def filter(self, *predicates, **constraints: Any) -> "BasePolarsDataset": predicates = predicates[0] for p in predicates[1:]: predicates = predicates & p if constraints: raise NotImplementedError("Named constraints are not supported") return self.__constructor__( _query_compiler=self._query_compiler.getitem_array( predicates._query_compiler ) ) def gather_every(self, n: int, offset: int = 0) -> "BasePolarsDataset": """ Gather every nth row of the DataFrame. Args: n: Number of rows to gather. offset: Offset to start gathering from. Returns: DataFrame with every nth row gathered. """ return self.__constructor__( _query_compiler=self._query_compiler.getitem_row_array( slice(offset, None, n) ) ) def head(self, n: int = 5) -> "BasePolarsDataset": """ Get the first n rows of the DataFrame. Args: n: Number of rows to get. Returns: DataFrame with the first n rows. """ return self.__constructor__( _query_compiler=self._query_compiler.getitem_row_array(slice(0, n)) ) def limit(self, n: int = 10) -> "BasePolarsDataset": """ Limit the DataFrame to the first n rows. Args: n: Number of rows to limit to. Returns: DataFrame with the first n rows. """ return self.head(n) def interpolate(self) -> "BasePolarsDataset": """ Interpolate values the DataFrame using a linear method. Returns: DataFrame with the interpolated values. """ return self.__constructor__(_query_compiler=self._query_compiler.interpolate()) def sample( self, n: int | "Series" | None = None, *, fraction: float | "Series" | None = None, with_replacement: bool = False, shuffle: bool = False, seed: int | None = None, ) -> "BasePolarsDataset": """ Sample the DataFrame. Args: n: Number of rows to sample. fraction: Fraction of rows to sample. with_replacement: Whether to sample with replacement. shuffle: Whether to shuffle the rows. seed: Seed for the random number generator. Returns: Sampled DataFrame. """ return self.__constructor__( _query_compiler=self.to_pandas() .sample(n=n, frac=fraction, replace=with_replacement, random_state=seed) ._query_compiler ) def shift(self, n: int = 1, *, fill_value=None) -> "DataFrame": raise NotImplementedError("not yet") def shrink_to_fit(self) -> "DataFrame": """ Shrink the DataFrame to fit in memory. Returns: A copy of the DataFrame. """ return self.copy() def slice(self, offset: int, length: int) -> "DataFrame": """ Slice the DataFrame. Args: offset: Offset to start the slice from. length: Length of the slice. Returns: Sliced DataFrame. """ return self.__constructor__( _query_compiler=self._query_compiler.getitem_row_array( slice(offset, offset + length) ) ) def sort( self, by, *more_by, descending: bool | Sequence[bool] = False, nulls_last: bool | Sequence[bool] | None = None, multithreaded: bool = True, maintain_order: bool = False, ) -> "DataFrame": """ Sort the DataFrame. Args: by: Column to sort by. more_by: Additional columns to sort by. descending: Whether to sort in descending order. nulls_last: Whether to sort null values last. multithreaded: Whether to use multiple threads. maintain_order: Whether to maintain the order of the DataFrame. Returns: Sorted DataFrame. """ # TODO: support expressions in by if len(more_by) > 0: by = [by, *more_by] return self.__constructor__( _query_compiler=self._query_compiler.sort_rows_by_column_values( by=by, reverse=descending, nulls_first=None if nulls_last is None else not nulls_last, ) ) def tail(self, n: int = 5) -> "DataFrame": """ Get the last n rows of the DataFrame. Args: n: Number of rows to get. Returns: DataFrame with the last n rows. """ return self.__constructor__( _query_compiler=self._query_compiler.getitem_row_array(slice(-n, None)) ) def to_dummies( self, columns: str | Sequence[str] | None = None, *, separator: str = "_", drop_first: bool = False, ) -> "DataFrame": """ Convert the columns to dummy variables. Args: columns: Columns to convert to dummy variables. separator: Separator for the dummy variables. drop_first: Whether to drop the first dummy variable. Returns: DataFrame with the columns converted to dummy variables. """ if columns is not None: if isinstance(columns, str): columns = [columns] else: columns = self.columns result = self.__constructor__( _query_compiler=self._query_compiler.get_dummies(columns) ) if separator != "_": result.columns = [ c.replace(separator, "_") if separator in c else c for c in result.columns ] if drop_first: columns_to_drop = [ next( result_col for result_col in result.columns if result_col.startswith(c) ) for c in columns ] return result.drop(columns_to_drop) else: return result def top_k( self, k: int, *, by, descending: bool | Sequence[bool] = False, nulls_last: bool | Sequence[bool] | None = None, maintain_order: bool | None = None, ) -> "DataFrame": raise NotImplementedError("not yet") def unique(self, subset=None, *, keep="any", maintain_order: bool = False): """ Get the unique values in each column. Args: subset: Columns to consider for unique values. keep: Strategy to keep unique values. maintain_order: Whether to maintain the order of the unique values. Returns: DataFrame with the unique values in each column. """ if keep == "none" or keep == "last": # TODO: support keep="none" raise NotImplementedError("not yet") return self.__constructor__( _query_compiler=self._query_compiler.unique(subset=subset) ) def equals(self, other: "BasePolarsDataset", *, null_equal: bool = True) -> bool: """ Determine whether the DataFrame is equal to another DataFrame. Args: other: DataFrame to compare with. Returns: True if the DataFrames are equal, False otherwise. """ return ( isinstance(other, type(self)) and self._query_compiler.equals(other._query_compiler) and ( null_equal or ( not self.to_pandas().isna().any(axis=None) and not other.to_pandas().isna().any(axis=None) ) ) ) @property def plot(self): return polars.from_pandas(self._query_compiler.to_pandas()).plot def count(self): """ Get the number of non-null values in each column. Returns: DataFrame with the counts. """ return self.__constructor__(_query_compiler=self._query_compiler.count(axis=0)) ================================================ FILE: modin/polars/dataframe.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses ``DataFrame`` class, that is distributed version of ``polars.DataFrame``.""" from __future__ import annotations from collections import OrderedDict from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, Sequence import numpy as np import pandas import polars from pandas.core.dtypes.common import is_list_like from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler from modin.pandas import DataFrame as ModinPandasDataFrame from modin.pandas import Series as ModinPandasSeries from modin.pandas.io import from_pandas from modin.polars.base import BasePolarsDataset if TYPE_CHECKING: from modin.polars import Series from modin.polars.groupby import GroupBy from modin.polars.lazyframe import LazyFrame class DataFrame(BasePolarsDataset): def __init__( self, data=None, schema=None, *, schema_overrides=None, strict=True, orient=None, infer_schema_length=100, nan_to_null=False, _query_compiler=None, ) -> None: """ Constructor for DataFrame object. Args: data: Data to be converted to DataFrame. schema: Schema of the data. schema_overrides: Schema overrides. strict: Whether to enforce strict schema. orient: Orientation of the data. infer_schema_length: Length of the data to infer schema. nan_to_null: Whether to convert NaNs to nulls. _query_compiler: Query compiler to use. """ if _query_compiler is None: if isinstance(data, (ModinPandasDataFrame, ModinPandasSeries)): self._query_compiler: BaseQueryCompiler = data._query_compiler.copy() else: self._query_compiler: BaseQueryCompiler = from_pandas( polars.DataFrame( data=data, schema=schema, schema_overrides=schema_overrides, strict=strict, orient=orient, infer_schema_length=infer_schema_length, nan_to_null=nan_to_null, ).to_pandas() )._query_compiler else: self._query_compiler: BaseQueryCompiler = _query_compiler def __getitem__(self, item): """ Get item from DataFrame. Args: item: Column to get. Returns: Series or DataFrame with the column. """ if is_list_like(item): missing = [i for i in item if i not in self.columns] if len(missing) > 0: raise polars.exceptions.ColumnNotFoundError(missing[0]) return self.__constructor__( _query_compiler=self._query_compiler.getitem_array(item) ) else: if item not in self.columns: raise polars.exceptions.ColumnNotFoundError(item) from .series import Series return Series(_query_compiler=self._query_compiler.getitem_array([item])) def _copy(self): return self.__constructor__(_query_compiler=self._query_compiler.copy()) def _to_polars(self) -> polars.DataFrame: """ Convert the DataFrame to Polars format. Returns: Polars representation of the DataFrame. """ return polars.from_pandas(self._query_compiler.to_pandas()) def _get_columns(self): """ Get columns of the DataFrame. Returns: List of columns. """ return list(self._query_compiler.columns) def _set_columns(self, new_columns): """ Set columns of the DataFrame. Args: new_columns: New columns to set. """ new_query_compiler = self._query_compiler.copy() new_query_compiler.columns = new_columns self._query_compiler = new_query_compiler columns = property(_get_columns, _set_columns) _sorted_columns_cache = None def _get_sorted_columns(self): if self._sorted_columns_cache is None: self._sorted_columns_cache = [False] * len(self.columns) return self._sorted_columns_cache def _set_sorted_columns(self, value): self._sorted_columns_cache = value _sorted_columns = property(_get_sorted_columns, _set_sorted_columns) @property def dtypes(self): """ Get dtypes of the DataFrame. Returns: List of dtypes. """ return polars.from_pandas( pandas.DataFrame(columns=self.columns).astype(self._query_compiler.dtypes) ).dtypes @property def flags(self): """ Get flags of the DataFrame. Returns: List of flags. """ # TODO: Add flags support return [] @property def height(self): """ Get height of the DataFrame. Returns: Number of rows in the DataFrame. """ return len(self._query_compiler.index) @property def schema(self): """ Get schema of the DataFrame. Returns: OrderedDict of column names and dtypes. """ return OrderedDict(zip(self.columns, self.dtypes, strict=True)) @property def shape(self): """ Get shape of the DataFrame. Returns: Tuple of (height, width """ return self.height, self.width @property def width(self): """ Get width of the DataFrame. Returns: Number of columns in the DataFrame. """ return len(self.columns) def __repr__(self): """ Get string representation of the DataFrame. Returns: String representation of the DataFrame. """ return repr(polars.from_pandas(self._query_compiler.to_pandas())) def max(self, axis=None): """ Get the maximum value in each column. Args: axis: Axis to get the maximum value on. Returns: DataFrame with the maximum values. """ if axis is None or axis == 0: return self.__constructor__( _query_compiler=self._query_compiler.max(axis=0) ) else: return self.max_horizontal() def max_horizontal(self): """ Get the maximum value in each row. Returns: DataFrame with the maximum values. """ return self.__constructor__(_query_compiler=self._query_compiler.max(axis=1)) def _convert_non_numeric_to_null(self): """ Convert non-numeric columns to null. Returns: DataFrame with non-numeric columns converted to null. """ non_numeric_cols = [ c for c, t in zip(self.columns, self.dtypes, strict=True) if not t.is_numeric() ] if len(non_numeric_cols) > 0: return self.__constructor__( _query_compiler=self._query_compiler.write_items( slice(None), [self.columns.index(c) for c in non_numeric_cols], pandas.NA, need_columns_reindex=False, ).astype({c: self._query_compiler.dtypes[c] for c in non_numeric_cols}) ) return self._copy() def mean(self, *, axis=None, null_strategy="ignore"): """ Get the mean of each column. Args: axis: Axis to get the mean on. null_strategy: Strategy to handle null values. Returns: DataFrame with the mean of each column or row. """ # TODO: this converts non numeric columns to numeric obj = self._convert_non_numeric_to_null() if axis is None or axis == 0: return self.__constructor__( _query_compiler=obj._query_compiler.mean( axis=0, skipna=True if null_strategy == "ignore" else False, ) ) else: return obj.mean_horizontal( ignore_nulls=True if null_strategy == "ignore" else False ) def median(self) -> "DataFrame": """ Get the median of each column. Returns: DataFrame with the median of each column. """ return self.__constructor__( _query_compiler=self._convert_non_numeric_to_null()._query_compiler.median( 0 ) ) def mean_horizontal(self, *, ignore_nulls: bool = True): """ Get the mean of each row. Args: ignore_nulls: Whether to ignore null values. Returns: DataFrame with the mean of each row. """ obj = self._convert_non_numeric_to_null() return self.__constructor__( _query_compiler=obj._query_compiler.mean(axis=1, skipna=ignore_nulls) ) def min(self, axis=None): """ Get the minimum value in each column. Args: axis: Axis to get the minimum value on. Returns: DataFrame with the minimum values of each row or column. """ if axis is None or axis == 0: return self.__constructor__( _query_compiler=self._query_compiler.min(axis=0) ) else: return self.max_horizontal() def min_horizontal(self): """ Get the minimum value in each row. Returns: DataFrame with the minimum values of each row. """ return self.__constructor__(_query_compiler=self._query_compiler.min(axis=1)) def product(self): """ Get the product of each column. Returns: DataFrame with the product of each column. """ obj = self._convert_non_numeric_to_null() return self.__constructor__(_query_compiler=obj._query_compiler.prod(axis=0)) def quantile(self, quantile: float, interpolation="nearest"): """ Get the quantile of each column. Args: quantile: Quantile to get. interpolation: Interpolation method. Returns: DataFrame with the quantile of each column. """ obj = self._convert_non_numeric_to_null() # TODO: interpolation support return self.__constructor__( _query_compiler=obj._query_compiler.quantile_for_single_value(quantile) ) def std(self, ddof: int = 1): """ Get the standard deviation of each column. Args: ddof: Delta degrees of freedom. Returns: DataFrame with the standard deviation of each column """ obj = self._convert_non_numeric_to_null() return self.__constructor__(_query_compiler=obj._query_compiler.std(ddof=ddof)) def sum(self, axis: int | None = None, null_strategy="ignore"): """ Get the sum of each column. Args: axis: Axis to get the sum on. null_strategy: Strategy to handle null values. Returns: DataFrame with the sum of each column or row. """ obj = self._convert_non_numeric_to_null() if axis is None or axis == 0: return self.__constructor__( _query_compiler=obj._query_compiler.sum( axis=0, skipna=True if null_strategy == "ignore" else False, ) ) else: return obj.sum_horizontal( ignore_nulls=True if null_strategy == "ignore" else False ) def sum_horizontal(self, *, ignore_nulls: bool = True): """ Get the sum of each row. Args: ignore_nulls: Whether to ignore null values. Returns: DataFrame with the sum of each row. """ # TODO: if there are strings in the row, polars will append numeric values # this behavior may not be intended so doing this instead (for now) obj = self._convert_non_numeric_to_null() return self.__constructor__( _query_compiler=obj._query_compiler.sum(axis=1, skipna=ignore_nulls) ) def var(self, ddof: int = 1): """ Get the variance of each column. Args: ddof: Delta degrees of freedom. Returns: DataFrame with the variance of each column. """ obj = self._convert_non_numeric_to_null() return self.__constructor__(_query_compiler=obj._query_compiler.var(ddof=ddof)) def approx_n_unique(self): """ Get the approximate number of unique values in each column. Returns: DataFrame with the approximate number of unique values in each column. """ return self.__constructor__(_query_compiler=self._query_compiler.nunique()) def describe(self, percentiles: Sequence[float] | float = (0.25, 0.5, 0.75)): """ Get the descriptive statistics of each column. Args: percentiles: Percentiles to get. Returns: DataFrame with the descriptive statistics of each column. """ return self.__constructor__( self.__constructor__( _query_compiler=self._query_compiler.describe( percentiles=np.array(percentiles) ).astype( { k: str for k, v in zip(self.columns, self.dtypes, strict=True) if v == polars.String } ) ) .to_pandas() .loc[ [ "count", # "null_count", TODO: support null_count in describe "mean", "std", "min", "25%", "50%", "75%", "max", ] ] .reset_index() .rename({"index": "statistic"}) ) def estimated_size(self, unit="b"): """ Get the estimated amount of memory used by the DataFrame. Args: unit: Unit of the memory size. Returns: DataFrame with the extimated memory usage. """ return self.__constructor__(_query_compiler=self._query_compiler.memory_usage()) def glimpse( self, *, max_items_per_column: int = 10, max_colname_length: int = 50, return_as_string: bool = False, ) -> str | None: raise NotImplementedError("not yet") def n_unique(self, subset=None) -> int: """ Get the number of unique values in each column. Args: subset: Columns to get the number of unique values for. Returns: Number of unique values in each column. """ if subset is not None: raise NotImplementedError("not yet") return ( self.is_unique()._query_compiler.sum(axis=0).to_pandas().squeeze(axis=None) ) def null_count(self) -> "DataFrame": """ Get the number of null values in each column. Returns: DataFrame with the number of null values in each column. """ return self.__constructor__( _query_compiler=self._query_compiler.isna().sum(axis=0) ) def to_pandas(self): """ Convert the DataFrame to Pandas format. Returns: modin.pandas representation of the DataFrame. """ return ModinPandasDataFrame(query_compiler=self._query_compiler.copy()) def group_by( self, *by, maintain_order: bool = False, **named_by, ) -> "GroupBy": """ Group the DataFrame by the given columns. Args: by: Columns to group by. maintain_order: Whether to maintain the order of the groups. named_by: Named columns to group by. Returns: GroupBy object. """ from modin.polars.groupby import GroupBy return GroupBy(self, *by, maintain_order=maintain_order, **named_by) def drop(self, *columns, strict: bool = True) -> "DataFrame": """ Drop the given columns. Args: columns: Columns to drop. strict: Whether to raise an error if a column is not found. Returns: DataFrame with the columns dropped. """ if strict: for c in columns: if c not in self.columns: raise KeyError(c) columns = list(columns) if not isinstance(columns[0], list) else columns[0] return self.__constructor__(_query_compiler=self._query_compiler.drop(columns)) def drop_in_place(self, name: str) -> "DataFrame": """ Drop the given column in place and return the dropped column. Args: name: Column to drop. Returns: The column that was dropped from the DataFrame. """ col_to_return = self[name] self._query_compiler = self._query_compiler.drop([name]) return col_to_return def get_column(self, name: str) -> "Series": """ Get the column by name. Args: name: Name of the column to get. Returns: Series with the column. """ return self[name] def get_column_index(self, name: str) -> int: """ Find the index of the column by name. Args: name: Name of the column to find. Returns: Index of the column. """ return self.columns.index(name) def get_columns(self) -> list["Series"]: """ Get the columns of the DataFrame. Returns: List of Series with the columns. """ return [self[name] for name in self.columns] def group_by_dynamic( self, index_column, *, every, period, offset, truncate, include_boundaries, closed, label, group_by, start_by, check_sorted, ): raise NotImplementedError("not yet") def hstack(self, columns, *, inplace: bool = False) -> "DataFrame": """ Stack the given columns horizontally. Args: columns: Columns to stack. inplace: Whether to stack the columns in place. Returns: DataFrame with the columns stacked horizontally. """ if isinstance(columns, DataFrame): columns = columns.get_columns() result_query_compiler = self._query_compiler.concat( axis=1, other=[c._query_compiler for c in columns] ) if inplace: self._query_compiler = result_query_compiler return self return self.__constructor__(_query_compiler=result_query_compiler) def insert_column(self, index: int, column: "Series") -> "DataFrame": """ Insert the given column at the given index. Args: index: Index to insert the column at. column: Column to insert. name: Name of the column to insert. Returns: DataFrame with the column inserted. """ return self.__constructor__( self._query_compiler.insert(index, column.name, column._query_compiler) ) def item(self, row: int | None = None, column: str | int | None = None) -> Any: """ Get the value at the given row and column. Args: row: Row to get the value from. column: Column to get the value from. Returns: Value at the given row and column. """ if row is None: row = 0 if column is None: column = 0 if isinstance(column, str): column = self.columns.index(column) return ( self._query_compiler.take_2d_labels(row, column) .to_pandas() .squeeze(axis=None) ) def iter_columns(self) -> Iterator["Series"]: """ Iterate over the columns of the DataFrame. Returns: Iterator over the columns. """ return iter(self.get_columns()) def iter_rows( self, *, named: bool = False, buffer_size: int = 512, ) -> Iterator[tuple[Any]] | Iterator[dict[str, Any]]: """ Iterate over the rows of the DataFrame. Returns: Iterator over the rows. """ raise NotImplementedError("not yet") def iter_slices( self, n_rows: int = 10000, ) -> Iterator["DataFrame"]: """ Iterate over the slices of the DataFrame. Args: n_rows: Number of rows in each slice. Returns: Iterator over the slices. """ raise NotImplementedError("not yet") def join( self, other: "DataFrame", on: str | list[str] | None = None, how: str = "inner", *, left_on: str | list[str] | None = None, right_on: str | list[str] | None = None, suffix: str = "_right", validate="m:m", join_nulls: bool = False, coalesce: bool | None = None, ) -> "DataFrame": """ Join the DataFrame with another DataFrame. Args: other: DataFrame to join with. on: Column to join on. how: How to join the DataFrames. Returns: Joined DataFrame. """ if how == "full": how = "outer" elif how == "cross": raise NotImplementedError("not yet") elif how == "semi": how = "right" elif how == "anti": raise NotImplementedError("not yet") return self.__constructor__( _query_compiler=self._query_compiler.merge( other._query_compiler, on=on, how=how, suffixes=("", suffix), left_on=left_on, right_on=right_on, ) ) def join_asof( self, other: "DataFrame", *, left_on: str | None = None, right_on: str | None = None, on: str | None = None, by_left: str | Sequence[str] | None = None, by_right: str | Sequence[str] | None = None, by: str | Sequence[str] | None = None, strategy: str = "backward", suffix: str = "_right", tolerance: str, ) -> "DataFrame": """ Join the DataFrame with another DataFrame using asof logic. Args: other: DataFrame to join with. left_on: Column to join on in the left DataFrame. right_on: Column to join on in the right DataFrame. on: Column to join on in both DataFrames. by_left: Columns to join on in the left DataFrame. by_right: Columns to join on in the right DataFrame. by: Columns to join on in both DataFrames. strategy: Strategy to use for the join. suffix: Suffix to add to the columns. tolerance: Tolerance for the join. Returns: Joined DataFrame. """ if on is not None and left_on is None and right_on is None: left_on = right_on = on if by is not None and by_left is None and by_right is None: by_left = by_right = by return self.__constructor__( _query_compiler=self._query_compiler.merge_asof( other._query_compiler, left_on=left_on, right_on=right_on, left_by=by_left, right_by=by_right, direction=strategy, suffixes=("", suffix), tolerance=tolerance, ) ) def melt( self, id_vars=None, value_vars=None, variable_name: str | None = None, value_name: str | None = None, ) -> "DataFrame": """ Melt the DataFrame. Args: id_vars: Columns to keep. value_vars: Columns to melt. variable_name: Name of the variable column. value_name: Name of the value column. Returns: Melted DataFrame. """ return self.__constructor__( _query_compiler=self._query_compiler.melt( id_vars=id_vars, value_vars=value_vars, var_name=variable_name, value_name=value_name, ) ) def merge_sorted(self, other: "DataFrame", on: str | list[str]) -> "DataFrame": # TODO: support natural join + sort raise NotImplementedError("not yet") def partition_by( self, by, *more_by, maintain_order: bool = True, include_key: bool = True, as_dict: bool = False, ) -> list["DataFrame"] | dict[Any, "DataFrame"]: """ Partition the DataFrame by the given columns. Args: by: Columns to partition by. more_by: Additional columns to partition by. maintain_order: Whether to maintain the order of the partitions. include_key: Whether to include the partition key. as_dict: Whether to return the partitions as a dictionary. Returns: List of DataFrames or dictionary of DataFrames. """ if isinstance(by, str): by = [by, *more_by] elif isinstance(by, list): by = [*by, *more_by] if as_dict: return { k: self.__constructor__(v) for k, v in self.to_pandas() .groupby(by, as_index=not include_key) .groups } else: return [ self.__constructor__(g) for g in self.to_pandas().groupby(by, as_index=not include_key) ] def pipe(self, function, *args, **kwargs) -> Any: return function(self, *args, **kwargs) def pivot( self, *, values, index, columns, aggregate_function=None, maintain_order: bool = True, sort_columns: bool = False, separator: str = "_", ) -> "DataFrame": """ Pivot the DataFrame. Args: values: Values to pivot. index: Index columns. columns: Columns to pivot. aggregate_function: Function to aggregate the values. maintain_order: Whether to maintain the order of the pivot. sort_columns: Whether to sort the columns. separator: Separator for the columns. Returns: Pivoted DataFrame. """ # TODO: handle maintain_order, sort_columns, separator return self.__constructor__( _query_compiler=self._query_compiler.pivot( values=values, index=index, columns=columns, agg=aggregate_function, ) ) def rechunk(self) -> "DataFrame": """ Rechunk the DataFrame into the given number of partitions. Returns: Rechunked DataFrame. """ return self._copy() def rename(self, mapping: dict[str, str] | callable) -> "DataFrame": """ Rename the columns of the DataFrame. Args: mapping: Mapping of old names to new names. Returns: DataFrame with the columns renamed. """ if callable(mapping): mapping = {c: mapping(c) for c in self.columns} # TODO: add a query compiler method for `rename` new_columns = {c: mapping.get(c, c) for c in self.columns} new_obj = self._copy() new_obj.columns = new_columns return new_obj def replace_column(self, index: int, column: "Series") -> "DataFrame": """ Replace the column at the given index with the new column. Args: index: Index of the column to replace. column: New column to replace with. Returns: DataFrame with the column replaced. """ self._query_compiler = self._query_compiler.drop([self.columns[index]]).insert( index, column.name, column._query_compiler, ) return self def reverse(self) -> "DataFrame": """ Reverse the DataFrame. Returns: Reversed DataFrame. """ return self.__constructor__( _query_compiler=self._query_compiler.getitem_row_array( slice(None, None, -1) ) ) def rolling(self, index_column, *, period, offset, closed, group_by, check_sorted): raise NotImplementedError("not yet") def row( self, index: int | None = None, *, by_predicate=None, named: bool = False ) -> tuple[Any] | dict[str, Any]: """ Get the row at the given index. Args: index: Index of the row to get. by_predicate: Predicate to get the row by. named: Whether to return the row as a dictionary. Returns: Row at the given index. """ if index is not None: if named: return dict(self.to_pandas().iloc[index]) else: return tuple(self.to_pandas().iloc[index]) else: # TODO: support expressions raise NotImplementedError("not yet") def rows(self, *, named: bool = False) -> list[tuple[Any]] | list[dict[str, Any]]: raise NotImplementedError("not yet") def rows_by_key( self, key: Any, *, named: bool = False, include_key: bool = False, unique: bool = False, ) -> dict[Any, Iterable[Any]]: raise NotImplementedError("not yet") def select(self, *exprs, **named_exprs) -> "DataFrame": # TODO: support expressions raise NotImplementedError("not yet") def select_seq(self, *exprs, **named_exprs) -> "DataFrame": # TODO: support expressions raise NotImplementedError("not yet") def set_sorted( self, column: str | Iterable[str], *more_columns: str, descending: bool = False ) -> "DataFrame": """ Set the columns to be sorted. Args: column: Column to sort by. more_columns: Additional columns to sort by. descending: Whether to sort in descending order. Returns: DataFrame with the columns sorted. """ if len(more_columns) > 0: if isinstance(column, Iterable): column = [*column, *more_columns] else: column = [column, *more_columns] if isinstance(column, str): column = [column] new_sorted_columns = [c in column for c in self.columns] obj = self._copy() obj._sorted_columns = new_sorted_columns return obj def sql(self, query: str, *, table_name: str = "self") -> "DataFrame": raise NotImplementedError("not yet") def to_series(self, index: int = 0) -> "Series": """ Convert the DataFrame at index provided to a Series. Args: index: Index of the column to convert to a Series. Returns: Series representation of the DataFrame at index provided. """ return self[self.columns[index]] def transpose( self, *, include_header: bool = False, header_name: str = "column", column_names: str | Sequence[str] | None = None, ) -> "DataFrame": """ Transpose the DataFrame. Args: include_header: Whether to include a header. header_name: Name of the header. column_names: Names of the columns. Returns: Transposed DataFrame. """ result = self.__constructor__(_query_compiler=self._query_compiler.transpose()) if column_names is not None: result.columns = column_names elif include_header: result.columns = [f"{header_name}_{i}" for i in range(result.width)] return result def unnest(self, columns, *more_columns) -> "DataFrame": """ Unnest the given columns. Args: columns: Columns to unnest. more_columns: Additional columns to unnest. Returns: DataFrame with the columns unnested. """ raise NotImplementedError("not yet") def unstack( self, step: int, how: str = "vertical", columns=None, fill_values: list[Any] | None = None, ): """ Unstack the DataFrame. Args: step: Step to unstack by. how: How to unstack the DataFrame. columns: Columns to unstack. fill_values: Values to fill the unstacked DataFrame with. Returns: Unstacked DataFrame. """ raise NotImplementedError("not yet") def update( self, other: "DataFrame", on: str | Sequence[str] | None = None, how: Literal["left", "inner", "full"] = "left", *, left_on: str | Sequence[str] | None = None, right_on: str | Sequence[str] | None = None, include_nulls: bool = False, ) -> "DataFrame": """ Update the DataFrame with another DataFrame. Args: other: DataFrame to update with. on: Column to update on. how: How to update the DataFrame. Returns: Updated DataFrame. """ raise NotImplementedError("not yet") def upsample( self, time_column: str, *, every: str, offset: str | None = None, group_by: str | Sequence[str] | None = None, maintain_order: bool = False, ) -> "DataFrame": raise NotImplementedError("not yet") def vstack(self, other: "DataFrame", *, in_place: bool = False) -> "DataFrame": """ Stack the given DataFrame vertically. Args: other: DataFrame to stack. in_place: Whether to stack the DataFrames in place. Returns: Stacked DataFrame. """ if in_place: self._query_compiler = self._query_compiler.concat( axis=0, other=other._query_compiler ) return self else: return self.__constructor__( _query_compiler=self._query_compiler.concat( axis=0, other=other._query_compiler ) ) def with_columns(self, *exprs, **named_exprs) -> "DataFrame": # TODO: support expressions raise NotImplementedError("not yet") def with_columns_seq(self, *exprs, **named_exprs) -> "DataFrame": # TODO: support expressions raise NotImplementedError("not yet") def with_row_index(self, name: str = "index", offset: int = 0) -> "DataFrame": """ Add a row index to the DataFrame. Args: name: Name of the row index. offset: Offset for the row index. Returns: DataFrame with the row index added. """ if offset != 0: obj = self._copy() obj.index = obj.index + offset result = self.__constructor__( _query_compiler=self._query_compiler.reset_index(drop=False) ) result.columns = [name, *self.columns] return result with_row_count = with_row_index def map_rows( self, function: callable, return_dtype=None, *, inference_size: int = 256 ) -> "DataFrame": """ Apply the given function to the DataFrame. Args: function: Function to apply. return_dtype: Return type of the function. inference_size: Size of the inference. Returns: DataFrame with the function applied. """ return self.__constructor__( _query_compiler=self._query_compiler.apply(function, axis=1) ) def corr(self, **kwargs: Any) -> "DataFrame": """ Compute the correlation of the DataFrame. Returns: DataFrame with the correlation. """ return self.__constructor__(_query_compiler=self._query_compiler.corr(**kwargs)) def lazy(self) -> "LazyFrame": """ Convert the DataFrame to a lazy DataFrame. Returns: Lazy DataFrame. """ raise NotImplementedError("not yet") @classmethod def deserialize(cls, source) -> "DataFrame": """ Deserialize the DataFrame. Args: source: Source to deserialize. Returns: Deserialized DataFrame. """ return cls(polars.DataFrame.deserialize(source)) def serialize(self, file=None) -> str | None: """ Serialize the DataFrame. Args: file: File to serialize to. Returns: Serialized DataFrame. """ return polars.from_pandas(self._query_compiler.to_pandas()).serialize(file) @property def style(self): """ Create a Great Table for styling. Returns: GreatTable object. """ return self._to_polars().style def to_dict( self, *, as_series: bool = True ) -> dict[str, "Series"] | dict[str, list[Any]]: """ Convert the DataFrame to a dictionary representation. Args: as_series: Whether to convert the columns to Series. Returns: Dictionary representation of the DataFrame. """ if as_series: return {name: self[name] for name in self.columns} else: return polars.from_pandas(self._query_compiler.to_pandas()).to_dict( as_series=as_series ) def to_dicts(self) -> list[dict[str, Any]]: """ Convert the DataFrame to a list of dictionaries. Returns: List of dictionaries. """ return self._to_polars().to_dicts() def to_init_repr(self, n: int = 1000) -> str: """ Get the string representation of the DataFrame for initialization. Returns: String representation of the DataFrame for initialization. """ return self._to_polars().to_init_repr(n) def to_struct(self, name: str = "") -> "Series": """ Convert the DataFrame to a struct. Args: name: Name of the struct. Returns: Series representation of the DataFrame as a struct. """ raise NotImplementedError("not yet") def unpivot( self, on, *, index, variable_name: str | None = None, value_name: str | None = None, ) -> "DataFrame": """ Unpivot a DataFrame from wide to long format. Args: on: Columns to unpivot. index: Columns to keep. variable_name: Name of the variable column. value_name: Name of the value column. Returns: Unpivoted DataFrame. """ return self.__constructor__( _query_compiler=self._query_compiler.melt( on=on, index=index, var_name=variable_name, value_name=value_name, ) ) write_avro = write_clipboard = write_csv = write_database = write_delta = ( write_excel ) = write_ipc = write_ipc_stream = write_json = write_ndjson = write_parquet = ( write_parquet_partitioned ) = lambda *args, **kwargs: (_ for _ in ()).throw(NotImplementedError("not yet")) def clear(self, n: int = 0) -> "DataFrame": """ Create an empty (n=0) or null filled (n>0) DataFrame. Args: n: Number of rows to create. Returns: Empty or null filled DataFrame. """ return self.__constructor__(polars.DataFrame(schema=self.schema).clear(n=n)) def collect_schema(self) -> dict[str, str]: """ Collect the schema of the DataFrame. Returns: Dictionary of the schema. """ return self.schema def fold(self, operation: callable) -> "Series": """ Fold the DataFrame. Args: operation: Operation to fold the DataFrame with. Returns: Series with the folded DataFrame. """ raise NotImplementedError("not yet") def hash_rows( self, seed: int = 0, seed_1: int | None = None, seed_2: int | None = None, seed_3: int | None = None, ) -> "Series": raise NotImplementedError("not yet") ================================================ FILE: modin/polars/groupby.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Implement GroupBy public API as pandas does.""" from typing import TYPE_CHECKING if TYPE_CHECKING: from modin.polars import DataFrame class GroupBy: def __init__( self, df: "DataFrame", *by, maintain_order: bool = False, **named_by, ) -> None: self.df = df if len(by) == 1: self.by = by[0] else: if all(isinstance(b, str) and b in self.df.columns for b in by): self.by = self.df[list(by)]._query_compiler elif all(isinstance(b, type(self._df._query_compiler)) for b in by): self.by = by else: raise NotImplementedError("not yet") self.named_by = named_by self.maintain_order = maintain_order def agg(self, *aggs, **named_aggs): raise NotImplementedError("not yet") def all(self): raise NotImplementedError("not yet") def map_groups(self, function) -> "DataFrame": raise NotImplementedError("not yet") apply = map_groups def count(self): return self.len(name="count") def first(self) -> "DataFrame": return self.df.__constructor__( _query_compiler=self.df._query_compiler.groupby_first( self.by, axis=0, groupby_kwargs=dict( sort=not self.maintain_order, as_index=True, ), agg_args=(), agg_kwargs={}, drop=False, ).reset_index(drop=False) ) def head(self, n: int = 5): return self.df.__constructor__( _query_compiler=self.df._query_compiler.groupby_head( self.by, axis=0, groupby_kwargs=dict( sort=not self.maintain_order, as_index=False, ), agg_args=(), agg_kwargs=dict(n=n), drop=False, ) ) def last(self) -> "DataFrame": return self.df.__constructor__( _query_compiler=self.df._query_compiler.groupby_last( self.by, axis=0, groupby_kwargs=dict( sort=not self.maintain_order, as_index=True, ), agg_args=(), agg_kwargs={}, drop=False, ).reset_index(drop=False) ) def len(self, name: str | None = None) -> "DataFrame": if name is None: name = "len" result = self.df.__constructor__( _query_compiler=self.df._query_compiler.groupby_size( self.by, axis=0, groupby_kwargs=dict( sort=not self.maintain_order, as_index=False, ), agg_args=(), agg_kwargs={}, drop=False, ) ) result._query_compiler.columns = [ c if c != "size" else name for c in result.columns ] return result def max(self) -> "DataFrame": return self.df.__constructor__( _query_compiler=self.df._query_compiler.groupby_max( self.by, axis=0, groupby_kwargs=dict( sort=not self.maintain_order, as_index=False, ), agg_args=(), agg_kwargs={}, drop=False, ) ) def mean(self) -> "DataFrame": # TODO: Non numeric columns are dropped, but in Polars they are converted to null return self.df.__constructor__( _query_compiler=self.df._query_compiler.groupby_mean( self.by, axis=0, groupby_kwargs=dict( sort=not self.maintain_order, as_index=True, ), agg_args=(), agg_kwargs=dict(numeric_only=True), drop=False, ).reset_index(drop=False) ) def median(self) -> "DataFrame": # TODO: Non numeric columns are dropped, but in Polars they are converted to null return self.df.__constructor__( _query_compiler=self.df._query_compiler.groupby_median( self.by, axis=0, groupby_kwargs=dict( sort=not self.maintain_order, as_index=True, ), agg_args=(), agg_kwargs=dict(numeric_only=True), drop=False, ).reset_index(drop=False) ) def min(self) -> "DataFrame": return self.df.__constructor__( _query_compiler=self.df._query_compiler.groupby_min( self.by, axis=0, groupby_kwargs=dict( sort=not self.maintain_order, as_index=False, ), agg_args=(), agg_kwargs={}, drop=False, ) ) def n_unique(self) -> "DataFrame": return self.df.__constructor__( _query_compiler=self.df._query_compiler.groupby_nunique( self.by, axis=0, groupby_kwargs=dict( sort=not self.maintain_order, as_index=False, ), agg_args=(), agg_kwargs={}, drop=False, ) ) def quantile(self, quantile: float, interpolation="nearest") -> "DataFrame": # TODO: Non numeric columns are dropped, but in Polars they are converted to null # TODO: interpolation types not yet supported return self.df.__constructor__( _query_compiler=self.df._query_compiler.groupby_quantile( self.by, axis=0, groupby_kwargs=dict( sort=not self.maintain_order, as_index=True, ), agg_args=(), agg_kwargs=dict(numeric_only=True, q=quantile), drop=False, ).reset_index(drop=False) ) def sum(self) -> "DataFrame": # TODO: Non numeric columns are dropped, but in Polars they are converted to null return self.df.__constructor__( _query_compiler=self.df._query_compiler.groupby_sum( self.by, axis=0, groupby_kwargs=dict( sort=not self.maintain_order, as_index=True, ), agg_args=(), agg_kwargs=dict(numeric_only=True), drop=False, ).reset_index(drop=False) ) def tail(self, n: int = 5): return self.df.__constructor__( _query_compiler=self.df._query_compiler.groupby_tail( self.by, axis=0, groupby_kwargs=dict( sort=not self.maintain_order, as_index=False, ), agg_args=(), agg_kwargs=dict(n=n), drop=False, ) ) ================================================ FILE: modin/polars/lazyframe.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. from modin.polars.base import BasePolarsDataset class LazyFrame(BasePolarsDataset): """ Stub for Lazy Frame implementation. """ pass ================================================ FILE: modin/polars/series.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Module houses `Series` class, that is distributed version of `polars.Series`.""" from __future__ import annotations from typing import TYPE_CHECKING, Any, Sequence import numpy as np import pandas import polars from polars._utils.various import no_default from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler from modin.error_message import ErrorMessage from modin.pandas import Series as ModinPandasSeries from modin.pandas.io import from_pandas from modin.polars.base import BasePolarsDataset if TYPE_CHECKING: from numpy.typing import ArrayLike from polars import PolarsDataType from modin.polars import DataFrame class Series(BasePolarsDataset): def __init__( self, name: str | "ArrayLike" | None = None, values: "ArrayLike" | None = None, dtype: "PolarsDataType | None" = None, *, strict: "bool" = True, nan_to_null: "bool" = False, dtype_if_empty: "PolarsDataType" = polars.Null, _query_compiler: BaseQueryCompiler | None = None, ) -> None: if _query_compiler is None: if isinstance(values, ModinPandasSeries): self._query_compiler = values._query_compiler.copy() else: self._query_compiler: BaseQueryCompiler = from_pandas( polars.Series( name=name, values=values, dtype=dtype, strict=strict, nan_to_null=nan_to_null, dtype_if_empty=dtype_if_empty, ) .to_pandas() .to_frame() )._query_compiler else: self._query_compiler: BaseQueryCompiler = _query_compiler def __repr__(self): return repr( polars.from_pandas(self._query_compiler.to_pandas().squeeze(axis=1)) ) _sorted = False _descending = None def to_pandas(self) -> ModinPandasSeries: return ModinPandasSeries(query_compiler=self._query_compiler) def arg_max(self) -> int: """ Get the index of the maximum value. Returns: Index of the maximum value. """ return self.to_pandas().argmax() def arg_min(self) -> int: """ Get the index of the minimum value. Returns: Index of the minimum value. """ return self.to_pandas().argmin() def implode(self) -> "Series": """ Aggregate values into a list. Returns: Imploded Series. """ raise NotImplementedError("not yet") def max(self) -> Any: """ Get the maximum value. Returns: Maximum value. """ return self.to_pandas().max() def min(self) -> Any: """ Get the minimum value. Returns: Minimum value. """ return self.to_pandas().min() def mean(self) -> Any: """ Get the mean value. Returns: Mean value. """ return self.to_pandas().mean() def median(self) -> Any: """ Get the median value. Returns: Median value. """ return self.to_pandas().median() def mode(self) -> Any: """ Get the mode value. Returns: Mode value. """ return self.to_pandas().mode() def nan_max(self) -> Any: """ Get the maximum value, ignoring NaN values. Returns: Maximum value. """ return self.to_pandas().max(skipna=True) def nan_min(self) -> Any: """ Get the minimum value, ignoring NaN values. Returns: Minimum value. """ return self.to_pandas().min(skipna=True) def product(self) -> Any: """ Get the product of all values. Returns: Product of all values. """ return self.to_pandas().product() def quantile(self, quantile: float, interpolation: str = "nearest") -> float | None: """ Get the quantile value. Args: quantile: Quantile to calculate. interpolation: Interpolation method. Returns: Quantile value. """ return self.to_pandas().quantile(quantile, interpolation=interpolation) def std(self, ddof: int = 1) -> float: """ Get the standard deviation. Args: ddof: Delta Degrees of Freedom. Returns: Standard deviation. """ return self.to_pandas().std(ddof=ddof) def sum(self) -> Any: """ Get the sum of all values. Returns: Sum of all values. """ return self.to_pandas().sum() def var(self, ddof: int = 1) -> float: """ Get the variance. Args: ddof: Delta Degrees of Freedom. Returns: Variance. """ return self.to_pandas().var(ddof=ddof) @property def arr(self) -> polars.series.array.ArrayNameSpace: """ Get the underlying array. Returns: Underlying array. """ return polars.from_pandas(self._query_compiler.to_pandas().squeeze(axis=1)).arr @property def dtype(self) -> polars.datatypes.DataType: """ Get the data type. Returns: Data type. """ return polars.from_pandas( pandas.Series().astype(self._query_compiler.dtypes.iloc[0]) ).dtype @property def name(self) -> str: """ Get the name. Returns: Name. """ return self._query_compiler.columns[0] @property def shape(self) -> tuple[int]: """ Get the shape. Returns: Shape. """ return (len(self._query_compiler.index),) flags = [] @property def bin(self): raise NotImplementedError("not yet") def all(self) -> bool: """ Check if all values are True. Returns: True if all values are True, False otherwise. """ return self.to_pandas().all() def any(self) -> bool: """ Check if any value is True. Returns: True if any value is True, False otherwise. """ return self.to_pandas().any() def not_(self) -> "Series": """ Negate the values. Returns: Negated Series. """ return self.__constructor__(_query_compiler=self._query_compiler.invert()) @property def cat(self): raise NotImplementedError("not yet") def abs(self) -> "Series": """ Get the absolute values. Returns: Absolute values Series. """ return self.__constructor__(_query_compiler=self._query_compiler.abs()) def arccos(self) -> "Series": """ Get the arc cosine values. Returns: Arc cosine values Series. """ raise NotImplementedError("not yet") def arccosh(self) -> "Series": """ Get the hyperbolic arc cosine values. Returns: Hyperbolic arc cosine values Series. """ raise NotImplementedError("not yet") def arcsin(self) -> "Series": """ Get the arc sine values. Returns: Arc sine values Series. """ raise NotImplementedError("not yet") def arcsinh(self) -> "Series": """ Get the hyperbolic arc sine values. Returns: Hyperbolic arc sine values Series. """ raise NotImplementedError("not yet") def arctan(self) -> "Series": """ Get the arc tangent values. Returns: Arc tangent values Series. """ raise NotImplementedError("not yet") def arctanh(self) -> "Series": """ Get the hyperbolic arc tangent values. Returns: Hyperbolic arc tangent values Series. """ raise NotImplementedError("not yet") def arg_true(self) -> "Series": """ Get the index of the first True value. Returns: Index of the first True value. """ return self.__constructor__( _query_compiler=self._query_compiler.reset_index(drop=False) .getitem_array(self._query_compiler) .getitem_column_array(0, numeric=True) ).rename(self.name) def arg_unique(self) -> "Series": """ Get the index of the first unique value. Returns: Index of the first unique value. """ raise NotImplementedError("not yet") def cbrt(self) -> "Series": """ Get the cube root values. Returns: Cube root values Series. """ raise NotImplementedError("not yet") def cos(self) -> "Series": """ Get the cosine values. Returns: Cosine values Series. """ raise NotImplementedError("not yet") def cosh(self) -> "Series": """ Get the hyperbolic cosine values. Returns: Hyperbolic cosine values Series. """ raise NotImplementedError("not yet") def cot(self) -> "Series": """ Get the cotangent values. Returns: Cotangent values Series. """ raise NotImplementedError("not yet") def cum_count(self) -> "Series": """ Get the cumulative count values. Returns: Cumulative count values Series. """ return self.__constructor__( _query_compiler=self._query_compiler.isna().cumsum() ) def cum_max(self) -> "Series": """ Get the cumulative maximum values. Returns: Cumulative maximum values Series. """ return self.__constructor__(_query_compiler=self._query_compiler.cummax()) def cum_min(self) -> "Series": """ Get the cumulative minimum values. Returns: Cumulative minimum values Series. """ return self.__constructor__(_query_compiler=self._query_compiler.cummin()) def cum_prod(self) -> "Series": """ Get the cumulative product values. Returns: Cumulative product values Series. """ return self.__constructor__(_query_compiler=self._query_compiler.cumprod()) def cum_sum(self) -> "Series": """ Get the cumulative sum values. Returns: Cumulative sum values Series. """ return self.__constructor__(_query_compiler=self._query_compiler.cumsum()) def cumulative_eval( self, expr, min_periods: int = 1, *, parallel: bool = False ) -> "Series": """ Get the cumulative evaluation values. Args: expr: Expression to evaluate. min_periods: Minimum number of periods. Returns: Cumulative evaluation values Series. """ raise NotImplementedError("not yet") def diff(self, n: int = 1, null_behavior: str = "ignore") -> "Series": """ Calculate the first discrete difference between shifted items. Args: n: Number of periods to shift. null_behavior: Null behavior. Returns: Difference values Series. """ raise NotImplementedError("not yet") def dot(self, other) -> int | float | None: """ Calculate the dot product. Args: other: Other Series. Returns: Dot product. """ if isinstance(other, Series): other = other.to_pandas() return self.to_pandas().dot(other) def entropy( self, base: float = 2.718281828459045, *, normalize: bool = False ) -> float: """ Calculate the entropy. Args: base: Logarithm base. normalize: Normalize the entropy. Returns: Entropy. """ raise NotImplementedError("not yet") def ewm_mean( self, com: int | None = None, span: int | None = None, half_life: int | None = None, alpha: float | None = None, *, adjust: bool = True, min_periods: int = 1, ignore_nulls: bool | None = None, ) -> "Series": """ Calculate the exponential weighted mean. Args: com: Center of mass. span: Span. Returns: Exponential weighted mean Series. """ return self.__constructor__( self.to_pandas() .ewm( com=com, span=span, halflife=half_life, alpha=alpha, adjust=adjust, min_periods=min_periods, ignore_na=ignore_nulls, ) .mean() ) def ewm_mean_by(self, by, *, half_life: int | None = None) -> "Series": """ Calculate the exponential weighted mean by group. Args: by: Grouping Series. Returns: Exponential weighted mean Series. """ raise NotImplementedError("not yet") def ewm_std( self, com: int | None = None, span: int | None = None, half_life: int | None = None, alpha: float | None = None, *, adjust: bool = True, min_periods: int = 1, ignore_nulls: bool | None = None, ) -> "Series": """ Calculate the exponential weighted standard deviation. Args: com: Center of mass. span: Span. Returns: Exponential weighted standard deviation Series. """ return self.__constructor__( self.to_pandas() .ewm( com=com, span=span, halflife=half_life, alpha=alpha, adjust=adjust, min_periods=min_periods, ignore_na=ignore_nulls, ) .std() ) def ewm_var( self, com: int | None = None, span: int | None = None, half_life: int | None = None, alpha: float | None = None, *, adjust: bool = True, min_periods: int = 1, ignore_nulls: bool | None = None, ) -> "Series": """ Calculate the exponential weighted variance. Args: com: Center of mass. span: Span. Returns: Exponential weighted variance Series. """ return self.__constructor__( self.to_pandas() .ewm( com=com, span=span, halflife=half_life, alpha=alpha, adjust=adjust, min_periods=min_periods, ignore_na=ignore_nulls, ) .var() ) def exp(self) -> "Series": """ Calculate the exponential values. Returns: Exponential values Series. """ return self.__constructor__(self.to_pandas().exp()) def hash( self, seed: int = 0, seed_1: int | None = None, seed_2: int | None = None, seed_3: int | None = None, ) -> "Series": """ Calculate the hash values. Args: seed: Seed. seed_1: Seed 1. seed_2: Seed 2. seed_3: Seed 3. Returns: Hash values Series. """ raise NotImplementedError("not yet") def hist( self, bins: list[float] | None = None, *, bin_count: int | None = None, include_category: bool = True, include_breakpoint: bool = True, ) -> "Series": """ Calculate the histogram. Args: bins: Bins. bin_count: Bin count. Returns: Histogram Series. """ raise NotImplementedError("not yet") def is_between(self, lower_bound, upper_bound, closed: str = "both") -> "Series": """ Check if values are between the bounds. Args: lower_bound: Lower bound. upper_bound: Upper bound. closed: Closed bounds. Returns: Boolean Series. """ raise NotImplementedError("not yet") def kurtosis(self, *, fisher: bool = True, bias: bool = True) -> float | None: """ Calculate the kurtosis. Args: fisher: Fisher method. bias: Bias method. Returns: Kurtosis. """ return self.to_pandas().kurtosis(fisher=fisher, bias=bias) def log(self, base: float = 2.718281828459045) -> "Series": """ Calculate the logarithm values. Args: base: Logarithm base. Returns: Logarithm values Series. """ raise NotImplementedError("not yet") def log10(self) -> "Series": """ Calculate the base 10 logarithm values. Returns: Base 10 logarithm values Series. """ return self.log(10) def log1p(self) -> "Series": """ Calculate the natural logarithm of 1 plus the values. Returns: Natural logarithm of 1 plus the values Series. """ raise NotImplementedError("not yet") def replace( self, mapping: dict[Any, Any], *, default: Any = None, return_dtype=None, ) -> "Series": """ Map values to other values. Args: mapping: Mapping. Returns: Mapped Series. """ return self.__constructor__( self.to_pandas().apply(lambda x: mapping.get(x, default)) ) def pct_change(self, n: int = 1) -> "Series": """ Calculate the percentage change. Args: n: Number of periods to shift. Returns: Percentage change Series. """ return self.__constructor__(self.to_pandas().pct_change(n)) def peak_max(self) -> "Series": """ Get the peak maximum values. Returns: Peak maximum values Series. """ return self.__eq__(self.max()) def peak_min(self) -> "Series": """ Get the peak minimum values. Returns: Peak minimum values Series. """ return self.__eq__(self.min()) def rank( self, method: str = "average", *, descending: bool = False, seed: int | None = None, ) -> "Series": """ Calculate the rank. Args: method: Rank method. Returns: Rank Series. """ # TODO: support seed if method not in ["average", "min", "max", "first", "dense"]: raise ValueError(f"method {method} not supported") return self.__constructor__( self.to_pandas().rank(method=method, ascending=not descending) ) def rolling_map( self, function: callable, window_size: int, weights: list[float] | None = None, min_periods: int = 1, *, center: bool = False, ) -> "Series": """ Apply a rolling function. Args: function: Function to apply. window_size: Window size. Returns: Applied Series. """ if weights is not None: raise NotImplementedError("not yet") return self.__constructor__( self.to_pandas() .rolling(window=window_size, min_periods=min_periods, center=center) .apply(function) ) def rolling_max( self, window_size: int, weights: list[float] | None = None, min_periods: int = 1, *, center: bool = False, ) -> "Series": """ Apply a rolling maximum function. Args: function: Function to apply. window_size: Window size. Returns: Applied Series. """ if weights is not None: raise NotImplementedError("not yet") return self.__constructor__( self.to_pandas() .rolling(window=window_size, min_periods=min_periods, center=center) .max() ) def rolling_mean( self, window_size: int, weights: list[float] | None = None, min_periods: int = 1, *, center: bool = False, ) -> "Series": """ Apply a rolling mean function. Args: function: Function to apply. window_size: Window size. Returns: Applied Series. """ if weights is not None: raise NotImplementedError("not yet") return self.__constructor__( self.to_pandas() .rolling(window=window_size, min_periods=min_periods, center=center) .mean() ) def rolling_median( self, window_size: int, weights: list[float] | None = None, min_periods: int = 1, *, center: bool = False, ) -> "Series": """ Apply a rolling median function. Args: function: Function to apply. window_size: Window size. Returns: Applied Series. """ if weights is not None: raise NotImplementedError("not yet") return self.__constructor__( self.to_pandas() .rolling(window=window_size, min_periods=min_periods, center=center) .median() ) def rolling_min( self, window_size: int, weights: list[float] | None = None, min_periods: int = 1, *, center: bool = False, ) -> "Series": """ Apply a rolling minimum function. Args: function: Function to apply. window_size: Window size. Returns: Applied Series. """ if weights is not None: raise NotImplementedError("not yet") return self.__constructor__( self.to_pandas() .rolling(window=window_size, min_periods=min_periods, center=center) .min() ) def rolling_quantile( self, window_size: int, quantile: float, interpolation: str = "nearest", weights: list[float] | None = None, min_periods: int = 1, *, center: bool = False, ) -> "Series": """ Apply a rolling quantile function. Args: function: Function to apply. window_size: Window size. Returns: Applied Series. """ if weights is not None: raise NotImplementedError("not yet") return self.__constructor__( self.to_pandas() .rolling(window=window_size, min_periods=min_periods, center=center) .quantile(quantile, interpolation=interpolation) ) def rolling_skew(self, window_size: int, *, bias: bool = False) -> "Series": """ Apply a rolling skewness function. Args: function: Function to apply. window_size: Window size. Returns: Applied Series. """ return self.__constructor__(self.to_pandas().rolling(window=window_size).skew()) def rolling_std( self, window_size: int, weights: list[float] | None = None, min_periods: int = 1, *, center: bool = False, ddof: int = 1, ) -> "Series": """ Apply a rolling standard deviation function. Args: function: Function to apply. window_size: Window size. Returns: Applied Series. """ if weights is not None: raise NotImplementedError("not yet") return self.__constructor__( self.to_pandas() .rolling(window=window_size, min_periods=min_periods, center=center) .std(ddof=ddof) ) def rolling_sum( self, window_size: int, weights: list[float] | None = None, min_periods: int = 1, *, center: bool = False, ) -> "Series": """ Apply a rolling sum function. Args: function: Function to apply. window_size: Window size. Returns: Applied Series. """ if weights is not None: raise NotImplementedError("not yet") return self.__constructor__( self.to_pandas() .rolling(window=window_size, min_periods=min_periods, center=center) .sum() ) def rolling_var( self, window_size: int, weights: list[float] | None = None, min_periods: int = 1, *, center: bool = False, ddof: int = 1, ) -> "Series": """ Apply a rolling variance function. Args: function: Function to apply. window_size: Window size. Returns: Applied Series. """ if weights is not None: raise NotImplementedError("not yet") return self.__constructor__( self.to_pandas() .rolling(window=window_size, min_periods=min_periods, center=center) .var(ddof=ddof) ) def search_sorted(self, element, side: str = "any") -> int | "Series": """ Search for the element in the sorted Series. Args: element: Element to search. side: Side to search. Returns: Index of the element. """ if side == "any": side = "left" return self.__constructor__(self.to_pandas().searchsorted(element, side=side)) def sign(self) -> "Series": """ Get the sign values. Returns: Sign values Series. """ return self.__lt__(0).__mul__(-1).__add__(self.__gt__(0)) def sin(self) -> "Series": """ Get the sine values. Returns: Sine values Series. """ raise NotImplementedError("not yet") def sinh(self) -> "Series": """ Get the hyperbolic sine values. Returns: Hyperbolic sine values Series. """ raise NotImplementedError("not yet") def skew(self, *, bias: bool = True) -> float: """ Calculate the skewness. Args: bias: Bias method. Returns: Skewness. """ return self.to_pandas().skew() def sqrt(self) -> "Series": """ Get the square root values. Returns: Square root values Series. """ return self.__constructor__(self.to_pandas().sqrt()) def tan(self) -> "Series": """ Get the tangent values. Returns: Tangent values Series. """ raise NotImplementedError("not yet") def tanh(self) -> "Series": """ Get the hyperbolic tangent values. Returns: Hyperbolic tangent values Series. """ raise NotImplementedError("not yet") def chunk_lengths(self) -> list[int]: """ Get the chunk lengths. Returns: Chunk lengths. """ raise NotImplementedError("not yet") def describe( self, percentiles: Sequence[float] | float | None = (0.25, 0.5, 0.75), interpolation: str = "nearest", ): """ Generate descriptive statistics. Args: percentiles: Percentiles to calculate. Returns: Descriptive statistics. """ return self.to_pandas().describe(percentiles=percentiles) def estimated_size(self) -> int: """ Get the estimated size. Returns: Estimated size. """ return self.to_pandas().memory_usage(index=False) def has_nulls(self) -> bool: """ Check if there are null values. Returns: True if there are null values, False otherwise. """ return self.to_pandas().isnull().any() has_validity = has_nulls def is_finite(self) -> "Series": """ Check if the values are finite. Returns: True if the values are finite, False otherwise. """ return self.__ne__(np.inf) def is_first_distinct(self) -> "Series": """ Check if the values are the first occurrence. Returns: True if the values are the first occurrence, False otherwise. """ raise NotImplementedError("not yet") def is_in(self, other: "Series" | list[Any]) -> "Series": """ Check if the values are in the other Series. Args: other: Other Series. Returns: True if the values are in the other Series, False otherwise. """ return self.__constructor__(self.to_pandas().isin(other)) def is_infinite(self) -> "Series": """ Check if the values are infinite. Returns: True if the values are infinite, False otherwise. """ return self.__eq__(np.inf) def is_last_distinct(self) -> "Series": """ Check if the values are the last occurrence. Returns: True if the values are the last occurrence, False otherwise. """ raise NotImplementedError("not yet") def is_nan(self) -> "Series": """ Check if the values are NaN. Returns: True if the values are NaN, False otherwise. """ return self.__constructor__(_query_compiler=self._query_compiler.isna()) def is_not_nan(self) -> "Series": """ Check if the values are not NaN. Returns: True if the values are not NaN, False otherwise. """ return self.__constructor__(_query_compiler=self._query_compiler.notna()) def is_not_null(self) -> "Series": """ Check if the values are not null. Returns: True if the values are not null, False otherwise. """ return self.is_not_nan() def is_null(self) -> "Series": """ Check if the values are null. Returns: True if the values are null, False otherwise. """ return self.is_nan() def is_sorted( self, *, descending: bool = False, nulls_last: bool = False, ) -> bool: """ Check if the values are sorted. Args: descending: Descending order. Returns: True if the values are sorted, False otherwise. """ return ( self.to_pandas().is_monotonic_increasing if not descending else self.to_pandas().is_monotonic_decreasing ) def len(self) -> int: """ Get the length of the values. Returns: Length of the values Series. """ return len(self.to_pandas()) def lower_bound(self) -> "Series": """ Get the lower bound values. Returns: Lower bound values Series. """ raise NotImplementedError("not yet") def null_count(self) -> int: """ Get the number of null values. Returns: Number of null values. """ return self.to_pandas().isnull().sum() def unique_counts(self) -> "Series": """ Get the unique counts. Returns: Unique counts. """ return self.__constructor__(values=self.to_pandas().value_counts()) def upper_bound(self) -> "Series": """ Get the upper bound values. Returns: Upper bound values Series. """ raise NotImplementedError("not yet") def value_counts( self, *, sort: bool = False, parallel: bool = False, name: str = "count" ) -> "DataFrame": """ Get the value counts. Returns: Value counts. """ from modin.polars import DataFrame return DataFrame( self.to_pandas().value_counts(sort=sort).reset_index(drop=False, names=name) ) def to_frame(self, name: str | None = None) -> "DataFrame": """ Convert the Series to a DataFrame. Args: name: Name of the Series. Returns: DataFrame representation of the Series. """ from modin.polars import DataFrame return DataFrame(_query_compiler=self._query_compiler).rename({self.name: name}) def to_init_repr(self, n: int = 1000) -> str: """ Convert Series to instantiatable string representation. Args: n: First n elements. Returns: Instantiatable string representation. """ return polars.from_pandas( self.slice(0, n)._query_compiler.to_pandas() ).to_init_repr() @property def list(self): # TODO: implement list object # https://docs.pola.rs/api/python/stable/reference/series/list.html raise NotImplementedError("not yet") def alias(self, name: str) -> "Series": """ Rename the Series. Args: name: New name. Returns: Renamed Series. """ return self.to_frame(name).to_series() def append(self, other: "Series") -> "Series": """ Append another Series. Args: other: Other Series. Returns: Appended Series. """ return self.__constructor__( _query_compiler=self._query_compiler.concat(0, other._query_compiler) ) def arg_sort( self, *, descending: bool = False, nulls_last: bool = False ) -> "Series": """ Get the sorted indices. Args: descending: Descending order. Returns: Sorted indices Series. """ # TODO: implement nulls_last result = self.__constructor__(values=self.to_pandas().argsort()) if descending: return result.reverse() else: return result def ceil(self) -> "Series": """ Get the ceiling values. Returns: Ceiling values Series. """ raise NotImplementedError("not yet") def clear(self, n: int = 0) -> "Series": """ Create an empty copy of the current Series, with zero to ‘n’ elements. Args: n: Number of elements. Returns: Series will n nulls. """ raise NotImplementedError("not yet") def clip(self, lower_bound=None, upper_bound=None) -> "Series": """ Clip the values. Args: lower_bound: Lower bound. upper_bound: Upper bound. Returns: Clipped values Series. """ return self.__constructor__( values=self.to_pandas().clip(lower_bound, upper_bound) ) def cut( self, breaks: Sequence[float], *, labels: list[str] | None = None, break_point_label: str = "breakpoint", left_closed: bool = False, include_breaks: bool = False, as_series: bool = True, ) -> "BasePolarsDataset": raise NotImplementedError("not yet") def extend_constant(self, value) -> "Series": """ Extend the Series with a constant value. Args: value: Constant value. Returns: Extended Series. """ raise NotImplementedError("not yet") def floor(self) -> "BasePolarsDataset": return self.__floordiv__(1) def gather(self, indices) -> "Series": """ Gather values by indices. Args: indices: Indices. Returns: Gathered Series. """ return self.__constructor__( values=self.to_pandas().iloc[ ( indices._query_compiler if hasattr(indices, "_query_compiler") else indices ) ] ) def interpolate_by(self, by) -> "Series": """ Interpolate values by group. Args: by: Grouping Series. Returns: Interpolated Series. """ raise NotImplementedError("not yet") def item(self, index: int | None = None) -> Any: """ Get the item at the index. Args: index: Index. Returns: Item at the index. """ return self.to_pandas().iloc[index] def new_from_index(self, index: int, length: int) -> "Series": """ Create a new Series from the index. Args: index: Index. length: Length. Returns: New Series. """ raise NotImplementedError("not yet") def qcut( self, quantiles: Sequence[float] | int, *, labels: Sequence[str] | None = None, left_closed: bool = False, allow_duplicates: bool = False, include_breaks: bool = False, break_point_label: str = "breakpoint", category_labels: str = "category", as_series: bool = True, ) -> "Series" | "DataFrame": """ Bin continuous values into discrete categories based on quantiles. Args: quantiles: Number of quantiles or sequence of quantiles. labels: Labels for the resulting bins. left_closed: Whether the intervals are left-closed. allow_duplicates: Whether to allow duplicate intervals. include_breaks: Whether to include the breaks in the result. break_point_label: Label for the break points. category_labels: Label for the categories. as_series: Whether to return a Series. Returns: Binned Series. """ raise NotImplementedError("not yet") def rechunk(self, *, in_place: bool = False) -> "Series": """ Rechunk the Series. Args: in_place: In-place operation. Returns: Rechunked Series. """ raise NotImplementedError("not yet") rename = alias def reshape(self, dimensions, nested_type) -> "Series": """ Reshape the Series. Args: dimensions: Dimensions. nested_type: Nested type. Returns: Reshaped Series. """ raise NotImplementedError("not yet") def reverse(self) -> "Series": """ Reverse the Series. Returns: Reversed Series. """ return self.__constructor__(values=self.to_pandas().iloc[::-1]) def rle(self) -> "Series": """ Run-length encode the Series. Returns: Run-length encoded Series. """ raise NotImplementedError("not yet") def rle_id(self) -> "Series": """ Run-length encode the Series with IDs. Returns: Run-length encoded Series with IDs. """ raise NotImplementedError("not yet") def round(self, decimals: int = 0) -> "Series": """ Round the values. Args: decimals: Number of decimals. Returns: Rounded values Series. """ return self.__constructor__(values=self.to_pandas().round(decimals)) def round_sig_figs(self, digits: int) -> "Series": """ Round the values to significant figures. Args: digits: Number of significant figures. Returns: Rounded values Series. """ raise NotImplementedError("not yet") def scatter(self, indices, values) -> "Series": """ Scatter values by indices. Args: indices: Indices. values: Values. Returns: Scattered Series. """ raise NotImplementedError("not yet") def set(self, filter: "Series", value: int | float | str | bool | None) -> "Series": """ Set values by filter. Args: filter: Filter. value: Value. Returns: Set Series. """ raise NotImplementedError("not yet") def shrink_dtype(self) -> "Series": """ Shrink the data type. Returns: Shrunk Series. """ raise NotImplementedError("not yet") def shuffle(self, seed: int | None = None) -> "Series": """ Shuffle the Series. Args: seed: Seed. Returns: Shuffled Series. """ raise NotImplementedError("not yet") def zip_with(self, mask: "Series", other: "Series") -> "Series": """ Zip the Series with another Series. Args: mask: Mask Series. other: Other Series. Returns: Zipped Series. """ return self.__constructor__( _query_compiler=self._query_compiler.where( mask._query_compiler, other._query_compiler ) ) def map_elements( self, function: callable, return_dtype=None, *, skip_nulls: bool = True, ) -> "Series": """ Map the elements. Args: function: Function to apply. Returns: Mapped Series. """ if return_dtype is not None or skip_nulls is False: ErrorMessage.warn( "`return_dtype` and `skip_nulls=False` are not supported yet" ) return self.__constructor__(values=self.to_pandas().apply(function)) def reinterpret(self, *, signed: bool = True) -> "Series": """ Reinterpret the data type of the series as signed or unsigned. Args: signed: If True, reinterpret as signed, otherwise as unsigned. Returns: Reinterpreted Series. """ raise NotImplementedError("not yet") def set_sorted(self, *, descending: bool = False) -> "Series": """ Set the Series as sorted. Args: descending: Descending order. Returns: Sorted Series. """ self._sorted = True self._descending = descending return self def to_physical(self) -> "Series": """ Convert the Series to physical. Returns: Physical Series. """ raise NotImplementedError("not yet") def get_chunks(self) -> list["Series"]: """ Get the chunks. Returns: Chunks. """ raise NotImplementedError("not yet") @property def str(self): # TODO: implement str object # https://docs.pola.rs/api/python/stable/reference/series/string.html raise NotImplementedError("not yet") @property def struct(self): # TODO: implement struct object # https://docs.pola.rs/api/python/stable/reference/series/struct.html raise NotImplementedError("not yet") @property def dt(self): # TODO: implement dt object # https://docs.pola.rs/api/python/stable/reference/series/temporal.html raise NotImplementedError("not yet") def __len__(self) -> int: """ Get the length of the Series. """ return self.len() def __matmul__(self, other) -> "Series": """ Matrix multiplication. Args: other: Other Series. Returns: Matrix multiplication Series. """ raise NotImplementedError("not yet") def __radd__(self, other) -> "Series": """ Right addition. Args: other: Other Series. Returns: Added Series. """ return self.__constructor__( _query_compiler=self._query_compiler.radd(other, axis=0) ) def __rand__(self, other) -> "Series": """ Right and. Args: other: Other Series. Returns: And Series. """ return self.__constructor__( _query_compiler=self._query_compiler.__rand__(other, axis=0) ) def __rfloordiv__(self, other) -> "Series": """ Right floor division. Args: other: Other Series. Returns: Floored Series. """ return self.__constructor__( _query_compiler=self._query_compiler.rfloordiv(other, axis=0) ) def __rmatmul__(self, other) -> "Series": """ Right matrix multiplication. Args: other: Other Series. Returns: Matrix multiplication Series. """ raise NotImplementedError("not yet") def __rmod__(self, other) -> "Series": """ Right modulo. Args: other: Other Series. Returns: Modulo Series. """ return self.__constructor__( _query_compiler=self._query_compiler.rmod(other, axis=0) ) def __rmul__(self, other) -> "Series": """ Right multiplication. Args: other: Other Series. Returns: Multiplied Series. """ return self.__constructor__( _query_compiler=self._query_compiler.rmul(other, axis=0) ) def __ror__(self, other) -> "Series": """ Right or. Args: other: Other Series. Returns: Or Series. """ return self.__constructor__( _query_compiler=self._query_compiler.__ror__(other, axis=0) ) def __rpow__(self, other) -> "Series": """ Right power. Args: other: Other Series. Returns: Powered Series. """ return self.__constructor__( _query_compiler=self._query_compiler.rpow(other, axis=0) ) def __rsub__(self, other) -> "Series": """ Right subtraction. Args: other: Other Series. Returns: Subtracted Series. """ return self.__constructor__( _query_compiler=self._query_compiler.rsub(other, axis=0) ) def __rtruediv__(self, other) -> "Series": """ Right true division. Args: other: Other Series. Returns: Divided Series. """ return self.__constructor__( _query_compiler=self._query_compiler.rtruediv(other, axis=0) ) def __rxor__(self, other) -> "Series": """ Right xor. Args: other: Other Series. Returns: Xor Series. """ return self.__constructor__( _query_compiler=self._query_compiler.__rxor__(other, axis=0) ) def eq(self, other) -> "Series": """ Check if the values are equal to the other Series. Args: other: Other Series. Returns: Boolean Series. """ return self.__constructor__( _query_compiler=self._query_compiler.eq(other._query_compiler) ) def eq_missing(self, other) -> "Series": """ Check if the values are equal to the other Series, including missing values. Args: other: Other Series. Returns: Boolean Series. """ raise NotImplementedError("not yet") def ge(self, other) -> "Series": """ Check if the values are greater than or equal to the other Series. Args: other: Other Series. Returns: Boolean Series. """ return self.__constructor__( _query_compiler=self._query_compiler.ge(other._query_compiler) ) def gt(self, other) -> "Series": """ Check if the values are greater than the other Series. Args: other: Other Series. Returns: Boolean Series. """ return self.__constructor__( _query_compiler=self._query_compiler.gt(other._query_compiler) ) def le(self, other) -> "Series": """ Check if the values are less than or equal to the other Series. Args: other: Other Series. Returns: Boolean Series. """ return self.__constructor__( _query_compiler=self._query_compiler.le(other._query_compiler) ) def lt(self, other) -> "Series": """ Check if the values are less than the other Series. Args: other: Other Series. Returns: Boolean Series. """ return self.__constructor__( _query_compiler=self._query_compiler.lt(other._query_compiler) ) def n_unique(self) -> int: """ Get the number of unique values. Returns: Number of unique values. """ return self._query_compiler.nunique().to_pandas().squeeze(axis=None) def ne(self, other) -> "Series": """ Check if the values are not equal to the other Series. Args: other: Other Series. Returns: Boolean Series. """ return self.__constructor__( _query_compiler=self._query_compiler.ne(other._query_compiler) ) def ne_missing(self, other) -> "Series": """ Check if the values are not equal to the other Series, including missing values. Args: other: Other Series. Returns: Boolean Series. """ raise NotImplementedError("not yet") def pow(self, exponent) -> "Series": """ Raise the values to the power of the exponent. Args: exponent: Exponent. Returns: Powered Series. """ return self.__constructor__( _query_compiler=self._query_compiler.pow(exponent, axis=0) ) def replace_strict( self, old, new=no_default, *, default=no_default, return_dtype=None ) -> "Series": """ Replace values strictly. Args: old: Old values. new: New values. default: Default value. Returns: Replaced Series. """ raise NotImplementedError("not yet") def to_list(self) -> list: """ Convert the Series to a list. Returns: List representation of the Series. """ return self._to_polars().tolist() def drop_nans(self) -> "Series": """ Drop NaN values. Returns: Series without NaN values. """ return self.__constructor__( _query_compiler=self._query_compiler.dropna(how="any") ) ================================================ FILE: modin/tests/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. ================================================ FILE: modin/tests/config/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. ================================================ FILE: modin/tests/config/docs_module/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. from .classes import BasePandasDataset, DataFrame, Series from .functions import read_csv __all__ = ["BasePandasDataset", "DataFrame", "Series", "read_csv"] ================================================ FILE: modin/tests/config/docs_module/classes.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. class DataFrame: def apply(self): """This is a test of the documentation module for DataFrame.""" return class Series: def isna(self): """This is a test of the documentation module for Series.""" return class BasePandasDataset: """This is a test of the documentation module for BasePandasDataSet.""" def apply(): """This is a test of the documentation module for BasePandasDataSet.apply.""" return def astype(): """This is a test of the documentation module for BasePandasDataSet.astype.""" ================================================ FILE: modin/tests/config/docs_module/functions.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. def read_csv(): """Test override for functions on the module.""" return ================================================ FILE: modin/tests/config/docs_module_with_just_base/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. from .classes import BasePandasDataset __all__ = ["BasePandasDataset"] ================================================ FILE: modin/tests/config/docs_module_with_just_base/classes.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. class BasePandasDataset: def astype(): """This is a test of the documentation module for BasePandasDataSet.astype.""" ================================================ FILE: modin/tests/config/test_envvars.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import itertools import os import re import sys import unittest.mock as mock from unittest.mock import Mock, patch import pandas import pytest from pytest import param import modin.config as cfg import modin.pandas as pd from modin.config.envvars import _check_vars from modin.config.pubsub import _UNSET, ExactStr, ValueSource from modin.pandas.base import BasePandasDataset from modin.tests.pandas.utils import switch_execution ################# WARNING ##################################################### # Test cases in this file affect global state, e.g. by setting environment # variables. The test cases may produce unexpected results when repeated on run # out of the order they are defined in. Be careful when running the test # locally or when adding new test cases. In particular, note: # - test_ray_cluster_resources() causes us to permanently attach the # `_initialize_engine` subscriber to Engine: https://github.com/modin-project/modin/blob/6252ebde19935bd1f6a6850209bf8a1f5e5ecfb7/modin/core/execution/dispatching/factories/dispatcher.py#L115 # Changing to any engine after that test runs will cause Modin to try to # initialize the engine. # - In CI, we only run these tests with Ray execution, in the # `test-internal` job. # - test_wrong_values() permanently messes up some config variables. For more # details see https://github.com/modin-project/modin/issues/7454 ################# WARNING ###################### UNIDIST_SKIP_REASON = ( "Switching to unidist causes an error since we have to execute unidist " + "tests differently, with `mpiexec` instead of just `pytest`" ) @pytest.fixture def clear_backend_execution_and_storage_format(monkeypatch): """ Reset environment variables and config classes for backend, execution, and storage format. Parameters ---------- *vars : tuple[Parameter] """ for variable in (cfg.Backend, cfg.StorageFormat, cfg.Engine): monkeypatch.setattr(variable, "_value", _UNSET) monkeypatch.setattr(variable, "_value_source", ValueSource.DEFAULT) monkeypatch.delitem(os.environ, variable.varname, raising=False) @pytest.fixture def make_unknown_env(): varname = "MODIN_UNKNOWN" os.environ[varname] = "foo" yield varname del os.environ[varname] @pytest.fixture(params=[str, ExactStr]) def make_custom_envvar(request): class CustomVar(cfg.EnvironmentVariable, type=request.param): """custom var""" default = 10 varname = "MODIN_CUSTOM" choices = (1, 5, 10) return CustomVar @pytest.fixture(scope="session") def add_pandas_duplicate_on_ray_execution(): """ Add an execution mode with the storage format Test_Pandasduplicate and engine Ray. This mode's execution is equivalent to PandasOnRay execution. """ cfg.StorageFormat.add_option("Test_Pandasduplicate") from modin.core.execution.dispatching.factories import factories factories.Test_PandasduplicateOnRayFactory = factories.PandasOnRayFactory cfg.Backend.register_backend( "Test_Backend_1", cfg.Execution( storage_format="Test_Pandasduplicate", engine="Ray", ), ) @pytest.fixture def set_custom_envvar(make_custom_envvar): os.environ[make_custom_envvar.varname] = " custom " yield "Custom" if make_custom_envvar.type is str else " custom " del os.environ[make_custom_envvar.varname] def test_unknown(make_unknown_env): with pytest.warns(UserWarning, match=f"Found unknown .*{make_unknown_env}.*"): _check_vars() def test_custom_default(make_custom_envvar): assert make_custom_envvar.get() == 10 def test_custom_set(make_custom_envvar, set_custom_envvar): assert make_custom_envvar.get() == set_custom_envvar def test_custom_help(make_custom_envvar): assert "MODIN_CUSTOM" in make_custom_envvar.get_help() assert "custom var" in make_custom_envvar.get_help() class TestDocModule: """ Test using a module to replace default docstrings. """ def test_overrides(self): cfg.DocModule.put("modin.tests.config.docs_module") # Test for override assert BasePandasDataset.__doc__ == ( "This is a test of the documentation module for BasePandasDataSet." ) assert BasePandasDataset.apply.__doc__ == ( "This is a test of the documentation module for BasePandasDataSet.apply." ) # Test scenario 2 from https://github.com/modin-project/modin/issues/7113: # We can correctly override the docstring for BasePandasDataset.astype, # which is the same method (modulo some wrapping that we add to handle # extensions) as Series.astype. assert ( pd.Series.astype.__wrapped__.__wrapped__ is BasePandasDataset.astype.__wrapped__ ) assert BasePandasDataset.astype.__doc__ == ( "This is a test of the documentation module for BasePandasDataSet.astype." ) assert ( pd.DataFrame.apply.__doc__ == "This is a test of the documentation module for DataFrame." ) # Test for pandas doc when method is not defined on the plugin module assert pandas.DataFrame.isna.__doc__ in pd.DataFrame.isna.__doc__ assert pandas.DataFrame.isnull.__doc__ in pd.DataFrame.isnull.__doc__ assert BasePandasDataset.astype.__doc__ in pd.DataFrame.astype.__doc__ # Test for override assert ( pd.Series.isna.__doc__ == "This is a test of the documentation module for Series." ) # Test for pandas doc when method is not defined on the plugin module assert pandas.Series.isnull.__doc__ in pd.Series.isnull.__doc__ assert pandas.Series.apply.__doc__ in pd.Series.apply.__doc__ # Test for override assert pd.read_csv.__doc__ == "Test override for functions on the module." # Test for pandas doc when function is not defined on module. assert pandas.read_table.__doc__ in pd.read_table.__doc__ def test_not_redefining_classes_modin_issue_7138(self): original_dataframe_class = pd.DataFrame cfg.DocModule.put("modin.tests.config.docs_module") # Test for override assert ( pd.DataFrame.apply.__doc__ == "This is a test of the documentation module for DataFrame." ) assert pd.DataFrame is original_dataframe_class def test_base_docstring_override_with_no_dataframe_or_series_class_issue_7113( self, ): # This test case tests scenario 1 from issue 7113. sys.path.append(f"{os.path.dirname(__file__)}") cfg.DocModule.put("docs_module_with_just_base") assert BasePandasDataset.astype.__doc__ == ( "This is a test of the documentation module for BasePandasDataSet.astype." ) @pytest.mark.skipif(cfg.Engine.get() != "Ray", reason="Ray specific test") def test_ray_cluster_resources(): import ray cfg.RayInitCustomResources.put({"special_hardware": 1.0}) # create a dummy df to initialize Ray engine _ = pd.DataFrame([1, 2, 3]) assert ray.cluster_resources()["special_hardware"] == 1.0 @pytest.mark.parametrize( "modify_config", [{cfg.RangePartitioning: False, cfg.LazyExecution: "Auto"}], indirect=True, ) def test_context_manager_update_config(modify_config): # simple case, 1 parameter assert cfg.RangePartitioning.get() is False with cfg.context(RangePartitioning=True): assert cfg.RangePartitioning.get() is True assert cfg.RangePartitioning.get() is False # nested case, 1 parameter assert cfg.RangePartitioning.get() is False with cfg.context(RangePartitioning=True): assert cfg.RangePartitioning.get() is True with cfg.context(RangePartitioning=False): assert cfg.RangePartitioning.get() is False with cfg.context(RangePartitioning=False): assert cfg.RangePartitioning.get() is False assert cfg.RangePartitioning.get() is False assert cfg.RangePartitioning.get() is True assert cfg.RangePartitioning.get() is False # simple case, 2 parameters assert cfg.RangePartitioning.get() is False assert cfg.LazyExecution.get() == "Auto" with cfg.context(RangePartitioning=True, LazyExecution="Off"): assert cfg.RangePartitioning.get() is True assert cfg.LazyExecution.get() == "Off" assert cfg.RangePartitioning.get() is False assert cfg.LazyExecution.get() == "Auto" # nested case, 2 parameters assert cfg.RangePartitioning.get() is False assert cfg.LazyExecution.get() == "Auto" with cfg.context(RangePartitioning=True, LazyExecution="Off"): assert cfg.RangePartitioning.get() is True assert cfg.LazyExecution.get() == "Off" with cfg.context(RangePartitioning=False): assert cfg.RangePartitioning.get() is False assert cfg.LazyExecution.get() == "Off" with cfg.context(LazyExecution="On"): assert cfg.RangePartitioning.get() is False assert cfg.LazyExecution.get() == "On" with cfg.context(RangePartitioning=True, LazyExecution="Off"): assert cfg.RangePartitioning.get() is True assert cfg.LazyExecution.get() == "Off" assert cfg.RangePartitioning.get() is False assert cfg.LazyExecution.get() == "On" assert cfg.RangePartitioning.get() is False assert cfg.LazyExecution.get() == "Off" assert cfg.RangePartitioning.get() is True assert cfg.LazyExecution.get() == "Off" assert cfg.RangePartitioning.get() is False assert cfg.LazyExecution.get() == "Auto" class TestBackend: @pytest.mark.parametrize( "engine, storage_format, expected_backend", [ ("Python", "Pandas", "Python_Test"), ("Ray", "Pandas", "Ray"), param( "Unidist", "Pandas", "Unidist", marks=pytest.mark.skip(reason=UNIDIST_SKIP_REASON), ), ("Dask", "Pandas", "Dask"), ("Native", "Native", "Pandas"), ], ) def test_setting_execution_changes_backend( self, engine, storage_format, expected_backend ): previous_backend = cfg.Backend.get() with switch_execution(engine, storage_format): assert cfg.Backend.get() == expected_backend assert cfg.Backend.get() == previous_backend def test_subscribing_to_backend_triggers_callback(self): backend_subscriber = Mock() cfg.Backend.subscribe(backend_subscriber) backend_subscriber.assert_called_once_with(cfg.Backend) def test_setting_backend_triggers_all_callbacks(self): # Start with a known backend (rather than the one that we start the # test with). with cfg.context(Backend="Pandas"): backend_subscriber = Mock() cfg.Backend.subscribe(backend_subscriber) backend_subscriber.reset_mock() storage_format_subscriber = Mock() cfg.StorageFormat.subscribe(storage_format_subscriber) storage_format_subscriber.reset_mock() engine_subscriber = Mock() cfg.Engine.subscribe(engine_subscriber) engine_subscriber.reset_mock() with cfg.context(Backend="Python_Test"): backend_subscriber.assert_called_once_with(cfg.Backend) storage_format_subscriber.assert_called_once_with(cfg.StorageFormat) engine_subscriber.assert_called_once_with(cfg.Engine) @pytest.mark.parametrize( "backend, expected_engine, expected_storage_format", [ ("Python_test", "Python", "Pandas"), ("PYTHON_test", "Python", "Pandas"), ("python_TEST", "Python", "Pandas"), ("Ray", "Ray", "Pandas"), param( "Unidist", "Unidist", "Pandas", marks=pytest.mark.skip(reason=UNIDIST_SKIP_REASON), ), ("Dask", "Dask", "Pandas"), ("Pandas", "Native", "Native"), ], ) def test_setting_backend_changes_execution( self, backend, expected_engine, expected_storage_format ): previous_engine = cfg.Engine.get() previous_storage_format = cfg.StorageFormat.get() with cfg.context(Backend=backend): assert cfg.Engine.get() == expected_engine assert cfg.StorageFormat.get() == expected_storage_format assert cfg.Engine.get() == previous_engine assert cfg.StorageFormat.get() == previous_storage_format def test_setting_engine_alone_changes_backend(self): # Start with a known backend (rather than the one that we start the # test with). with switch_execution(storage_format="Pandas", engine="Ray"): current_backend = cfg.Backend.get() assert current_backend == "Ray" with cfg.context(Engine="Python"): assert cfg.Backend.get() == "Python_Test" assert cfg.Backend.get() == current_backend def test_setting_engine_triggers_callbacks(self): # Start with a known backend (rather than the one that we start the # test with). with switch_execution(storage_format="Pandas", engine="Ray"): engine_subscriber = Mock() cfg.Engine.subscribe(engine_subscriber) engine_subscriber.reset_mock() backend_subscriber = Mock() cfg.Backend.subscribe(backend_subscriber) backend_subscriber.reset_mock() storage_format_subscriber = Mock() cfg.StorageFormat.subscribe(storage_format_subscriber) storage_format_subscriber.reset_mock() with cfg.context(Engine="Dask"): engine_subscriber.assert_called_once_with(cfg.Engine) backend_subscriber.assert_called_once_with(cfg.Backend) # StorageFormat stayed the same, so we don't call its callback. storage_format_subscriber.assert_not_called() def test_setting_storage_format_triggers_callbacks(self): # There's only one built-in storage format, pandas, so we add a new one # here. cfg.StorageFormat.add_option("Pandasduplicate") from modin.core.execution.dispatching.factories import factories factories.PandasduplicateOnRayFactory = factories.PandasOnRayFactory cfg.Backend.register_backend( "NewBackend", cfg.Execution( storage_format="Pandasduplicate", engine="Ray", ), ) with switch_execution(storage_format="Pandas", engine="Ray"): engine_subscriber = Mock() cfg.Engine.subscribe(engine_subscriber) engine_subscriber.reset_mock() backend_subscriber = Mock() cfg.Backend.subscribe(backend_subscriber) backend_subscriber.reset_mock() storage_format_subscriber = Mock() cfg.StorageFormat.subscribe(storage_format_subscriber) storage_format_subscriber.reset_mock() with cfg.context(StorageFormat="PANDASDUPLICATE"): storage_format_subscriber.assert_called_once_with(cfg.StorageFormat) backend_subscriber.assert_called_once_with(cfg.Backend) # Engine stayed the same, so we don't call its callback. engine_subscriber.assert_not_called() @pytest.mark.parametrize("name", ["Python_Test", "python_Test"]) def test_register_existing_backend(self, name): with pytest.raises( ValueError, match=re.escape( "Backend 'Python_Test' is already registered with the execution " + "Execution(storage_format='Pandas', engine='Python')" ), ): cfg.Backend.register_backend( name, cfg.Execution( storage_format="Pandas", engine="Python", ), ) def test_register_existing_execution(self): with pytest.raises( ValueError, match=re.escape( "Execution(storage_format='Pandas', engine='Python') is already registered with the backend Python_Test." ), ): cfg.Backend.register_backend( "NewBackend2", cfg.Execution( storage_format="Pandas", engine="Python", ), ) def test_set_invalid_backend(self): with pytest.raises(ValueError, match=re.escape("Unknown backend 'Unknown'")): cfg.Backend.put("Unknown") def test_switch_to_unregistered_backend_with_switch_execution(self): cfg.StorageFormat.add_option("Pandas2") from modin.core.execution.dispatching.factories import factories factories.Pandas2OnRayFactory = factories.PandasOnRayFactory with pytest.raises( ValueError, match=re.escape( "Execution(storage_format='Pandas2', engine='Ray') " + "has no known backend. Please register a backend for it with " + "Backend.register_backend()" ), ), switch_execution(engine="Ray", storage_format="Pandas2"): pass def test_switch_to_unregistered_backend_with_switch_storage_format(self): cfg.StorageFormat.add_option("Pandas3") from modin.core.execution.dispatching.factories import factories factories.Pandas2OnRayFactory = factories.PandasOnPythonFactory with cfg.context(StorageFormat="Pandas", Engine="Python"): with pytest.raises( ValueError, match=re.escape( "Execution(storage_format='Pandas3', engine='Python') " + "has no known backend. Please register a backend for it with " + "Backend.register_backend()" ), ): cfg.StorageFormat.put("Pandas3") def test_switch_to_unregistered_backend_with_switch_engine(self): cfg.Engine.add_option("Python2") from modin.core.execution.dispatching.factories import factories factories.PandasOnPython2Factory = factories.PandasOnPythonFactory with cfg.context(StorageFormat="Pandas", Engine="Python"): with pytest.raises( ValueError, match=re.escape( "Execution(storage_format='Pandas', engine='Python2') " + "has no known backend. Please register a backend for it with " + "Backend.register_backend()" ), ): cfg.Engine.put("Python2") # The default engine and storage format, and hence the default backend, # will depend on which engines are available in the current environment. # For simplicity, patch the defaults. @patch( target="modin.config.StorageFormat._get_default", ) @patch( target="modin.config.Engine._get_default", ) def test_backend_default( self, mocked_get_default, mocked_get_default2, ): mocked_get_default.return_value = "Native" mocked_get_default2.return_value = "Native" assert cfg.Backend._get_default() == "Pandas" def test_add_backend_option(self): with pytest.raises( ValueError, match=re.escape( "Cannot add an option to Backend directly. Use Backend.register_backend instead." ), ): cfg.Backend.add_option("NewBackend") @pytest.mark.parametrize( "order_to_get_in", itertools.permutations( [ cfg.Backend, cfg.Engine, cfg.StorageFormat, ] ), ids=lambda permutation: "_".join(x.__name__ for x in permutation), ) @pytest.mark.parametrize( "storage_environment_variable, engine_environment_variable, variable_to_expected_value", [ ( "Native", "Native", { cfg.Backend: "Pandas", cfg.Engine: "Native", cfg.StorageFormat: "Native", }, ), ( "NATIVE", "NATIVE", { cfg.Backend: "Pandas", cfg.Engine: "Native", cfg.StorageFormat: "Native", }, ), ( "Pandas", "Dask", { cfg.Backend: "Dask", cfg.Engine: "Dask", cfg.StorageFormat: "Pandas", }, ), ], ) def test_storage_format_and_engine_come_from_environment( self, monkeypatch, clear_backend_execution_and_storage_format, order_to_get_in, storage_environment_variable, engine_environment_variable, variable_to_expected_value, ): with mock.patch.dict( os.environ, { cfg.StorageFormat.varname: storage_environment_variable, cfg.Engine.varname: engine_environment_variable, }, ): for variable in order_to_get_in: expected_value = variable_to_expected_value[variable] assert ( variable.get() == expected_value ), f"{variable.__name__} was {variable.get()} instead of {expected_value}" @pytest.mark.parametrize( "order_to_get_in", itertools.permutations( [ cfg.Backend, cfg.Engine, cfg.StorageFormat, ] ), ids=lambda permutation: "_".join(x.__name__ for x in permutation), ) @pytest.mark.parametrize( "engine_environment_variable, variable_to_expected_value", [ ( "Dask", {cfg.Backend: "Dask", cfg.StorageFormat: "Pandas", cfg.Engine: "Dask"}, ), ( "DASK", {cfg.Backend: "Dask", cfg.StorageFormat: "Pandas", cfg.Engine: "Dask"}, ), ( "python", { cfg.Backend: "Python_Test", cfg.StorageFormat: "Pandas", cfg.Engine: "Python", }, ), ( "ray", {cfg.Backend: "Ray", cfg.StorageFormat: "Pandas", cfg.Engine: "Ray"}, ), # note that we can't test Native here because it's not valid to use # "Native" engine with the default storage format of "Pandas." ], ) def test_only_engine_comes_from_environment( self, clear_backend_execution_and_storage_format, order_to_get_in, engine_environment_variable, variable_to_expected_value, ): with mock.patch.dict( os.environ, {cfg.Engine.varname: engine_environment_variable}, ): for var in order_to_get_in: expected_value = variable_to_expected_value[var] assert ( var.get() == expected_value ), f"{var.__name__} was {var.get()} instead of {expected_value}" @pytest.mark.parametrize( "order_to_get_in", itertools.permutations( [ cfg.Backend, cfg.Engine, cfg.StorageFormat, ] ), ids=lambda permutation: "_".join(x.__name__ for x in permutation), ) def test_only_storage_format_comes_from_environment( self, clear_backend_execution_and_storage_format, order_to_get_in, add_pandas_duplicate_on_ray_execution, ): # To test switching StorageFormat alone, we have to add a new backend # that works with the default "Pandas" execution. with mock.patch.dict( os.environ, { cfg.StorageFormat.varname: "Test_Pandasduplicate", }, ): cfg.Engine.put("Ray") for variable in order_to_get_in: expected_value = { cfg.Backend: "Test_Backend_1", cfg.Engine: "Ray", cfg.StorageFormat: "Test_Pandasduplicate", }[variable] assert ( variable.get() == expected_value ), f"{variable.__name__} was {variable.get()} instead of {expected_value}" @pytest.mark.parametrize( "order_to_get_in", itertools.permutations( [ cfg.Backend, cfg.Engine, cfg.StorageFormat, ] ), ids=lambda permutation: "_".join(x.__name__ for x in permutation), ) @pytest.mark.parametrize( "backend_environment_variable, variable_to_expected_value", [ ( "Pandas", { cfg.Backend: "Pandas", cfg.Engine: "Native", cfg.StorageFormat: "Native", }, ), ( "Ray", {cfg.Backend: "Ray", cfg.Engine: "Ray", cfg.StorageFormat: "Pandas"}, ), ( "Dask", {cfg.Backend: "Dask", cfg.Engine: "Dask", cfg.StorageFormat: "Pandas"}, ), ( "python_test", { cfg.Backend: "Python_Test", cfg.Engine: "Python", cfg.StorageFormat: "Pandas", }, ), ], ) def test_backend_comes_from_environment( self, monkeypatch, clear_backend_execution_and_storage_format, order_to_get_in, backend_environment_variable, variable_to_expected_value, ): with mock.patch.dict( os.environ, { cfg.Backend.varname: backend_environment_variable, }, ): for variable in order_to_get_in: expected_value = variable_to_expected_value[variable] assert ( variable.get() == expected_value ), f"{variable.__name__} was {variable.get()} instead of {expected_value}" @pytest.mark.parametrize( "order_to_get_in", itertools.permutations( [cfg.Backend, cfg.Engine, cfg.StorageFormat], ), ids=lambda permutation: "_".join(x.__name__ for x in permutation), ) def test_environment_not_set_and_pick_up_default_engine( self, clear_backend_execution_and_storage_format, order_to_get_in ): for variable in order_to_get_in: assert variable.get() == variable._get_default() @pytest.mark.parametrize( "execution_variable, value", [(cfg.Engine, "Python"), (cfg.StorageFormat, "Pandas")], ) @pytest.mark.parametrize( "variable_to_get", [cfg.Backend, cfg.Engine, cfg.StorageFormat], ) def test_conflicting_execution_and_backend_in_environment( self, monkeypatch, clear_backend_execution_and_storage_format, execution_variable, value, variable_to_get, ): monkeypatch.setitem(os.environ, cfg.Backend.varname, "Ray") monkeypatch.setitem(os.environ, execution_variable.varname, value) with pytest.raises( ValueError, match=re.escape("Can't specify both execution and backend in environment"), ): variable_to_get.get() def test_get_execution_for_unknown_backend(self): backend_choice_string = ", ".join( f"'{choice}'" for choice in cfg.Backend.choices ) with pytest.raises( ValueError, match=re.escape( f"Unknown backend 'Unknown'. Available backends are: {backend_choice_string}" ), ): cfg.Backend.get_execution_for_backend("Unknown") @pytest.mark.parametrize( "config_name", [ "NPartitions", "CpuCount", "LogMemoryInterval", "LogFileSize", "MinRowPartitionSize", "MinColumnPartitionSize", ], ) def test_wrong_values(config_name): config: cfg.EnvironmentVariable = getattr(cfg, config_name) new_value = -1 with pytest.raises(ValueError): with cfg.context(**{config_name: new_value}): _ = config.get() ================================================ FILE: modin/tests/config/test_parameter.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. from collections import defaultdict import pytest from modin.config import Parameter from modin.config.pubsub import _TYPE_PARAMS def make_prefilled(vartype, varinit): class Prefilled(Parameter, type=vartype): @classmethod def _get_value_from_config(cls): if not _TYPE_PARAMS[cls.type].verify(varinit): raise ValueError(f"Unsupported raw value: {varinit}") return _TYPE_PARAMS[cls.type].decode(varinit) return Prefilled @pytest.fixture def prefilled_parameter(): return make_prefilled(str, "init") def test_equals(prefilled_parameter): assert prefilled_parameter.get() == "Init" prefilled_parameter.put("value2") assert prefilled_parameter.get() == "Value2" def test_triggers(prefilled_parameter): results = defaultdict(int) callbacks = [] def make_callback(name, res=results): def callback(p: Parameter): res[name] += 1 # keep reference to callbacks so they won't be removed by GC callbacks.append(callback) return callback prefilled_parameter.once("init", make_callback("init")) assert results["init"] == 1 prefilled_parameter.once("never", make_callback("never")) prefilled_parameter.once("once", make_callback("once")) prefilled_parameter.subscribe(make_callback("subscribe")) prefilled_parameter.put("multi") prefilled_parameter.put("once") prefilled_parameter.put("multi") prefilled_parameter.put("once") expected = [("init", 1), ("never", 0), ("once", 1), ("subscribe", 5)] for name, val in expected: assert results[name] == val, "{} has wrong count".format(name) @pytest.mark.parametrize( "parameter,good,bad", [ (make_prefilled(bool, "false"), {"1": True, False: False}, ["nope", 2]), (make_prefilled(int, "10"), {" 15\t": 15, 25: 25}, ["-10", 1.0, "foo"]), ( make_prefilled(dict, "key = value"), { "KEY1 = VALUE1, KEY2=VALUE2=VALUE3,KEY3=0": { "KEY1": "VALUE1", "KEY2": "VALUE2=VALUE3", "KEY3": 0, }, "KEY=1": {"KEY": 1}, }, ["key1=some,string", "key1=value1,key2=", "random string"], ), ], ) def test_validation(parameter, good, bad): for inval, outval in good.items(): parameter.put(inval) assert parameter.get() == outval for inval in bad: with pytest.raises(ValueError): parameter.put(inval) @pytest.mark.parametrize("vartype", [bool, int, dict]) def test_init_validation(vartype): parameter = make_prefilled(vartype, "bad value") with pytest.raises(ValueError): parameter.get() ================================================ FILE: modin/tests/core/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. ================================================ FILE: modin/tests/core/storage_formats/base/test_internals.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import pandas import pytest import modin.pandas as pd from modin.config import NPartitions from modin.tests.pandas.utils import create_test_dfs, df_equals, test_data_values NPartitions.put(4) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("item_length", [0, 1, 2]) @pytest.mark.parametrize("loc", ["first", "first + 1", "middle", "penult", "last"]) @pytest.mark.parametrize("replace", [True, False]) def test_insert_item(axis, item_length, loc, replace): data = test_data_values[0] def post_fn(df): return ( (df.iloc[:, :-item_length], df.iloc[:, -item_length:]) if axis else (df.iloc[:-item_length, :], df.iloc[-item_length:, :]) ) def get_loc(frame, loc): locs_dict = { "first": 0, "first + 1": 1, "middle": len(frame.axes[axis]) // 2, "penult": len(frame.axes[axis]) - 1, "last": len(frame.axes[axis]), } return locs_dict[loc] def get_reference(df, value, loc): if axis == 0: first_mask = df.iloc[:loc] if replace: loc += 1 second_mask = df.iloc[loc:] else: first_mask = df.iloc[:, :loc] if replace: loc += 1 second_mask = df.iloc[:, loc:] return pandas.concat([first_mask, value, second_mask], axis=axis) md_frames, pd_frames = create_test_dfs(data, post_fn=post_fn) md_item1, md_item2 = md_frames pd_item1, pd_item2 = pd_frames index_loc = get_loc(pd_item1, loc) pd_res = get_reference(pd_item1, loc=index_loc, value=pd_item2) md_res = md_item1._query_compiler.insert_item( axis=axis, loc=index_loc, value=md_item2._query_compiler, replace=replace ).to_pandas() df_equals( md_res, pd_res, # This test causes an empty slice to be generated thus triggering: # https://github.com/modin-project/modin/issues/5974 check_dtypes=axis != 0, ) index_loc = get_loc(pd_item2, loc) pd_res = get_reference(pd_item2, loc=index_loc, value=pd_item1) md_res = md_item2._query_compiler.insert_item( axis=axis, loc=index_loc, value=md_item1._query_compiler, replace=replace ).to_pandas() df_equals( md_res, pd_res, # This test causes an empty slice to be generated thus triggering: # https://github.com/modin-project/modin/issues/5974 check_dtypes=axis != 0, ) @pytest.mark.parametrize("num_rows", list(range(1, 5)), ids=lambda x: f"num_rows={x}") @pytest.mark.parametrize("num_cols", list(range(1, 5)), ids=lambda x: f"num_cols={x}") def test_repr_size_issue_6104(num_rows, num_cols): # this tests an edge case where we used to select exactly num_cols / 2 + 1 columns # from both the front and the back of the dataframe, but the dataframe is such a # length that the front and back columns overlap at one column. The result is that # we convert one column twice to pandas, although we would never see the duplicate # column in the output because pandas would also only represent the num_cols / 2 # columns from the front and back. df = pd.DataFrame([list(range(4)) for _ in range(4)]) pandas_repr_df = df._build_repr_df(num_rows, num_cols) assert pandas_repr_df.columns.is_unique assert pandas_repr_df.index.is_unique ================================================ FILE: modin/tests/core/storage_formats/cudf/test_gpu_managers.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. ================================================ FILE: modin/tests/core/storage_formats/cudf/test_internals.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. ================================================ FILE: modin/tests/core/storage_formats/pandas/test_internals.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import functools import sys import unittest.mock as mock import numpy as np import pandas import pytest import modin.pandas as pd from modin.config import ( CpuCount, Engine, MinColumnPartitionSize, MinRowPartitionSize, NPartitions, RangePartitioning, context, ) from modin.core.dataframe.algebra import Fold from modin.core.dataframe.algebra.default2pandas import DataFrameDefault from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe from modin.core.dataframe.pandas.dataframe.utils import ColumnInfo, ShuffleSortFunctions from modin.core.dataframe.pandas.metadata import ( DtypesDescriptor, LazyProxyCategoricalDtype, ModinDtypes, ) from modin.core.execution.utils import remote_function from modin.core.storage_formats import PandasQueryCompiler from modin.core.storage_formats.pandas.utils import split_result_of_axis_func_pandas from modin.distributed.dataframe.pandas import from_partitions from modin.tests.pandas.utils import ( create_test_dfs, df_equals, eval_general, test_data_values, ) from modin.utils import try_cast_to_pandas NPartitions.put(4) if Engine.get() == "Ray": import ray from modin.core.execution.ray.common import RayWrapper from modin.core.execution.ray.common.deferred_execution import MetaList from modin.core.execution.ray.implementations.pandas_on_ray.partitioning import ( PandasOnRayDataframeColumnPartition, PandasOnRayDataframePartition, PandasOnRayDataframeRowPartition, ) block_partition_class = PandasOnRayDataframePartition virtual_column_partition_class = PandasOnRayDataframeColumnPartition virtual_row_partition_class = PandasOnRayDataframeRowPartition put = RayWrapper.put deploy = RayWrapper.deploy materialize = RayWrapper.materialize elif Engine.get() == "Dask": from modin.core.execution.dask.common import DaskWrapper from modin.core.execution.dask.implementations.pandas_on_dask.partitioning import ( PandasOnDaskDataframeColumnPartition, PandasOnDaskDataframePartition, PandasOnDaskDataframeRowPartition, ) # initialize modin dataframe to initialize dask pd.DataFrame() def put(x): return DaskWrapper.put(x, hash=False) block_partition_class = PandasOnDaskDataframePartition virtual_column_partition_class = PandasOnDaskDataframeColumnPartition virtual_row_partition_class = PandasOnDaskDataframeRowPartition deploy = DaskWrapper.deploy materialize = DaskWrapper.materialize elif Engine.get() == "Unidist": from modin.core.execution.unidist.common import UnidistWrapper from modin.core.execution.unidist.implementations.pandas_on_unidist.partitioning import ( PandasOnUnidistDataframeColumnPartition, PandasOnUnidistDataframePartition, PandasOnUnidistDataframeRowPartition, ) block_partition_class = PandasOnUnidistDataframePartition virtual_column_partition_class = PandasOnUnidistDataframeColumnPartition virtual_row_partition_class = PandasOnUnidistDataframeRowPartition put = UnidistWrapper.put elif Engine.get() == "Python": from modin.core.execution.python.common import PythonWrapper from modin.core.execution.python.implementations.pandas_on_python.partitioning import ( PandasOnPythonDataframeColumnPartition, PandasOnPythonDataframePartition, PandasOnPythonDataframeRowPartition, ) def put(x): return PythonWrapper.put(x, hash=False) def deploy(func, args=tuple()): return func(*args) def materialize(arg): return arg block_partition_class = PandasOnPythonDataframePartition virtual_column_partition_class = PandasOnPythonDataframeColumnPartition virtual_row_partition_class = PandasOnPythonDataframeRowPartition else: raise NotImplementedError( f"These test suites are not implemented for the '{Engine.get()}' engine" ) def construct_modin_df_by_scheme(pandas_df, partitioning_scheme): """ Build ``modin.pandas.DataFrame`` from ``pandas.DataFrame`` according the `partitioning_scheme`. Parameters ---------- pandas_df : pandas.DataFrame partitioning_scheme : dict[{"row_lengths", "column_widths"}] -> list of ints Returns ------- modin.pandas.DataFrame """ index = pandas_df.index columns = pandas_df.columns row_lengths = partitioning_scheme["row_lengths"] column_widths = partitioning_scheme["column_widths"] new_length = sum(row_lengths) new_width = sum(column_widths) new_index = index if len(index) == new_length else index[:new_length] new_columns = columns if len(columns) == new_width else columns[:new_width] row_partitions = split_result_of_axis_func_pandas( axis=0, num_splits=len(row_lengths), result=pandas_df, min_block_size=MinRowPartitionSize.get(), length_list=row_lengths, ) partitions = [ split_result_of_axis_func_pandas( axis=1, num_splits=len(column_widths), result=row_part, min_block_size=MinColumnPartitionSize.get(), length_list=column_widths, ) for row_part in row_partitions ] md_df = from_partitions( [[put(part) for part in row_parts] for row_parts in partitions], axis=None, index=new_index, columns=new_columns, row_lengths=row_lengths, column_widths=column_widths, ) return md_df def validate_partitions_cache(df, axis=None): """ Assert that the ``PandasDataframe`` shape caches correspond to the actual partition's shapes. Parameters ---------- df : PandasDataframe axis : int, optional An axis to verify the cache for. If not specified, verify cache for both of the axes. """ axis = [0, 1] if axis is None else [axis] axis_lengths = [df._row_lengths_cache, df._column_widths_cache] for ax in axis: assert axis_lengths[ax] is not None assert df._partitions.shape[ax] == len(axis_lengths[ax]) for i in range(df._partitions.shape[0]): for j in range(df._partitions.shape[1]): if 0 in axis: assert df._partitions[i, j].length() == axis_lengths[0][i] if 1 in axis: assert df._partitions[i, j].width() == axis_lengths[1][j] def assert_has_no_cache(df, axis=0): """ Assert that the passed dataframe has no labels and no lengths cache along the specified axis. Parameters ---------- df : modin.pandas.DataFrame axis : int, default: 0 """ mf = df._query_compiler._modin_frame if axis == 0: assert not mf.has_materialized_index and mf._row_lengths_cache is None else: assert not mf.has_materialized_columns and mf._column_widths_cache is None def remove_axis_cache(df, axis=0, remove_lengths=True): """ Remove index/columns cache for the passed dataframe. Parameters ---------- df : modin.pandas.DataFrame axis : int, default: 0 0 - remove index cache, 1 - remove columns cache. remove_lengths : bool, default: True Whether to remove row lengths/column widths cache. """ mf = df._query_compiler._modin_frame if axis == 0: mf.set_index_cache(None) if remove_lengths: mf._row_lengths_cache = None else: mf.set_columns_cache(None) if remove_lengths: mf._column_widths_cache = None def test_aligning_blocks(): # Test problem when modin frames have the same number of rows, but different # blocks (partition.list_of_blocks). See #2322 for details accm = pd.DataFrame(["-22\n"] * 162) accm = accm.iloc[2:, :] accm.reset_index(drop=True, inplace=True) accm["T"] = pd.Series(["24.67\n"] * 145) # see #2322 for details try_cast_to_pandas(accm) # force materialization def test_aligning_blocks_with_duplicated_index(): # Same problem as in `test_aligning_blocks` but with duplicated values in index. data11 = [0, 1] data12 = [2, 3] data21 = [0] data22 = [1, 2, 3] df1 = pd.concat((pd.DataFrame(data11), pd.DataFrame(data12))) df2 = pd.concat((pd.DataFrame(data21), pd.DataFrame(data22))) try_cast_to_pandas(df1 - df2) # force materialization def test_aligning_partitions(): data = [0, 1, 2, 3, 4, 5] modin_df1, _ = create_test_dfs({"a": data, "b": data}) modin_df = modin_df1.loc[:2] modin_df2 = pd.concat((modin_df, modin_df)) modin_df2["c"] = modin_df1["b"] try_cast_to_pandas(modin_df2) # force materialization @pytest.mark.parametrize("row_labels", [None, [("a", "")], ["a"]]) @pytest.mark.parametrize("col_labels", [None, ["a1"], [("c1", "z")]]) def test_take_2d_labels_or_positional(row_labels, col_labels): kwargs = { "index": [["a", "b", "c", "d"], ["", "", "x", "y"]], "columns": [["a1", "b1", "c1", "d1"], ["", "", "z", "x"]], } md_df, pd_df = create_test_dfs(np.random.rand(4, 4), **kwargs) _row_labels = slice(None) if row_labels is None else row_labels _col_labels = slice(None) if col_labels is None else col_labels pd_df = pd_df.loc[_row_labels, _col_labels] modin_frame = md_df._query_compiler._modin_frame new_modin_frame = modin_frame.take_2d_labels_or_positional( row_labels=row_labels, col_labels=col_labels ) md_df._query_compiler._modin_frame = new_modin_frame df_equals(md_df, pd_df) @pytest.mark.parametrize("has_partitions_shape_cache", [True, False]) @pytest.mark.parametrize("has_frame_shape_cache", [True, False]) def test_apply_func_to_both_axis(has_partitions_shape_cache, has_frame_shape_cache): """ Test ``modin.core.dataframe.pandas.dataframe.dataframe.PandasDataframe.apply_select_indices`` functionality of broadcasting non-distributed items. """ data = test_data_values[0] md_df, pd_df = create_test_dfs(data) values = pd_df.values + 1 pd_df.iloc[:, :] = values modin_frame = md_df._query_compiler._modin_frame if has_frame_shape_cache: # Explicitly compute rows & columns shapes to store this info in frame's cache modin_frame.row_lengths modin_frame.column_widths else: # Explicitly reset frame's cache modin_frame._row_lengths_cache = None modin_frame._column_widths_cache = None for row in modin_frame._partitions: for part in row: if has_partitions_shape_cache: # Explicitly compute partition shape to store this info in its cache part.length() part.width() else: # Explicitly reset partition's shape cache part._length_cache = None part._width_cache = None def func_to_apply(partition, row_internal_indices, col_internal_indices, item): partition.iloc[row_internal_indices, col_internal_indices] = item return partition new_modin_frame = modin_frame.apply_select_indices( axis=None, func=func_to_apply, # Passing none-slices does not trigger shapes recomputation and so the cache is untouched. row_labels=slice(None), col_labels=slice(None), keep_remaining=True, new_index=pd_df.index, new_columns=pd_df.columns, item_to_distribute=values, ) md_df._query_compiler._modin_frame = new_modin_frame df_equals(md_df, pd_df) @pytest.mark.parametrize( "test_type", [ "many_small_dfs", "concatted_df_with_small_dfs", "large_df_plus_small_dfs", ], ) @pytest.mark.parametrize( "set_num_partitions", [1, 4], indirect=True, ) def test_rebalance_partitions(test_type, set_num_partitions): num_partitions = NPartitions.get() if test_type == "many_small_dfs": small_dfs = [ pd.DataFrame( [[i + j for j in range(0, 1000)]], columns=[f"col{j}" for j in range(0, 1000)], index=pd.Index([i]), ) for i in range(1, 100001, 1000) ] large_df = pd.concat(small_dfs) col_length = 100 elif test_type == "concatted_df_with_small_dfs": small_dfs = [ pd.DataFrame( [[i + j for j in range(0, 1000)]], columns=[f"col{j}" for j in range(0, 1000)], index=pd.Index([i]), ) for i in range(1, 100001, 1000) ] large_df = pd.concat([pd.concat(small_dfs)] + small_dfs[:3]) col_length = 103 else: large_df = pd.DataFrame( [[i + j for j in range(1, 1000)] for i in range(0, 100000, 1000)], columns=[f"col{j}" for j in range(1, 1000)], index=pd.Index(list(range(0, 100000, 1000))), ) small_dfs = [ pd.DataFrame( [[i + j for j in range(0, 1000)]], columns=[f"col{j}" for j in range(0, 1000)], index=pd.Index([i]), ) for i in range(1, 4001, 1000) ] large_df = pd.concat([large_df] + small_dfs[:3]) col_length = 103 large_modin_frame = large_df._query_compiler._modin_frame assert large_modin_frame._partitions.shape == ( num_partitions, num_partitions, ), "Partitions were not rebalanced after concat." assert all( isinstance(ptn, large_modin_frame._partition_mgr_cls._column_partitions_class) for ptn in large_modin_frame._partitions.flatten() ) # The following check tests that we can correctly form full-axis virtual partitions # over the orthogonal axis from non-full-axis virtual partitions. def col_apply_func(col): assert len(col) == col_length, "Partial axis partition detected." return col + 1 large_apply_result = large_df.apply(col_apply_func) large_apply_result_frame = large_apply_result._query_compiler._modin_frame assert large_apply_result_frame._partitions.shape == ( num_partitions, num_partitions, ), "Partitions list shape is incorrect." assert all( isinstance(ptn, large_apply_result_frame._partition_mgr_cls._partition_class) for ptn in large_apply_result_frame._partitions.flatten() ), "Partitions are not block partitioned after column-wise apply." large_df = pd.DataFrame( query_compiler=large_df._query_compiler.__constructor__(large_modin_frame) ) # The following check tests that we can correctly form full-axis virtual partitions # over the same axis from non-full-axis virtual partitions. def row_apply_func(row): assert len(row) == 1000, "Partial axis partition detected." return row + 1 large_apply_result = large_df.apply(row_apply_func, axis=1) large_apply_result_frame = large_apply_result._query_compiler._modin_frame assert large_apply_result_frame._partitions.shape == ( num_partitions, num_partitions, ), "Partitions list shape is incorrect." assert all( isinstance(ptn, large_apply_result_frame._partition_mgr_cls._partition_class) for ptn in large_apply_result_frame._partitions.flatten() ), "Partitions are not block partitioned after row-wise apply." large_apply_result = large_df.applymap(lambda x: x) large_apply_result_frame = large_apply_result._query_compiler._modin_frame assert large_apply_result_frame._partitions.shape == ( num_partitions, num_partitions, ), "Partitions list shape is incorrect." assert all( isinstance(ptn, large_apply_result_frame._partition_mgr_cls._partition_class) for ptn in large_apply_result_frame._partitions.flatten() ), "Partitions are not block partitioned after element-wise apply." @pytest.mark.parametrize( "axis,virtual_partition_class", ((0, virtual_column_partition_class), (1, virtual_row_partition_class)), ids=["partitions_spanning_all_columns", "partitions_spanning_all_rows"], ) class TestDrainVirtualPartitionCallQueue: """Test draining virtual partition call queues. Test creating a virtual partition made of block partitions and/or one or more layers of virtual partitions, draining the top-level partition's call queue, and getting the result. In all these test cases, the full_axis argument doesn't matter for correctness because it only affects `apply`, which is not used here. Still, virtual partition users are not supposed to create full-axis virtual partitions out of other full-axis virtual partitions, so set full_axis to False everywhere. """ def test_from_virtual_partitions_with_call_queues( self, axis, virtual_partition_class, ): # reverse the dataframe along the virtual partition axis. def reverse(df): return df.iloc[::-1, :] if axis == 0 else df.iloc[:, ::-1] level_zero_blocks_first = [ block_partition_class(put(pandas.DataFrame([0]))), block_partition_class(put(pandas.DataFrame([1]))), ] level_one_virtual_first = virtual_partition_class( level_zero_blocks_first, full_axis=False ) level_one_virtual_first = level_one_virtual_first.add_to_apply_calls(reverse) level_zero_blocks_second = [ block_partition_class(put(pandas.DataFrame([2]))), block_partition_class(put(pandas.DataFrame([3]))), ] level_one_virtual_second = virtual_partition_class( level_zero_blocks_second, full_axis=False ) level_one_virtual_second = level_one_virtual_second.add_to_apply_calls(reverse) level_two_virtual = virtual_partition_class( [level_one_virtual_first, level_one_virtual_second], full_axis=False ) level_two_virtual.drain_call_queue() if axis == 0: expected_df = pandas.DataFrame([1, 0, 3, 2], index=[0, 0, 0, 0]) else: expected_df = pandas.DataFrame([[1, 0, 3, 2]], columns=[0, 0, 0, 0]) df_equals( level_two_virtual.to_pandas(), expected_df, ) def test_from_block_and_virtual_partition_with_call_queues( self, axis, virtual_partition_class ): # make a function that reverses the dataframe along the virtual # partition axis. # for testing axis == 0, start with two 2-rows-by-1-column blocks. for # axis == 1, start with two 1-rows-by-2-column blocks. def reverse(df): return df.iloc[::-1, :] if axis == 0 else df.iloc[:, ::-1] block_data = [[0, 1], [2, 3]] if axis == 0 else [[[0, 1]], [[2, 3]]] level_zero_blocks = [ block_partition_class(put(pandas.DataFrame(block_data[0]))), block_partition_class(put(pandas.DataFrame(block_data[1]))), ] level_zero_blocks[0] = level_zero_blocks[0].add_to_apply_calls(reverse) level_one_virtual = virtual_partition_class( level_zero_blocks[1], full_axis=False ) level_one_virtual = level_one_virtual.add_to_apply_calls(reverse) level_two_virtual = virtual_partition_class( [level_zero_blocks[0], level_one_virtual], full_axis=False ) level_two_virtual.drain_call_queue() if axis == 0: expected_df = pandas.DataFrame([1, 0, 3, 2], index=[1, 0, 1, 0]) else: expected_df = pandas.DataFrame([[1, 0, 3, 2]], columns=[1, 0, 1, 0]) df_equals(level_two_virtual.to_pandas(), expected_df) def test_virtual_partition_call_queues_at_three_levels( self, axis, virtual_partition_class ): block = block_partition_class(put(pandas.DataFrame([1]))) level_one_virtual = virtual_partition_class([block], full_axis=False) level_one_virtual = level_one_virtual.add_to_apply_calls( lambda df: pandas.concat([df, pandas.DataFrame([2])]) ) level_two_virtual = virtual_partition_class( [level_one_virtual], full_axis=False ) level_two_virtual = level_two_virtual.add_to_apply_calls( lambda df: pandas.concat([df, pandas.DataFrame([3])]) ) level_three_virtual = virtual_partition_class( [level_two_virtual], full_axis=False ) level_three_virtual = level_three_virtual.add_to_apply_calls( lambda df: pandas.concat([df, pandas.DataFrame([4])]) ) level_three_virtual.drain_call_queue() df_equals( level_three_virtual.to_pandas(), pd.DataFrame([1, 2, 3, 4], index=[0, 0, 0, 0]), ) @pytest.mark.parametrize( "virtual_partition_class", (virtual_column_partition_class, virtual_row_partition_class), ids=["partitions_spanning_all_columns", "partitions_spanning_all_rows"], ) def test_virtual_partition_apply_not_returning_pandas_dataframe( virtual_partition_class, ): # see https://github.com/modin-project/modin/issues/4811 partition = virtual_partition_class( block_partition_class(put(pandas.DataFrame())), full_axis=False ) apply_result = partition.apply(lambda df: 1).get() assert apply_result == 1 @pytest.mark.skipif( Engine.get() != "Ray", reason="Only ray.wait() does not take duplicate object refs.", ) def test_virtual_partition_dup_object_ref(): # See https://github.com/modin-project/modin/issues/5045 frame_c = pd.DataFrame(np.zeros((100, 20), dtype=np.float32, order="C")) frame_c = [frame_c] * 20 df = pd.concat(frame_c) partition = df._query_compiler._modin_frame._partitions.flatten()[0] obj_refs = partition.list_of_blocks assert len(obj_refs) != len( set(obj_refs) ), "Test setup did not contain duplicate objects" # The below call to wait() should not crash partition.wait() __test_reorder_labels_cache_axis_positions = [ pytest.param(lambda index: None, id="no_reordering"), pytest.param(lambda index: np.arange(len(index) - 1, -1, -1), id="reordering_only"), pytest.param( lambda index: [0, 1, 2, len(index) - 3, len(index) - 2, len(index) - 1], id="projection_only", ), pytest.param( lambda index: np.repeat(np.arange(len(index)), repeats=3), id="size_grow" ), ] @pytest.mark.parametrize("row_positions", __test_reorder_labels_cache_axis_positions) @pytest.mark.parametrize("col_positions", __test_reorder_labels_cache_axis_positions) @pytest.mark.parametrize( "partitioning_scheme", [ pytest.param( lambda df: { "row_lengths": [df.shape[0]], "column_widths": [df.shape[1]], }, id="single_partition", ), pytest.param( lambda df: { "row_lengths": [32, max(0, df.shape[0] - 32)], "column_widths": [32, max(0, df.shape[1] - 32)], }, id="two_unbalanced_partitions", ), pytest.param( lambda df: { "row_lengths": [df.shape[0] // NPartitions.get()] * NPartitions.get(), "column_widths": [df.shape[1] // NPartitions.get()] * NPartitions.get(), }, id="perfect_partitioning", ), pytest.param( lambda df: { "row_lengths": [2**i for i in range(NPartitions.get())], "column_widths": [2**i for i in range(NPartitions.get())], }, id="unbalanced_partitioning_equals_npartition", ), pytest.param( lambda df: { "row_lengths": [2] * (df.shape[0] // 2), "column_widths": [2] * (df.shape[1] // 2), }, id="unbalanced_partitioning", ), ], ) def test_reorder_labels_cache( row_positions, col_positions, partitioning_scheme, ): pandas_df = pandas.DataFrame(test_data_values[0]) md_df = construct_modin_df_by_scheme(pandas_df, partitioning_scheme(pandas_df)) md_df = md_df._query_compiler._modin_frame result = md_df._reorder_labels( row_positions(md_df.index), col_positions(md_df.columns) ) validate_partitions_cache(result) def test_reorder_labels_dtypes(): pandas_df = pandas.DataFrame( { "a": [1, 2, 3, 4], "b": [1.0, 2.4, 3.4, 4.5], "c": ["a", "b", "c", "d"], "d": pd.to_datetime([1, 2, 3, 4], unit="D"), } ) md_df = construct_modin_df_by_scheme( pandas_df, partitioning_scheme={ "row_lengths": [len(pandas_df)], "column_widths": [ len(pandas_df) // 2, len(pandas_df) // 2 + len(pandas_df) % 2, ], }, ) md_df = md_df._query_compiler._modin_frame result = md_df._reorder_labels( row_positions=None, col_positions=np.arange(len(md_df.columns) - 1, -1, -1) ) df_equals(result.dtypes, result.to_pandas().dtypes) @pytest.mark.parametrize( "left_partitioning, right_partitioning, ref_with_cache_available, ref_with_no_cache", # Note: this test takes into consideration that `MinRowPartitionSize == 32`, # `MinColumnPartitionSize == 32` and `NPartitions == 4` [ ( [2], [2], 1, # the num_splits is computed like (2 + 2 = 4 / chunk_size = 1 split) 2, # the num_splits is just splits sum (1 + 1 == 2) ), ( [24], [54], 3, # the num_splits is computed like (24 + 54 = 78 / chunk_size = 3 splits) 2, # the num_splits is just splits sum (1 + 1 == 2) ), ( [2], [299], 4, # the num_splits is bounded by NPartitions (2 + 299 = 301 / chunk_size = 10 splits -> bound by 4) 2, # the num_splits is just splits sum (1 + 1 == 2) ), ( [32, 32], [128], 4, # the num_splits is bounded by NPartitions (32 + 32 + 128 = 192 / chunk_size = 6 splits -> bound by 4) 3, # the num_splits is just splits sum (2 + 1 == 3) ), ( [128] * 7, [128] * 6, 4, # the num_splits is bounded by NPartitions (128 * 7 + 128 * 6 = 1664 / chunk_size = 52 splits -> bound by 4) 4, # the num_splits is just splits sum bound by NPartitions (7 + 6 = 13 splits -> 4 splits) ), ], ) @pytest.mark.parametrize( "modify_config", [{NPartitions: 4, MinRowPartitionSize: 32, MinColumnPartitionSize: 32}], indirect=True, ) def test_merge_partitioning( left_partitioning, right_partitioning, ref_with_cache_available, ref_with_no_cache, modify_config, ): from modin.core.storage_formats.pandas.utils import merge_partitioning left_df = pandas.DataFrame( [np.arange(sum(left_partitioning)) for _ in range(sum(left_partitioning))] ) right_df = pandas.DataFrame( [np.arange(sum(right_partitioning)) for _ in range(sum(right_partitioning))] ) left = construct_modin_df_by_scheme( left_df, {"row_lengths": left_partitioning, "column_widths": left_partitioning} )._query_compiler._modin_frame right = construct_modin_df_by_scheme( right_df, {"row_lengths": right_partitioning, "column_widths": right_partitioning}, )._query_compiler._modin_frame assert left.row_lengths == left.column_widths == left_partitioning assert right.row_lengths == right.column_widths == right_partitioning res = merge_partitioning(left, right, axis=0) assert res == ref_with_cache_available res = merge_partitioning(left, right, axis=1) assert res == ref_with_cache_available ( left._row_lengths_cache, left._column_widths_cache, right._row_lengths_cache, right._column_widths_cache, ) = [None] * 4 res = merge_partitioning(left, right, axis=0) assert res == ref_with_no_cache # Verifying that no computations are being triggered assert all( cache is None for cache in ( left._row_lengths_cache, left._column_widths_cache, right._row_lengths_cache, right._column_widths_cache, ) ) res = merge_partitioning(left, right, axis=1) assert res == ref_with_no_cache # Verifying that no computations are being triggered assert all( cache is None for cache in ( left._row_lengths_cache, left._column_widths_cache, right._row_lengths_cache, right._column_widths_cache, ) ) def test_merge_with_bad_partitioning(): # https://github.com/modin-project/modin/pull/7229 left_partitioning = [256] right_partitioning = [32, 32, 32, 32] left_df = pandas.DataFrame( [np.arange(sum(left_partitioning)) for _ in range(sum(left_partitioning))] ) right_df = pandas.DataFrame( [np.arange(sum(right_partitioning)) for _ in range(sum(right_partitioning))] ) left = construct_modin_df_by_scheme( left_df, {"row_lengths": left_partitioning, "column_widths": left_partitioning} ) right = construct_modin_df_by_scheme( right_df, {"row_lengths": right_partitioning, "column_widths": right_partitioning}, ) left_frame = left._query_compiler._modin_frame right_frame = right._query_compiler._modin_frame assert left_frame.row_lengths == left_frame.column_widths == left_partitioning assert right_frame.row_lengths == right_frame.column_widths == right_partitioning # just a dummy value return_value = pd.DataFrame([1, 2, 3, 4])._query_compiler with mock.patch.object( left._query_compiler, "repartition", return_value=return_value ) as repartition: _ = left.merge(right) repartition.assert_called_once_with(axis=0) def test_groupby_with_empty_partition(): # see #5461 for details md_df = construct_modin_df_by_scheme( pandas_df=pandas.DataFrame({"a": [1, 1, 2, 2], "b": [3, 4, 5, 6]}), partitioning_scheme={"row_lengths": [2, 2], "column_widths": [2]}, ) md_res = md_df.query("a > 1", engine="python") grp_obj = md_res.groupby("a") # check index error due to partitioning mismatching grp_obj.count() md_df = construct_modin_df_by_scheme( pandas_df=pandas.DataFrame({"a": [1, 1, 2, 2], "b": [3, 4, 5, 6]}), partitioning_scheme={"row_lengths": [2, 2], "column_widths": [2]}, ) md_res = md_df.query("a > 1", engine="python") grp_obj = md_res.groupby(md_res["a"]) grp_obj.count() @pytest.mark.parametrize("set_num_partitions", [2], indirect=True) def test_repartitioning(set_num_partitions): """ This test verifies that 'keep_partitioning=False' doesn't actually preserve partitioning. For more details see: https://github.com/modin-project/modin/issues/5621 """ assert NPartitions.get() == 2 pandas_df = pandas.DataFrame( {"a": [1, 1, 2, 2], "b": [3, 4, 5, 6], "c": [1, 2, 3, 4], "d": [4, 5, 6, 7]} ) modin_df = construct_modin_df_by_scheme( pandas_df=pandas.DataFrame( {"a": [1, 1, 2, 2], "b": [3, 4, 5, 6], "c": [1, 2, 3, 4], "d": [4, 5, 6, 7]} ), partitioning_scheme={"row_lengths": [4], "column_widths": [2, 2]}, ) modin_frame = modin_df._query_compiler._modin_frame assert modin_frame._partitions.shape == (1, 2) assert modin_frame.column_widths == [2, 2] res = modin_frame.apply_full_axis( axis=1, func=lambda df: df, keep_partitioning=False, new_index=[0, 1, 2, 3], new_columns=["a", "b", "c", "d"], ) assert res._partitions.shape == (1, 1) assert res.column_widths == [4] df_equals(res._partitions[0, 0].to_pandas(), pandas_df) df_equals(res.to_pandas(), pandas_df) @pytest.mark.parametrize("col_name", ["numeric_col", "non_numeric_col"]) @pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize("num_pivots", [3, 2, 1]) @pytest.mark.parametrize("all_pivots_are_unique", [True, False]) def test_split_partitions_kernel( col_name, ascending, num_pivots, all_pivots_are_unique ): """ This test verifies proper work of the `split_partitions_using_pivots_for_sort` function used in partitions reshuffling. The function being tested splits the passed dataframe into parts according to the 'pivots' indicating boundary values for the parts. Parameters ---------- col_name : {"numeric_col", "non_numeric_col"} The tested function takes a key column name to which the pivot values belong. The function may behave differently depending on the type of that column. ascending : {True, False} The split parts are returned either in ascending or descending order. This parameter helps us to test both of the cases. num_pivots : {3, 2, 1} The function's behavior may depend on the number of boundary values being passed. all_pivots_are_unique : {True, False} Duplicate pivot values cause empty partitions to be produced. This parameter helps to verify that the function still behaves correctly in such cases. """ random_state = np.random.RandomState(42) df = pandas.DataFrame( { "numeric_col": range(9), "non_numeric_col": list("abcdefghi"), } ) min_val, max_val = df[col_name].iloc[0], df[col_name].iloc[-1] # Selecting random boundary values for the key column pivots = random_state.choice(df[col_name], num_pivots, replace=False) if not all_pivots_are_unique: # Making the 'pivots' contain only duplicate values pivots = np.repeat(pivots[0], num_pivots) # The tested function assumes that we pass pivots in the ascending order pivots = np.sort(pivots) # Randomly reordering rows in the dataframe df = df.reindex(random_state.permutation(df.index)) bins = ShuffleSortFunctions.split_partitions_using_pivots_for_sort( df, [ ColumnInfo( name=col_name, is_numeric=pandas.api.types.is_numeric_dtype(df.dtypes[col_name]), pivots=pivots, ) ], ascending=ascending, ) # Building reference bounds to make the result verification simpler bounds = np.concatenate([[min_val], pivots, [max_val]]) if not ascending: # If the order is descending we want bounds to be in the descending order as well: # Ex: bounds = [0, 2, 5, 10] for ascending and [10, 5, 2, 0] for descending. bounds = bounds[::-1] for idx, part in enumerate(bins): if ascending: # Check that each part is in the range of 'bound[i] <= part <= bound[i + 1]' # Example, if the `pivots` were [2, 5] and the min/max values for the colum are min=0, max=10 # Then each part satisfies: 0 <= part[0] <= 2; 2 <= part[1] <= 5; 5 <= part[2] <= 10 assert ( (bounds[idx] <= part[col_name]) & (part[col_name] <= bounds[idx + 1]) ).all() else: # Check that each part is in the range of 'bound[i + 1] <= part <= bound[i]' # Example, if the `pivots` were [2, 5] and the min/max values for the colum are min=0, max=10 # Then each part satisfies: 5 <= part[0] <= 10; 2 <= part[1] <= 5; 0 <= part[2] <= 2 assert ( (bounds[idx + 1] <= part[col_name]) & (part[col_name] <= bounds[idx]) ).all() @pytest.mark.parametrize("col_name", ["numeric_col", "non_numeric_col"]) @pytest.mark.parametrize("ascending", [True, False]) def test_split_partitions_with_empty_pivots(col_name, ascending): """ This test verifies that the splitting function performs correctly when an empty pivots list is passed. The expected behavior is to return a single split consisting of the exact copy of the input dataframe. """ df = pandas.DataFrame( { "numeric_col": range(9), "non_numeric_col": list("abcdefghi"), } ) result = ShuffleSortFunctions.split_partitions_using_pivots_for_sort( df, [ ColumnInfo( name=col_name, is_numeric=pandas.api.types.is_numeric_dtype(df.dtypes[col_name]), pivots=[], ) ], ascending=ascending, ) # We're expecting to recieve a single split here assert isinstance(result, tuple) assert len(result) == 1 assert result[0].equals(df) @pytest.mark.parametrize("ascending", [True, False]) def test_shuffle_partitions_with_empty_pivots(ascending): """ This test verifies that the `PartitionMgr.shuffle_partitions` method can handle empty pivots list. """ modin_frame = pd.DataFrame( np.array([["hello", "goodbye"], ["hello", "Hello"]]) )._query_compiler._modin_frame assert modin_frame._partitions.shape == (1, 1) column_name = modin_frame.columns[1] shuffle_functions = ShuffleSortFunctions( # These are the parameters we pass in the `.sort_by()` implementation modin_frame, columns=column_name, ascending=ascending, ideal_num_new_partitions=1, ) new_partitions = modin_frame._partition_mgr_cls.shuffle_partitions( modin_frame._partitions, index=0, shuffle_functions=shuffle_functions, final_shuffle_func=lambda df: df.sort_values(column_name), ) ref = modin_frame.to_pandas().sort_values(column_name) res = new_partitions[0, 0].get() assert new_partitions.shape == (1, 1) assert ref.equals(res) @pytest.mark.parametrize("ascending", [True, False]) def test_split_partition_preserve_names(ascending): """ This test verifies that the dataframes being split by ``split_partitions_using_pivots_for_sort`` preserve their index/column names. """ df = pandas.DataFrame( { "numeric_col": range(9), "non_numeric_col": list("abcdefghi"), } ) index_name = "custom_name" df.index.name = index_name df.columns.name = index_name # Pivots that contain empty bins pivots = [2, 2, 5, 7] splits = ShuffleSortFunctions.split_partitions_using_pivots_for_sort( df, [ColumnInfo(name="numeric_col", is_numeric=True, pivots=pivots)], ascending=ascending, ) for part in splits: assert part.index.name == index_name assert part.columns.name == index_name @pytest.mark.parametrize("has_cols_metadata", [True, False]) @pytest.mark.parametrize("has_dtypes_metadata", [True, False]) def test_merge_preserves_metadata(has_cols_metadata, has_dtypes_metadata): df1 = pd.DataFrame({"a": [1, 1, 2, 2], "b": list("abcd")}) df2 = pd.DataFrame({"a": [4, 2, 1, 3], "b": list("bcaf"), "c": [3, 2, 1, 0]}) modin_frame = df1._query_compiler._modin_frame if has_cols_metadata: # Verify that there were initially materialized metadata assert modin_frame.has_materialized_columns else: modin_frame._columns_cache = None if has_dtypes_metadata: # Verify that there were initially materialized metadata assert modin_frame.has_materialized_dtypes else: modin_frame.set_dtypes_cache(None) res = df1.merge(df2, on="b")._query_compiler._modin_frame if has_cols_metadata: assert res.has_materialized_columns if has_dtypes_metadata: assert res.has_materialized_dtypes else: # Verify that no materialization was triggered assert not res.has_materialized_dtypes assert not modin_frame.has_materialized_dtypes else: # Verify that no materialization was triggered assert not res.has_materialized_columns assert not res.has_materialized_dtypes assert not modin_frame.has_materialized_columns if not has_dtypes_metadata: assert not modin_frame.has_materialized_dtypes def test_binary_op_preserve_dtypes(): df = pd.DataFrame({"a": [1, 2, 3], "b": [4.0, 5.0, 6.0]}) def setup_cache(df, has_cache=True): if has_cache: _ = df.dtypes assert df._query_compiler.frame_has_materialized_dtypes else: df._query_compiler.set_frame_dtypes_cache(None) assert not df._query_compiler.frame_has_materialized_dtypes return df def assert_cache(df, has_cache=True): assert not (has_cache ^ df._query_compiler.frame_has_materialized_dtypes) # Check when `other` is a non-distributed object assert_cache(setup_cache(df) + 2.0) assert_cache(setup_cache(df) + {"a": 2.0, "b": 4}) assert_cache(setup_cache(df) + [2.0, 4]) assert_cache(setup_cache(df) + np.array([2.0, 4])) # Check when `other` is a dataframe other = pd.DataFrame({"b": [3, 4, 5], "c": [4.0, 5.0, 6.0]}) assert_cache(setup_cache(df) + setup_cache(other, has_cache=True)) assert_cache(setup_cache(df) + setup_cache(other, has_cache=False), has_cache=False) # Check when `other` is a series other = pd.Series({"b": 3.0, "c": 4.0}) assert_cache(setup_cache(df) + setup_cache(other, has_cache=True)) assert_cache(setup_cache(df) + setup_cache(other, has_cache=False), has_cache=False) @pytest.mark.parametrize("axis", [0, 1]) def test_concat_dont_materialize_opposite_axis(axis): data = {"a": [1, 2, 3], "b": [4.0, 5.0, 6.0]} df1, df2 = pd.DataFrame(data), pd.DataFrame(data) def assert_no_cache(df, axis): if axis: assert not df._query_compiler.frame_has_materialized_columns else: assert not df._query_compiler.frame_has_materialized_index def remove_cache(df, axis): if axis: df._query_compiler.set_frame_columns_cache(None) else: df._query_compiler.set_frame_index_cache(None) assert_no_cache(df, axis) return df df1, df2 = remove_cache(df1, axis), remove_cache(df2, axis) df_concated = pd.concat((df1, df2), axis=axis) assert_no_cache(df1, axis) assert_no_cache(df2, axis) assert_no_cache(df_concated, axis) def test_setitem_bool_preserve_dtypes(): df = pd.DataFrame({"a": [1, 1, 2, 2], "b": [3, 4, 5, 6]}) indexer = pd.Series([True, False, True, False]) assert df._query_compiler.frame_has_materialized_dtypes # slice(None) as a col_loc df.loc[indexer] = 2.0 assert df._query_compiler.frame_has_materialized_dtypes # list as a col_loc df.loc[indexer, ["a", "b"]] = 2.0 assert df._query_compiler.frame_has_materialized_dtypes # scalar as a col_loc df.loc[indexer, "a"] = 2.0 assert df._query_compiler.frame_has_materialized_dtypes def test_setitem_unhashable_preserve_dtypes(): df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]]) assert df._query_compiler.frame_has_materialized_dtypes df2 = pd.DataFrame([[9, 9], [5, 5]]) assert df2._query_compiler.frame_has_materialized_dtypes df[[1, 2]] = df2 assert df._query_compiler.frame_has_materialized_dtypes @pytest.mark.parametrize("modify_config", [{RangePartitioning: True}], indirect=True) def test_groupby_size_shuffling(modify_config): # verifies that 'groupby.size()' works with reshuffling implementation # https://github.com/modin-project/modin/issues/6367 df = pd.DataFrame({"a": [1, 1, 2, 2], "b": [3, 4, 5, 6]}) modin_frame = df._query_compiler._modin_frame with mock.patch.object( modin_frame, "_apply_func_to_range_partitioning", wraps=modin_frame._apply_func_to_range_partitioning, ) as shuffling_method: try_cast_to_pandas(df.groupby("a").size()) shuffling_method.assert_called() @pytest.mark.parametrize( "kwargs", [dict(axis=0, labels=[]), dict(axis=1, labels=["a"]), dict(axis=1, labels=[])], ) def test_reindex_preserve_dtypes(kwargs): df = pd.DataFrame({"a": [1, 1, 2, 2], "b": [3, 4, 5, 6]}) reindexed_df = df.reindex(**kwargs) assert reindexed_df._query_compiler.frame_has_materialized_dtypes class TestModinIndexIds: @staticmethod def _patch_get_index(df, axis=0): """Patch the ``.index``/``.columns`` attribute of the passed dataframe.""" if axis == 0: return mock.patch.object( type(df), "index", new_callable=mock.PropertyMock, wraps=functools.partial(type(df).index.__get__, df), ) else: return mock.patch.object( type(df), "columns", new_callable=mock.PropertyMock, wraps=functools.partial(type(df).columns.__get__, df), ) def test_setitem_without_copartition(self): """Test that setitem for identical indices works without materializing the axis.""" # simple insertion df = pd.DataFrame({f"col{i}": np.arange(256) for i in range(64)}) remove_axis_cache(df) col = df["col0"] assert_has_no_cache(col) assert_has_no_cache(df) # insert the column back and check that no index computation were triggered with self._patch_get_index(df) as get_index_patch: df["col0"] = col # check that no cache computation was triggered assert_has_no_cache(df) assert_has_no_cache(col) get_index_patch.assert_not_called() # insertion with few map operations df = pd.DataFrame({f"col{i}": np.arange(256) for i in range(64)}) remove_axis_cache(df) col = df["col0"] # perform some operations that doesn't modify index labels and partitioning col = col * 2 + 10 assert_has_no_cache(col) assert_has_no_cache(df) # insert the modified column back and check that no index computation were triggered with self._patch_get_index(df) as get_index_patch: df["col0"] = col # check that no cache computation was triggered assert_has_no_cache(df) assert_has_no_cache(col) get_index_patch.assert_not_called() @pytest.mark.parametrize("axis", [0, 1]) def test_concat_without_copartition(self, axis): """Test that concatenation for frames with identical indices works without materializing the axis.""" df1 = pd.DataFrame({f"col{i}": np.arange(256) for i in range(64)}) remove_axis_cache(df1, axis) # perform some operations that doesn't modify index labels and partitioning df2 = df1.abs().applymap(lambda df: df * 2) with self._patch_get_index(df1, axis) as get_index_patch: res = pd.concat([df1, df2], axis=axis ^ 1) # check that no cache computation was triggered assert_has_no_cache(df1, axis) assert_has_no_cache(df2, axis) assert_has_no_cache(res, axis) get_index_patch.assert_not_called() def test_index_updates_ref(self): """Test that copying the default ModinIndex to a new frame updates frame reference with the new one.""" df1 = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) remove_axis_cache(df1) modin_frame1 = df1._query_compiler._modin_frame # verify that index cache is 'default' and so holds a reference to the `modin_frame` assert modin_frame1._index_cache._is_default_callable ref_count_before = sys.getrefcount(modin_frame1) df2 = df1 + 1 modin_frame2 = df2._query_compiler._modin_frame # verify that new index cache is also the 'default' one assert modin_frame2._index_cache._is_default_callable # verify that there's no new references being created to the old frame assert sys.getrefcount(modin_frame1) == ref_count_before def test_index_updates_axis(self): """Verify that the ModinIndex `axis` attribute is updated when copied to a new frame but for an opposit axis.""" df1 = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) remove_axis_cache(df1) # now index becomes columns and vice-versa, this means that the 'default callable' # of the ModinIndex now has to update its axis df2 = df1.T idx1 = df1._query_compiler._modin_frame._index_cache idx2 = df2._query_compiler._modin_frame._index_cache cols1 = df1._query_compiler._modin_frame._columns_cache cols2 = df2._query_compiler._modin_frame._columns_cache # check that we can compare df.index == df.T.columns & df.columns == df.T.index # without triggering any axis materialization assert ( idx1._index_id == cols2._index_id and idx1._lengths_id == cols2._lengths_id ) assert ( cols1._index_id == idx2._index_id and cols1._lengths_id == idx2._lengths_id ) # check that when the materialization is triggered for the transposed frame it produces proper labels assert df2.index.equals(pandas.Index(["a", "b"])) assert df2.columns.equals(pandas.Index([0, 1, 2])) def test_filter_empties_resets_lengths(self): """Verify that filtering out empty partitions affects ``ModinIndex._lengths_id`` field.""" # case1: partitioning is modified by '._filter_empties()', meaning that '._lengths_id' should be changed md_df = construct_modin_df_by_scheme( pandas.DataFrame({"a": [1, 1, 2, 2]}), {"row_lengths": [2, 2], "column_widths": [1]}, ) mf = md_df.query("a < 2")._query_compiler._modin_frame mf.index # trigger index materialization old_cache = mf._index_cache assert mf._partitions.shape == (2, 1) mf._filter_empties() new_cache = mf._index_cache assert new_cache._index_id == old_cache._index_id assert new_cache._lengths_id != old_cache._lengths_id assert new_cache._lengths_cache != old_cache._lengths_cache # case2: partitioning is NOT modified by '._filter_empties()', meaning that '._lengths_id' should stay the same md_df = construct_modin_df_by_scheme( pandas.DataFrame({"a": [1, 1, 2, 2]}), {"row_lengths": [2, 2], "column_widths": [1]}, ) mf = md_df._query_compiler._modin_frame old_cache = mf._index_cache assert mf._partitions.shape == (2, 1) mf._filter_empties() new_cache = mf._index_cache assert new_cache._index_id == old_cache._index_id assert new_cache._lengths_id == old_cache._lengths_id assert new_cache._lengths_cache == old_cache._lengths_cache def test_binops_without_repartitioning(self): """Test that binary operations for identical indices works without materializing the axis.""" df = pd.DataFrame({f"col{i}": np.arange(256) for i in range(64)}) remove_axis_cache(df) col1 = df["col1"] assert_has_no_cache(col1) assert_has_no_cache(df) col2 = df["col2"] assert_has_no_cache(col2) assert_has_no_cache(df) # perform a binary op and insert the result back then check that no index computation were triggered with self._patch_get_index(df) as get_index_df: df["result"] = col1 + col2 # check that no cache computation was triggered assert_has_no_cache(df) assert_has_no_cache(col1) assert_has_no_cache(col2) get_index_df.assert_not_called() def test_skip_set_columns(): """ Verifies that the mechanism of skipping the actual ``._set_columns()`` call in case the new columns are identical to the previous ones works properly. In this test, we rely on the ``modin_frame._deferred_column`` attribute. The new indices propagation is done lazily, and the ``deferred_column`` attribute indicates whether there's a new indices propagation pending. """ df = pd.DataFrame({"col1": [1, 2, 3], "col2": [3, 4, 5]}) df.columns = ["col1", "col10"] # Verifies that the new columns were successfully set in case they're actually new assert df._query_compiler._modin_frame._deferred_column assert np.all(df.columns.values == ["col1", "col10"]) df = pd.DataFrame({"col1": [1, 2, 3], "col2": [3, 4, 5]}) df.columns = ["col1", "col2"] # Verifies that the new columns weren't set if they're equal to the previous ones assert not df._query_compiler._modin_frame._deferred_column df = pd.DataFrame({"col1": [1, 2, 3], "col2": [3, 4, 5]}) df.columns = pandas.Index(["col1", "col2"], name="new name") # Verifies that the new columns were successfully set in case they's new metadata assert df.columns.name == "new name" df = pd.DataFrame( {("a", "col1"): [1, 2, 3], ("a", "col2"): [3, 4, 5], ("b", "col1"): [6, 7, 8]} ) df.columns = df.columns.copy() # Verifies that the new columns weren't set if they're equal to the previous ones assert not df._query_compiler._modin_frame._deferred_column df = pd.DataFrame( {("a", "col1"): [1, 2, 3], ("a", "col2"): [3, 4, 5], ("b", "col1"): [6, 7, 8]} ) new_cols = df.columns[::-1] df.columns = new_cols # Verifies that the new columns were successfully set in case they're actually new assert df._query_compiler._modin_frame._deferred_column assert df.columns.equals(new_cols) df = pd.DataFrame({"col1": [1, 2, 3], "col2": [3, 4, 5]}) remove_axis_cache(df, axis=1) df.columns = ["col1", "col2"] # Verifies that the computation of the old columns wasn't triggered for the sake # of equality comparison, in this case the new columns should be set unconditionally, # meaning that the '_deferred_column' has to be True assert df._query_compiler._modin_frame._deferred_column def test_query_dispatching(): """ Test whether the logic of determining whether the passed query can be performed row-wise works correctly in ``PandasQueryCompiler.rowwise_query()``. The tested method raises a ``NotImpementedError`` if the query cannot be performed row-wise and raises nothing if it can. """ qc = pd.DataFrame( {"a": [1], "b": [2], "c": [3], "d": [4], "e": [5]} )._query_compiler local_var = 10 # noqa: F841 (unused variable) # these queries should be performed row-wise (so no exception) qc.rowwise_query("a < 1") qc.rowwise_query("a < b") qc.rowwise_query("a < (b + @local_var) * c > 10") # these queries cannot be performed row-wise (so they must raise an exception) with pytest.raises(NotImplementedError): qc.rowwise_query("a < b[0]") with pytest.raises(NotImplementedError): qc.rowwise_query("a < b.min()") with pytest.raises(NotImplementedError): qc.rowwise_query("a < (b + @local_var + (b - e.min())) * c > 10") with pytest.raises(NotImplementedError): qc.rowwise_query("a < b.size") def test_sort_values_cache(): """ Test that the column widths cache after ``.sort_values()`` is valid: https://github.com/modin-project/modin/issues/6607 """ # 1 row partition and 2 column partitions, in this case '.sort_values()' will use # row-wise implementation and so the column widths WILL NOT be changed modin_df = construct_modin_df_by_scheme( pandas.DataFrame({f"col{i}": range(100) for i in range(64)}), partitioning_scheme={"row_lengths": [100], "column_widths": [32, 32]}, ) mf_initial = modin_df._query_compiler._modin_frame mf_res = modin_df.sort_values("col0")._query_compiler._modin_frame # check that row-wise implementation was indeed used (col widths were not changed) assert mf_res._column_widths_cache == [32, 32] # check that the cache and actual col widths match validate_partitions_cache(mf_res, axis=1) # check that the initial frame's cache wasn't changed assert mf_initial._column_widths_cache == [32, 32] validate_partitions_cache(mf_initial, axis=1) # 2 row partition and 2 column partitions, in this case '.sort_values()' will use # range-partitioning implementation and so the column widths WILL be changed modin_df = construct_modin_df_by_scheme( pandas.DataFrame({f"col{i}": range(100) for i in range(64)}), partitioning_scheme={"row_lengths": [50, 50], "column_widths": [32, 32]}, ) mf_initial = modin_df._query_compiler._modin_frame mf_res = modin_df.sort_values("col0")._query_compiler._modin_frame # check that range-partitioning implementation was indeed used (col widths were changed) assert mf_res._column_widths_cache == [64] # check that the cache and actual col widths match validate_partitions_cache(mf_res, axis=1) # check that the initial frame's cache wasn't changed assert mf_initial._column_widths_cache == [32, 32] validate_partitions_cache(mf_initial, axis=1) def test_apply_full_axis_preserve_widths(): md_df = construct_modin_df_by_scheme( pandas.DataFrame( {"a": [1, 2, 3, 4], "b": [3, 4, 5, 6], "c": [6, 7, 8, 9], "d": [0, 1, 2, 3]} ), {"row_lengths": [2, 2], "column_widths": [2, 2]}, )._query_compiler._modin_frame assert md_df._row_lengths_cache == [2, 2] assert md_df._column_widths_cache == [2, 2] def func(df): if df.iloc[0, 0] == 1: return pandas.DataFrame( {"a": [1, 2, 3], "b": [3, 4, 5], "c": [6, 7, 8], "d": [0, 1, 2]} ) else: return pandas.DataFrame({"a": [4], "b": [6], "c": [9], "d": [3]}) res = md_df.apply_full_axis( func=func, axis=1, new_index=[0, 1, 2, 3], new_columns=["a", "b", "c", "d"], keep_partitioning=True, ) col_widths_cache = res._column_widths_cache actual_column_widths = [part.width() for part in res._partitions[0]] assert col_widths_cache == actual_column_widths assert res._row_lengths_cache is None def test_apply_full_axis_preserve_lengths(): md_df = construct_modin_df_by_scheme( pandas.DataFrame( {"a": [1, 2, 3, 4], "b": [3, 4, 5, 6], "c": [6, 7, 8, 9], "d": [0, 1, 2, 3]} ), {"row_lengths": [2, 2], "column_widths": [2, 2]}, )._query_compiler._modin_frame assert md_df._row_lengths_cache == [2, 2] assert md_df._column_widths_cache == [2, 2] def func(df): if df.iloc[0, 0] == 1: return pandas.DataFrame({"a": [3, 2, 3, 4], "b": [3, 4, 5, 6]}) else: return pandas.DataFrame({"c": [9, 5, 6, 7]}) res = md_df.apply_full_axis( func=func, axis=0, new_index=[0, 1, 2, 3], new_columns=["a", "b", "c"], keep_partitioning=True, ) row_lengths_cache = res._row_lengths_cache actual_row_lengths = [part.length() for part in res._partitions[:, 0]] assert row_lengths_cache == actual_row_lengths assert res._column_widths_cache is None class DummyFuture: """ A dummy object emulating future's behaviour, this class is used in ``test_call_queue_serialization``. It stores a random numeric value representing its data and `was_materialized` state. Initially this object is considered to be serialized, the state can be changed by calling the ``.materialize()`` method. """ def __init__(self): self._value = np.random.randint(0, 1_000_000) self._was_materialized = False def materialize(self): self._was_materialized = True return self def __eq__(self, other): if isinstance(other, type(self)) and self._value == other._value: return True return False class TestModinDtypes: """Test ``ModinDtypes`` and ``DtypesDescriptor`` classes.""" schema = pandas.Series( { "a": np.dtype("int64"), "b": np.dtype(float), "c": np.dtype(bool), "d": np.dtype(bool), "e": np.dtype("object"), } ) def get_columns_order(self, cols): """Return a value to be passed as ``DtypesDescriptor(columns_order=...)`` parameter.""" return {i: col for i, col in enumerate(cols)} class DummyDf: def __init__(self, schema): self._schema = pandas.Series(schema) # record calls to verify that we haven't materialized more than needed self.history = [] def _compute_dtypes(self, subset=None): self.history.append(("_compute_dtypes", subset)) return self._schema if subset is None else self._schema[subset] @property def columns(self): self.history.append(("columns",)) return self._schema.index @property def has_materialized_columns(self): # False, to make descriptor avoid materialization at all cost return False def test_get_dtypes_set_modin_dtypes(self): """Test that ``ModinDtypes.get_dtypes_set()`` correctly propagates this request to the underlying value.""" res = ModinDtypes(lambda: self.schema).get_dtypes_set() exp = set(self.schema.values) assert res == exp res = ModinDtypes(self.schema).get_dtypes_set() exp = set(self.schema.values) assert res == exp res = ModinDtypes( DtypesDescriptor( self.schema[["a", "b", "e"]], remaining_dtype=np.dtype(bool) ) ).get_dtypes_set() exp = set(self.schema.values) assert res == exp def test_get_dtypes_set_desc(self): """ Test that ``DtypesDescriptor.get_dtypes_set()`` returns valid values and doesn't trigger unnecessary computations. """ df = self.DummyDf(self.schema) desc = DtypesDescriptor( self.schema[["a", "b"]], know_all_names=False, parent_df=df ) res = desc.get_dtypes_set() exp = self.schema.values assert res == set(exp) # since 'know_all_names=False', we first have to retrieve columns # in order to determine missing dtypes and then call '._compute_dtypes()' # only on a subset assert len(df.history) == 2 and df.history == [ ("columns",), ("_compute_dtypes", ["c", "d", "e"]), ] df = self.DummyDf(self.schema) desc = DtypesDescriptor( self.schema[["a", "b"]], cols_with_unknown_dtypes=["c", "d", "e"], parent_df=df, ) res = desc.get_dtypes_set() exp = self.schema.values assert res == set(exp) # here we already know names for cols with unknown dtypes, so only # calling '._compute_dtypes()' on a subset assert len(df.history) == 1 and df.history[0] == ( "_compute_dtypes", ["c", "d", "e"], ) df = self.DummyDf(self.schema[["a", "b", "c", "d"]]) desc = DtypesDescriptor( self.schema[["a", "b"]], remaining_dtype=np.dtype(bool), parent_df=df ) res = desc.get_dtypes_set() exp = self.schema[["a", "b", "c", "d"]].values assert res == set(exp) # we don't need to access 'parent_df' in order to get dtypes set, as we # can infer it from 'known_dtypes' and 'remaining_dtype' assert len(df.history) == 0 df = self.DummyDf(self.schema) desc = DtypesDescriptor(know_all_names=False, parent_df=df) res = desc.get_dtypes_set() exp = self.schema.values assert res == set(exp) # compute dtypes for all columns assert len(df.history) == 1 and df.history[0] == ("_compute_dtypes", None) df = self.DummyDf(self.schema) desc = DtypesDescriptor( cols_with_unknown_dtypes=self.schema.index.tolist(), parent_df=df ) res = desc.get_dtypes_set() exp = self.schema.values assert res == set(exp) # compute dtypes for all columns assert len(df.history) == 1 and df.history[0] == ( "_compute_dtypes", self.schema.index.tolist(), ) df = self.DummyDf(self.schema) desc = DtypesDescriptor( cols_with_unknown_dtypes=["a", "b", "e"], remaining_dtype=np.dtype(bool), parent_df=df, ) res = desc.get_dtypes_set() exp = self.schema.values assert res == set(exp) # here we already know names for cols with unknown dtypes, so only # calling '._compute_dtypes()' on a subset assert len(df.history) == 1 and df.history[0] == ( "_compute_dtypes", ["a", "b", "e"], ) def test_lazy_get_modin_dtypes(self): """Test that ``ModinDtypes.lazy_get()`` correctly propagates this request to the underlying value.""" res = ModinDtypes(self.schema).lazy_get(["b", "c", "a"]) exp = self.schema[["b", "c", "a"]] assert res._value.equals(exp) res = ModinDtypes(lambda: self.schema).lazy_get(["b", "c", "a"]) exp = self.schema[["b", "c", "a"]] assert callable(res._value) assert res._value().equals(exp) res = ModinDtypes( DtypesDescriptor( self.schema[["a", "b"]], cols_with_unknown_dtypes=["c", "d", "e"] ) ).lazy_get(["b", "c", "a"]) exp = DtypesDescriptor( self.schema[["a", "b"]], cols_with_unknown_dtypes=["c"], columns_order={0: "b", 1: "c", 2: "a"}, ) assert res._value.equals(exp) def test_lazy_get_desc(self): """ Test that ``DtypesDescriptor.lazy_get()`` work properly. In this test we never specify `parent_df` for a descriptor, verifying that ``.lazy_get()`` never triggers any computations. """ desc = DtypesDescriptor(self.schema[["a", "b"]]) subset = ["a", "c", "e"] res = desc.lazy_get(subset) exp = DtypesDescriptor( self.schema[subset[:1]], cols_with_unknown_dtypes=subset[1:], columns_order=self.get_columns_order(subset), ) assert res.equals(exp) desc = DtypesDescriptor(self.schema[["a", "b"]], remaining_dtype=np.dtype(bool)) subset = ["a", "c", "d"] res = desc.lazy_get(subset) exp = DtypesDescriptor( # dtypes for 'c' and 'b' were infered from 'remaining_dtype' parameter self.schema[subset], columns_order=self.get_columns_order(subset), _schema_is_known=True, ) assert res.equals(exp) desc = DtypesDescriptor() subset = ["a", "c", "d"] res = desc.lazy_get(subset) exp = DtypesDescriptor( cols_with_unknown_dtypes=subset, columns_order=self.get_columns_order(subset), ) assert res.equals(exp) desc = DtypesDescriptor(remaining_dtype=np.dtype(bool)) subset = ["c", "d"] res = desc.lazy_get(subset) exp = DtypesDescriptor( # dtypes for 'c' and 'd' were infered from 'remaining_dtype' parameter self.schema[subset], columns_order=self.get_columns_order(subset), _schema_is_known=True, ) assert res.equals(exp) def test_concat_axis_0(self): res = DtypesDescriptor.concat( [ DtypesDescriptor(self.schema[["a", "b"]]), DtypesDescriptor(self.schema[["c", "d"]]), ] ) # simply concat known schemas exp = DtypesDescriptor(self.schema[["a", "b", "c", "d"]]) assert res.equals(exp) res = DtypesDescriptor.concat( [ DtypesDescriptor(self.schema[["a", "b"]]), DtypesDescriptor(remaining_dtype=np.dtype(bool)), ] ) # none of the descriptors had missing column names, so we can preserve 'remaining_dtype' exp = DtypesDescriptor(self.schema[["a", "b"]], remaining_dtype=np.dtype(bool)) assert res.equals(exp) res = DtypesDescriptor.concat( [ DtypesDescriptor(self.schema[["a", "b"]], know_all_names=False), DtypesDescriptor(remaining_dtype=np.dtype(bool)), ] ) # can't preserve 'remaining_dtype' since first descriptor has unknown column names exp = DtypesDescriptor(self.schema[["a", "b"]], know_all_names=False) assert res.equals(exp) res = DtypesDescriptor.concat( [ DtypesDescriptor(self.schema[["a", "b"]]), DtypesDescriptor( cols_with_unknown_dtypes=["d", "e"], know_all_names=False ), DtypesDescriptor(remaining_dtype=np.dtype(bool)), ] ) # can't preserve 'remaining_dtype' since second descriptor has unknown column names exp = DtypesDescriptor( self.schema[["a", "b"]], cols_with_unknown_dtypes=["d", "e"], know_all_names=False, ) assert res.equals(exp) res = DtypesDescriptor.concat( [ DtypesDescriptor( self.schema[["a", "b"]], ), DtypesDescriptor( cols_with_unknown_dtypes=["d", "e"], ), DtypesDescriptor(remaining_dtype=np.dtype(bool)), ] ) # none of the descriptors had missing column names, so we can preserve 'remaining_dtype' exp = DtypesDescriptor( self.schema[["a", "b"]], cols_with_unknown_dtypes=["d", "e"], remaining_dtype=np.dtype(bool), ) assert res.equals(exp) res = DtypesDescriptor.concat( [ DtypesDescriptor( self.schema[["a", "b"]], remaining_dtype=np.dtype(bool) ), DtypesDescriptor( cols_with_unknown_dtypes=["d", "e"], remaining_dtype=np.dtype(float) ), DtypesDescriptor(remaining_dtype=np.dtype(bool)), ] ) # remaining dtypes don't match, so we drop them and set 'know_all_names=False' exp = DtypesDescriptor( self.schema[["a", "b"]], cols_with_unknown_dtypes=["d", "e"], know_all_names=False, ) assert res.equals(exp) @pytest.mark.parametrize( "initial_dtypes, result_cols_with_known_dtypes, result_cols_with_unknown_dtypes", [ [ # initial dtypes (cols_with_known_dtypes, cols_with_unknown_dtypes, remaining_dtype): # dtypes for all columns are known [ (["a", "b", "c", "d"], [], None), (["a", "b", "e", "d"], [], None), (["a", "b"], [], None), ], # result_cols_with_known_dtypes: # all dtypes were known in the beginning, expecting the same # for the result ["a", "b", "c", "d", "e"], # result_cols_with_unknown_dtypes [], ], [ # initial dtypes (cols_with_known_dtypes, cols_with_unknown_dtypes, remaining_dtype) [ (["a", "b"], ["c", "d"], None), (["a", "b", "d"], ["e"], None), (["a", "b"], [], None), ], # result_cols_with_known_dtypes: # across all dataframes, dtypes were only known for 'a' and 'b' columns ["a", "b"], # result_cols_with_unknown_dtypes ["c", "d", "e"], ], [ # initial dtypes (cols_with_known_dtypes, cols_with_unknown_dtypes, remaining_dtype): # the 'e' column in the second frame is missing here, emulating 'know_all_names=False' case [ (["a", "b"], ["c", "d"], None), (["a", "b", "d"], [], None), (["a", "b"], [], None), ], # result_cols_with_known_dtypes ["a", "b"], # result_cols_with_unknown_dtypes: # the missing 'e' column will be deducted from the resulted frame after '.concat()' ["c", "d", "e"], ], [ # initial dtypes (cols_with_known_dtypes, cols_with_unknown_dtypes, remaining_dtype) # the 'c' column in the first frame is described using 'remaining_dtype' [ (["a", "b", "d"], [], np.dtype(bool)), (["a", "b", "e", "d"], [], None), (["a", "b"], [], None), ], # result_cols_with_known_dtypes: # remaining dtypes are not supported by 'concat(axis=0)', so dtype for the 'c' # column is missing here ["a", "b", "e", "d"], # result_cols_with_unknown_dtypes: ["c"], ], ], ) def test_concat_axis_1( self, initial_dtypes, result_cols_with_known_dtypes, result_cols_with_unknown_dtypes, ): """ Test that ``DtypesDescriptor.concat(axis=1)`` works as expected. Parameters ---------- initial_dtypes : list of tuples: (cols_with_known_dtypes, cols_with_unknown_dtypes, remaining_dtype) Describe how to build ``DtypesDescriptor`` for each of the three dataframes. result_cols_with_known_dtypes : list of labels Column names for which dtypes has to be determined after ``.concat()``. result_cols_with_unknown_dtypes : list of labels Column names for which dtypes has to be unknown after ``.concat()``. """ md_df1, pd_df1 = create_test_dfs( { "a": [1, 2, 3], "b": [3.5, 4.5, 5.5], "c": [True, False, True], "d": ["a", "b", "c"], } ) md_df2, pd_df2 = create_test_dfs( { "a": [1.5, 2.5, 3.5], "b": [3.5, 4.5, 5.5], "e": [True, False, True], "d": ["a", "b", "c"], } ) md_df3, pd_df3 = create_test_dfs({"a": [1, 2, 3], "b": [3.5, 4.5, 5.5]}) for md_df, (known_cols, unknown_cols, remaining_dtype) in zip( [md_df1, md_df2, md_df3], initial_dtypes ): known_dtypes = {col: md_df.dtypes[col] for col in known_cols} know_all_names = ( len(known_cols) + len(unknown_cols) == len(md_df.columns) or remaining_dtype is not None ) # setting columns cache to 'None', in order to prevent completing 'dtypes' with the materialized columns md_df._query_compiler.set_frame_columns_cache(None) md_df._query_compiler.set_frame_dtypes_cache( ModinDtypes( DtypesDescriptor( known_dtypes, unknown_cols, remaining_dtype, know_all_names=know_all_names, ) ) ) md_dtypes = pd.concat( [md_df1, md_df2, md_df3] )._query_compiler._modin_frame._dtypes pd_dtypes = pandas.concat([pd_df1, pd_df2, pd_df3]).dtypes if len(result_cols_with_known_dtypes) == len(pd_dtypes): md_dtypes = ( md_dtypes if isinstance(md_dtypes, pandas.Series) else md_dtypes._value ) assert isinstance(md_dtypes, pandas.Series) assert md_dtypes.equals(pd_dtypes) else: assert set(md_dtypes._value._known_dtypes.keys()) == set( result_cols_with_known_dtypes ) # reindexing to ensure proper order md_known_dtypes = pandas.Series(md_dtypes._value._known_dtypes).reindex( result_cols_with_known_dtypes ) assert md_known_dtypes.equals(pd_dtypes[result_cols_with_known_dtypes]) assert set(md_dtypes._value._cols_with_unknown_dtypes) == set( result_cols_with_unknown_dtypes ) def test_ModinDtypes_duplicated_concat(self): # test that 'ModinDtypes' is able to perform dtypes concatenation on duplicated labels # if all of them are Serieses res = ModinDtypes.concat([pandas.Series([np.dtype("int64")], index=["a"])] * 2) assert isinstance(res._value, pandas.Series) assert res._value.equals( pandas.Series([np.dtype("int64"), np.dtype("int64")], index=["a", "a"]) ) # test that 'ModinDtypes.concat' with duplicated labels raises when not all dtypes are materialized with pytest.raises(NotImplementedError): res = ModinDtypes.concat( [ pandas.Series([np.dtype("int64")], index=["a"]), DtypesDescriptor(cols_with_unknown_dtypes=["a"]), ] ) def test_update_parent(self): """ Test that updating parents in ``DtypesDescriptor`` also propagates to stored lazy categoricals. """ # 'df1' will have a materialized 'pandas.Series' as dtypes cache df1 = pd.DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]}).astype({"a": "category"}) assert isinstance(df1.dtypes["a"], LazyProxyCategoricalDtype) # 'df2' will have a 'DtypesDescriptor' with unknown dtypes for a column 'c' df2 = pd.DataFrame({"c": [2, 3, 4]}) df2._query_compiler.set_frame_dtypes_cache(None) dtypes_cache = df2._query_compiler._modin_frame._dtypes assert isinstance( dtypes_cache._value, DtypesDescriptor ) and dtypes_cache._value._cols_with_unknown_dtypes == ["c"] # concatenating 'df1' and 'df2' to get a 'DtypesDescriptor' storing lazy categories # in its 'known_dtypes' field res = pd.concat([df1, df2], axis=1) old_parent = df1._query_compiler._modin_frame new_parent = res._query_compiler._modin_frame dtypes_cache = new_parent._dtypes._value # verifying that the reference for lazy categories to a new parent was updated assert dtypes_cache._parent_df is new_parent assert dtypes_cache._known_dtypes["a"]._parent is new_parent assert old_parent._dtypes["a"]._parent is old_parent @pytest.mark.parametrize( "initial_dtypes, result_dtypes", [ [ DtypesDescriptor( {"a": np.dtype("int64"), "b": np.dtype(float), "c": np.dtype(float)} ), DtypesDescriptor( cols_with_unknown_dtypes=["col1", "col2", "col3"], columns_order={0: "col1", 1: "col2", 2: "col3"}, ), ], [ DtypesDescriptor( { "a": np.dtype("int64"), "b": np.dtype(float), "c": np.dtype(float), }, columns_order={0: "a", 1: "b", 2: "c"}, ), DtypesDescriptor( { "col1": np.dtype("int64"), "col2": np.dtype(float), "col3": np.dtype(float), }, columns_order={0: "col1", 1: "col2", 2: "col3"}, ), ], [ DtypesDescriptor( {"a": np.dtype("int64"), "b": np.dtype(float)}, cols_with_unknown_dtypes=["c"], columns_order={0: "a", 1: "b", 2: "c"}, ), DtypesDescriptor( {"col1": np.dtype("int64"), "col2": np.dtype(float)}, cols_with_unknown_dtypes=["col3"], columns_order={0: "col1", 1: "col2", 2: "col3"}, ), ], [ DtypesDescriptor( {"a": np.dtype("int64")}, cols_with_unknown_dtypes=["c"], know_all_names=False, ), DtypesDescriptor( cols_with_unknown_dtypes=["col1", "col2", "col3"], columns_order={0: "col1", 1: "col2", 2: "col3"}, ), ], [ DtypesDescriptor( {"a": np.dtype("int64")}, remaining_dtype=np.dtype(float) ), DtypesDescriptor( cols_with_unknown_dtypes=["col1", "col2", "col3"], columns_order={0: "col1", 1: "col2", 2: "col3"}, ), ], [ lambda: pandas.Series( [np.dtype("int64"), np.dtype(float), np.dtype(float)], index=["a", "b", "c"], ), lambda: pandas.Series( [np.dtype("int64"), np.dtype(float), np.dtype(float)], index=["col1", "col2", "col3"], ), ], [ pandas.Series( [np.dtype("int64"), np.dtype(float), np.dtype(float)], index=["a", "b", "c"], ), pandas.Series( [np.dtype("int64"), np.dtype(float), np.dtype(float)], index=["col1", "col2", "col3"], ), ], ], ) def test_set_index_dataframe(self, initial_dtypes, result_dtypes): """Test that changing labels for a dataframe also updates labels of dtypes.""" df = pd.DataFrame( {"a": [1, 2, 3], "b": [3.0, 4.0, 5.0], "c": [3.2, 4.5, 5.4]} )._query_compiler._modin_frame df.set_columns_cache(None) if isinstance(initial_dtypes, DtypesDescriptor): initial_dtypes = ModinDtypes(initial_dtypes) df.set_dtypes_cache(initial_dtypes) df.columns = ["col1", "col2", "col3"] if result_dtypes is not None: if callable(result_dtypes): assert callable(df._dtypes._value) assert df._dtypes._value().equals(result_dtypes()) else: assert df._dtypes._value.equals(result_dtypes) assert df.dtypes.index.equals(pandas.Index(["col1", "col2", "col3"])) def test_set_index_with_dupl_labels(self): """Verify that setting duplicated columns doesn't propagate any errors to a user.""" df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [3.5, 4.4, 5.5, 6.6]}) # making sure that dtypes are represented by an unmaterialized dtypes-descriptor df._query_compiler.set_frame_dtypes_cache(None) df.columns = ["a", "a"] assert df.dtypes.equals( pandas.Series([np.dtype(int), np.dtype("float64")], index=["a", "a"]) ) def test_reset_index_mi_columns(self): # reproducer from: https://github.com/modin-project/modin/issues/6904 md_df, pd_df = create_test_dfs({"a": [1, 1, 2, 2], "b": [3, 3, 4, 4]}) eval_general( md_df, pd_df, lambda df: df.groupby("a").agg({"b": ["min", "std"]}).reset_index().dtypes, ) def test_concat_mi(self): """ Verify that concatenating dfs with non-MultiIndex and MultiIndex columns results into valid indices for lazy dtypes. """ md_df1, pd_df1 = create_test_dfs({"a": [1, 1, 2, 2], "b": [3, 3, 4, 4]}) md_df2, pd_df2 = create_test_dfs( {("l1", "v1"): [1, 1, 2, 2], ("l1", "v2"): [3, 3, 4, 4]} ) # Drop actual dtypes in order to use partially-known dtypes md_df1._query_compiler.set_frame_dtypes_cache(None) md_df2._query_compiler.set_frame_dtypes_cache(None) md_res = pd.concat([md_df1, md_df2], axis=1) pd_res = pandas.concat([pd_df1, pd_df2], axis=1) df_equals(md_res.dtypes, pd_res.dtypes) class TestZeroComputationDtypes: """ Test cases that shouldn't trigger dtypes computation during their execution. """ @pytest.mark.parametrize("self_dtype", ["materialized", "partial", "unknown"]) @pytest.mark.parametrize( "value, value_dtype", [ [3.5, np.dtype(float)], [[3.5, 2.4], np.dtype(float)], [np.array([3.5, 2.4]), np.dtype(float)], [pd.Series([3.5, 2.4]), np.dtype(float)], ], ) def test_preserve_dtypes_setitem(self, self_dtype, value, value_dtype): """ Test that ``df[single_existing_column] = value`` preserves dtypes cache. """ with mock.patch.object(PandasDataframe, "_compute_dtypes") as patch: df = pd.DataFrame({"a": [1, 2], "b": [3, 4], "c": [3, 4]}) if self_dtype == "materialized": assert df._query_compiler.frame_has_materialized_dtypes elif self_dtype == "partial": df._query_compiler.set_frame_dtypes_cache( ModinDtypes( DtypesDescriptor( {"a": np.dtype("int64")}, cols_with_unknown_dtypes=["b", "c"], ) ) ) elif self_dtype == "unknown": df._query_compiler.set_frame_dtypes_cache(None) else: raise NotImplementedError(self_dtype) df["b"] = value if self_dtype == "materialized": result_dtype = pandas.Series( [np.dtype("int64"), value_dtype, np.dtype("int64")], index=["a", "b", "c"], ) assert df._query_compiler.frame_has_materialized_dtypes assert df.dtypes.equals(result_dtype) elif self_dtype == "partial": result_dtype = DtypesDescriptor( {"a": np.dtype("int64"), "b": value_dtype}, cols_with_unknown_dtypes=["c"], columns_order={0: "a", 1: "b", 2: "c"}, ) df._query_compiler._modin_frame._dtypes._value.equals(result_dtype) elif self_dtype == "unknown": result_dtype = DtypesDescriptor( {"b": value_dtype}, cols_with_unknown_dtypes=["a", "b"], columns_order={0: "a", 1: "b", 2: "c"}, ) df._query_compiler._modin_frame._dtypes._value.equals(result_dtype) else: raise NotImplementedError(self_dtype) patch.assert_not_called() @pytest.mark.parametrize("self_dtype", ["materialized", "partial", "unknown"]) @pytest.mark.parametrize( "value, value_dtype", [ [3.5, np.dtype(float)], [[3.5, 2.4], np.dtype(float)], [np.array([3.5, 2.4]), np.dtype(float)], [pd.Series([3.5, 2.4]), np.dtype(float)], ], ) def test_preserve_dtypes_insert(self, self_dtype, value, value_dtype): with mock.patch.object(PandasDataframe, "_compute_dtypes") as patch: df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) if self_dtype == "materialized": assert df._query_compiler.frame_has_materialized_dtypes elif self_dtype == "partial": df._query_compiler.set_frame_dtypes_cache( ModinDtypes( DtypesDescriptor( {"a": np.dtype("int64")}, cols_with_unknown_dtypes=["b"] ) ) ) elif self_dtype == "unknown": df._query_compiler.set_frame_dtypes_cache(None) else: raise NotImplementedError(self_dtype) df.insert(loc=0, column="c", value=value) if self_dtype == "materialized": result_dtype = pandas.Series( [value_dtype, np.dtype("int64"), np.dtype("int64")], index=["c", "a", "b"], ) assert df._query_compiler.frame_has_materialized_dtypes assert df.dtypes.equals(result_dtype) elif self_dtype == "partial": result_dtype = DtypesDescriptor( {"a": np.dtype("int64"), "c": value_dtype}, cols_with_unknown_dtypes=["b"], columns_order={0: "c", 1: "a", 2: "b"}, ) df._query_compiler._modin_frame._dtypes._value.equals(result_dtype) elif self_dtype == "unknown": result_dtype = DtypesDescriptor( {"c": value_dtype}, cols_with_unknown_dtypes=["a", "b"], columns_order={0: "c", 1: "a", 2: "b"}, ) df._query_compiler._modin_frame._dtypes._value.equals(result_dtype) else: raise NotImplementedError(self_dtype) patch.assert_not_called() def test_get_dummies_case(self): with mock.patch.object(PandasDataframe, "_compute_dtypes") as patch: df = pd.DataFrame( {"items": [1, 2, 3, 4], "b": [3, 3, 4, 4], "c": [1, 0, 0, 1]} ) res = pd.get_dummies(df, columns=["b", "c"]) cols = [col for col in res.columns if col != "items"] res[cols] = res[cols] / res[cols].mean() assert res._query_compiler.frame_has_materialized_dtypes patch.assert_not_called() @pytest.mark.parametrize("has_materialized_index", [True, False]) @pytest.mark.parametrize("drop", [True, False]) def test_preserve_dtypes_reset_index(self, drop, has_materialized_index): with mock.patch.object(PandasDataframe, "_compute_dtypes") as patch: # case 1: 'df' has complete dtype by default df = pd.DataFrame({"a": [1, 2, 3]}) if has_materialized_index: assert df._query_compiler.frame_has_materialized_index else: df._query_compiler.set_frame_index_cache(None) assert not df._query_compiler.frame_has_materialized_index assert df._query_compiler.frame_has_materialized_dtypes res = df.reset_index(drop=drop) if drop: # we droped the index, so columns and dtypes shouldn't change assert res._query_compiler.frame_has_materialized_dtypes assert res.dtypes.equals(df.dtypes) else: if has_materialized_index: # we should have inserted index dtype into the descriptor, # and since both of them are materialized, the result should be # materialized too assert res._query_compiler.frame_has_materialized_dtypes assert res.dtypes.equals( pandas.Series( [np.dtype("int64"), np.dtype("int64")], index=["index", "a"] ) ) else: # we now know that there are cols with unknown name and dtype in our dataframe, # so the resulting dtypes should contain information only about original column expected_dtypes = DtypesDescriptor( {"a": np.dtype("int64")}, know_all_names=False, ) assert res._query_compiler._modin_frame._dtypes._value.equals( expected_dtypes ) # case 2: 'df' has partial dtype by default df = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) df._query_compiler.set_frame_dtypes_cache( ModinDtypes( DtypesDescriptor( {"a": np.dtype("int64")}, cols_with_unknown_dtypes=["b"] ) ) ) if has_materialized_index: assert df._query_compiler.frame_has_materialized_index else: df._query_compiler.set_frame_index_cache(None) assert not df._query_compiler.frame_has_materialized_index res = df.reset_index(drop=drop) if drop: # we droped the index, so columns and dtypes shouldn't change assert res._query_compiler._modin_frame._dtypes._value.equals( df._query_compiler._modin_frame._dtypes._value ) else: if has_materialized_index: # we should have inserted index dtype into the descriptor, # the resulted dtype should have information about 'index' and 'a' columns, # and miss dtype info for 'b' column expected_dtypes = DtypesDescriptor( {"index": np.dtype("int64"), "a": np.dtype("int64")}, cols_with_unknown_dtypes=["b"], columns_order={0: "index", 1: "a", 2: "b"}, ) assert res._query_compiler._modin_frame._dtypes._value.equals( expected_dtypes ) else: # we miss info about the 'index' column since it wasn't materialized at # the time of 'reset_index()' and we're still missing dtype info for 'b' column expected_dtypes = DtypesDescriptor( {"a": np.dtype("int64")}, cols_with_unknown_dtypes=["b"], know_all_names=False, ) assert res._query_compiler._modin_frame._dtypes._value.equals( expected_dtypes ) patch.assert_not_called() def test_groupby_index_dtype(self): with mock.patch.object(PandasDataframe, "_compute_dtypes") as patch: # case 1: MapReduce impl, Series as an output of groupby df = pd.DataFrame({"a": [1, 2, 2], "b": [3, 4, 5]}) res = df.groupby("a").size().reset_index(name="new_name") res_dtypes = res._query_compiler._modin_frame._dtypes._value assert "a" in res_dtypes._known_dtypes assert res_dtypes._known_dtypes["a"] == np.dtype("int64") # case 2: ExperimentalImpl impl, Series as an output of groupby RangePartitioning.put(True) try: df = pd.DataFrame({"a": [1, 2, 2], "b": [3, 4, 5]}) res = df.groupby("a").size().reset_index(name="new_name") res_dtypes = res._query_compiler._modin_frame._dtypes._value assert "a" in res_dtypes._known_dtypes assert res_dtypes._known_dtypes["a"] == np.dtype("int64") finally: RangePartitioning.put(False) # case 3: MapReduce impl, DataFrame as an output of groupby df = pd.DataFrame({"a": [1, 2, 2], "b": [3, 4, 5]}) res = df.groupby("a").sum().reset_index() res_dtypes = res._query_compiler._modin_frame._dtypes._value assert "a" in res_dtypes._known_dtypes assert res_dtypes._known_dtypes["a"] == np.dtype("int64") # case 4: ExperimentalImpl impl, DataFrame as an output of groupby RangePartitioning.put(True) try: df = pd.DataFrame({"a": [1, 2, 2], "b": [3, 4, 5]}) res = df.groupby("a").sum().reset_index() res_dtypes = res._query_compiler._modin_frame._dtypes._value assert "a" in res_dtypes._known_dtypes assert res_dtypes._known_dtypes["a"] == np.dtype("int64") finally: RangePartitioning.put(False) # case 5: FullAxis impl, DataFrame as an output of groupby df = pd.DataFrame({"a": [1, 2, 2], "b": [3, 4, 5]}) res = df.groupby("a").quantile().reset_index() res_dtypes = res._query_compiler._modin_frame._dtypes._value assert "a" in res_dtypes._known_dtypes assert res_dtypes._known_dtypes["a"] == np.dtype("int64") patch.assert_not_called() @pytest.mark.skipif(Engine.get() != "Ray", reason="Ray specific") @pytest.mark.parametrize("mode", [None, "Auto", "On", "Off"]) def test_ray_lazy_exec_mode(mode): import ray from modin.config import LazyExecution from modin.core.execution.ray.common.deferred_execution import DeferredExecution from modin.core.execution.ray.common.utils import ObjectIDType from modin.core.execution.ray.implementations.pandas_on_ray.partitioning import ( PandasOnRayDataframePartition, ) orig_mode = LazyExecution.get() try: if mode is None: mode = LazyExecution.get() else: LazyExecution.put(mode) assert mode == LazyExecution.get() df = pandas.DataFrame({"A": [1, 2, 3]}) part = PandasOnRayDataframePartition(ray.put(df)) def func(df): return len(df) ray_func = ray.put(func) if mode == "Auto": assert isinstance(part.apply(ray_func)._data_ref, ObjectIDType) assert isinstance( part.add_to_apply_calls(ray_func)._data_ref, DeferredExecution ) elif mode == "On": assert isinstance(part.apply(ray_func)._data_ref, DeferredExecution) assert isinstance( part.add_to_apply_calls(ray_func)._data_ref, DeferredExecution ) elif mode == "Off": assert isinstance(part.apply(ray_func)._data_ref, ObjectIDType) assert isinstance(part.add_to_apply_calls(ray_func)._data_ref, ObjectIDType) else: pytest.fail(f"Invalid value: {mode}") finally: LazyExecution.put(orig_mode) @pytest.mark.skipif(Engine.get() != "Ray", reason="Ray specific") def test_materialization_hook_serialization(): @ray.remote(num_returns=1) def f1(): return [1, 2, 3] @ray.remote(num_returns=1) def f2(i): return i hook = MetaList(f1.remote())[2] assert ray.get(f2.remote(hook)) == 3 def test_remote_function(): def get_func(): @remote_function def remote_func(arg): return arg return remote_func def get_capturing_func(arg): @remote_function def remote_func(): return arg return remote_func if Engine.get() in ("Ray", "Unidist"): from modin.core.execution.utils import _remote_function_cache cache_len = len(_remote_function_cache) assert get_func() is get_func() assert get_func() in _remote_function_cache.values() assert get_capturing_func(1) not in _remote_function_cache.values() assert len(_remote_function_cache) == cache_len + 1 assert materialize(deploy(get_func(), [123])) == 123 assert get_capturing_func(1) is not get_capturing_func(2) assert ( materialize(deploy(get_capturing_func(1))) + materialize(deploy(get_capturing_func(2))) == 3 ) @pytest.mark.parametrize( "partitioning_scheme,expected_map_approach", [ pytest.param( lambda df: { "row_lengths": [df.shape[0] // CpuCount.get()] * CpuCount.get(), "column_widths": [df.shape[1]], }, "map_partitions", id="one_column_partition", ), pytest.param( lambda df: { "row_lengths": [df.shape[0] // (CpuCount.get() * 2)] * (CpuCount.get() * 2), "column_widths": [df.shape[1]], }, "map_partitions_joined_by_column", id="very_long_column_partition", ), pytest.param( lambda df: { "row_lengths": [df.shape[0] // CpuCount.get()] * CpuCount.get(), "column_widths": [df.shape[1] // CpuCount.get()] * CpuCount.get(), }, "map_axis_partitions", id="perfect_partitioning", ), ], ) def test_dynamic_partitioning(partitioning_scheme, expected_map_approach): data_size = MinRowPartitionSize.get() * CpuCount.get() data = {f"col{i}": np.ones(data_size) for i in range(data_size)} df = pandas.DataFrame(data) modin_df = construct_modin_df_by_scheme(df, partitioning_scheme(df)) partitions = modin_df._query_compiler._modin_frame._partitions partition_mgr_cls = modin_df._query_compiler._modin_frame._partition_mgr_cls with mock.patch.object( partition_mgr_cls, expected_map_approach, wraps=getattr(partition_mgr_cls, expected_map_approach), ) as expected_method: with context(DynamicPartitioning=True): partition_mgr_cls.map_partitions(partitions, lambda x: x * 2) expected_method.assert_called() @pytest.mark.parametrize("npartitions", [7, CpuCount.get() * 2]) def test_map_partitions_joined_by_column(npartitions): with context(NPartitions=npartitions): ncols = MinColumnPartitionSize.get() nrows = MinRowPartitionSize.get() * CpuCount.get() * 2 data = {f"col{i}": np.ones(nrows) for i in range(ncols)} df = pd.DataFrame(data) partitions = df._query_compiler._modin_frame._partitions partition_mgr_cls = df._query_compiler._modin_frame._partition_mgr_cls def map_func(df, first_arg, extra_arg=0): return df.map(lambda x: (x * first_arg) + extra_arg) column_splits = 2 map_func_args = (2,) map_func_kwargs = {"extra_arg": 1} # this approach doesn't work if column_splits == 0 with pytest.raises(ValueError): partition_mgr_cls.map_partitions_joined_by_column( partitions, 0, map_func, map_func_args, map_func_kwargs ) result_partitions = partition_mgr_cls.map_partitions_joined_by_column( partitions, column_splits, map_func, map_func_args, map_func_kwargs, ) assert ( result_partitions.shape == partitions.shape ), "The result has a different split than the original." for i in range(result_partitions.shape[0]): assert np.all( result_partitions[i][0].to_numpy() == 3 ), "Invalid map function result." def test_fold_operator(): new_index = list(range(500, 1000)) new_columns = ["b"] initial_df = pandas.DataFrame({"a": range(0, 1000)}) modin_df = pd.DataFrame(initial_df) expected_df = pandas.DataFrame( list(range(0, 1000, 2)), index=new_index, columns=new_columns ) def filter_func(df): result = df[df.index % 2 == 0] result.index = new_index result.columns = new_columns return result PandasQueryCompiler.filter_func = Fold.register(filter_func) def filter_modin_dataframe1(df): return df.__constructor__( query_compiler=df._query_compiler.filter_func( fold_axis=0, new_index=new_index, new_columns=new_columns, ) ) pd.DataFrame.filter_dataframe1 = filter_modin_dataframe1 filtered_df = modin_df.filter_dataframe1() df_equals(filtered_df, expected_df) def filter_modin_dataframe2(df): return df.__constructor__( query_compiler=df._query_compiler.filter_func(fold_axis=0) ) pd.DataFrame.filter_dataframe2 = filter_modin_dataframe2 filtered_df = modin_df.filter_dataframe2() df_equals(filtered_df, expected_df) def test_default_property_warning_name(): # Test that when a property defaults to pandas, the raised warning mentions the full name of # the pandas property rather than a hex address @property def _test_default_property(df): return "suspicious sentinel value" @property def qc_test_default_property(qc): return DataFrameDefault.register(_test_default_property)(qc) PandasQueryCompiler.qc_test_default_property = qc_test_default_property @property def dataframe_test_default_property(df): return df._query_compiler.qc_test_default_property pd.DataFrame.dataframe_test_default_property = dataframe_test_default_property with pytest.warns( UserWarning, match="> is not currently supported", ): pd.DataFrame([[1]]).dataframe_test_default_property @pytest.mark.parametrize( "modify_config", [ {Engine: "Ray"}, {Engine: "Dask"}, ], indirect=True, ) def test_daemonic_worker_protection(modify_config): # Test for issue #7346, wherein some operations on Dask cause a second submission of a task to # the Dask client from the worker scope, which should not cause a new client to be created def submission_triggering_row_operation(row): row_to_dict = row.to_dict() dict_to_row = pd.Series(row_to_dict) return dict_to_row df = pd.DataFrame( { "A": ["a", "b", "c", "d"], "B": [1, 2, 3, 4], "C": [1, 2, 3, 4], "D": [1, 2, 3, 4], } ) df.apply(submission_triggering_row_operation, axis=1) ================================================ FILE: modin/tests/core/test_dispatcher.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. from contextlib import contextmanager import pytest import modin.pandas as pd from modin.config import Backend, Engine, Execution, Parameter, StorageFormat from modin.core.execution.dispatching.factories import factories from modin.core.execution.dispatching.factories.dispatcher import ( FactoryDispatcher, FactoryNotFoundError, ) from modin.core.execution.python.implementations.pandas_on_python.io import ( PandasOnPythonIO, ) from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler from modin.tests.pandas.utils import switch_execution @contextmanager def _switch_value(config: Parameter, value: str): old_value = config.get() try: yield config.put(value) finally: config.put(old_value) class PandasOnTestFactory(factories.BaseFactory): """ Stub factory to ensure we can switch execution engine to 'Test' """ @classmethod def prepare(cls): """ Fills in .io_cls class attribute lazily """ cls.io_cls = "Foo" class TestOnPythonFactory(factories.BaseFactory): """ Stub factory to ensure we can switch partition format to 'Test' """ @classmethod def prepare(cls): """ Fills in .io_cls class attribute lazily """ cls.io_cls = "Bar" class FooOnBarFactory(factories.BaseFactory): """ Stub factory to ensure we can switch engine and partition to 'Foo' and 'Bar' """ @classmethod def prepare(cls): """ Fills in .io_cls class attribute lazily """ cls.io_cls = "Zug-zug" # inject the stubs factories.PandasOnTestFactory = PandasOnTestFactory factories.TestOnPythonFactory = TestOnPythonFactory factories.FooOnBarFactory = FooOnBarFactory Backend.register_backend( "Test1", Execution( engine="Test", storage_format="Pandas", ), ) Backend.register_backend( "Test2", Execution( engine="Python", storage_format="Test", ), ) Backend.register_backend( "Test3", Execution( engine="Bar", storage_format="Foo", ), ) Backend.register_backend( "Test4", Execution( engine="Dask", storage_format="Pyarrow", ), ) # register them as known "no init" engines for modin.pandas Engine.NOINIT_ENGINES |= {"Test", "Bar"} def test_default_factory(): assert issubclass(FactoryDispatcher.get_factory(), factories.BaseFactory) assert FactoryDispatcher.get_factory().io_cls def test_factory_switch(): with switch_execution("Python", "Pandas"): with _switch_value(Engine, "Test"): assert FactoryDispatcher.get_factory() == PandasOnTestFactory assert FactoryDispatcher.get_factory().io_cls == "Foo" with _switch_value(StorageFormat, "Test"): assert FactoryDispatcher.get_factory() == TestOnPythonFactory assert FactoryDispatcher.get_factory().io_cls == "Bar" def test_engine_wrong_factory(): with pytest.raises(FactoryNotFoundError): with _switch_value(Engine, "Dask"): with _switch_value(StorageFormat, "Pyarrow"): pass def test_set_execution(): with switch_execution("Bar", "Foo"): assert FactoryDispatcher.get_factory() == FooOnBarFactory def test_add_option(): class DifferentlyNamedFactory(factories.BaseFactory): @classmethod def prepare(cls): cls.io_cls = PandasOnPythonIO factories.StorageOnExecFactory = DifferentlyNamedFactory StorageFormat.add_option("sToragE") Engine.add_option("Exec") Backend.register_backend( name="Test5", execution=Execution( engine="Exec", storage_format="Storage", ), ) with switch_execution("Exec", "Storage"): df = pd.DataFrame([[1, 2, 3], [3, 4, 5], [5, 6, 7]]) assert isinstance(df._query_compiler, PandasQueryCompiler) ================================================ FILE: modin/tests/experimental/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. ================================================ FILE: modin/tests/experimental/spreadsheet/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. ================================================ FILE: modin/tests/experimental/spreadsheet/test_general.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import numpy as np import pandas import pytest from modin_spreadsheet import SpreadsheetWidget import modin.experimental.spreadsheet as mss import modin.pandas as pd def get_test_data(): return { "A": 1.0, "B": pd.Timestamp("20130102"), "C": pd.Series(1, index=list(range(4)), dtype="float32"), "D": np.array([5, 2, 3, 1], dtype="int32"), "E": pd.Categorical(["test", "train", "foo", "bar"]), "F": ["foo", "bar", "buzz", "fox"], } def test_from_dataframe(): data = get_test_data() modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) modin_result = mss.from_dataframe(modin_df) assert isinstance(modin_result, SpreadsheetWidget) with pytest.raises(TypeError): mss.from_dataframe(pandas_df) # Check parameters don't error def can_edit_row(row): return row["D"] > 2 modin_result = mss.from_dataframe( modin_df, show_toolbar=True, show_history=True, precision=1, grid_options={"forceFitColumns": False, "filterable": False}, column_options={"D": {"editable": True}}, column_definitions={"editable": False}, row_edit_callback=can_edit_row, ) assert isinstance(modin_result, SpreadsheetWidget) def test_to_dataframe(): data = get_test_data() modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) spreadsheet = mss.from_dataframe(modin_df) modin_result = mss.to_dataframe(spreadsheet) assert modin_result.equals(modin_df) with pytest.raises(TypeError): mss.to_dataframe("Not a SpreadsheetWidget") with pytest.raises(TypeError): mss.to_dataframe(pandas_df) ================================================ FILE: modin/tests/experimental/test_fuzzydata.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import glob import os import shutil import uuid from fuzzydata.clients.modin import ModinWorkflow from fuzzydata.core.generator import generate_workflow from modin.config import Engine def test_fuzzydata_sample_workflow(): # Workflow Generation Options wf_name = str(uuid.uuid4())[:8] # Unique name for the generated workflow num_versions = 10 # Number of unique CSV files to generate cols = 33 # Columns in Base Artifact rows = 1000 # Rows in Base Artifact bfactor = 1.0 # Branching Factor - 0.1 is linear, 10.0 is star-like exclude_ops = ["groupby"] # In-Memory groupby operations cause issue #4287 matfreq = 2 # How many operations to chain before materialization engine = Engine.get().lower() # Create Output Directory for Workflow Data base_out_directory = ( f"/tmp/fuzzydata-test-wf-{engine}/" # Must match corresponding github-action ) if os.path.exists(base_out_directory): shutil.rmtree(base_out_directory) output_directory = f"{base_out_directory}/{wf_name}/" os.makedirs(output_directory, exist_ok=True) # Start Workflow Generation workflow = generate_workflow( workflow_class=ModinWorkflow, name=wf_name, num_versions=num_versions, base_shape=(cols, rows), out_directory=output_directory, bfactor=bfactor, exclude_ops=exclude_ops, matfreq=matfreq, wf_options={"modin_engine": engine}, ) # Assertions that the workflow generation worked correctly assert len(workflow) == num_versions assert len(list(glob.glob(f"{output_directory}/artifacts/*.csv"))) == len( workflow.artifact_dict ) assert os.path.exists(f"{output_directory}/{workflow.name}_operations.json") assert os.path.getsize(f"{output_directory}/{workflow.name}_operations.json") > 0 assert os.path.exists(f"{output_directory}/{workflow.name}_gt_graph.csv") ================================================ FILE: modin/tests/experimental/test_io_exp.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import json import platform from pathlib import Path import numpy as np import pandas import pytest from pandas._testing import ensure_clean import modin.experimental.pandas as pd from modin.config import AsyncReadMode, Engine from modin.tests.pandas.utils import ( df_equals, eval_general, parse_dates_values_by_id, test_data, time_parsing_csv_path, ) from modin.tests.test_utils import ( current_execution_is_native, warns_that_defaulting_to_pandas_if, ) from modin.utils import try_cast_to_pandas @pytest.mark.skipif( Engine.get() not in ("Ray", "Unidist", "Dask"), reason=f"{Engine.get()} does not have experimental API", ) def test_from_sql_distributed(tmp_path, make_sql_connection): filename = "test_from_sql_distributed.db" table = "test_from_sql_distributed" conn = make_sql_connection(str(tmp_path / filename), table) query = "select * from {0}".format(table) pandas_df = pandas.read_sql(query, conn) modin_df_from_query = pd.read_sql( query, conn, partition_column="col1", lower_bound=0, upper_bound=6, max_sessions=2, ) modin_df_from_table = pd.read_sql( table, conn, partition_column="col1", lower_bound=0, upper_bound=6, max_sessions=2, ) df_equals(modin_df_from_query, pandas_df) df_equals(modin_df_from_table, pandas_df) @pytest.mark.skipif( Engine.get() not in ("Ray", "Unidist", "Dask"), reason=f"{Engine.get()} does not have experimental API", ) def test_from_sql_defaults(tmp_path, make_sql_connection): filename = "test_from_sql_distributed.db" table = "test_from_sql_distributed" conn = make_sql_connection(str(tmp_path / filename), table) query = "select * from {0}".format(table) pandas_df = pandas.read_sql(query, conn) with pytest.warns(UserWarning): modin_df_from_query = pd.read_sql(query, conn) with pytest.warns(UserWarning): modin_df_from_table = pd.read_sql(table, conn) df_equals(modin_df_from_query, pandas_df) df_equals(modin_df_from_table, pandas_df) @pytest.mark.usefixtures("TestReadGlobCSVFixture") @pytest.mark.skipif( Engine.get() not in ("Ray", "Unidist", "Dask"), reason=f"{Engine.get()} does not have experimental glob API", ) class TestCsvGlob: def test_read_multiple_small_csv(self): pandas_df = pandas.concat([pandas.read_csv(fname) for fname in pytest.files]) modin_df = pd.read_csv_glob(pytest.glob_path) # Indexes get messed up when concatting so we reset both. pandas_df = pandas_df.reset_index(drop=True) modin_df = modin_df.reset_index(drop=True) df_equals(modin_df, pandas_df) @pytest.mark.parametrize("nrows", [35, 100]) def test_read_multiple_csv_nrows(self, request, nrows): pandas_df = pandas.concat([pandas.read_csv(fname) for fname in pytest.files]) pandas_df = pandas_df.iloc[:nrows, :] modin_df = pd.read_csv_glob(pytest.glob_path, nrows=nrows) # Indexes get messed up when concatting so we reset both. pandas_df = pandas_df.reset_index(drop=True) modin_df = modin_df.reset_index(drop=True) df_equals(modin_df, pandas_df) def test_read_csv_empty_frame(self): kwargs = { "usecols": [0], "index_col": 0, } modin_df = pd.read_csv_glob(pytest.files[0], **kwargs) pandas_df = pandas.read_csv(pytest.files[0], **kwargs) df_equals(modin_df, pandas_df) def test_read_csv_without_glob(self): with pytest.raises(FileNotFoundError): with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): pd.read_csv_glob( "s3://dask-data/nyc-taxi/2015/yellow_tripdata_2015-", storage_options={"anon": True}, ) def test_read_csv_glob_4373(self, tmp_path): columns, filename = ["col0"], str(tmp_path / "1x1.csv") df = pd.DataFrame([[1]], columns=columns) with warns_that_defaulting_to_pandas_if(df._query_compiler.engine == "Dask"): df.to_csv(filename) kwargs = {"filepath_or_buffer": filename, "usecols": columns} modin_df = pd.read_csv_glob(**kwargs) pandas_df = pandas.read_csv(**kwargs) df_equals(modin_df, pandas_df) @pytest.mark.parametrize( "parse_dates", [pytest.param(value, id=id) for id, value in parse_dates_values_by_id.items()], ) def test_read_single_csv_with_parse_dates(self, parse_dates): try: pandas_df = pandas.read_csv(time_parsing_csv_path, parse_dates=parse_dates) except Exception as pandas_exception: with pytest.raises(Exception) as modin_exception: modin_df = pd.read_csv_glob( time_parsing_csv_path, parse_dates=parse_dates ) try_cast_to_pandas(modin_df) # force materialization assert isinstance( modin_exception.value, type(pandas_exception) ), "Got Modin Exception type {}, but pandas Exception type {} was expected".format( type(modin_exception.value), type(pandas_exception) ) else: modin_df = pd.read_csv_glob(time_parsing_csv_path, parse_dates=parse_dates) df_equals(modin_df, pandas_df) @pytest.mark.skipif( Engine.get() not in ("Ray", "Unidist", "Dask"), reason=f"{Engine.get()} does not have experimental glob API", ) @pytest.mark.parametrize( "path", [ "s3://modin-test/modin-bugs/multiple_csv/test_data*.csv", ], ) def test_read_multiple_csv_cloud_store(path, s3_resource, s3_storage_options): def _pandas_read_csv_glob(path, storage_options): pandas_dfs = [ pandas.read_csv( f"{path.lower().split('*')[0]}{i}.csv", storage_options=storage_options ) for i in range(2) ] return pandas.concat(pandas_dfs).reset_index(drop=True) eval_general( pd, pandas, lambda module, **kwargs: ( pd.read_csv_glob(path, **kwargs).reset_index(drop=True) if hasattr(module, "read_csv_glob") else _pandas_read_csv_glob(path, **kwargs) ), storage_options=s3_storage_options, ) @pytest.mark.skipif( Engine.get() not in ("Ray", "Unidist", "Dask"), reason=f"{Engine.get()} does not have experimental API", ) @pytest.mark.parametrize( "storage_options_extra", [{"anon": False}, {"anon": True}, {"key": "123", "secret": "123"}], ) def test_read_multiple_csv_s3_storage_opts( s3_resource, s3_storage_options, storage_options_extra ): s3_path = "s3://modin-test/modin-bugs/multiple_csv/" def _pandas_read_csv_glob(path, storage_options): pandas_df = pandas.concat( [ pandas.read_csv( f"{s3_path}test_data{i}.csv", storage_options=storage_options, ) for i in range(2) ], ).reset_index(drop=True) return pandas_df expected_exception = None if "anon" in storage_options_extra: expected_exception = PermissionError("Forbidden") eval_general( pd, pandas, lambda module, **kwargs: ( pd.read_csv_glob(s3_path, **kwargs) if hasattr(module, "read_csv_glob") else _pandas_read_csv_glob(s3_path, **kwargs) ), storage_options=s3_storage_options | storage_options_extra, expected_exception=expected_exception, ) @pytest.mark.skipif( Engine.get() not in ("Ray", "Unidist", "Dask"), reason=f"{Engine.get()} does not have experimental API", ) @pytest.mark.parametrize("pathlike", [False, True]) @pytest.mark.parametrize("compression", [None, "gzip"]) @pytest.mark.parametrize( "filename", ["test_default_to_pickle.pkl", "test_to_pickle*.pkl"] ) @pytest.mark.parametrize("read_func", ["read_pickle_glob"]) @pytest.mark.parametrize("to_func", ["to_pickle_glob"]) def test_distributed_pickling( tmp_path, filename, compression, pathlike, read_func, to_func ): data = test_data["int_data"] df = pd.DataFrame(data) filename_param = filename if compression: filename = f"{filename}.gz" filename = Path(filename) if pathlike else filename with warns_that_defaulting_to_pandas_if( filename_param == "test_default_to_pickle.pkl" ): getattr(df.modin, to_func)(str(tmp_path / filename), compression=compression) pickled_df = getattr(pd, read_func)( str(tmp_path / filename), compression=compression ) df_equals(pickled_df, df) @pytest.mark.skipif( Engine.get() not in ("Ray", "Unidist", "Dask"), reason=f"{Engine.get()} does not have experimental API", ) @pytest.mark.parametrize( "filename", ["test_parquet_glob.parquet", "test_parquet_glob*.parquet"], ) def test_parquet_glob(tmp_path, filename): data = test_data["int_data"] df = pd.DataFrame(data) filename_param = filename with warns_that_defaulting_to_pandas_if( filename_param == "test_parquet_glob.parquet" ): df.modin.to_parquet_glob(str(tmp_path / filename)) read_df = pd.read_parquet_glob(str(tmp_path / filename)) df_equals(read_df, df) @pytest.mark.skipif( Engine.get() not in ("Ray", "Unidist", "Dask"), reason=f"{Engine.get()} does not have experimental API", ) @pytest.mark.parametrize( "filename", ["test_json_glob.json", "test_json_glob*.json"], ) def test_json_glob(tmp_path, filename): data = test_data["int_data"] df = pd.DataFrame(data) filename_param = filename with warns_that_defaulting_to_pandas_if(filename_param == "test_json_glob.json"): df.modin.to_json_glob(str(tmp_path / filename)) read_df = pd.read_json_glob(str(tmp_path / filename)) df_equals(read_df, df) @pytest.mark.skipif( Engine.get() not in ("Ray", "Unidist", "Dask"), reason=f"{Engine.get()} does not have experimental API", ) @pytest.mark.parametrize( "filename", ["test_xml_glob.xml", "test_xml_glob*.xml"], ) @pytest.mark.skipif( platform.system() == "Windows", reason="https://github.com/modin-project/modin/issues/7497", ) def test_xml_glob(tmp_path, filename): data = test_data["int_data"] df = pd.DataFrame(data) filename_param = filename with warns_that_defaulting_to_pandas_if(filename_param == "test_xml_glob.xml"): df.modin.to_xml_glob(str(tmp_path / filename), index=False) read_df = pd.read_xml_glob(str(tmp_path / filename)) # Index get messed up when concatting so we reset it. read_df = read_df.reset_index(drop=True) df_equals(read_df, df) @pytest.mark.skipif( Engine.get() not in ("Ray", "Unidist", "Dask"), reason=f"{Engine.get()} does not have experimental read_custom_text API", ) @pytest.mark.parametrize("set_async_read_mode", [False, True], indirect=True) def test_read_custom_json_text(set_async_read_mode): def _generate_json(file_name, nrows, ncols): data = np.random.rand(nrows, ncols) df = pandas.DataFrame(data, columns=[f"col{x}" for x in range(ncols)]) df.to_json(file_name, lines=True, orient="records") # Custom parser allows us to add some specifics to reading files, # which is not available through the ready-made API. # For example, the parser allows us to reduce the amount of RAM # required for reading by selecting a subset of columns. def _custom_parser(io_input, **kwargs): result = {"col0": [], "col1": [], "col3": []} for line in io_input: # for example, simjson can be used here obj = json.loads(line) for key in result: result[key].append(obj[key]) return pandas.DataFrame(result).rename(columns={"col0": "testID"}) with ensure_clean() as filename: _generate_json(filename, 64, 8) df1 = pd.read_custom_text( filename, columns=["testID", "col1", "col3"], custom_parser=_custom_parser, is_quoting=False, ) df2 = pd.read_json(filename, lines=True)[["col0", "col1", "col3"]].rename( columns={"col0": "testID"} ) if AsyncReadMode.get(): # If read operations are asynchronous, then the dataframes # check should be inside `ensure_clean` context # because the file may be deleted before actual reading starts df_equals(df1, df2) if not AsyncReadMode.get(): df_equals(df1, df2) @pytest.mark.skipif( Engine.get() not in ("Ray", "Unidist", "Dask"), reason=f"{Engine.get()} does not have experimental API", ) @pytest.mark.parametrize("set_async_read_mode", [False, True], indirect=True) def test_read_evaluated_dict(set_async_read_mode): def _generate_evaluated_dict(file_name, nrows, ncols): result = {} keys = [f"col{x}" for x in range(ncols)] with open(file_name, mode="w") as _file: for i in range(nrows): data = np.random.rand(ncols) for idx, key in enumerate(keys): result[key] = data[idx] _file.write(str(result)) _file.write("\n") # This parser allows us to read a format not supported by other reading functions def _custom_parser(io_input, **kwargs): cat_list = [] asin_list = [] for line in io_input: obj = eval(line) cat_list.append(obj["col1"]) asin_list.append(obj["col2"]) return pandas.DataFrame({"col1": asin_list, "col2": cat_list}) def columns_callback(io_input, **kwargs): columns = None for line in io_input: columns = list(eval(line).keys())[1:3] break return columns with ensure_clean() as filename: _generate_evaluated_dict(filename, 64, 8) df1 = pd.read_custom_text( filename, columns=["col1", "col2"], custom_parser=_custom_parser, ) assert df1.shape == (64, 2) df2 = pd.read_custom_text( filename, columns=columns_callback, custom_parser=_custom_parser ) if AsyncReadMode.get(): # If read operations are asynchronous, then the dataframes # check should be inside `ensure_clean` context # because the file may be deleted before actual reading starts df_equals(df1, df2) if not AsyncReadMode.get(): df_equals(df1, df2) ================================================ FILE: modin/tests/experimental/test_pipeline.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import numpy as np import pandas import pytest import modin.pandas as pd from modin.config import Engine, NPartitions from modin.core.execution.ray.common import RayWrapper from modin.distributed.dataframe.pandas.partitions import from_partitions from modin.experimental.batch.pipeline import PandasQueryPipeline from modin.tests.pandas.utils import df_equals @pytest.mark.skipif( Engine.get() != "Ray", reason="Only Ray supports the Batch Pipeline API", ) class TestPipelineRayEngine: def test_warnings(self): """Ensure that creating a Pipeline object raises the correct warnings.""" arr = np.random.randint(0, 1000, (1000, 1000)) df = pd.DataFrame(arr) # Ensure that building a pipeline warns users that it is an experimental feature with pytest.warns( UserWarning, match="The Batch Pipeline API is an experimental feature and still under development in Modin.", ): pipeline = PandasQueryPipeline(df) with pytest.warns( UserWarning, match="No outputs to compute. Returning an empty list. Please specify outputs by calling `add_query` with `is_output=True`.", ): output = pipeline.compute_batch() assert output == [], "Empty pipeline did not return an empty list." def test_pipeline_simple(self): """Create a simple pipeline and ensure that it runs end to end correctly.""" arr = np.random.randint(0, 1000, (1000, 1000)) df = pd.DataFrame(arr) def add_col(df): df["new_col"] = df.sum(axis=1) return df # Build pipeline pipeline = PandasQueryPipeline(df) pipeline.add_query(add_col) pipeline.add_query(lambda df: df * -30) pipeline.add_query( lambda df: df.rename(columns={i: f"col {i}" for i in range(1000)}) ) def add_row_to_partition(df): return pandas.concat([df, df.iloc[[-1]]]) pipeline.add_query(add_row_to_partition, is_output=True) new_df = pipeline.compute_batch()[0] # Build df without pipelining to ensure correctness correct_df = add_col(pd.DataFrame(arr)) correct_df *= -30 correct_df = pd.DataFrame( correct_df.rename(columns={i: f"col {i}" for i in range(1000)})._to_pandas() ) correct_modin_frame = correct_df._query_compiler._modin_frame partitions = correct_modin_frame._partition_mgr_cls.row_partitions( correct_modin_frame._partitions ) partitions = [ partition.add_to_apply_calls(add_row_to_partition) for partition in partitions ] [partition.drain_call_queue() for partition in partitions] partitions = [partition.list_of_blocks for partition in partitions] correct_df = from_partitions(partitions, axis=None) # Compare pipelined and non-pipelined df df_equals(correct_df, new_df) # Ensure that setting `num_partitions` when creating a pipeline does not change `NPartitions` num_partitions = NPartitions.get() PandasQueryPipeline(df, num_partitions=(num_partitions - 1)) assert ( NPartitions.get() == num_partitions ), "Pipeline did not change NPartitions.get()" def test_update_df(self): """Ensure that `update_df` updates the df that the pipeline runs on.""" df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) pipeline = PandasQueryPipeline(df) pipeline.add_query(lambda df: df + 3, is_output=True) new_df = df * -1 pipeline.update_df(new_df) output_df = pipeline.compute_batch()[0] df_equals((df * -1) + 3, output_df) def test_multiple_outputs(self): """Create a pipeline with multiple outputs, and check that all are computed correctly.""" arr = np.random.randint(0, 1000, (1000, 1000)) df = pd.DataFrame(arr) pipeline = PandasQueryPipeline(df) pipeline.add_query(lambda df: df * -30, is_output=True) pipeline.add_query( lambda df: df.rename(columns={i: f"col {i}" for i in range(1000)}), is_output=True, ) pipeline.add_query(lambda df: df + 30, is_output=True) new_dfs = pipeline.compute_batch() assert len(new_dfs) == 3, "Pipeline did not return all outputs" correct_df = pd.DataFrame(arr) * -30 df_equals(correct_df, new_dfs[0]) # First output computed correctly correct_df = correct_df.rename(columns={i: f"col {i}" for i in range(1000)}) df_equals(correct_df, new_dfs[1]) # Second output computed correctly correct_df += 30 df_equals(correct_df, new_dfs[2]) # Third output computed correctly def test_output_id(self): """Ensure `output_id` is handled correctly when passed.""" arr = np.random.randint(0, 1000, (1000, 1000)) df = pd.DataFrame(arr) pipeline = PandasQueryPipeline(df, 0) pipeline.add_query(lambda df: df * -30, is_output=True, output_id=20) with pytest.raises( ValueError, match="Output ID must be specified for all nodes." ): pipeline.add_query( lambda df: df.rename(columns={i: f"col {i}" for i in range(1000)}), is_output=True, ) assert ( len(pipeline.query_list) == 0 and len(pipeline.outputs) == 1 ), "Invalid `add_query` incorrectly added a node to the pipeline." pipeline = PandasQueryPipeline(df) pipeline.add_query(lambda df: df * -30, is_output=True) with pytest.raises( ValueError, match="Output ID must be specified for all nodes." ): pipeline.add_query( lambda df: df.rename(columns={i: f"col {i}" for i in range(1000)}), is_output=True, output_id=20, ) assert ( len(pipeline.query_list) == 0 and len(pipeline.outputs) == 1 ), "Invalid `add_query` incorrectly added a node to the pipeline." pipeline = PandasQueryPipeline(df) pipeline.add_query(lambda df: df, is_output=True) with pytest.raises( ValueError, match=( "`pass_output_id` is set to True, but output ids have not been specified. " + "To pass output ids, please specify them using the `output_id` kwarg with pipeline.add_query" ), ): pipeline.compute_batch(postprocessor=lambda df: df, pass_output_id=True) with pytest.raises( ValueError, match="Output ID cannot be specified for non-output node.", ): pipeline.add_query(lambda df: df, output_id=22) assert ( len(pipeline.query_list) == 0 and len(pipeline.outputs) == 1 ), "Invalid `add_query` incorrectly added a node to the pipeline." def test_output_id_multiple_outputs(self): """Ensure `output_id` is handled correctly when multiple outputs are computed.""" arr = np.random.randint(0, 1000, (1000, 1000)) df = pd.DataFrame(arr) pipeline = PandasQueryPipeline(df) pipeline.add_query(lambda df: df * -30, is_output=True, output_id=20) pipeline.add_query( lambda df: df.rename(columns={i: f"col {i}" for i in range(1000)}), is_output=True, output_id=21, ) pipeline.add_query(lambda df: df + 30, is_output=True, output_id=22) new_dfs = pipeline.compute_batch() assert isinstance( new_dfs, dict ), "Pipeline did not return a dictionary mapping output_ids to dfs" assert 20 in new_dfs, "Output ID 1 not cached correctly" assert 21 in new_dfs, "Output ID 2 not cached correctly" assert 22 in new_dfs, "Output ID 3 not cached correctly" assert len(new_dfs) == 3, "Pipeline did not return all outputs" correct_df = pd.DataFrame(arr) * -30 df_equals(correct_df, new_dfs[20]) # First output computed correctly correct_df = correct_df.rename(columns={i: f"col {i}" for i in range(1000)}) df_equals(correct_df, new_dfs[21]) # Second output computed correctly correct_df += 30 df_equals(correct_df, new_dfs[22]) # Third output computed correctly def test_postprocessing(self): """Check that the `postprocessor` argument to `_compute_batch` is handled correctly.""" arr = np.random.randint(0, 1000, (1000, 1000)) df = pd.DataFrame(arr) pipeline = PandasQueryPipeline(df) pipeline.add_query(lambda df: df * -30, is_output=True) pipeline.add_query( lambda df: df.rename(columns={i: f"col {i}" for i in range(1000)}), is_output=True, ) pipeline.add_query(lambda df: df + 30, is_output=True) def new_col_adder(df): df["new_col"] = df.iloc[:, -1] return df new_dfs = pipeline.compute_batch(postprocessor=new_col_adder) assert len(new_dfs) == 3, "Pipeline did not return all outputs" correct_df = pd.DataFrame(arr) * -30 correct_df["new_col"] = correct_df.iloc[:, -1] df_equals(correct_df, new_dfs[0]) correct_df = correct_df.drop(columns=["new_col"]) correct_df = correct_df.rename(columns={i: f"col {i}" for i in range(1000)}) correct_df["new_col"] = correct_df.iloc[:, -1] df_equals(correct_df, new_dfs[1]) correct_df = correct_df.drop(columns=["new_col"]) correct_df += 30 correct_df["new_col"] = correct_df.iloc[:, -1] df_equals(correct_df, new_dfs[2]) def test_postprocessing_with_output_id(self): """Check that the `postprocessor` argument is correctly handled when `output_id` is specified.""" def new_col_adder(df): df["new_col"] = df.iloc[:, -1] return df arr = np.random.randint(0, 1000, (1000, 1000)) df = pd.DataFrame(arr) pipeline = PandasQueryPipeline(df) pipeline.add_query(lambda df: df * -30, is_output=True, output_id=20) pipeline.add_query( lambda df: df.rename(columns={i: f"col {i}" for i in range(1000)}), is_output=True, output_id=21, ) pipeline.add_query(lambda df: df + 30, is_output=True, output_id=22) new_dfs = pipeline.compute_batch(postprocessor=new_col_adder) assert len(new_dfs) == 3, "Pipeline did not return all outputs" def test_postprocessing_with_output_id_passed(self): """Check that the `postprocessor` argument is correctly passed `output_id` when `pass_output_id` is `True`.""" arr = np.random.randint(0, 1000, (1000, 1000)) def new_col_adder(df, o_id): df["new_col"] = o_id return df df = pd.DataFrame(arr) pipeline = PandasQueryPipeline(df) pipeline.add_query(lambda df: df * -30, is_output=True, output_id=20) pipeline.add_query( lambda df: df.rename(columns={i: f"col {i}" for i in range(1000)}), is_output=True, output_id=21, ) pipeline.add_query(lambda df: df + 30, is_output=True, output_id=22) new_dfs = pipeline.compute_batch( postprocessor=new_col_adder, pass_output_id=True ) correct_df = pd.DataFrame(arr) * -30 correct_df["new_col"] = 20 df_equals(correct_df, new_dfs[20]) correct_df = correct_df.drop(columns=["new_col"]) correct_df = correct_df.rename(columns={i: f"col {i}" for i in range(1000)}) correct_df["new_col"] = 21 df_equals(correct_df, new_dfs[21]) correct_df = correct_df.drop(columns=["new_col"]) correct_df += 30 correct_df["new_col"] = 22 df_equals(correct_df, new_dfs[22]) def test_postprocessing_with_partition_id(self): """Check that the postprocessing is correctly handled when `partition_id` is passed.""" arr = np.random.randint(0, 1000, (1000, 1000)) def new_col_adder(df, partition_id): df["new_col"] = partition_id return df df = pd.DataFrame(arr) pipeline = PandasQueryPipeline(df) pipeline.add_query(lambda df: df * -30, is_output=True, output_id=20) pipeline.add_query( lambda df: df.rename(columns={i: f"col {i}" for i in range(1000)}), is_output=True, output_id=21, ) new_dfs = pipeline.compute_batch( postprocessor=new_col_adder, pass_partition_id=True ) correct_df = pd.DataFrame(arr) * -30 correct_modin_frame = correct_df._query_compiler._modin_frame partitions = correct_modin_frame._partition_mgr_cls.row_partitions( correct_modin_frame._partitions ) partitions = [ partition.add_to_apply_calls(new_col_adder, i) for i, partition in enumerate(partitions) ] [partition.drain_call_queue() for partition in partitions] partitions = [partition.list_of_blocks for partition in partitions] correct_df = from_partitions(partitions, axis=None) df_equals(correct_df, new_dfs[20]) correct_df = correct_df.drop(columns=["new_col"]) correct_df = pd.DataFrame( correct_df.rename(columns={i: f"col {i}" for i in range(1000)})._to_pandas() ) correct_modin_frame = correct_df._query_compiler._modin_frame partitions = correct_modin_frame._partition_mgr_cls.row_partitions( correct_modin_frame._partitions ) partitions = [ partition.add_to_apply_calls(new_col_adder, i) for i, partition in enumerate(partitions) ] [partition.drain_call_queue() for partition in partitions] partitions = [partition.list_of_blocks for partition in partitions] correct_df = from_partitions(partitions, axis=None) df_equals(correct_df, new_dfs[21]) def test_postprocessing_with_all_metadata(self): """Check that postprocessing is correctly handled when `partition_id` and `output_id` are passed.""" arr = np.random.randint(0, 1000, (1000, 1000)) def new_col_adder(df, o_id, partition_id): df["new_col"] = f"{o_id} {partition_id}" return df df = pd.DataFrame(arr) pipeline = PandasQueryPipeline(df) pipeline.add_query(lambda df: df * -30, is_output=True, output_id=20) pipeline.add_query( lambda df: df.rename(columns={i: f"col {i}" for i in range(1000)}), is_output=True, output_id=21, ) new_dfs = pipeline.compute_batch( postprocessor=new_col_adder, pass_partition_id=True, pass_output_id=True ) correct_df = pd.DataFrame(arr) * -30 correct_modin_frame = correct_df._query_compiler._modin_frame partitions = correct_modin_frame._partition_mgr_cls.row_partitions( correct_modin_frame._partitions ) partitions = [ partition.add_to_apply_calls(new_col_adder, 20, i) for i, partition in enumerate(partitions) ] [partition.drain_call_queue() for partition in partitions] partitions = [partition.list_of_blocks for partition in partitions] correct_df = from_partitions(partitions, axis=None) df_equals(correct_df, new_dfs[20]) correct_df = correct_df.drop(columns=["new_col"]) correct_df = pd.DataFrame( correct_df.rename(columns={i: f"col {i}" for i in range(1000)})._to_pandas() ) correct_modin_frame = correct_df._query_compiler._modin_frame partitions = correct_modin_frame._partition_mgr_cls.row_partitions( correct_modin_frame._partitions ) partitions = [ partition.add_to_apply_calls(new_col_adder, 21, i) for i, partition in enumerate(partitions) ] [partition.drain_call_queue() for partition in partitions] partitions = [partition.list_of_blocks for partition in partitions] correct_df = from_partitions(partitions, axis=None) df_equals(correct_df, new_dfs[21]) def test_repartition_after(self): """Check that the `repartition_after` argument is appropriately handled.""" df = pd.DataFrame([list(range(1000))]) pipeline = PandasQueryPipeline(df) pipeline.add_query( lambda df: pandas.concat([df] * 1000), repartition_after=True ) def new_col_adder(df, partition_id): df["new_col"] = partition_id return df pipeline.add_query(new_col_adder, is_output=True, pass_partition_id=True) new_dfs = pipeline.compute_batch() # new_col_adder should set `new_col` to the partition ID # throughout the dataframe. We expect there to be # NPartitions.get() partitions by the time new_col_adder runs, # because the previous step has repartitioned. assert len(new_dfs[0]["new_col"].unique()) == NPartitions.get() # Test that `repartition_after=True` raises an error when the result has more than # one partition. partition1 = RayWrapper.put(pandas.DataFrame([[0, 1, 2]])) partition2 = RayWrapper.put(pandas.DataFrame([[3, 4, 5]])) df = from_partitions([partition1, partition2], 0) pipeline = PandasQueryPipeline(df, 0) pipeline.add_query(lambda df: df, repartition_after=True, is_output=True) with pytest.raises( NotImplementedError, match="Dynamic repartitioning is currently only supported for DataFrames with 1 partition.", ): pipeline.compute_batch() def test_fan_out(self): """Check that the fan_out argument is appropriately handled.""" df = pd.DataFrame([[0, 1, 2]]) def new_col_adder(df, partition_id): df["new_col"] = partition_id return df def reducer(dfs): new_cols = "".join([str(df["new_col"].values[0]) for df in dfs]) dfs[0]["new_col1"] = new_cols return dfs[0] pipeline = PandasQueryPipeline(df) pipeline.add_query( new_col_adder, fan_out=True, reduce_fn=reducer, pass_partition_id=True, is_output=True, ) new_df = pipeline.compute_batch()[0] correct_df = pd.DataFrame([[0, 1, 2]]) correct_df["new_col"] = 0 correct_df["new_col1"] = "".join([str(i) for i in range(NPartitions.get())]) df_equals(correct_df, new_df) # Test that `fan_out=True` raises an error when the input has more than # one partition. partition1 = RayWrapper.put(pandas.DataFrame([[0, 1, 2]])) partition2 = RayWrapper.put(pandas.DataFrame([[3, 4, 5]])) df = from_partitions([partition1, partition2], 0) pipeline = PandasQueryPipeline(df) pipeline.add_query( new_col_adder, fan_out=True, reduce_fn=reducer, pass_partition_id=True, is_output=True, ) with pytest.raises( NotImplementedError, match="Fan out is only supported with DataFrames with 1 partition.", ): pipeline.compute_batch()[0] def test_pipeline_complex(self): """Create a complex pipeline with both `fan_out`, `repartition_after` and postprocessing and ensure that it runs end to end correctly.""" from os import remove from os.path import exists from time import sleep df = pd.DataFrame([[0, 1, 2]]) def new_col_adder(df, partition_id): sleep(60) df["new_col"] = partition_id return df def reducer(dfs): new_cols = "".join([str(df["new_col"].values[0]) for df in dfs]) dfs[0]["new_col1"] = new_cols return dfs[0] desired_num_partitions = 24 pipeline = PandasQueryPipeline(df, num_partitions=desired_num_partitions) pipeline.add_query( new_col_adder, fan_out=True, reduce_fn=reducer, pass_partition_id=True, is_output=True, output_id=20, ) pipeline.add_query( lambda df: pandas.concat([df] * 1000), repartition_after=True, ) def to_csv(df, partition_id): df = df.drop(columns=["new_col"]) df.to_csv(f"{partition_id}.csv") return df pipeline.add_query(to_csv, is_output=True, output_id=21, pass_partition_id=True) def post_proc(df, o_id, partition_id): df["new_col_proc"] = f"{o_id} {partition_id}" return df new_dfs = pipeline.compute_batch( postprocessor=post_proc, pass_partition_id=True, pass_output_id=True, ) correct_df = pd.DataFrame([[0, 1, 2]]) correct_df["new_col"] = 0 correct_df["new_col1"] = "".join( [str(i) for i in range(desired_num_partitions)] ) correct_df["new_col_proc"] = "20 0" df_equals(correct_df, new_dfs[20]) correct_df = pd.concat([correct_df] * 1000) correct_df = correct_df.drop(columns=["new_col"]) correct_df["new_col_proc"] = "21 0" new_length = len(correct_df.index) // desired_num_partitions for i in range(desired_num_partitions): if i == desired_num_partitions - 1: correct_df.iloc[i * new_length :, -1] = f"21 {i}" else: correct_df.iloc[i * new_length : (i + 1) * new_length, -1] = f"21 {i}" df_equals(correct_df, new_dfs[21]) correct_df = correct_df.drop(columns=["new_col_proc"]) for i in range(desired_num_partitions): if i == desired_num_partitions - 1: correct_partition = correct_df.iloc[i * new_length :] else: correct_partition = correct_df.iloc[ i * new_length : (i + 1) * new_length ] assert exists( f"{i}.csv" ), "CSV File for Partition {i} does not exist, even though dataframe should have been repartitioned." df_equals( correct_partition, pd.read_csv(f"{i}.csv", index_col="Unnamed: 0").rename( columns={"0": 0, "1": 1, "2": 2} ), ) remove(f"{i}.csv") @pytest.mark.skipif( Engine.get() == "Ray", reason="Ray supports the Batch Pipeline API", ) def test_pipeline_unsupported_engine(): """Ensure that trying to use the Pipeline API with an unsupported Engine raises errors.""" # Check that pipeline does not allow `Engine` to not be Ray. df = pd.DataFrame([[1]]) with pytest.raises( NotImplementedError, match="Batch Pipeline API is only implemented for `PandasOnRay` execution.", ): PandasQueryPipeline(df) eng = Engine.get() Engine.put("Ray") # Check that even if Engine is Ray, if the df is not backed by Ray, the Pipeline does not allow initialization. with pytest.raises( NotImplementedError, match="Batch Pipeline API is only implemented for `PandasOnRay` execution.", ): PandasQueryPipeline(df, 0) df_on_ray_engine = pd.DataFrame([[1]]) pipeline = PandasQueryPipeline(df_on_ray_engine) # Check that even if Engine is Ray, if the new df is not backed by Ray, the Pipeline does not allow an update. with pytest.raises( NotImplementedError, match="Batch Pipeline API is only implemented for `PandasOnRay` execution.", ): pipeline.update_df(df) Engine.put(eng) # Check that pipeline does not allow an update when `Engine` is not Ray. with pytest.raises( NotImplementedError, match="Batch Pipeline API is only implemented for `PandasOnRay` execution.", ): pipeline.update_df(df) ================================================ FILE: modin/tests/experimental/torch/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. ================================================ FILE: modin/tests/experimental/torch/test_dataloader.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. from __future__ import annotations from types import ModuleType from typing import Type import numpy as np import pandas import pytest import ray import torch from torch.utils.data import RandomSampler, Sampler, SequentialSampler import modin.pandas as pd from modin.experimental.torch.datasets import ModinDataLoader @pytest.fixture(scope="module", autouse=True) def ray_fix(): ray.init(num_cpus=1) yield None ray.shutdown() def _load_test_dataframe(lib: ModuleType): df = lib.read_csv( "https://raw.githubusercontent.com/ponder-org/ponder-datasets/main/USA_Housing.csv" ) return df @pytest.mark.parametrize("lib", [pandas, pd]) @pytest.mark.parametrize("sampler_cls", [RandomSampler, SequentialSampler]) @pytest.mark.parametrize("batch_size", [16, 37]) def test_torch_dataloader(lib: ModuleType, sampler_cls: Type[Sampler], batch_size: int): df = _load_test_dataframe(lib) np.random.seed(42) torch.manual_seed(42) loader = ModinDataLoader( df, batch_size=batch_size, features=[ "AVG_AREA_INCOME", "AVG_AREA_HOUSE_AGE", "AVG_AREA_NUM_ROOMS", "AVG_AREA_NUM_BEDROOMS", "POPULATION", "PRICE", ], sampler=sampler_cls, ) outputs = [] for batch in loader: assert batch.shape[0] <= batch_size, batch.shape assert batch.shape[1] == 6, batch.shape outputs.append(batch) return outputs @pytest.mark.parametrize("sampler_cls", [RandomSampler, SequentialSampler]) @pytest.mark.parametrize("batch_size", [16, 37]) def test_compare_dataloaders(sampler_cls: Type[Sampler], batch_size: int): by_modin = test_torch_dataloader(pd, sampler_cls, batch_size=batch_size) by_pandas = test_torch_dataloader(pandas, sampler_cls, batch_size=batch_size) assert len(by_modin) == len(by_pandas) for tensor_by_modin, tensor_by_pandas in zip(by_modin, by_pandas): assert np.allclose(tensor_by_modin, tensor_by_pandas), ( tensor_by_modin - tensor_by_pandas ) ================================================ FILE: modin/tests/experimental/xgboost/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. ================================================ FILE: modin/tests/experimental/xgboost/test_default.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import pytest import modin.experimental.xgboost as xgb import modin.pandas as pd from modin.config import Engine @pytest.mark.skipif( Engine.get() == "Ray", reason="This test doesn't make sense on Ray engine.", ) @pytest.mark.skipif( Engine.get() == "Python", reason="This test doesn't make sense on non-distributed engine (see issue #2938).", ) def test_engine(): try: xgb.train({}, xgb.DMatrix(pd.DataFrame([0]), pd.DataFrame([0]))) except ValueError: pass ================================================ FILE: modin/tests/experimental/xgboost/test_dmatrix.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import numpy as np import pandas import pytest import xgboost as xgb from sklearn.datasets import load_breast_cancer from sklearn.metrics import accuracy_score import modin.experimental.xgboost as mxgb import modin.pandas as pd from modin.config import Engine from modin.utils import try_cast_to_pandas if Engine.get() != "Ray": pytest.skip( "Modin' xgboost extension works only with Ray engine.", allow_module_level=True, ) rng = np.random.RandomState(1994) def check_dmatrix(data, label=None, **kwargs): modin_data = pd.DataFrame(data) modin_label = label if label is None else pd.Series(label) try: dm = xgb.DMatrix(data, label=label, **kwargs) except Exception as xgb_exception: with pytest.raises(Exception) as mxgb_exception: mxgb.DMatrix(modin_data, label=modin_label, **kwargs) # Thrown exceptions are `XGBoostError`, which is a descendant of `ValueError`, and `ValueError` # for XGBoost and Modin, respectively, so we intentionally use `xgb_exception` # as a first parameter of `isinstance` to pass the assertion assert isinstance( xgb_exception, type(mxgb_exception.value) ), "Got Modin Exception type {}, but xgboost Exception type {} was expected".format( type(mxgb_exception.value), type(xgb_exception) ) else: md_dm = mxgb.DMatrix(modin_data, label=modin_label, **kwargs) assert md_dm.num_row() == dm.num_row() assert md_dm.num_col() == dm.num_col() assert md_dm.feature_names == dm.feature_names assert md_dm.feature_types == dm.feature_types @pytest.mark.parametrize( "data", [ np.random.randn(5, 5), np.array([[1, 2], [3, 4]]), np.array([["a", "b"], ["c", "d"]]), [[1, 2], [3, 4]], [["a", "b"], ["c", "d"]], ], ) @pytest.mark.parametrize( "feature_names", [ list("abcdef"), ["a", "b", "c", "d", "d"], ["a", "b", "c", "d", "e<1"], list("abcde"), ], ) @pytest.mark.parametrize( "feature_types", [None, "q", list("qiqiq")], ) def test_dmatrix_feature_names_and_feature_types(data, feature_names, feature_types): check_dmatrix(data, feature_names=feature_names, feature_types=feature_types) @pytest.mark.skipif( Engine.get() != "Ray", reason="implemented only for Ray engine.", ) def test_feature_names(): dataset = load_breast_cancer() X = dataset.data y = dataset.target feature_names = [f"feat{i}" for i in range(X.shape[1])] check_dmatrix( X, y, feature_names=feature_names, ) dmatrix = xgb.DMatrix(X, label=y, feature_names=feature_names) md_dmatrix = mxgb.DMatrix( pd.DataFrame(X), label=pd.Series(y), feature_names=feature_names ) params = { "objective": "binary:logistic", "eval_metric": "mlogloss", } booster = xgb.train(params, dmatrix, num_boost_round=10) md_booster = mxgb.train(params, md_dmatrix, num_boost_round=10) predictions = booster.predict(dmatrix) modin_predictions = md_booster.predict(md_dmatrix) preds = pandas.DataFrame(predictions).apply(np.round, axis=0) modin_preds = modin_predictions.apply(np.round, axis=0) accuracy = accuracy_score(y, preds) md_accuracy = accuracy_score(y, modin_preds) np.testing.assert_allclose(accuracy, md_accuracy, atol=0.005, rtol=0.002) # Different feature_names (default) must raise error in this case dm = xgb.DMatrix(X) md_dm = mxgb.DMatrix(pd.DataFrame(X)) with pytest.raises(ValueError): booster.predict(dm) with pytest.raises(ValueError): try_cast_to_pandas(md_booster.predict(md_dm)) # force materialization def test_feature_weights(): n_rows = 10 n_cols = 50 fw = rng.uniform(size=n_cols) X = rng.randn(n_rows, n_cols) dm = xgb.DMatrix(X) md_dm = mxgb.DMatrix(pd.DataFrame(X)) dm.set_info(feature_weights=fw) md_dm.set_info(feature_weights=fw) np.testing.assert_allclose( dm.get_float_info("feature_weights"), md_dm.get_float_info("feature_weights") ) # Handle empty dm.set_info(feature_weights=np.empty((0,))) md_dm.set_info(feature_weights=np.empty((0,))) assert ( dm.get_float_info("feature_weights").shape[0] == md_dm.get_float_info("feature_weights").shape[0] == 0 ) ================================================ FILE: modin/tests/experimental/xgboost/test_xgboost.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import multiprocessing as mp import numpy as np import pytest import ray import xgboost from sklearn.datasets import ( load_breast_cancer, load_diabetes, load_digits, load_iris, load_wine, ) from sklearn.metrics import accuracy_score, mean_squared_error import modin import modin.experimental.xgboost as xgb import modin.pandas as pd from modin.config import Engine from modin.experimental.sklearn.model_selection.train_test_split import train_test_split if Engine.get() != "Ray": pytest.skip("Implemented only for Ray engine.", allow_module_level=True) ray.init(log_to_driver=False) num_cpus = mp.cpu_count() @pytest.mark.parametrize( "modin_type_y", [pd.DataFrame, pd.Series], ) @pytest.mark.parametrize( "num_actors", [1, num_cpus, None, modin.config.NPartitions.get() + 1], ) @pytest.mark.parametrize( "data", [ ( load_breast_cancer(), {"objective": "binary:logistic", "eval_metric": ["logloss", "error"]}, ), ], ids=["load_breast_cancer"], ) def test_xgb_with_binary_classification_datasets(data, num_actors, modin_type_y): dataset, param = data num_round = 10 X = dataset.data y = dataset.target xgb_dmatrix = xgboost.DMatrix(X, label=y) modin_X = pd.DataFrame(X) modin_y = modin_type_y(y) mxgb_dmatrix = xgb.DMatrix(modin_X, label=modin_y) evals_result_xgb = {} evals_result_mxgb = {} verbose_eval = False bst = xgboost.train( param, xgb_dmatrix, num_round, evals_result=evals_result_xgb, evals=[(xgb_dmatrix, "train")], verbose_eval=verbose_eval, ) modin_bst = xgb.train( param, mxgb_dmatrix, num_round, evals_result=evals_result_mxgb, evals=[(mxgb_dmatrix, "train")], num_actors=num_actors, verbose_eval=verbose_eval, ) for par in param["eval_metric"]: assert len(evals_result_xgb["train"][par]) == len( evals_result_xgb["train"][par] ) for i in range(len(evals_result_xgb["train"][par])): np.testing.assert_allclose( evals_result_xgb["train"][par][i], evals_result_mxgb["train"][par][i], atol=0.011, ) predictions = bst.predict(xgb_dmatrix) modin_predictions = modin_bst.predict(mxgb_dmatrix) preds = pd.DataFrame(predictions).apply(round) modin_preds = modin_predictions.apply(round) val = accuracy_score(y, preds) modin_val = accuracy_score(modin_y, modin_preds) np.testing.assert_allclose(val, modin_val, atol=0.002, rtol=0.002) @pytest.mark.parametrize( "modin_type_y", [pd.DataFrame, pd.Series], ) @pytest.mark.parametrize( "num_actors", [1, num_cpus, None, modin.config.NPartitions.get() + 1], ) @pytest.mark.parametrize( "data", [ ( load_iris(), {"num_class": 3}, ), ( load_digits(), {"num_class": 10}, ), ( load_wine(), {"num_class": 3}, ), ], ids=["load_iris", "load_digits", "load_wine"], ) def test_xgb_with_multiclass_classification_datasets(data, num_actors, modin_type_y): dataset, param_ = data num_round = 10 part_param = {"objective": "multi:softprob", "eval_metric": "mlogloss"} param = {**param_, **part_param} X = dataset.data y = dataset.target xgb_dmatrix = xgboost.DMatrix(X, label=y) modin_X = pd.DataFrame(X) modin_y = modin_type_y(y) mxgb_dmatrix = xgb.DMatrix(modin_X, label=modin_y) evals_result_xgb = {} evals_result_mxgb = {} verbose_eval = False bst = xgboost.train( param, xgb_dmatrix, num_round, evals_result=evals_result_xgb, evals=[(xgb_dmatrix, "train")], verbose_eval=verbose_eval, ) modin_bst = xgb.train( param, mxgb_dmatrix, num_round, evals_result=evals_result_mxgb, evals=[(mxgb_dmatrix, "train")], num_actors=num_actors, verbose_eval=verbose_eval, ) assert len(evals_result_xgb["train"]["mlogloss"]) == len( evals_result_mxgb["train"]["mlogloss"] ) for i in range(len(evals_result_xgb["train"]["mlogloss"])): np.testing.assert_allclose( evals_result_xgb["train"]["mlogloss"][i], evals_result_mxgb["train"]["mlogloss"][i], atol=0.009, ) predictions = bst.predict(xgb_dmatrix) modin_predictions = modin_bst.predict(mxgb_dmatrix) array_preds = np.asarray([np.argmax(line) for line in predictions]) modin_array_preds = np.asarray( [np.argmax(line) for line in modin_predictions.to_numpy()] ) val = accuracy_score(y, array_preds) modin_val = accuracy_score(modin_y, modin_array_preds) np.testing.assert_allclose(val, modin_val) @pytest.mark.parametrize( "modin_type_y", [pd.DataFrame, pd.Series], ) @pytest.mark.parametrize( "num_actors", [1, num_cpus, None, modin.config.NPartitions.get() + 1], ) @pytest.mark.parametrize( "data", [(load_diabetes(), {"eta": 0.01})], ids=["load_diabetes"], ) def test_xgb_with_regression_datasets(data, num_actors, modin_type_y): dataset, param = data num_round = 10 X_df = pd.DataFrame(dataset.data) y_df = modin_type_y(dataset.target) X_train, X_test = train_test_split(X_df) y_train, y_test = train_test_split(y_df) train_xgb_dmatrix = xgboost.DMatrix(X_train, label=y_train) test_xgb_dmatrix = xgboost.DMatrix(X_test, label=y_test) train_mxgb_dmatrix = xgb.DMatrix(X_train, label=y_train) test_mxgb_dmatrix = xgb.DMatrix(X_test, label=y_test) evals_result_xgb = {} evals_result_mxgb = {} verbose_eval = False bst = xgboost.train( param, train_xgb_dmatrix, num_round, evals_result=evals_result_xgb, evals=[(train_xgb_dmatrix, "train"), (test_xgb_dmatrix, "test")], verbose_eval=verbose_eval, ) modin_bst = xgb.train( param, train_mxgb_dmatrix, num_round, evals_result=evals_result_mxgb, evals=[(train_mxgb_dmatrix, "train"), (test_mxgb_dmatrix, "test")], num_actors=num_actors, verbose_eval=verbose_eval, ) for param in ["train", "test"]: assert len(evals_result_xgb[param]["rmse"]) == len( evals_result_mxgb[param]["rmse"] ) for i in range(len(evals_result_xgb[param]["rmse"])): np.testing.assert_allclose( evals_result_xgb[param]["rmse"][i], evals_result_mxgb[param]["rmse"][i], rtol=0.0007, ) predictions = bst.predict(train_xgb_dmatrix) modin_predictions = modin_bst.predict(train_mxgb_dmatrix) val = mean_squared_error(y_train, predictions) modin_val = mean_squared_error(y_train, modin_predictions) np.testing.assert_allclose(val, modin_val, rtol=1.25e-05) def test_invalid_input(): list_df = [[1, 2.0, True], [2, 3.0, False]] with pytest.raises(AssertionError): # Check that DMatrix uses only DataFrame xgb.DMatrix(list_df, label=pd.Series([1, 2])) param = {} num_round = 2 with pytest.raises(AssertionError): # Check that train uses only DMatrix xgb.train(param, list_df, num_round) df = pd.DataFrame([[1, 2.0, True], [2, 3.0, False]], columns=["a", "b", "c"]) modin_dtrain = xgb.DMatrix(df, label=pd.Series([1, 2])) modin_bst = xgb.train(param, modin_dtrain, num_round) dt = [[1, 2.0, 3.3], [2, 3.0, 4.4]] with pytest.raises(AssertionError): # Check that predict uses only DMatrix modin_bst.predict(dt) ================================================ FILE: modin/tests/interchange/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. ================================================ FILE: modin/tests/interchange/dataframe_protocol/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. ================================================ FILE: modin/tests/interchange/dataframe_protocol/base/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. ================================================ FILE: modin/tests/interchange/dataframe_protocol/base/test_sanity.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Basic sanity checks for the DataFrame exchange protocol.""" import pytest import modin.pandas as pd from modin.tests.pandas.utils import default_to_pandas_ignore_string def test_sanity(): """Test that the DataFrame protocol module is valid and could be imported correctly.""" from modin.core.dataframe.base.interchange.dataframe_protocol.dataframe import ( # noqa ProtocolDataframe, ) @pytest.mark.filterwarnings(default_to_pandas_ignore_string) def test_basic_io(get_unique_base_execution): """Test that the protocol IO functions actually reach their implementation with no errors.""" class TestPassed(BaseException): pass def dummy_io_method(*args, **kwargs): """Dummy method emulating that the code path reached the exchange protocol implementation.""" raise TestPassed query_compiler_cls = get_unique_base_execution query_compiler_cls.from_interchange_dataframe = dummy_io_method query_compiler_cls.to_interchange_dataframe = dummy_io_method from modin.pandas.io import from_dataframe with pytest.raises(TestPassed): from_dataframe(None) with pytest.raises(TestPassed): pd.DataFrame([[1]]).__dataframe__() ================================================ FILE: modin/tests/interchange/dataframe_protocol/base/test_utils.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Tests for common utility functions of the DataFrame exchange protocol.""" import numpy as np import pandas import pytest from modin.core.dataframe.base.interchange.dataframe_protocol.utils import ( pandas_dtype_to_arrow_c, ) # TODO: use ArrowSchema to get reference C-string. # At the time, there is no way to access ArrowSchema holding a type format string from python. # The only way to 'touch' it is to export the structure to a C-pointer: # https://github.com/apache/arrow/blob/5680d209fd870f99134e2d7299b47acd90fabb8e/python/pyarrow/types.pxi#L230-L239 @pytest.mark.parametrize( "pandas_dtype, c_string", [ (np.dtype("bool"), "b"), (np.dtype("int8"), "c"), (np.dtype("uint8"), "C"), (np.dtype("int16"), "s"), (np.dtype("uint16"), "S"), (np.dtype("int32"), "i"), (np.dtype("uint32"), "I"), (np.dtype("int64"), "l"), (np.dtype("uint64"), "L"), (np.dtype("float16"), "e"), (np.dtype("float32"), "f"), (np.dtype("float64"), "g"), (pandas.Series(["a"]).dtype, "u"), ( pandas.Series([0]).astype("datetime64[ns]").dtype, "tsn:", ), ], ) def test_dtype_to_arrow_c(pandas_dtype, c_string): # noqa PR01 """Test ``pandas_dtype_to_arrow_c`` utility function.""" assert pandas_dtype_to_arrow_c(pandas_dtype) == c_string ================================================ FILE: modin/tests/interchange/dataframe_protocol/pandas/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. ================================================ FILE: modin/tests/interchange/dataframe_protocol/pandas/test_protocol.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Dataframe exchange protocol tests that are specific for pandas storage format implementation.""" import pandas import modin.pandas as pd from modin.pandas.io import from_dataframe from modin.tests.pandas.utils import df_equals, test_data from modin.tests.test_utils import ( df_or_series_using_native_execution, warns_that_defaulting_to_pandas_if, ) def eval_df_protocol(modin_df_producer): internal_modin_df_producer = modin_df_producer.__dataframe__() # Our configuration in pytest.ini requires that we explicitly catch all # instances of defaulting to pandas, this one raises a warning on `.from_dataframe` with warns_that_defaulting_to_pandas_if( not df_or_series_using_native_execution(modin_df_producer) ): modin_df_consumer = from_dataframe(modin_df_producer) internal_modin_df_consumer = from_dataframe(internal_modin_df_producer) # TODO: the following assertions verify that `from_dataframe` doesn't return # the same object untouched due to optimization branching, it actually should # do so but the logic is not implemented yet, so the assertions are passing # for now. It's required to replace the producer's type with a different one # to consumer when we have some other implementation of the protocol as the # assertions may start failing shortly. assert modin_df_producer is not modin_df_consumer assert internal_modin_df_producer is not internal_modin_df_consumer assert ( modin_df_producer._query_compiler._modin_frame is not modin_df_consumer._query_compiler._modin_frame ) df_equals(modin_df_producer, modin_df_consumer) df_equals(modin_df_producer, internal_modin_df_consumer) def test_simple_import(): modin_df = pd.DataFrame(test_data["int_data"]) eval_df_protocol(modin_df) def test_categorical_from_dataframe(): modin_df = pd.DataFrame( {"foo": pd.Series(["0", "1", "2", "3", "0", "3", "2", "3"], dtype="category")} ) eval_df_protocol(modin_df) def test_from_dataframe_with_empty_dataframe(): modin_df = pd.DataFrame({"foo_col": pd.Series([], dtype="int64")}) with warns_that_defaulting_to_pandas_if( not df_or_series_using_native_execution(modin_df) ): eval_df_protocol(modin_df) def test_interchange_with_pandas_string(): modin_df = pd.DataFrame({"fips": ["01001"]}) pandas_df = pandas.api.interchange.from_dataframe(modin_df.__dataframe__()) df_equals(modin_df, pandas_df) def test_interchange_with_datetime(): date_range = pd.date_range( start=pd.Timestamp("2024-01-01", unit="ns"), end=pd.Timestamp("2024-03-01", unit="ns"), freq="D", ) modin_df = pd.DataFrame( { "datetime_s": date_range.astype("datetime64[s]"), "datetime_ns": date_range.astype("datetime64[ns]"), } ) eval_df_protocol(modin_df) ================================================ FILE: modin/tests/interchange/dataframe_protocol/test_general.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Dataframe exchange protocol tests that are common for every implementation.""" import ctypes import math import pytest import modin.pandas as pd @pytest.fixture def df_from_dict(): def maker(dct, is_categorical=False): df = pd.DataFrame(dct, dtype=("category" if is_categorical else None)) return df return maker @pytest.mark.parametrize( "test_data", [ {"a": ["foo", "bar"], "b": ["baz", "qux"]}, {"a": [1.5, 2.5, 3.5], "b": [9.2, 10.5, 11.8]}, {"A": [1, 2, 3, 4], "B": [1, 2, 3, 4]}, ], ids=["str_data", "float_data", "int_data"], ) def test_only_one_dtype(test_data, df_from_dict): columns = list(test_data.keys()) df = df_from_dict(test_data) dfX = df.__dataframe__() column_size = len(test_data[columns[0]]) for column in columns: assert dfX.get_column_by_name(column).null_count == 0 assert dfX.get_column_by_name(column).size() == column_size assert dfX.get_column_by_name(column).offset == 0 def test_float_int(df_from_dict): df = df_from_dict( { "a": [1, 2, 3], "b": [3, 4, 5], "c": [1.5, 2.5, 3.5], "d": [9, 10, 11], "e": [True, False, True], "f": ["a", "", "c"], } ) dfX = df.__dataframe__() columns = {"a": 0, "b": 0, "c": 2, "d": 0, "e": 20, "f": 21} for column, kind in columns.items(): colX = dfX.get_column_by_name(column) assert colX.null_count == 0 assert colX.size() == 3 assert colX.offset == 0 assert colX.dtype[0] == kind assert dfX.get_column_by_name("c").dtype[1] == 64 def test_na_float(df_from_dict): df = df_from_dict({"a": [1.0, math.nan, 2.0]}) dfX = df.__dataframe__() colX = dfX.get_column_by_name("a") assert colX.null_count == 1 def test_null_count(df_from_dict): df = df_from_dict({"foo": [42]}) dfX = df.__dataframe__() colX = dfX.get_column_by_name("foo") null_count = colX.null_count assert null_count == 0 and type(null_count) is int def test_noncategorical(df_from_dict): df = df_from_dict({"a": [1, 2, 3]}) dfX = df.__dataframe__() colX = dfX.get_column_by_name("a") with pytest.raises(TypeError): colX.describe_categorical def test_categorical(df_from_dict): df = df_from_dict( {"weekday": ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", "Sun"]}, is_categorical=True, ) colX = df.__dataframe__().get_column_by_name("weekday") is_ordered, is_dictionary, _ = colX.describe_categorical.values() assert isinstance(is_ordered, bool) assert isinstance(is_dictionary, bool) def test_dataframe(df_from_dict): df = df_from_dict( {"x": [True, True, False], "y": [1, 2, 0], "z": [9.2, 10.5, 11.8]} ) dfX = df.__dataframe__() assert dfX.num_columns() == 3 assert dfX.num_rows() == 3 assert dfX.num_chunks() == 1 assert list(dfX.column_names()) == ["x", "y", "z"] assert list(dfX.select_columns((0, 2)).column_names()) == list( dfX.select_columns_by_name(("x", "z")).column_names() ) @pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)]) def test_df_get_chunks(size, n_chunks, df_from_dict): df = df_from_dict({"x": list(range(size))}) dfX = df.__dataframe__() chunks = list(dfX.get_chunks(n_chunks)) assert len(chunks) == n_chunks assert sum(chunk.num_rows() for chunk in chunks) == size @pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)]) def test_column_get_chunks(size, n_chunks, df_from_dict): df = df_from_dict({"x": list(range(size))}) dfX = df.__dataframe__() chunks = list(dfX.get_column(0).get_chunks(n_chunks)) assert len(chunks) == n_chunks assert sum(chunk.size() for chunk in chunks) == size def test_get_columns(df_from_dict): df = df_from_dict({"a": [0, 1], "b": [2.5, 3.5]}) dfX = df.__dataframe__() for colX in dfX.get_columns(): assert colX.size() == 2 assert colX.num_chunks() == 1 assert dfX.get_column(0).dtype[0] == 0 assert dfX.get_column(1).dtype[0] == 2 def test_buffer(df_from_dict): arr = [0, 1, -1] df = df_from_dict({"a": arr}) dfX = df.__dataframe__() colX = dfX.get_column(0) bufX = colX.get_buffers() dataBuf, dataDtype = bufX["data"] assert dataBuf.bufsize > 0 assert dataBuf.ptr != 0 device, _ = dataBuf.__dlpack_device__() assert dataDtype[0] == 0 if device == 1: # CPU-only as we're going to directly read memory here bitwidth = dataDtype[1] ctype = { 8: ctypes.c_int8, 16: ctypes.c_int16, 32: ctypes.c_int32, 64: ctypes.c_int64, }[bitwidth] for idx, truth in enumerate(arr): val = ctype.from_address(dataBuf.ptr + idx * (bitwidth // 8)).value assert val == truth, f"Buffer at index {idx} mismatch" ================================================ FILE: modin/tests/numpy/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. ================================================ FILE: modin/tests/numpy/test_array.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import warnings import numpy import pytest import modin.numpy as np from .utils import assert_scalar_or_array_equal @pytest.fixture def change_numpy_print_threshold(): prev_threshold = numpy.get_printoptions()["threshold"] numpy.set_printoptions(threshold=50) yield prev_threshold numpy.set_printoptions(threshold=prev_threshold) @pytest.mark.parametrize( "size", [ 100, (2, 100), (100, 2), (1, 100), (100, 1), (100, 100), (6, 100), (100, 6), (100, 7), (7, 100), ], ) def test_repr(size, change_numpy_print_threshold): numpy_arr = numpy.random.randint(-100, 100, size=size) modin_arr = np.array(numpy_arr) assert repr(modin_arr) == repr(numpy_arr) @pytest.mark.parametrize("size", [100, (2, 100), (100, 2), (1, 100), (100, 1)]) def test_shape(size): numpy_arr = numpy.random.randint(-100, 100, size=size) modin_arr = np.array(numpy_arr) assert modin_arr.shape == numpy_arr.shape def test_dtype(): numpy_arr = numpy.array([[1, "2"], [3, "4"]]) modin_arr = np.array([[1, "2"], [3, "4"]]) assert modin_arr.dtype == numpy_arr.dtype modin_arr = modin_arr == modin_arr.T numpy_arr = numpy_arr == numpy_arr.T assert modin_arr.dtype == numpy_arr.dtype def test_conversion(): import modin.pandas as pd from modin.numpy.utils import try_convert_from_interoperable_type df = pd.DataFrame(numpy.random.randint(0, 100, size=(100, 100))) series = df.iloc[0] df_converted = try_convert_from_interoperable_type(df) assert isinstance(df_converted, np.array) series_converted = try_convert_from_interoperable_type(series) assert isinstance(series_converted, np.array) assert_scalar_or_array_equal(df_converted, df) assert_scalar_or_array_equal(series_converted, series) pandas_df = df._to_pandas() pandas_series = series._to_pandas() pandas_converted = try_convert_from_interoperable_type(pandas_df) assert isinstance(pandas_converted, type(pandas_df)) assert pandas_converted.equals(pandas_df) pandas_converted = try_convert_from_interoperable_type(pandas_series) assert isinstance(pandas_converted, type(pandas_series)) assert pandas_converted.equals(pandas_series) def test_to_df(): import pandas import modin.pandas as pd from modin.tests.pandas.utils import df_equals modin_df = pd.DataFrame(np.array([1, 2, 3])) pandas_df = pandas.DataFrame(numpy.array([1, 2, 3])) df_equals(pandas_df, modin_df) modin_df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6]])) pandas_df = pandas.DataFrame(numpy.array([[1, 2, 3], [4, 5, 6]])) df_equals(pandas_df, modin_df) for kw in [{}, {"dtype": str}]: modin_df, pandas_df = [ lib[0].DataFrame( lib[1].array([[1, 2, 3], [4, 5, 6]]), columns=["col 0", "col 1", "col 2"], index=pd.Index([4, 6]), **kw ) for lib in ((pd, np), (pandas, numpy)) ] df_equals(pandas_df, modin_df) df_equals(pandas_df, modin_df) def test_to_series(): import pandas import modin.pandas as pd from modin.tests.pandas.utils import df_equals with pytest.raises(ValueError, match="Data must be 1-dimensional"): pd.Series(np.array([[1, 2, 3], [4, 5, 6]])) modin_series = pd.Series(np.array([1, 2, 3]), index=pd.Index([-1, -2, -3])) pandas_series = pandas.Series( numpy.array([1, 2, 3]), index=pandas.Index([-1, -2, -3]) ) df_equals(modin_series, pandas_series) modin_series = pd.Series( np.array([1, 2, 3]), index=pd.Index([-1, -2, -3]), dtype=str ) pandas_series = pandas.Series( numpy.array([1, 2, 3]), index=pandas.Index([-1, -2, -3]), dtype=str ) df_equals(modin_series, pandas_series) def test_update_inplace(): out = np.array([1, 2, 3]) arr1 = np.array([1, 2, 3]) arr2 = np.array(out, copy=False) np.add(arr1, arr1, out=out) assert_scalar_or_array_equal(out, arr2) out = np.array([1, 2, 3]) arr2 = np.array(out, copy=False) np.add(arr1, arr1, out=out, where=False) assert_scalar_or_array_equal(out, arr2) @pytest.mark.parametrize( "data_out", [ numpy.zeros((1, 3)), numpy.zeros((2, 3)), ], ) def test_out_broadcast(data_out): if data_out.shape == (2, 3): pytest.xfail("broadcasting would require duplicating row: see GH#5819") data1 = [[1, 2, 3]] data2 = [7, 8, 9] modin_out, numpy_out = np.array(data_out), numpy.array(data_out) numpy.add(numpy.array(data1), numpy.array(data2), out=numpy_out) np.add(np.array(data1), np.array(data2), out=modin_out) assert_scalar_or_array_equal(modin_out, numpy_out) def test_out_broadcast_error(): with pytest.raises(ValueError): # Incompatible dimensions between inputs np.add(np.array([1, 2, 3]), np.array([[1, 2], [3, 4]])) with pytest.raises(ValueError): # Compatible input broadcast dimensions, but output array dimensions are wrong out = np.array([0]) np.add(np.array([[1, 2], [3, 4]]), np.array([1, 2]), out=out) with pytest.raises(ValueError): # Compatible input broadcast dimensions, but output array dimensions are wrong # (cannot broadcast a 2x2 result into a 1x2 array) out = np.array([0, 0]) np.add(np.array([[1, 2], [3, 4]]), np.array([1, 2]), out=out) with pytest.raises(ValueError): # Compatible input broadcast dimensions, but output array dimensions are wrong # (cannot broadcast 1x2 into 1D 2-element array) out = np.array([0, 0]) np.add(np.array([[1, 2]]), np.array([1, 2]), out=out) with pytest.raises(ValueError): # Compatible input broadcast dimensions, but output array dimensions are wrong # (cannot broadcast a 2x2 result into a 3x2 array) # Technically, our error message here does not match numpy's exactly, as the # numpy message will specify both input shapes, whereas we only specify the # shape of the default broadcast between the two inputs out = np.array([[0, 0], [0, 0], [0, 0]]) np.add(np.array([[1, 2], [3, 4]]), np.array([1, 2]), out=out) @pytest.mark.parametrize("size", [100, (2, 100), (100, 2), (1, 100), (100, 1)]) def test_array_ufunc(size): # Test ufunc.__call__ numpy_arr = numpy.random.randint(-100, 100, size=size) modin_arr = np.array(numpy_arr) modin_result = numpy.sign(modin_arr) numpy_result = numpy.sign(numpy_arr) assert_scalar_or_array_equal(modin_result, numpy_result) # Test ufunc that we have support for. modin_result = numpy.add(modin_arr, modin_arr) numpy_result = numpy.add(numpy_arr, numpy_arr) assert_scalar_or_array_equal(modin_result, numpy_result) # Test ufunc that we have support for, but method that we do not implement. modin_result = numpy.add.reduce(modin_arr) numpy_result = numpy.add.reduce(numpy_arr) assert numpy_result == modin_result # We do not test ufunc.reduce and ufunc.accumulate, since these require a binary reduce # operation that Modin does not currently support. @pytest.mark.parametrize("size", [100, (2, 100), (100, 2), (1, 100), (100, 1)]) def test_array_function(size): numpy_arr = numpy.random.randint(-100, 100, size=size) modin_arr = np.array(numpy_arr) # Test from array shaping modin_result = numpy.ravel(modin_arr) numpy_result = numpy.ravel(numpy_arr) assert_scalar_or_array_equal(modin_result, numpy_result) # Test from array creation modin_result = numpy.zeros_like(modin_arr) numpy_result = numpy.zeros_like(numpy_arr) assert_scalar_or_array_equal(modin_result, numpy_result) # Test from math modin_result = numpy.sum(modin_arr) numpy_result = numpy.sum(numpy_arr) assert numpy_result == modin_result def test_array_where(): numpy_flat_arr = numpy.random.randint(-100, 100, size=100) modin_flat_arr = np.array(numpy_flat_arr) with pytest.warns( UserWarning, match="np.where method with only condition specified" ): warnings.filterwarnings("ignore", message="Distributing") (modin_flat_arr <= 0).where() with pytest.raises(ValueError, match="np.where requires x and y"): (modin_flat_arr <= 0).where(x=["Should Fail."]) with pytest.warns(UserWarning, match="np.where not supported when both x and y"): warnings.filterwarnings("ignore", message="Distributing") modin_result = (modin_flat_arr <= 0).where(x=4, y=5) numpy_result = numpy.where(numpy_flat_arr <= 0, 4, 5) assert_scalar_or_array_equal(modin_result, numpy_result) modin_flat_bool_arr = modin_flat_arr <= 0 numpy_flat_bool_arr = numpy_flat_arr <= 0 modin_result = modin_flat_bool_arr.where(x=5, y=modin_flat_arr) numpy_result = numpy.where(numpy_flat_bool_arr, 5, numpy_flat_arr) assert_scalar_or_array_equal(modin_result, numpy_result) modin_result = modin_flat_bool_arr.where(x=modin_flat_arr, y=5) numpy_result = numpy.where(numpy_flat_bool_arr, numpy_flat_arr, 5) assert_scalar_or_array_equal(modin_result, numpy_result) modin_result = modin_flat_bool_arr.where(x=modin_flat_arr, y=(-1 * modin_flat_arr)) numpy_result = numpy.where( numpy_flat_bool_arr, numpy_flat_arr, (-1 * numpy_flat_arr) ) assert_scalar_or_array_equal(modin_result, numpy_result) numpy_arr = numpy_flat_arr.reshape((10, 10)) modin_arr = np.array(numpy_arr) modin_bool_arr = modin_arr > 0 numpy_bool_arr = numpy_arr > 0 modin_result = modin_bool_arr.where(modin_arr, 10 * modin_arr) numpy_result = numpy.where(numpy_bool_arr, numpy_arr, 10 * numpy_arr) assert_scalar_or_array_equal(modin_result, numpy_result) @pytest.mark.parametrize("method", ["argmax", "argmin"]) def test_argmax_argmin(method): numpy_arr = numpy.array([[1, 2, 3], [4, 5, np.nan]]) modin_arr = np.array(numpy_arr) assert_scalar_or_array_equal( getattr(np, method)(modin_arr, axis=1), getattr(numpy, method)(numpy_arr, axis=1), ) def test_flatten(): numpy_flat_arr = numpy.random.randint(-100, 100, size=100) modin_flat_arr = np.array(numpy_flat_arr) assert_scalar_or_array_equal(modin_flat_arr.flatten(), numpy_flat_arr.flatten()) numpy_arr = numpy_flat_arr.reshape((10, 10)) modin_arr = np.array(numpy_arr) assert_scalar_or_array_equal(modin_arr.flatten(), numpy_arr.flatten()) def test_transpose(): numpy_flat_arr = numpy.random.randint(-100, 100, size=100) modin_flat_arr = np.array(numpy_flat_arr) assert_scalar_or_array_equal(modin_flat_arr.transpose(), numpy_flat_arr.transpose()) numpy_arr = numpy_flat_arr.reshape((10, 10)) modin_arr = np.array(numpy_arr) assert_scalar_or_array_equal(modin_arr.transpose(), numpy_arr.transpose()) assert_scalar_or_array_equal(modin_arr.T, numpy_arr.T) def test_astype(): numpy_arr = numpy.array([[1, 2], [3, 4]]) modin_arr = np.array([[1, 2], [3, 4]]) modin_result = modin_arr.astype(numpy.float64) numpy_result = numpy_arr.astype(numpy.float64) assert modin_result.dtype == numpy_result.dtype assert_scalar_or_array_equal(modin_result, numpy_result) modin_result = modin_arr.astype(str) numpy_result = numpy_arr.astype(str) assert_scalar_or_array_equal(modin_result, numpy_result) assert_scalar_or_array_equal(modin_arr, numpy_arr) modin_result = modin_arr.astype(str, copy=False) numpy_result = numpy_arr.astype(str, copy=False) assert_scalar_or_array_equal(modin_result, numpy_result) assert_scalar_or_array_equal(modin_arr, numpy_arr) modin_result = modin_arr.astype(numpy.float64, copy=False) numpy_result = numpy_arr.astype(numpy.float64, copy=False) assert_scalar_or_array_equal(modin_result, numpy_result) assert_scalar_or_array_equal(modin_arr, numpy_arr) def test_set_shape(): numpy_arr = numpy.array([[1, 2, 3], [4, 5, 6]]) numpy_arr.shape = (6,) modin_arr = np.array([[1, 2, 3], [4, 5, 6]]) modin_arr.shape = (6,) assert_scalar_or_array_equal(modin_arr, numpy_arr) modin_arr.shape = 6 # Same as using (6,) assert_scalar_or_array_equal(modin_arr, numpy_arr) with pytest.raises(ValueError, match="cannot reshape"): modin_arr.shape = (4,) def test__array__(): numpy_arr = numpy.array([[1, 2, 3], [4, 5, 6]]) modin_arr = np.array(numpy_arr) # this implicitly calls `__array__` converted_array = numpy.array(modin_arr) assert type(converted_array) is type(numpy_arr) assert_scalar_or_array_equal(converted_array, numpy_arr) ================================================ FILE: modin/tests/numpy/test_array_arithmetic.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import numpy import pytest import modin.numpy as np from .utils import assert_scalar_or_array_equal @pytest.mark.parametrize( "operand1_shape", [ 100, (1, 100), (3, 100), ], ) @pytest.mark.parametrize( "operand2_shape", [ 100, (1, 100), (3, 100), 1, ], ) @pytest.mark.parametrize( "operator", [ "__add__", "__sub__", "__truediv__", "__mul__", "__rtruediv__", "__rmul__", "__radd__", "__rsub__", "__ge__", "__gt__", "__lt__", "__le__", "__eq__", "__ne__", ], ) def test_basic_arithmetic_with_broadcast(operand1_shape, operand2_shape, operator): """Test of operators that support broadcasting.""" if operand1_shape == (1, 100) or operand2_shape == (1, 100): # For some reason, marking the param with xfail leads to [XPASS(strict)] and a reported failure pytest.xfail(reason="broadcasting is broken: see GH#5894") operand1 = numpy.random.randint(-100, 100, size=operand1_shape) operand2 = numpy.random.randint(-100, 100, size=operand2_shape) numpy_result = getattr(operand1, operator)(operand2) if operand2_shape == 1: # Tests binary ops with a scalar modin_result = getattr(np.array(operand1), operator)(operand2[0]) else: modin_result = getattr(np.array(operand1), operator)(np.array(operand2)) if operator not in ["__truediv__", "__rtruediv__"]: assert_scalar_or_array_equal( modin_result, numpy_result, err_msg=f"Binary Op {operator} failed.", ) else: # Truediv can have precision issues, where thanks to floating point error, the numbers # aren't exactly the same across both, but are functionally equivalent, since the difference # is less than 1e-12. numpy.testing.assert_array_almost_equal( modin_result._to_numpy(), numpy_result, decimal=12, err_msg="Binary Op __truediv__ failed.", ) @pytest.mark.parametrize("matched_axis", [0, 1]) @pytest.mark.parametrize( "operator", [ "__add__", "__sub__", "__truediv__", "__mul__", "__rtruediv__", "__rmul__", "__radd__", "__rsub__", "__ge__", "__gt__", "__lt__", "__le__", "__eq__", "__ne__", ], ) def test_binary_bad_broadcast(matched_axis, operator): """Tests broadcasts between 2d arrays that should fail.""" if matched_axis == 0: operand1 = numpy.random.randint(-100, 100, size=(3, 100)) operand2 = numpy.random.randint(-100, 100, size=(3, 200)) else: operand1 = numpy.random.randint(-100, 100, size=(100, 3)) operand2 = numpy.random.randint(-100, 100, size=(200, 3)) with pytest.raises(ValueError): getattr(operand1, operator)(operand2) with pytest.raises(ValueError): getattr(np.array(operand1), operator)(np.array(operand2)) @pytest.mark.parametrize("operator", ["__pow__", "__floordiv__", "__mod__"]) def test_arithmetic(operator): """Test of operators that do not yet support broadcasting.""" for size, textdim in ((100, "1D"), ((10, 10), "2D")): operand1 = numpy.random.randint(-100, 100, size=size) lower_bound = -100 if operator != "__pow__" else 0 operand2 = numpy.random.randint(lower_bound, 100, size=size) modin_result = getattr(np.array(operand1), operator)(np.array(operand2)) numpy_result = getattr(operand1, operator)(operand2) numpy.testing.assert_array_almost_equal( modin_result._to_numpy(), numpy_result, decimal=12, err_msg=f"Binary Op {operator} failed on {textdim} arrays.", ) def test_arithmetic_nans_and_zeros(): numpy_arr1 = numpy.array([[1, 0, 3], [numpy.nan, 0, numpy.nan]]) numpy_arr2 = numpy.array([1, 0, 0]) assert_scalar_or_array_equal( (np.array(numpy_arr1) // np.array(numpy_arr2)), numpy_arr1 // numpy_arr2, ) assert_scalar_or_array_equal( (np.array([0]) // 0), numpy.array([0]) // 0, ) assert_scalar_or_array_equal( (np.array([0], dtype=numpy.float64) // 0), numpy.array([0], dtype=numpy.float64) // 0, ) @pytest.mark.parametrize("size", [100, (2, 100), (100, 2), (1, 100), (100, 1)]) def test_scalar_arithmetic(size): numpy_arr = numpy.random.randint(-100, 100, size=size) modin_arr = np.array(numpy_arr) scalar = numpy.random.randint(1, 100) assert_scalar_or_array_equal( (scalar * modin_arr), scalar * numpy_arr, err_msg="__mul__ failed." ) assert_scalar_or_array_equal( (modin_arr * scalar), scalar * numpy_arr, err_msg="__rmul__ failed.", ) assert_scalar_or_array_equal( (scalar / modin_arr), scalar / numpy_arr, err_msg="__rtruediv__ failed.", ) assert_scalar_or_array_equal( (modin_arr / scalar), numpy_arr / scalar, err_msg="__truediv__ failed.", ) assert_scalar_or_array_equal( (scalar + modin_arr), scalar + numpy_arr, err_msg="__radd__ failed.", ) assert_scalar_or_array_equal( (modin_arr + scalar), scalar + numpy_arr, err_msg="__add__ failed." ) assert_scalar_or_array_equal( (scalar - modin_arr), scalar - numpy_arr, err_msg="__rsub__ failed.", ) assert_scalar_or_array_equal( (modin_arr - scalar), numpy_arr - scalar, err_msg="__sub__ failed." ) @pytest.mark.parametrize("op_name", ["abs", "exp", "sqrt", "tanh"]) def test_unary_arithmetic(op_name): numpy_flat_arr = numpy.random.randint(-100, 100, size=100) modin_flat_arr = np.array(numpy_flat_arr) assert_scalar_or_array_equal( getattr(np, op_name)(modin_flat_arr), getattr(numpy, op_name)(numpy_flat_arr), ) numpy_arr = numpy_flat_arr.reshape((10, 10)) modin_arr = np.array(numpy_arr) assert_scalar_or_array_equal( getattr(np, op_name)(modin_arr), getattr(numpy, op_name)(numpy_arr) ) def test_invert(): numpy_flat_arr = numpy.random.randint(-100, 100, size=100) modin_flat_arr = np.array(numpy_flat_arr) assert_scalar_or_array_equal(~modin_flat_arr, ~numpy_flat_arr) numpy_arr = numpy_flat_arr.reshape((10, 10)) modin_arr = np.array(numpy_arr) assert_scalar_or_array_equal(~modin_arr, ~numpy_arr) numpy_flat_arr = numpy.random.randint(-100, 100, size=100) < 0 modin_flat_arr = np.array(numpy_flat_arr) assert_scalar_or_array_equal(~modin_flat_arr, ~numpy_flat_arr) numpy_arr = numpy_flat_arr.reshape((10, 10)) modin_arr = np.array(numpy_arr) assert_scalar_or_array_equal(~modin_arr, ~numpy_arr) ================================================ FILE: modin/tests/numpy/test_array_axis_functions.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import numpy import pytest import modin.numpy as np from .utils import assert_scalar_or_array_equal def test_max(): # Test 1D numpy_arr = numpy.random.randint(-100, 100, size=100) modin_arr = np.array(numpy_arr) assert modin_arr.max() == numpy_arr.max() modin_result = modin_arr.max(axis=0) numpy_result = modin_arr.max(axis=0) assert modin_result == numpy_result modin_result = modin_arr.max(initial=200) numpy_result = numpy_arr.max(initial=200) assert modin_result == numpy_result modin_result = modin_arr.max(initial=0, where=False) numpy_result = numpy_arr.max(initial=0, where=False) assert modin_result == numpy_result modin_result = modin_arr.max(keepdims=True) numpy_result = numpy_arr.max(keepdims=True) assert modin_result.shape == numpy_result.shape assert_scalar_or_array_equal(modin_result, numpy_result) numpy_arr = numpy.array([1, 10000, 2, 3, 4, 5]) modin_arr = np.array(numpy_arr) numpy_mask = numpy.array([True, False, True, True, True, True]) modin_mask = np.array(numpy_mask) assert numpy_arr.max(where=numpy_mask, initial=5) == modin_arr.max( where=modin_mask, initial=5 ) # Test 2D numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) modin_arr = np.array(numpy_arr) assert modin_arr.max() == numpy_arr.max() modin_result = modin_arr.max(axis=0) numpy_result = numpy_arr.max(axis=0) assert modin_result.shape == numpy_result.shape assert_scalar_or_array_equal(modin_result, numpy_result) modin_result = modin_arr.max(axis=0, keepdims=True) numpy_result = numpy_arr.max(axis=0, keepdims=True) assert modin_result.shape == numpy_result.shape assert_scalar_or_array_equal(modin_result, numpy_result) modin_result = modin_arr.max(axis=1) numpy_result = numpy_arr.max(axis=1) assert modin_result.shape == numpy_result.shape assert_scalar_or_array_equal(modin_result, numpy_result) modin_result = modin_arr.max(axis=1, keepdims=True) numpy_result = numpy_arr.max(axis=1, keepdims=True) assert modin_result.shape == numpy_result.shape assert_scalar_or_array_equal(modin_result, numpy_result) modin_result = modin_arr.max(initial=200) numpy_result = numpy_arr.max(initial=200) assert modin_result == numpy_result modin_result = modin_arr.max(initial=0, where=False) numpy_result = numpy_arr.max(initial=0, where=False) assert modin_result == numpy_result with pytest.raises(ValueError): modin_arr.max(out=modin_arr, keepdims=True) modin_out = np.array([[1]]) numpy_out = modin_out._to_numpy() modin_result = modin_arr.max(out=modin_out, keepdims=True) numpy_result = numpy_arr.max(out=numpy_out, keepdims=True) assert_scalar_or_array_equal(modin_result, numpy_result) assert_scalar_or_array_equal(modin_out, numpy_out) numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) modin_arr = np.array(numpy_arr) modin_result = modin_arr.max(axis=0, where=False, initial=4) numpy_result = numpy_arr.max(axis=0, where=False, initial=4) assert_scalar_or_array_equal(modin_result, numpy_result) numpy_out = numpy.ones(20) modin_out = np.array(numpy_out) modin_result = modin_arr.max(axis=0, where=False, initial=4, out=modin_out) numpy_result = numpy_arr.max(axis=0, where=False, initial=4, out=numpy_out) assert_scalar_or_array_equal(modin_result, numpy_result) assert_scalar_or_array_equal(modin_out, numpy_out) numpy_out = numpy.ones(20) modin_out = np.array(numpy_out) modin_result = modin_arr.max(axis=0, initial=4, out=modin_out) numpy_result = numpy_arr.max(axis=0, initial=4, out=numpy_out) assert_scalar_or_array_equal(modin_result, numpy_result) assert_scalar_or_array_equal(modin_out, numpy_out) numpy_out = numpy.ones(20) modin_out = np.array(numpy_out) modin_result = modin_arr.max(axis=1, initial=4, out=modin_out) numpy_result = numpy_arr.max(axis=1, initial=4, out=numpy_out) assert_scalar_or_array_equal(modin_result, numpy_result) assert_scalar_or_array_equal(modin_out, numpy_out) numpy_out = numpy.ones(20) modin_out = np.array(numpy_out) numpy_where = numpy.full(20, False) numpy_where[:10] = True numpy.random.shuffle(numpy_where) modin_where = np.array(numpy_where) modin_result = modin_arr.max(axis=0, initial=4, out=modin_out, where=modin_where) numpy_result = numpy_arr.max(axis=0, initial=4, out=numpy_out, where=numpy_where) assert_scalar_or_array_equal(modin_result, numpy_result) assert_scalar_or_array_equal(modin_out, numpy_out) numpy_arr = numpy.array([[1, 10000, 2], [3, 4, 5]]) modin_arr = np.array(numpy_arr) numpy_mask = numpy.array([[True, False, True], [True, True, True]]) modin_mask = np.array(numpy_mask) assert_scalar_or_array_equal( modin_arr.max(where=modin_mask, initial=5), numpy_arr.max(where=numpy_mask, initial=5), ) def test_min(): # Test 1D numpy_arr = numpy.random.randint(-100, 100, size=100) modin_arr = np.array(numpy_arr) assert modin_arr.min() == numpy_arr.min() modin_result = modin_arr.min(axis=0) numpy_result = modin_arr.min(axis=0) assert modin_result == numpy_result modin_result = modin_arr.min(initial=-200) numpy_result = numpy_arr.min(initial=-200) assert modin_result == numpy_result modin_result = modin_arr.min(initial=0, where=False) numpy_result = numpy_arr.min(initial=0, where=False) assert modin_result == numpy_result modin_result = modin_arr.min(keepdims=True) numpy_result = numpy_arr.min(keepdims=True) assert modin_result.shape == numpy_result.shape assert_scalar_or_array_equal(modin_result, numpy_result) numpy_arr = numpy.array([1, -10000, 2, 3, 4, 5]) modin_arr = np.array(numpy_arr) numpy_mask = numpy.array([True, False, True, True, True, True]) modin_mask = np.array(numpy_mask) assert numpy_arr.min(where=numpy_mask, initial=5) == modin_arr.min( where=modin_mask, initial=5 ) # Test 2D numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) modin_arr = np.array(numpy_arr) assert modin_arr.min() == numpy_arr.min() modin_result = modin_arr.min(axis=0) numpy_result = numpy_arr.min(axis=0) assert modin_result.shape == numpy_result.shape assert_scalar_or_array_equal(modin_result, numpy_result) modin_result = modin_arr.min(axis=0, keepdims=True) numpy_result = numpy_arr.min(axis=0, keepdims=True) assert modin_result.shape == numpy_result.shape assert_scalar_or_array_equal(modin_result, numpy_result) modin_result = modin_arr.min(axis=1) numpy_result = numpy_arr.min(axis=1) assert modin_result.shape == numpy_result.shape assert_scalar_or_array_equal(modin_result, numpy_result) modin_result = modin_arr.min(axis=1, keepdims=True) numpy_result = numpy_arr.min(axis=1, keepdims=True) assert modin_result.shape == numpy_result.shape assert_scalar_or_array_equal(modin_result, numpy_result) modin_result = modin_arr.min(initial=-200) numpy_result = numpy_arr.min(initial=-200) assert modin_result == numpy_result modin_result = modin_arr.min(initial=0, where=False) numpy_result = numpy_arr.min(initial=0, where=False) assert modin_result == numpy_result with pytest.raises(ValueError): modin_arr.min(out=modin_arr, keepdims=True) modin_out = np.array([[1]]) numpy_out = modin_out._to_numpy() modin_result = modin_arr.min(out=modin_out, keepdims=True) numpy_result = numpy_arr.min(out=numpy_out, keepdims=True) assert_scalar_or_array_equal(modin_result, numpy_result) assert_scalar_or_array_equal(modin_out, numpy_out) numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) modin_arr = np.array(numpy_arr) modin_result = modin_arr.min(axis=0, where=False, initial=4) numpy_result = numpy_arr.min(axis=0, where=False, initial=4) assert_scalar_or_array_equal(modin_result, numpy_result) numpy_out = numpy.ones(20) modin_out = np.array(numpy_out) modin_result = modin_arr.min(axis=0, where=False, initial=4, out=modin_out) numpy_result = numpy_arr.min(axis=0, where=False, initial=4, out=numpy_out) assert_scalar_or_array_equal(modin_result, numpy_result) assert_scalar_or_array_equal(modin_out, numpy_out) numpy_out = numpy.ones(20) modin_out = np.array(numpy_out) modin_result = modin_arr.min(axis=0, initial=4, out=modin_out) numpy_result = numpy_arr.min(axis=0, initial=4, out=numpy_out) assert_scalar_or_array_equal(modin_result, numpy_result) assert_scalar_or_array_equal(modin_out, numpy_out) numpy_out = numpy.ones(20) modin_out = np.array(numpy_out) modin_result = modin_arr.min(axis=1, initial=4, out=modin_out) numpy_result = numpy_arr.min(axis=1, initial=4, out=numpy_out) assert_scalar_or_array_equal(modin_result, numpy_result) assert_scalar_or_array_equal(modin_out, numpy_out) numpy_out = numpy.ones(20) modin_out = np.array(numpy_out) numpy_where = numpy.full(20, False) numpy_where[:10] = True numpy.random.shuffle(numpy_where) modin_where = np.array(numpy_where) modin_result = modin_arr.min(axis=0, initial=4, out=modin_out, where=modin_where) numpy_result = numpy_arr.min(axis=0, initial=4, out=numpy_out, where=numpy_where) assert_scalar_or_array_equal(modin_result, numpy_result) assert_scalar_or_array_equal(modin_out, numpy_out) numpy_arr = numpy.array([[1, -10000, 2], [3, 4, 5]]) modin_arr = np.array(numpy_arr) numpy_mask = numpy.array([[True, False, True], [True, True, True]]) modin_mask = np.array(numpy_mask) assert_scalar_or_array_equal( modin_arr.min(where=modin_mask, initial=5), numpy_arr.min(where=numpy_mask, initial=5), ) def test_sum(): # Test 1D numpy_arr = numpy.random.randint(-100, 100, size=100) modin_arr = np.array(numpy_arr) assert modin_arr.sum() == numpy_arr.sum() modin_result = modin_arr.sum(axis=0) numpy_result = modin_arr.sum(axis=0) assert modin_result == numpy_result modin_result = modin_arr.sum(initial=-200) numpy_result = numpy_arr.sum(initial=-200) assert modin_result == numpy_result modin_result = modin_arr.sum(initial=0, where=False) numpy_result = numpy_arr.sum(initial=0, where=False) assert modin_result == numpy_result modin_result = modin_arr.sum(keepdims=True) numpy_result = numpy_arr.sum(keepdims=True) assert modin_result.shape == numpy_result.shape assert_scalar_or_array_equal(modin_result, numpy_result) numpy_arr = numpy.array([1, 10000, 2, 3, 4, 5]) modin_arr = np.array(numpy_arr) numpy_mask = numpy.array([True, False, True, True, True, True]) modin_mask = np.array(numpy_mask) assert numpy_arr.sum(where=numpy_mask) == modin_arr.sum(where=modin_mask) # Test 2D numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) modin_arr = np.array(numpy_arr) assert modin_arr.sum() == numpy_arr.sum() modin_result = modin_arr.sum(axis=0) numpy_result = numpy_arr.sum(axis=0) assert modin_result.shape == numpy_result.shape assert_scalar_or_array_equal(modin_result, numpy_result) modin_result = modin_arr.sum(axis=0, keepdims=True) numpy_result = numpy_arr.sum(axis=0, keepdims=True) assert modin_result.shape == numpy_result.shape assert_scalar_or_array_equal(modin_result, numpy_result) modin_result = modin_arr.sum(axis=1) numpy_result = numpy_arr.sum(axis=1) assert modin_result.shape == numpy_result.shape assert_scalar_or_array_equal(modin_result, numpy_result) modin_result = modin_arr.sum(axis=1, keepdims=True) numpy_result = numpy_arr.sum(axis=1, keepdims=True) assert modin_result.shape == numpy_result.shape assert_scalar_or_array_equal(modin_result, numpy_result) modin_result = modin_arr.sum(initial=-200) numpy_result = numpy_arr.sum(initial=-200) assert modin_result == numpy_result modin_result = modin_arr.sum(initial=0, where=False) numpy_result = numpy_arr.sum(initial=0, where=False) assert modin_result == numpy_result with pytest.raises(ValueError): modin_arr.sum(out=modin_arr, keepdims=True) modin_out = np.array([[1]]) numpy_out = modin_out._to_numpy() modin_result = modin_arr.sum(out=modin_out, keepdims=True) numpy_result = numpy_arr.sum(out=numpy_out, keepdims=True) assert_scalar_or_array_equal(modin_result, numpy_result) assert_scalar_or_array_equal(modin_out, numpy_out) numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) modin_arr = np.array(numpy_arr) modin_result = modin_arr.sum(axis=0, where=False, initial=4) numpy_result = numpy_arr.sum(axis=0, where=False, initial=4) assert_scalar_or_array_equal(modin_result, numpy_result) numpy_out = numpy.ones(20) modin_out = np.array(numpy_out) modin_result = modin_arr.sum(axis=0, where=False, initial=4, out=modin_out) numpy_result = numpy_arr.sum(axis=0, where=False, initial=4, out=numpy_out) assert_scalar_or_array_equal(modin_result, numpy_result) assert_scalar_or_array_equal(modin_out, numpy_out) numpy_out = numpy.ones(20) modin_out = np.array(numpy_out) modin_result = modin_arr.sum(axis=0, initial=4, out=modin_out) numpy_result = numpy_arr.sum(axis=0, initial=4, out=numpy_out) assert_scalar_or_array_equal(modin_result, numpy_result) assert_scalar_or_array_equal(modin_out, numpy_out) numpy_out = numpy.ones(20) modin_out = np.array(numpy_out) modin_result = modin_arr.sum(axis=1, initial=4, out=modin_out) numpy_result = numpy_arr.sum(axis=1, initial=4, out=numpy_out) assert_scalar_or_array_equal(modin_result, numpy_result) assert_scalar_or_array_equal(modin_out, numpy_out) numpy_out = numpy.ones(20) modin_out = np.array(numpy_out) numpy_where = numpy.full(20, False) numpy_where[:10] = True numpy.random.shuffle(numpy_where) modin_where = np.array(numpy_where) modin_result = modin_arr.sum(axis=0, initial=4, out=modin_out, where=modin_where) numpy_result = numpy_arr.sum(axis=0, initial=4, out=numpy_out, where=numpy_where) assert_scalar_or_array_equal(modin_result, numpy_result) assert_scalar_or_array_equal(modin_out, numpy_out) numpy_where = numpy.full(400, False) numpy_where[:200] = True numpy.random.shuffle(numpy_where) numpy_where = numpy_where.reshape((20, 20)) modin_where = np.array(numpy_where) modin_result = modin_arr.sum(where=modin_where) numpy_result = numpy_arr.sum(where=numpy_where) assert modin_result == numpy_result # Test NA propagation numpy_arr = numpy.array([[1, 2], [3, 4], [5, numpy.nan]]) modin_arr = np.array([[1, 2], [3, 4], [5, np.nan]]) assert numpy.isnan(modin_arr.sum()) assert_scalar_or_array_equal( modin_arr.sum(axis=1), numpy_arr.sum(axis=1), ) assert_scalar_or_array_equal( modin_arr.sum(axis=0), numpy_arr.sum(axis=0), ) def test_mean(): # Test 1D numpy_arr = numpy.random.randint(-100, 100, size=100) modin_arr = np.array(numpy_arr) assert modin_arr.mean() == numpy_arr.mean() modin_result = modin_arr.mean(axis=0) numpy_result = modin_arr.mean(axis=0) assert modin_result == numpy_result modin_result = modin_arr.mean() numpy_result = numpy_arr.mean() assert modin_result == numpy_result modin_result = modin_arr.mean(keepdims=True) numpy_result = numpy_arr.mean(keepdims=True) assert modin_result.shape == numpy_result.shape assert_scalar_or_array_equal(modin_result, numpy_result) numpy_arr = numpy.array([1, 10000, 2, 3, 4, 5]) modin_arr = np.array(numpy_arr) numpy_mask = numpy.array([True, False, True, True, True, True]) modin_mask = np.array(numpy_mask) assert numpy_arr.mean(where=numpy_mask) == modin_arr.mean(where=modin_mask) # Test 2D numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) modin_arr = np.array(numpy_arr) assert modin_arr.mean() == numpy_arr.mean() modin_result = modin_arr.mean(axis=0) numpy_result = numpy_arr.mean(axis=0) assert modin_result.shape == numpy_result.shape assert_scalar_or_array_equal(modin_result, numpy_result) modin_result = modin_arr.mean(axis=0, keepdims=True) numpy_result = numpy_arr.mean(axis=0, keepdims=True) assert modin_result.shape == numpy_result.shape assert_scalar_or_array_equal(modin_result, numpy_result) modin_result = modin_arr.mean(axis=1) numpy_result = numpy_arr.mean(axis=1) assert modin_result.shape == numpy_result.shape assert_scalar_or_array_equal(modin_result, numpy_result) modin_result = modin_arr.mean(axis=1, keepdims=True) numpy_result = numpy_arr.mean(axis=1, keepdims=True) assert modin_result.shape == numpy_result.shape assert_scalar_or_array_equal(modin_result, numpy_result) modin_result = modin_arr.mean() numpy_result = numpy_arr.mean() assert modin_result == numpy_result with pytest.raises(ValueError): modin_arr.mean(out=modin_arr, keepdims=True) modin_out = np.array([[1]]) numpy_out = modin_out._to_numpy() modin_result = modin_arr.mean(out=modin_out, keepdims=True) numpy_result = numpy_arr.mean(out=numpy_out, keepdims=True) assert_scalar_or_array_equal(modin_result, numpy_result) assert_scalar_or_array_equal(modin_out, numpy_out) numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) modin_arr = np.array(numpy_arr) numpy_out = numpy.ones(20) modin_out = np.array(numpy_out) modin_result = modin_arr.mean(axis=0, where=False, out=modin_out) numpy_result = numpy_arr.mean(axis=0, where=False, out=numpy_out) assert_scalar_or_array_equal(modin_result, numpy_result) assert_scalar_or_array_equal(modin_out, numpy_out) numpy_out = numpy.ones(20) modin_out = np.array(numpy_out) modin_result = modin_arr.mean(axis=0, out=modin_out) numpy_result = numpy_arr.mean(axis=0, out=numpy_out) assert_scalar_or_array_equal(modin_result, numpy_result) assert_scalar_or_array_equal(modin_out, numpy_out) numpy_out = numpy.ones(20) modin_out = np.array(numpy_out) modin_result = modin_arr.mean(axis=1, out=modin_out) numpy_result = numpy_arr.mean(axis=1, out=numpy_out) assert_scalar_or_array_equal(modin_result, numpy_result) assert_scalar_or_array_equal(modin_out, numpy_out) numpy_out = numpy.ones(20) modin_out = np.array(numpy_out) numpy_where = numpy.full(20, False) numpy_where[:10] = True numpy.random.shuffle(numpy_where) modin_where = np.array(numpy_where) modin_result = modin_arr.mean(axis=0, out=modin_out, where=modin_where) numpy_result = numpy_arr.mean(axis=0, out=numpy_out, where=numpy_where) assert_scalar_or_array_equal(modin_result, numpy_result) assert_scalar_or_array_equal(modin_out, numpy_out) numpy_where = numpy.full(400, False) numpy_where[:200] = True numpy.random.shuffle(numpy_where) numpy_where = numpy_where.reshape((20, 20)) modin_where = np.array(numpy_where) modin_result = modin_arr.mean(where=modin_where) numpy_result = numpy_arr.mean(where=numpy_where) assert modin_result == numpy_result # Test NA propagation numpy_arr = numpy.array([[1, 2], [3, 4], [5, numpy.nan]]) modin_arr = np.array([[1, 2], [3, 4], [5, np.nan]]) assert numpy.isnan(modin_arr.mean()) assert_scalar_or_array_equal( modin_arr.mean(axis=1), numpy_arr.mean(axis=1), ) assert_scalar_or_array_equal( modin_arr.mean(axis=0), numpy_arr.mean(axis=0), ) numpy_where = numpy.array([[True, True], [True, True], [True, False]]) modin_where = np.array(numpy_where) assert modin_arr.mean(where=modin_where) == numpy_arr.mean(where=numpy_where) def test_prod(): # Test 1D numpy_arr = numpy.random.randint(-100, 100, size=100) modin_arr = np.array(numpy_arr) assert modin_arr.prod() == numpy_arr.prod() modin_result = modin_arr.prod(axis=0) numpy_result = modin_arr.prod(axis=0) assert modin_result == numpy_result modin_result = modin_arr.prod(initial=-200) numpy_result = numpy_arr.prod(initial=-200) assert modin_result == numpy_result modin_result = modin_arr.prod(initial=0, where=False) numpy_result = numpy_arr.prod(initial=0, where=False) assert modin_result == numpy_result modin_result = modin_arr.prod(keepdims=True) numpy_result = numpy_arr.prod(keepdims=True) assert modin_result.shape == numpy_result.shape assert_scalar_or_array_equal(modin_result, numpy_result) numpy_arr = numpy.array([1, 10000, 2, 3, 4, 5]) modin_arr = np.array(numpy_arr) numpy_mask = numpy.array([True, False, True, True, True, True]) modin_mask = np.array(numpy_mask) assert numpy_arr.prod(where=numpy_mask) == modin_arr.prod(where=modin_mask) # Test 2D numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) modin_arr = np.array(numpy_arr) assert modin_arr.prod() == numpy_arr.prod() modin_result = modin_arr.prod(axis=0) numpy_result = numpy_arr.prod(axis=0) assert modin_result.shape == numpy_result.shape assert_scalar_or_array_equal(modin_result, numpy_result) modin_result = modin_arr.prod(axis=0, keepdims=True) numpy_result = numpy_arr.prod(axis=0, keepdims=True) assert modin_result.shape == numpy_result.shape assert_scalar_or_array_equal(modin_result, numpy_result) modin_result = modin_arr.prod(axis=1) numpy_result = numpy_arr.prod(axis=1) assert modin_result.shape == numpy_result.shape assert_scalar_or_array_equal(modin_result, numpy_result) modin_result = modin_arr.prod(axis=1, keepdims=True) numpy_result = numpy_arr.prod(axis=1, keepdims=True) assert modin_result.shape == numpy_result.shape assert_scalar_or_array_equal(modin_result, numpy_result) modin_result = modin_arr.prod(initial=-200) numpy_result = numpy_arr.prod(initial=-200) assert modin_result == numpy_result modin_result = modin_arr.prod(initial=0, where=False) numpy_result = numpy_arr.prod(initial=0, where=False) assert modin_result == numpy_result with pytest.raises(ValueError): modin_arr.prod(out=modin_arr, keepdims=True) modin_out = np.array([[1]]) numpy_out = modin_out._to_numpy() modin_result = modin_arr.prod(out=modin_out, keepdims=True) numpy_result = numpy_arr.prod(out=numpy_out, keepdims=True) assert_scalar_or_array_equal(modin_result, numpy_result) assert_scalar_or_array_equal(modin_out, numpy_out) numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) modin_arr = np.array(numpy_arr) modin_result = modin_arr.prod(axis=0, where=False, initial=4) numpy_result = numpy_arr.prod(axis=0, where=False, initial=4) assert_scalar_or_array_equal(modin_result, numpy_result) numpy_out = numpy.ones(20) modin_out = np.array(numpy_out) modin_result = modin_arr.prod(axis=0, where=False, initial=4, out=modin_out) numpy_result = numpy_arr.prod(axis=0, where=False, initial=4, out=numpy_out) assert_scalar_or_array_equal(modin_result, numpy_result) assert_scalar_or_array_equal(modin_out, numpy_out) numpy_arr = numpy.random.randint(-100, 100, size=(20, 20)) modin_arr = np.array(numpy_arr) numpy_out = numpy.ones(20) modin_out = np.array(numpy_out) modin_result = modin_arr.prod(axis=0, initial=4, out=modin_out) numpy_result = numpy_arr.prod(axis=0, initial=4, out=numpy_out) assert_scalar_or_array_equal(modin_result, numpy_result) assert_scalar_or_array_equal(modin_out, numpy_out) numpy_out = numpy.ones(20) modin_out = np.array(numpy_out) modin_result = modin_arr.prod(axis=1, initial=4, out=modin_out) numpy_result = numpy_arr.prod(axis=1, initial=4, out=numpy_out) assert_scalar_or_array_equal(modin_result, numpy_result) assert_scalar_or_array_equal(modin_out, numpy_out) numpy_out = numpy.ones(20) modin_out = np.array(numpy_out) numpy_where = numpy.full(20, False) numpy_where[:10] = True numpy.random.shuffle(numpy_where) modin_where = np.array(numpy_where) modin_result = modin_arr.prod(axis=0, initial=4, out=modin_out, where=modin_where) numpy_result = numpy_arr.prod(axis=0, initial=4, out=numpy_out, where=numpy_where) assert_scalar_or_array_equal(modin_result, numpy_result) assert_scalar_or_array_equal(modin_out, numpy_out) numpy_where = numpy.full(400, False) numpy_where[:200] = True numpy.random.shuffle(numpy_where) numpy_where = numpy_where.reshape((20, 20)) modin_where = np.array(numpy_where) modin_result = modin_arr.prod(where=modin_where) numpy_result = numpy_arr.prod(where=numpy_where) assert modin_result == numpy_result # Test NA propagation numpy_arr = numpy.array([[1, 2], [3, 4], [5, numpy.nan]]) modin_arr = np.array([[1, 2], [3, 4], [5, np.nan]]) assert numpy.isnan(modin_arr.prod()) assert_scalar_or_array_equal( modin_arr.prod(axis=1), numpy_arr.prod(axis=1), ) assert_scalar_or_array_equal( modin_arr.prod(axis=0), numpy_arr.prod(axis=0), ) ================================================ FILE: modin/tests/numpy/test_array_creation.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import numpy import modin.numpy as np from .utils import assert_scalar_or_array_equal def test_zeros_like(): modin_arr = np.array([[1.0, 2.0], [3.0, 4.0]]) numpy_arr = modin_arr._to_numpy() assert_scalar_or_array_equal(np.zeros_like(modin_arr), numpy.zeros_like(numpy_arr)) assert_scalar_or_array_equal( np.zeros_like(modin_arr, dtype=numpy.int8), numpy.zeros_like(numpy_arr, dtype=numpy.int8), ) assert_scalar_or_array_equal( np.zeros_like(modin_arr, shape=(10, 10)), numpy.zeros_like(numpy_arr, shape=(10, 10)), ) modin_arr = np.array([[1, 2], [3, 4]]) numpy_arr = modin_arr._to_numpy() assert_scalar_or_array_equal( np.zeros_like(modin_arr), numpy.zeros_like(numpy_arr), ) def test_ones_like(): modin_arr = np.array([[1.0, 2.0], [3.0, 4.0]]) numpy_arr = modin_arr._to_numpy() assert_scalar_or_array_equal( np.ones_like(modin_arr), numpy.ones_like(numpy_arr), ) assert_scalar_or_array_equal( np.ones_like(modin_arr, dtype=numpy.int8), numpy.ones_like(numpy_arr, dtype=numpy.int8), ) assert_scalar_or_array_equal( np.ones_like(modin_arr, shape=(10, 10)), numpy.ones_like(numpy_arr, shape=(10, 10)), ) modin_arr = np.array([[1, 2], [3, 4]]) numpy_arr = modin_arr._to_numpy() assert_scalar_or_array_equal( np.ones_like(modin_arr), numpy.ones_like(numpy_arr), ) ================================================ FILE: modin/tests/numpy/test_array_indexing.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import numpy import pytest from pandas.core.dtypes.common import is_list_like import modin.numpy as np from .utils import assert_scalar_or_array_equal @pytest.mark.parametrize( "index", ( 0, 1, -1, # Scalar indices slice(0, 1, 1), slice(1, -1, 1), # Slices [0, 2], [1, -1], # Lists ), ids=lambda i: f"index={i}", ) def test_getitem_1d(index): data = [1, 2, 3, 4, 5] numpy_result = numpy.array(data)[index] modin_result = np.array(data)[index] if is_list_like(numpy_result): assert_scalar_or_array_equal(modin_result, numpy_result) assert modin_result.shape == numpy_result.shape else: assert modin_result == numpy_result @pytest.mark.parametrize( "index", ( 0, 1, -1, # Scalar indices slice(0, 1, 1), slice(1, -1, 1), # Slices slice(None, None, None), slice(None, 1, None), slice(0, 1, None), slice(0, None, None), [0, 2], [2, 0], [1, -1], # Lists ), ids=lambda i: f"index={i}", ) def test_getitem_2d(index): data = [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]] numpy_result = numpy.array(data)[index] modin_result = np.array(data)[index] if is_list_like(numpy_result): assert_scalar_or_array_equal(modin_result, numpy_result) assert modin_result.shape == numpy_result.shape else: assert modin_result == numpy_result def test_getitem_nested(): # Index into the result of slicing a 1D array data = [1, 2, 3, 4, 5] numpy_result = numpy.array(data)[1:3][1] modin_result = np.array(data)[1:3][1] if is_list_like(numpy_result): assert_scalar_or_array_equal(modin_result, numpy_result) assert modin_result.shape == numpy_result.shape else: assert ( modin_result == numpy_result ) # Index into the result of indexing a 2D array data = [[1, 2, 3], [4, 5, 6]] numpy_result = numpy.array(data)[1][1] modin_result = np.array(data)[1][1] if is_list_like(numpy_result): assert_scalar_or_array_equal(modin_result, numpy_result) assert modin_result.shape == numpy_result.shape else: assert modin_result == numpy_result @pytest.mark.parametrize( ("index", "value"), [ (0, 1), (1, 1), (-1, 1), # Scalar indices (slice(0, 1, 1), [7]), (slice(1, -1, 1), [7, 8, 9]), # Slices (slice(0, 4, 1), 7), # Slice with broadcast ([0, 2], [7, 8]), ([1, -1], [7, 8]), # Lists ], ids=lambda i: f"{i}", ) def test_setitem_1d(index, value): data = [1, 2, 3, 4, 5] modin_arr, numpy_arr = np.array(data), numpy.array(data) numpy_arr[index] = value modin_arr[index] = value assert_scalar_or_array_equal(modin_arr, numpy_arr) def test_setitem_1d_error(): arr = np.array([1, 2, 3, 4, 5]) with pytest.raises(ValueError, match="could not broadcast"): arr[0:5] = [1, 2] @pytest.mark.parametrize( ("index", "value"), [ (0, 1), (1, 1), (-1, 1), # Scalar indices (slice(0, 1, 1), [13]), # arr[0:1:1] = [13] (slice(1, -1, 1), [13]), # arr[1:-1:1] = 13 (slice(None, None, None), [7]), # arr[:] = [7] (slice(None, 1, None), [7]), # arr[:1] = [7] (slice(0, 1, None), [7]), # arr[0:1] = [7] (slice(0, None, None), [7]), # arr[0:] = [7] ([0, 2], [[13, 14, 15], [16, 17, 18]]), ([2, 0], [[13, 14, 15], [16, 17, 18]]), ([1, -1], [[13, 14, 15], [16, 17, 18]]), # Lists ], ids=lambda i: f"{i}", ) def test_setitem_2d(index, value): if index == [2, 0]: pytest.xfail("indexing with unsorted list would fail: see GH#5886") data = [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]] modin_arr, numpy_arr = np.array(data), numpy.array(data) numpy_arr[index] = value modin_arr[index] = value assert_scalar_or_array_equal(modin_arr, numpy_arr) ================================================ FILE: modin/tests/numpy/test_array_linalg.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import numpy import numpy.linalg as NLA import pytest import modin.numpy as np import modin.numpy.linalg as LA import modin.pandas as pd from .utils import assert_scalar_or_array_equal def test_dot_from_pandas_reindex(): # Reindexing the dataframe does not change the output of dot # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dot.html df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]]) s = pd.Series([1, 1, 2, 1]) result1 = np.dot(df, s) s2 = s.reindex([1, 0, 2, 3]) result2 = np.dot(df, s2) assert_scalar_or_array_equal(result1, result2) def test_dot_1d(): x1 = numpy.random.randint(-100, 100, size=100) x2 = numpy.random.randint(-100, 100, size=100) numpy_result = numpy.dot(x1, x2) x1, x2 = np.array(x1), np.array(x2) modin_result = np.dot(x1, x2) assert_scalar_or_array_equal(modin_result, numpy_result) def test_dot_2d(): x1 = numpy.random.randint(-100, 100, size=(100, 3)) x2 = numpy.random.randint(-100, 100, size=(3, 50)) numpy_result = numpy.dot(x1, x2) x1, x2 = np.array(x1), np.array(x2) modin_result = np.dot(x1, x2) assert_scalar_or_array_equal(modin_result, numpy_result) def test_dot_scalar(): x1 = numpy.random.randint(-100, 100, size=(100, 3)) x2 = numpy.random.randint(-100, 100) numpy_result = numpy.dot(x1, x2) x1 = np.array(x1) modin_result = np.dot(x1, x2) assert_scalar_or_array_equal(modin_result, numpy_result) def test_matmul_scalar(): x1 = numpy.random.randint(-100, 100, size=(100, 3)) x2 = numpy.random.randint(-100, 100) x1 = np.array(x1) # Modin error message differs from numpy for readability; the original numpy error is: # ValueError: matmul: Input operand 1 does not have enough dimensions (has 0, gufunc # core with signature (n?,k),(k,m?)->(n?,m?) requires 1) with pytest.raises(ValueError): x1 @ x2 def test_dot_broadcast(): # 2D @ 1D x1 = numpy.random.randint(-100, 100, size=(100, 3)) x2 = numpy.random.randint(-100, 100, size=(3,)) numpy_result = numpy.dot(x1, x2) x1, x2 = np.array(x1), np.array(x2) modin_result = np.dot(x1, x2) assert_scalar_or_array_equal(modin_result, numpy_result) # 1D @ 2D x1 = numpy.random.randint(-100, 100, size=(100,)) x2 = numpy.random.randint(-100, 100, size=(100, 3)) numpy_result = numpy.dot(x1, x2) x1, x2 = np.array(x1), np.array(x2) modin_result = np.dot(x1, x2) assert_scalar_or_array_equal(modin_result, numpy_result) @pytest.mark.parametrize("axis", [None, 0, 1], ids=["axis=None", "axis=0", "axis=1"]) def test_norm_fro_2d(axis): x1 = numpy.random.randint(-10, 10, size=(100, 3)) numpy_result = NLA.norm(x1, axis=axis) x1 = np.array(x1) modin_result = LA.norm(x1, axis=axis) # Result may be a scalar if isinstance(modin_result, np.array): modin_result = modin_result._to_numpy() numpy.testing.assert_allclose(modin_result, numpy_result, rtol=1e-12) def test_norm_fro_1d(): x1 = numpy.random.randint(-10, 10, size=100) numpy_result = NLA.norm(x1) x1 = np.array(x1) modin_result = LA.norm(x1) numpy.testing.assert_allclose(modin_result, numpy_result, rtol=1e-12) ================================================ FILE: modin/tests/numpy/test_array_logic.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import numpy import pytest import modin.numpy as np from .utils import assert_scalar_or_array_equal small_arr_c_2d = numpy.array( [ [1j, 1, 0, -numpy.inf, numpy.inf, 0.5], [1 + 1.1j, numpy.nan, 0, numpy.nan, 2, 0.3], ] ) small_arr_c_1d = numpy.array([numpy.nan, 0, -numpy.inf, numpy.inf, 5, -0.1, 1 + 1.1j]) small_arr_r_2d = numpy.array( [[1, 0, -numpy.inf, numpy.inf, 0.5], [numpy.nan, 0, numpy.nan, 2, 0.3]] ) small_arr_r_1d = numpy.array([numpy.nan, 0, -numpy.inf, numpy.inf, 5, -0.1]) @pytest.mark.parametrize("operand_shape", [100, (3, 100)]) @pytest.mark.parametrize("operator", ["any", "all"]) @pytest.mark.parametrize("axis", [None, 0, 1], ids=["axis=None", "axis=0", "axis=1"]) def test_unary_with_axis(operand_shape, operator, axis): if isinstance(operand_shape, int) and axis == 1: pytest.skip("cannot use axis=1 on 1D arrays") x1 = numpy.random.randint(-100, 100, size=operand_shape) numpy_result = getattr(numpy, operator)(x1, axis=axis) x1 = np.array(x1) modin_result = getattr(np, operator)(x1, axis=axis) assert_scalar_or_array_equal( modin_result, numpy_result, err_msg=f"Unary operator {operator} failed." ) def test_all_any_where(): arr = np.array([[0, 1], [1, 0]]) where = np.array([[False, True], [True, False]]) result = arr.all(where=where) # Result should be np.bool_ True, since where mask isolates the non-zero elements assert result where = np.array([[True, False], [False, False]]) result = arr.all(where=where, axis=1) assert_scalar_or_array_equal(result, numpy.array([False, True])) # Results should contain vacuous Trues in the relevant shape result = arr.all(where=False, axis=1) assert_scalar_or_array_equal(result, numpy.array([True, True])) result = arr.all(where=False, axis=0) assert_scalar_or_array_equal(result, numpy.array([True, True])) assert bool(arr.all(where=False, axis=None)) where = np.array([[True, False], [False, True]]) result = arr.any(where=where) # Result should be np.bool_ False, since mask isolates only zero elements assert not result where = np.array([[False, True], [False, False]]) result = arr.any(where=where, axis=1) assert_scalar_or_array_equal(result, numpy.array([True, False])) # Results should contain vacuous Falses in the relevant shape result = arr.any(where=False, axis=1) assert_scalar_or_array_equal(result, numpy.array([False, False])) result = arr.any(where=False, axis=0) assert_scalar_or_array_equal(result, numpy.array([False, False])) assert not bool(arr.any(where=False, axis=None)) @pytest.mark.parametrize("data", [small_arr_c_2d, small_arr_c_1d], ids=["2D", "1D"]) @pytest.mark.parametrize( "operator", ["isfinite", "isinf", "isnan", "iscomplex", "isreal"] ) def test_unary_with_complex(data, operator): x1 = data numpy_result = getattr(numpy, operator)(x1) x1 = np.array(x1) modin_result = getattr(np, operator)(x1) assert_scalar_or_array_equal(modin_result, numpy_result) def test_isnat(): x1 = numpy.array([numpy.datetime64("2016-01-01"), numpy.datetime64("NaT")]) numpy_result = numpy.isnat(x1) x1 = np.array(x1) modin_result = np.isnat(x1) assert_scalar_or_array_equal(modin_result, numpy_result) @pytest.mark.parametrize("data", [small_arr_r_2d, small_arr_r_1d], ids=["2D", "1D"]) @pytest.mark.parametrize("operator", ["isneginf", "isposinf"]) def test_unary_without_complex(data, operator): x1 = data numpy_result = getattr(numpy, operator)(x1) x1 = np.array(x1) modin_result = getattr(np, operator)(x1) assert_scalar_or_array_equal(modin_result, numpy_result) @pytest.mark.parametrize("data", [small_arr_r_2d, small_arr_r_1d], ids=["2D", "1D"]) def test_logical_not(data): x1 = data numpy_result = numpy.logical_not(x1) x1 = np.array(x1) modin_result = np.logical_not(x1) assert_scalar_or_array_equal(modin_result, numpy_result) @pytest.mark.parametrize("operand1_shape", [100, (3, 100)]) @pytest.mark.parametrize("operand2_shape", [100, (3, 100)]) @pytest.mark.parametrize("operator", ["logical_and", "logical_or", "logical_xor"]) def test_logical_binops(operand1_shape, operand2_shape, operator): if operand1_shape != operand2_shape: pytest.xfail("TODO fix broadcasting behavior for binary logic operators") x1 = numpy.random.randint(-100, 100, size=operand1_shape) x2 = numpy.random.randint(-100, 100, size=operand2_shape) numpy_result = getattr(numpy, operator)(x1, x2) x1, x2 = np.array(x1), np.array(x2) modin_result = getattr(np, operator)(x1, x2) assert_scalar_or_array_equal( modin_result, numpy_result, err_msg=f"Logic binary operator {operator} failed." ) ================================================ FILE: modin/tests/numpy/test_array_math.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import numpy import pytest import modin.numpy as np from .utils import assert_scalar_or_array_equal @pytest.mark.parametrize( "data", [ [3, 2, 1, 1], [-87.434, -90.908, -87.152, -84.903], [-87.434, -90.908, np.nan, -87.152, -84.903], ], ids=["ints", "floats", "floats with nan"], ) @pytest.mark.parametrize("op", ["argmin", "argmax"]) def test_argmax_argmin(data, op): numpy_result = getattr(numpy, op)(numpy.array(data)) modin_result = getattr(np, op)(np.array(data)) assert_scalar_or_array_equal(modin_result, numpy_result) def test_rem_mod(): """Tests remainder and mod, which, unlike the C/matlab equivalents, are identical in numpy.""" a = numpy.array([[2, -1], [10, -3]]) b = numpy.array(([-3, 3], [3, -7])) numpy_result = numpy.remainder(a, b) modin_result = np.remainder(np.array(a), np.array(b)) assert_scalar_or_array_equal(modin_result, numpy_result) numpy_result = numpy.mod(a, b) modin_result = np.mod(np.array(a), np.array(b)) assert_scalar_or_array_equal(modin_result, numpy_result) ================================================ FILE: modin/tests/numpy/test_array_shaping.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import numpy import pytest import modin.numpy as np from .utils import assert_scalar_or_array_equal @pytest.mark.parametrize("operand_shape", [100, (100, 3), (3, 100)]) def test_ravel(operand_shape): x = numpy.random.randint(-100, 100, size=operand_shape) numpy_result = numpy.ravel(x) modin_result = np.ravel(np.array(x)) assert_scalar_or_array_equal(modin_result, numpy_result) @pytest.mark.parametrize("operand_shape", [100, (100, 3), (3, 100)]) def test_shape(operand_shape): x = numpy.random.randint(-100, 100, size=operand_shape) numpy_result = numpy.shape(x) modin_result = np.shape(np.array(x)) assert modin_result == numpy_result @pytest.mark.parametrize("operand_shape", [100, (100, 3), (3, 100)]) def test_transpose(operand_shape): x = numpy.random.randint(-100, 100, size=operand_shape) numpy_result = numpy.transpose(x) modin_result = np.transpose(np.array(x)) assert_scalar_or_array_equal(modin_result, numpy_result) @pytest.mark.parametrize("axis", [0, 1]) def test_split_2d(axis): x = numpy.random.randint(-100, 100, size=(6, 4)) # Integer argument: split into N equal arrays along axis numpy_result = numpy.split(x, 2, axis=axis) modin_result = np.split(np.array(x), 2, axis=axis) for modin_entry, numpy_entry in zip(modin_result, numpy_result): assert_scalar_or_array_equal(modin_entry, numpy_entry) # List argument: split at specified indices idxs = [2, 3] numpy_result = numpy.split(x, idxs, axis=axis) modin_result = np.split(np.array(x), idxs, axis=axis) for modin_entry, numpy_entry in zip(modin_result, numpy_result): assert_scalar_or_array_equal(modin_entry, numpy_entry) def test_split_2d_oob(): # Supplying an index out of bounds results in an empty sub-array, for which modin # would return a numpy array by default x = numpy.random.randint(-100, 100, size=(6, 4)) idxs = [2, 3, 6] numpy_result = numpy.split(x, idxs) modin_result = np.split(np.array(x), idxs) for modin_entry, numpy_entry in zip(modin_result, numpy_result): assert_scalar_or_array_equal(modin_entry, numpy_entry) def test_split_2d_uneven(): x = np.array(numpy.random.randint(-100, 100, size=(3, 2))) with pytest.raises( ValueError, match="array split does not result in an equal division" ): np.split(x, 2) def test_hstack(): # 2D arrays a = numpy.random.randint(-100, 100, size=(5, 3)) b = numpy.random.randint(-100, 100, size=(5, 2)) numpy_result = numpy.hstack((a, b)) modin_result = np.hstack((np.array(a), np.array(b))) assert_scalar_or_array_equal(modin_result, numpy_result) # 1D arrays a = numpy.random.randint(-100, 100, size=(5,)) b = numpy.random.randint(-100, 100, size=(3,)) numpy_result = numpy.hstack((a, b)) modin_result = np.hstack((np.array(a), np.array(b))) assert_scalar_or_array_equal(modin_result, numpy_result) def test_append(): # Examples taken from numpy docs xs = [[1, 2, 3], [[4, 5, 6], [7, 8, 9]]] numpy_result = numpy.append(*xs) modin_result = np.append(*[np.array(x) for x in xs]) assert_scalar_or_array_equal(modin_result, numpy_result) numpy_result = numpy.append([[1, 2, 3], [4, 5, 6]], [[7, 8, 9]], axis=0) modin_result = np.append(np.array([[1, 2, 3], [4, 5, 6]]), [[7, 8, 9]], axis=0) assert_scalar_or_array_equal(modin_result, numpy_result) @pytest.mark.xfail(reason="append error checking is incorrect: see GH#5896") def test_append_error(): with pytest.raises(ValueError): np.append(np.array([[1, 2, 3], [4, 5, 6]]), np.array([7, 8, 9]), axis=0) ================================================ FILE: modin/tests/numpy/utils.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import numpy import modin.numpy as np def assert_scalar_or_array_equal(x1, x2, err_msg=""): """ Assert whether the result of the numpy and modin computations are the same. If either argument is a modin array object, then `_to_numpy()` is called on it. The arguments are compared with `numpy.testing.assert_array_equals`. """ if isinstance(x1, np.array): x1 = x1._to_numpy() if isinstance(x2, np.array): x2 = x2._to_numpy() numpy.testing.assert_array_equal(x1, x2, err_msg=err_msg) ================================================ FILE: modin/tests/pandas/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. ================================================ FILE: modin/tests/pandas/conftest.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import pytest from modin.config import Engine, StorageFormat def pytest_collection_modifyitems(items): try: if ( Engine.get() in ("Ray", "Unidist", "Dask", "Python") and StorageFormat.get() != "Base" ): for item in items: if item.name in ( "test_dataframe_dt_index[3s-both-DateCol-_NoDefault.no_default]", "test_dataframe_dt_index[3s-right-DateCol-_NoDefault.no_default]", ): item.add_marker( pytest.mark.xfail( reason="https://github.com/modin-project/modin/issues/6399" ) ) except ImportError: # No engine ... ================================================ FILE: modin/tests/pandas/data/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. ================================================ FILE: modin/tests/pandas/data/blah.csv ================================================ ,Presidents,Presidents,Presidents,Presidents,Presidents,Presidents,Presidents,Presidents,Presidents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Subcontinents,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes,Themes ,Pure mentions,Pure mentions,Pure mentions,Pure tags,Pure tags,Pure tags,Mentions + Tags,Mentions + Tags,Mentions + Tags,Subcontinent Tags,Subcontinent Tags,Subcontinent Tags,Subcontinent Tags,Subcontinent Tags,Subcontinent Tags,Subcontinent Tags,Subcontinent Tags,Subcontinent Tags,Subcontinent Tags,Subcontinent Tags,Subcontinent Tags,Subcontinent Tags,Subcontinent Tags,Subcontinent Tags,Subcontinent Tags,Subcontinent Tags,Subcontinent Tags,Subcontinent Tags,Subcontinent Tags,Subcontinent Tags,Subcontinent Tags,Subcontinent Tags,Subcontinent Tags,"Subcontintents, No POTUS","Subcontintents, No POTUS","Subcontintents, No POTUS","Subcontintents, No POTUS","Subcontintents, No POTUS","Subcontintents, No POTUS","Subcontintents, No POTUS","Subcontintents, No POTUS","Subcontintents, No POTUS","Subcontintents, No POTUS","Subcontintents, No POTUS","Subcontintents, No POTUS","Subcontintents, No POTUS","Subcontintents, No POTUS","Subcontintents, No POTUS","Subcontintents, No POTUS","Subcontintents, No POTUS","Subcontintents, No POTUS","Subcontintents, No POTUS","Subcontintents, No POTUS","Subcontintents, No POTUS","Subcontintents, No POTUS","Subcontintents, No POTUS","Subcontintents, No POTUS",Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,Theme Tags,"Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS","Themes, No POTUS" ,IND,DEP,DEP,IND,DEP,DEP,IND,DEP,DEP,IND,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,IND,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,IND,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,IND,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP,DEP ,all,obama_mention,trump_mention,pr_tags,obama_tag,trump_tag,all,obama_m+t,trump_m+t,pr_tags,Caribbean,Southern Asia,Middle Africa,Northern Europe,Southern Europe,Western Asia,South America,Polynesia,Antarctica,Eastern Africa,Australia and New Zealand,Western Europe,Western Africa,Eastern Europe,Central America,Northern America,South-eastern Asia,Southern Africa,Eastern Asia,Northern Africa,Melanesia,Micronesia,Central Asia,pr_tags_np,Caribbean_np,Southern Asia_np,Middle Africa_np,Northern Europe_np,Southern Europe_np,Western Asia_np,South America_np,Polynesia_np,Antarctica_np,Eastern Africa_np,Australia and New Zealand_np,Western Europe_np,Western Africa_np,Eastern Europe_np,Central America_np,Northern America_np,South-eastern Asia_np,Southern Africa_np,Eastern Asia_np,Northern Africa_np,Melanesia_np,Micronesia_np,Central Asia_np,pr_themes,Top/News,Top/News/Sports,Top/Features/Travel/Guides/Activities and Interests/Golf,Top/News/Sports/Golf,Top/News/Business,Top/Opinion/Opinion,Top/Opinion,Top/News/Education,Top/Classifieds/Job Market/Job Categories/Education,Top/Features/Travel/Guides/Destinations/North America/United States/New York/New York City,Top/News/U.S./Mid-Atlantic,Top/Opinion/Opinion/Op-Ed,Top/Features/Travel/Guides/Destinations/North America/United States,Top/Features/Travel/Guides/Destinations/North America,Top/News/Technology,Top/News/U.S.,Top/News/New York and Region,"Top/News/U.S./U.S. States, Territories and Possessions/New York",Top/Features/Travel/Guides/Destinations/North America/United States/California,Top/Features/Books,"Top/News/U.S./U.S. States, Territories and Possessions/California",Top/Opinion/Opinion/Editorials,Top/Features/Travel/Guides/Activities and Interests/Family,Top/Opinion/Opinion/Op-Ed/Contributors,Top/Features/Travel/Guides/Destinations/Europe,Top/Features/Movies/News and Features,Top/Features/Arts/Music,Top/Features/Travel/Guides/Activities and Interests/Music,Top/Features/Arts,Top/Classifieds/Paid Death Notices,Top/Features/Movies,Top/Features/Travel/Guides/Destinations/Asia/China,"Top/Classifieds/Job Market/Job Categories/Marketing, Advertising and PR",Top/Features/Travel/Guides/Destinations/Asia,"Top/News/U.S./U.S. States, Territories and Possessions/Arizona",Top/Features/Travel/Guides/Destinations/North America/United States/Arizona,Top/News/U.S./Rockies,Top/Features/Travel/Guides/Destinations/North America/United States/New Jersey,Top/Features/Books/Book Reviews,Top/Features/Travel/Guides/Destinations/Asia/Pakistan,Top/News/World/Asia Pacific,Top/News/World/Countries and Territories/Pakistan,Top/News/World,Top/News/World/Countries and Territories/Afghanistan,Top/Features/Travel/Guides/Destinations/North America/United States/South Carolina,Top/Features/Travel/Guides/Destinations/Middle East/Israel,Top/Features/Travel/Guides/Destinations/Middle East,Top/News/World/Middle East,Top/Features/Travel/Guides/Destinations/Middle East/Iran,Top/News/World/Countries and Territories/Israel,Top/Features/Travel/Guides/Destinations/North America/United States/Colorado,Top/News/New York and Region/New Jersey,Top/Features/Travel/Guides/Destinations/Central and South America,Top/Features/Travel/Guides/Destinations/Central and South America/Colombia,Top/Features/Travel/Guides/Destinations/Africa/Kenya,Top/Features/Travel/Guides/Activities and Interests/Food and Wine,"Top/News/U.S./U.S. States, Territories and Possessions/Massachusetts",Top/News/Sports/Pro Football/National Football League/Washington Redskins,education and schools,teachers and school employees,privacy,politics and government,law and legislation,tests and testing,computers and the internet,finances,abortion,no index terms from nytimes,privatization,books and literature,motion pictures,united states politics and government,christians and christianity,religion and churches,advertising and marketing,budgets and budgeting,elections,medicine and health,presidents and presidency (us),presidential elections (us),minorities (us),recordings (audio),handicapped,homosexuality,labor,suits and litigation,colleges and universities,recordings (video),blacks,public opinion,primaries,lobbying and lobbyists,hispanic-americans,"armament, defense and military forces",appointments and executive changes,copyrights,philanthropy,mathematics,recession and depression,reading and writing skills,writing and writers,ratings and rating systems,jews,language and languages,television,computer software,police,taxation,governors (us),oil (petroleum) and gasoline,news and news media,global warming,environment,islam,presidential election of 1988,drug abuse and traffic,marijuana,women,church-state relations,editorials,gun control,election issues,immigration and refugees,sex,"awards, decorations and honors",terrorism,nazi policies toward jews and minorities,weather,electronic mail,quotation of the day,decisions and verdicts,equal educational opportunities,libraries and librarians,advertising,baseball,illegal aliens,media,crime and criminals,roads and traffic,automobiles,ethics,art,property taxes,speech,freedom of speech and expression,political advertising,reviews,sex crimes,prostitution,insurance,hurricanes and tropical storms,hurricane katrina,floods,election results,strikes,united states armament and defense,basketball,horse racing,united states international relations,international relations,firearms,health insurance and managed care,health insurance,discrimination,music,airlines and airplanes,drugs (pharmaceuticals),diseases and conditions,banks and banking,college athletics,football,impeachment,frauds and swindling,new year,correction stories,trees and shrubs,home repairs,olympic games,apparel,home furnishings,earthquakes,home repairs and improvements,world trade center (nyc),fish and other marine life,office buildings and commercial properties,noise,legislatures and parliaments,tuition,presidential election of 2004,mayors,soccer,restaurants,unemployment,biographical information,radio,"conventions, national (us)",computer and video games,presidential election of 2008,super bowl,demonstrations and riots,marriages,deaths (obituaries),accidents and safety,standards and standardization,referendums,exercise,children and youth,murders and attempted murders,international trade and world market,wages and salaries,coaches and managers,archaeology and anthropology,palestinians,birth control and family planning,economic conditions and trends,united states economy,telephones and telecommunications,restoration and rehabilitation,dairy products,animals,sales,"prices (fares, fees and rates)",energy and power,atomic weapons,holidays and special occasions,medicaid,medicare,christmas,"war crimes, genocide and crimes against humanity",presidential election of 2000,fires and firefighters,fires and firemen,air pollution,robberies and thefts,conventions and conferences,food,diet and nutrition,stocks and bonds,electric light and power,light,blackouts and brownouts (electrical),theater,hijacking,pentagon building,"suspensions, dismissals and resignations",scholarships and fellowships,newspapers,travel and vacations,building (construction),games,torture,cellular telephones,sentences (criminal),bridges and tunnels,affirmative action,credit,birds,space,postal service,pornography and obscenity,steroids,embargoes and economic sanctions,smoking and tobacco,social security (us),child care,inventions and patents,vaccination and immunization,prisons and prisoners,retirement,currency,transit systems,subways,snow and snowstorms,housing,priests,company reports,corporations,layoffs and job reductions,magazines,aged,viruses,biological and chemical warfare,opera,parades,states (us),constitutional amendments,cancer,pensions and retirement plans,child abuse and neglect,government employees,culture,blacks (in us),radiation,documentary films and programs,retail stores and trade,spanish language,"mergers, acquisitions and divestitures",small business,poetry and poets,rock music,identification devices,space shuttle,atomic energy,interest rates,police brutality and misconduct,science and technology,running,marathon running,research,weight,homeless persons,cocaine and crack cocaine,suicides and suicide attempts,bicycles and bicycling,buses,pregnancy and obstetrics,contests and prizes,vetoes (us),jewels and jewelry,academy awards (oscars),parties (social),festivals,"divorce, separations and annulments",gas (fuel),photography,comedy and humor,world series,hotels and motels,serial murders,textiles,gambling,cooking and cookbooks,recipes,beverages,tennis,shoes and boots,dogs,"hockey, ice",extradition,boxing,"indians, american",violence,chemicals,sports of the times (times column),arson,vietnam war,boycotts,toys,cruises,ships and shipping,trade shows and fairs,mental health and disorders,wines,alcoholic beverages,dancing,golf,auctions,mutual funds,swimming,historic buildings and sites,weddings and engagements,freedom and human rights,athletics and sports,draft and recruitment (sports),hospitals,genetics and heredity,foreign aid,anthrax,acquired immune deficiency syndrome,insects,consumer protection,mines and mining,blood,doctors,nursing and nurses,airports,water,death and dying,dna (deoxyribonucleic acid),third world and developing countries,food contamination and poisoning,agriculture,livestock,acquired immune deficiency syndrome (aids),no index terms,regulation and deregulation of industry,taxicabs and taxicab drivers,meat,babies,shortages,nasdaq composite index,government bonds,security and warning systems,grain,transplants,freedom of the press,metals and minerals,computer security,bombs and explosives,population,mortgages,customs (tariff),farmers,automobile racing,biology and biochemistry,anatomy and physiology,production,factories and industrial plants,track and field,summer games (olympics),foreign investments,stadiums and arenas,foreign service,ncaa basketball tournament,waste materials and disposal,hunting and trapping,deportation,casinos,world cup (soccer),reproduction (biological),surgery and surgeons,kidnapping,heart,alcohol abuse,domestic violence,capital punishment,hostages,world war ii (1939-45),war and revolution,civil war and guerrilla warfare,jury system,entertainment and amusements,child abuse,sexual harassment,war crimes and criminals,censorship,railroads,asylum (political),legal profession,courts,political prisoners,prisoners of war,men,dow jones stock average,probation and parole,drunken and reckless driving,organized crime,futures and options trading,securities and commodities violations,assaults,physics,bribery,debating,recalls and bans of products,credit and money cards,drought,fines (penalties),perjury,bars,trades (sports),skiing,attacks on police,race,bankruptcies,bridge (card game),renting and leasing,condominiums,commuting,stations and terminals (passenger),shutdowns (institutional),beaches,families and family life,brain,book trade,futures trading,federal taxes (us),tax credits,assassinations and attempted assassinations,recycling of waste materials,automobile insurance and liability,delays (transportation),transportation,flowers and plants,steel and iron,chemistry,anti-semitism,soft drinks,consumer behavior,parks and other recreation areas,leisure,gardens and gardening,figure skating,ice skating,"fishing, sport",executives and management,coups d'etat and attempted coups d'etat,supermarkets,rescues,accounting and accountants,rain,judges,water pollution,satellites,trucks and trucking,playoff games,"fishing, commercial",antitrust actions and laws,royal family,personal finances,parking,utility vehicles and other light trucks,racketeering and racketeers,interscholastic athletics,chess,jazz,breast,explosions,foster care,classical music,intelligence,navies,architecture,organized labor,bakeries and baked products,espionage,local government,interior design,guards,computer chips,boats and boating,forests and forestry,zoning,hiring and promotion,area planning and renewal,marketing and merchandising,embezzlement,boards of directors,smuggling,land use policies,monuments and memorials,endangered and extinct species,"age, chronological",welfare (us),child custody and support,wiretapping and other eavesdropping devices and methods,ferries,history,collectors and collections,missiles and missile defense systems,arms control and limitation and disarmament,constitutions,shopping centers,tax evasion,design,free agents (sports),genetic engineering,pilots,military aircraft,liability for products,real estate,veterans,antiques,missiles,furniture,drug traffic,discount selling,savings,fruit,deaths,united states open (tennis),english language,records and achievements,united states foreign service,recording equipment,nightclubs and cabarets,beer,brokers and brokerage firms,buildings (structures),"arbitration, conciliation and mediation",hazardous and toxic substances,defense contracts,arms sales abroad,military personnel,missing persons,concerts and recitals,drug addiction and abuse,geographic profiles,geography,fast food industry,labeling and labels,military bases and installations,vice presidents and vice presidency (us),volunteers,layoffs (labor),income,gifts,treaties,shootings,city councils,social conditions and trends,urban areas,shows (exhibits),data processing (computers),stock prices and trading volume,office buildings,immigration and emigration,kurds,income tax,software products,personal computers,summit conferences,prices,bombs and bomb plots,racial relations,search and seizure,museums,"health, personal",contracts,industry profiles,refugees and expatriates,trials,disclosure of information,unemployment and job market,suburbs,special sections,reform and reorganization,cooperatives,federal aid (us),relocation of business,cable television,attorneys general,electronics,book reviews,"names, organizational","minorities (ethnic, racial, religious)","new models, design and products",terms not available,account changes,surveys and series,military action,whitewater case,company and organization profiles,savings and loan associations,art shows,independence movements,life styles,suits and claims against government,presidential election of 1996,forecasts,threats and threatening messages,persian gulf war,pr_themes_np,Top/News_np,Top/News/Sports_np,Top/Features/Travel/Guides/Activities and Interests/Golf_np,Top/News/Sports/Golf_np,Top/News/Business_np,Top/Opinion/Opinion_np,Top/Opinion_np,Top/News/Education_np,Top/Classifieds/Job Market/Job Categories/Education_np,Top/Features/Travel/Guides/Destinations/North America/United States/New York/New York City_np,Top/News/U.S./Mid-Atlantic_np,Top/Opinion/Opinion/Op-Ed_np,Top/Features/Travel/Guides/Destinations/North America/United States_np,Top/Features/Travel/Guides/Destinations/North America_np,Top/News/Technology_np,Top/News/U.S._np,Top/News/New York and Region_np,"Top/News/U.S./U.S. States, Territories and Possessions/New York_np",Top/Features/Travel/Guides/Destinations/North America/United States/California_np,Top/Features/Books_np,"Top/News/U.S./U.S. States, Territories and Possessions/California_np",Top/Opinion/Opinion/Editorials_np,Top/Features/Travel/Guides/Activities and Interests/Family_np,Top/Opinion/Opinion/Op-Ed/Contributors_np,Top/Features/Travel/Guides/Destinations/Europe_np,Top/Features/Movies/News and Features_np,Top/Features/Arts/Music_np,Top/Features/Travel/Guides/Activities and Interests/Music_np,Top/Features/Arts_np,Top/Classifieds/Paid Death Notices_np,Top/Features/Movies_np,Top/Features/Travel/Guides/Destinations/Asia/China_np,"Top/Classifieds/Job Market/Job Categories/Marketing, Advertising and PR_np",Top/Features/Travel/Guides/Destinations/Asia_np,"Top/News/U.S./U.S. States, Territories and Possessions/Arizona_np",Top/Features/Travel/Guides/Destinations/North America/United States/Arizona_np,Top/News/U.S./Rockies_np,Top/Features/Travel/Guides/Destinations/North America/United States/New Jersey_np,Top/Features/Books/Book Reviews_np,Top/Features/Travel/Guides/Destinations/Asia/Pakistan_np,Top/News/World/Asia Pacific_np,Top/News/World/Countries and Territories/Pakistan_np,Top/News/World_np,Top/News/World/Countries and Territories/Afghanistan_np,Top/Features/Travel/Guides/Destinations/North America/United States/South Carolina_np,Top/Features/Travel/Guides/Destinations/Middle East/Israel_np,Top/Features/Travel/Guides/Destinations/Middle East_np,Top/News/World/Middle East_np,Top/Features/Travel/Guides/Destinations/Middle East/Iran_np,Top/News/World/Countries and Territories/Israel_np,Top/Features/Travel/Guides/Destinations/North America/United States/Colorado_np,Top/News/New York and Region/New Jersey_np,Top/Features/Travel/Guides/Destinations/Central and South America_np,Top/Features/Travel/Guides/Destinations/Central and South America/Colombia_np,Top/Features/Travel/Guides/Destinations/Africa/Kenya_np,Top/Features/Travel/Guides/Activities and Interests/Food and Wine_np,"Top/News/U.S./U.S. States, Territories and Possessions/Massachusetts_np",Top/News/Sports/Pro Football/National Football League/Washington Redskins_np,education and schools_np,teachers and school employees_np,privacy_np,politics and government_np,law and legislation_np,tests and testing_np,computers and the internet_np,finances_np,abortion_np,no index terms from nytimes_np,privatization_np,books and literature_np,motion pictures_np,united states politics and government_np,christians and christianity_np,religion and churches_np,advertising and marketing_np,budgets and budgeting_np,elections_np,medicine and health_np,presidents and presidency (us)_np,presidential elections (us)_np,minorities (us)_np,recordings (audio)_np,handicapped_np,homosexuality_np,labor_np,suits and litigation_np,colleges and universities_np,recordings (video)_np,blacks_np,public opinion_np,primaries_np,lobbying and lobbyists_np,hispanic-americans_np,"armament, defense and military forces_np",appointments and executive changes_np,copyrights_np,philanthropy_np,mathematics_np,recession and depression_np,reading and writing skills_np,writing and writers_np,ratings and rating systems_np,jews_np,language and languages_np,television_np,computer software_np,police_np,taxation_np,governors (us)_np,oil (petroleum) and gasoline_np,news and news media_np,global warming_np,environment_np,islam_np,presidential election of 1988_np,drug abuse and traffic_np,marijuana_np,women_np,church-state relations_np,editorials_np,gun control_np,election issues_np,immigration and refugees_np,sex_np,"awards, decorations and honors_np",terrorism_np,nazi policies toward jews and minorities_np,weather_np,electronic mail_np,quotation of the day_np,decisions and verdicts_np,equal educational opportunities_np,libraries and librarians_np,advertising_np,baseball_np,illegal aliens_np,media_np,crime and criminals_np,roads and traffic_np,automobiles_np,ethics_np,art_np,property taxes_np,speech_np,freedom of speech and expression_np,political advertising_np,reviews_np,sex crimes_np,prostitution_np,insurance_np,hurricanes and tropical storms_np,hurricane katrina_np,floods_np,election results_np,strikes_np,united states armament and defense_np,basketball_np,horse racing_np,united states international relations_np,international relations_np,firearms_np,health insurance and managed care_np,health insurance_np,discrimination_np,music_np,airlines and airplanes_np,drugs (pharmaceuticals)_np,diseases and conditions_np,banks and banking_np,college athletics_np,football_np,impeachment_np,frauds and swindling_np,new year_np,correction stories_np,trees and shrubs_np,home repairs_np,olympic games_np,apparel_np,home furnishings_np,earthquakes_np,home repairs and improvements_np,world trade center (nyc)_np,fish and other marine life_np,office buildings and commercial properties_np,noise_np,legislatures and parliaments_np,tuition_np,presidential election of 2004_np,mayors_np,soccer_np,restaurants_np,unemployment_np,biographical information_np,radio_np,"conventions, national (us)_np",computer and video games_np,presidential election of 2008_np,super bowl_np,demonstrations and riots_np,marriages_np,deaths (obituaries)_np,accidents and safety_np,standards and standardization_np,referendums_np,exercise_np,children and youth_np,murders and attempted murders_np,international trade and world market_np,wages and salaries_np,coaches and managers_np,archaeology and anthropology_np,palestinians_np,birth control and family planning_np,economic conditions and trends_np,united states economy_np,telephones and telecommunications_np,restoration and rehabilitation_np,dairy products_np,animals_np,sales_np,"prices (fares, fees and rates)_np",energy and power_np,atomic weapons_np,holidays and special occasions_np,medicaid_np,medicare_np,christmas_np,"war crimes, genocide and crimes against humanity_np",presidential election of 2000_np,fires and firefighters_np,fires and firemen_np,air pollution_np,robberies and thefts_np,conventions and conferences_np,food_np,diet and nutrition_np,stocks and bonds_np,electric light and power_np,light_np,blackouts and brownouts (electrical)_np,theater_np,hijacking_np,pentagon building_np,"suspensions, dismissals and resignations_np",scholarships and fellowships_np,newspapers_np,travel and vacations_np,building (construction)_np,games_np,torture_np,cellular telephones_np,sentences (criminal)_np,bridges and tunnels_np,affirmative action_np,credit_np,birds_np,space_np,postal service_np,pornography and obscenity_np,steroids_np,embargoes and economic sanctions_np,smoking and tobacco_np,social security (us)_np,child care_np,inventions and patents_np,vaccination and immunization_np,prisons and prisoners_np,retirement_np,currency_np,transit systems_np,subways_np,snow and snowstorms_np,housing_np,priests_np,company reports_np,corporations_np,layoffs and job reductions_np,magazines_np,aged_np,viruses_np,biological and chemical warfare_np,opera_np,parades_np,states (us)_np,constitutional amendments_np,cancer_np,pensions and retirement plans_np,child abuse and neglect_np,government employees_np,culture_np,blacks (in us)_np,radiation_np,documentary films and programs_np,retail stores and trade_np,spanish language_np,"mergers, acquisitions and divestitures_np",small business_np,poetry and poets_np,rock music_np,identification devices_np,space shuttle_np,atomic energy_np,interest rates_np,police brutality and misconduct_np,science and technology_np,running_np,marathon running_np,research_np,weight_np,homeless persons_np,cocaine and crack cocaine_np,suicides and suicide attempts_np,bicycles and bicycling_np,buses_np,pregnancy and obstetrics_np,contests and prizes_np,vetoes (us)_np,jewels and jewelry_np,academy awards (oscars)_np,parties (social)_np,festivals_np,"divorce, separations and annulments_np",gas (fuel)_np,photography_np,comedy and humor_np,world series_np,hotels and motels_np,serial murders_np,textiles_np,gambling_np,cooking and cookbooks_np,recipes_np,beverages_np,tennis_np,shoes and boots_np,dogs_np,"hockey, ice_np",extradition_np,boxing_np,"indians, american_np",violence_np,chemicals_np,sports of the times (times column)_np,arson_np,vietnam war_np,boycotts_np,toys_np,cruises_np,ships and shipping_np,trade shows and fairs_np,mental health and disorders_np,wines_np,alcoholic beverages_np,dancing_np,golf_np,auctions_np,mutual funds_np,swimming_np,historic buildings and sites_np,weddings and engagements_np,freedom and human rights_np,athletics and sports_np,draft and recruitment (sports)_np,hospitals_np,genetics and heredity_np,foreign aid_np,anthrax_np,acquired immune deficiency syndrome_np,insects_np,consumer protection_np,mines and mining_np,blood_np,doctors_np,nursing and nurses_np,airports_np,water_np,death and dying_np,dna (deoxyribonucleic acid)_np,third world and developing countries_np,food contamination and poisoning_np,agriculture_np,livestock_np,acquired immune deficiency syndrome (aids)_np,no index terms_np,regulation and deregulation of industry_np,taxicabs and taxicab drivers_np,meat_np,babies_np,shortages_np,nasdaq composite index_np,government bonds_np,security and warning systems_np,grain_np,transplants_np,freedom of the press_np,metals and minerals_np,computer security_np,bombs and explosives_np,population_np,mortgages_np,customs (tariff)_np,farmers_np,automobile racing_np,biology and biochemistry_np,anatomy and physiology_np,production_np,factories and industrial plants_np,track and field_np,summer games (olympics)_np,foreign investments_np,stadiums and arenas_np,foreign service_np,ncaa basketball tournament_np,waste materials and disposal_np,hunting and trapping_np,deportation_np,casinos_np,world cup (soccer)_np,reproduction (biological)_np,surgery and surgeons_np,kidnapping_np,heart_np,alcohol abuse_np,domestic violence_np,capital punishment_np,hostages_np,world war ii (1939-45)_np,war and revolution_np,civil war and guerrilla warfare_np,jury system_np,entertainment and amusements_np,child abuse_np,sexual harassment_np,war crimes and criminals_np,censorship_np,railroads_np,asylum (political)_np,legal profession_np,courts_np,political prisoners_np,prisoners of war_np,men_np,dow jones stock average_np,probation and parole_np,drunken and reckless driving_np,organized crime_np,futures and options trading_np,securities and commodities violations_np,assaults_np,physics_np,bribery_np,debating_np,recalls and bans of products_np,credit and money cards_np,drought_np,fines (penalties)_np,perjury_np,bars_np,trades (sports)_np,skiing_np,attacks on police_np,race_np,bankruptcies_np,bridge (card game)_np,renting and leasing_np,condominiums_np,commuting_np,stations and terminals (passenger)_np,shutdowns (institutional)_np,beaches_np,families and family life_np,brain_np,book trade_np,futures trading_np,federal taxes (us)_np,tax credits_np,assassinations and attempted assassinations_np,recycling of waste materials_np,automobile insurance and liability_np,delays (transportation)_np,transportation_np,flowers and plants_np,steel and iron_np,chemistry_np,anti-semitism_np,soft drinks_np,consumer behavior_np,parks and other recreation areas_np,leisure_np,gardens and gardening_np,figure skating_np,ice skating_np,"fishing, sport_np",executives and management_np,coups d'etat and attempted coups d'etat_np,supermarkets_np,rescues_np,accounting and accountants_np,rain_np,judges_np,water pollution_np,satellites_np,trucks and trucking_np,playoff games_np,"fishing, commercial_np",antitrust actions and laws_np,royal family_np,personal finances_np,parking_np,utility vehicles and other light trucks_np,racketeering and racketeers_np,interscholastic athletics_np,chess_np,jazz_np,breast_np,explosions_np,foster care_np,classical music_np,intelligence_np,navies_np,architecture_np,organized labor_np,bakeries and baked products_np,espionage_np,local government_np,interior design_np,guards_np,computer chips_np,boats and boating_np,forests and forestry_np,zoning_np,hiring and promotion_np,area planning and renewal_np,marketing and merchandising_np,embezzlement_np,boards of directors_np,smuggling_np,land use policies_np,monuments and memorials_np,endangered and extinct species_np,"age, chronological_np",welfare (us)_np,child custody and support_np,wiretapping and other eavesdropping devices and methods_np,ferries_np,history_np,collectors and collections_np,missiles and missile defense systems_np,arms control and limitation and disarmament_np,constitutions_np,shopping centers_np,tax evasion_np,design_np,free agents (sports)_np,genetic engineering_np,pilots_np,military aircraft_np,liability for products_np,real estate_np,veterans_np,antiques_np,missiles_np,furniture_np,drug traffic_np,discount selling_np,savings_np,fruit_np,deaths_np,united states open (tennis)_np,english language_np,records and achievements_np,united states foreign service_np,recording equipment_np,nightclubs and cabarets_np,beer_np,brokers and brokerage firms_np,buildings (structures)_np,"arbitration, conciliation and mediation_np",hazardous and toxic substances_np,defense contracts_np,arms sales abroad_np,military personnel_np,missing persons_np,concerts and recitals_np,drug addiction and abuse_np,geographic profiles_np,geography_np,fast food industry_np,labeling and labels_np,military bases and installations_np,vice presidents and vice presidency (us)_np,volunteers_np,layoffs (labor)_np,income_np,gifts_np,treaties_np,shootings_np,city councils_np,social conditions and trends_np,urban areas_np,shows (exhibits)_np,data processing (computers)_np,stock prices and trading volume_np,office buildings_np,immigration and emigration_np,kurds_np,income tax_np,software products_np,personal computers_np,summit conferences_np,prices_np,bombs and bomb plots_np,racial relations_np,search and seizure_np,museums_np,"health, personal_np",contracts_np,industry profiles_np,refugees and expatriates_np,trials_np,disclosure of information_np,unemployment and job market_np,suburbs_np,special sections_np,reform and reorganization_np,cooperatives_np,federal aid (us)_np,relocation of business_np,cable television_np,attorneys general_np,electronics_np,book reviews_np,"names, organizational_np","minorities (ethnic, racial, religious)_np","new models, design and products_np",terms not available_np,account changes_np,surveys and series_np,military action_np,whitewater case_np,company and organization profiles_np,savings and loan associations_np,art shows_np,independence movements_np,life styles_np,suits and claims against government_np,presidential election of 1996_np,forecasts_np,threats and threatening messages_np,persian gulf war_np 1,blah,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, ================================================ FILE: modin/tests/pandas/data/issue_1930.csv ================================================ ,col1,col2,col3,col4,col5 0,0,4,8,12,0 1,1,5,9,13,0 2,2,6,10,14,0 3,3,7,11,15,0 ================================================ FILE: modin/tests/pandas/data/issue_2074.csv ================================================ one,two, three, five, six, seven, eight three,three, five, six, seven, eight, nine one,four, three, five, six, seven, eight one,two, three, five, six, seven, eight one,two, three, five, six, seven, eight one,two, three, five, six, seven, eight three,four, five, six, seven, eight, nine one,two, three, five, six, seven, eight three,four, five, six, seven, eight, nine three,four, five, six, seven, eight, nine three,four, five, six, seven, eight, nine three,four, five, six, seven, eight, nine ================================================ FILE: modin/tests/pandas/data/issue_2239.csv ================================================ 1585542839.000000, 1585542839.000000, 1585542839.000000 32.000000, 32.000000, 32.000000 -38,-14,51 -38,-13,51 -38,-14,51 -38,-14,50 -38,-13,51 -38,-14,50 -38,-14,51 -38,-13,51 -38,-14,51 -38,-13,51 -38,-14,51 -38,-14,50 -38,-13,51 -38,-14,50 -38,-14,51 -38,-13,51 -38,-14,51 -38,-13,51 -38,-14,51 -38,-14,50 -38,-13,51 -38,-14,50 -38,-14,51 -38,-13,51 -38,-14,51 -38,-13,51 -38,-14,51 -38,-14,50 -38,-13,51 -38,-14,50 -38,-14,51 -38,-13,51 -38,-14,51 -38,-13,51 -38,-14,51 -38,-14,50 -38,-13,51 -38,-14,50 -38,-14,51 -38,-13,51 -38,-14,51 -38,-13,51 -38,-14,51 -38,-14,50 -38,-13,51 -38,-14,50 -38,-14,51 -38,-13,51 -38,-14,51 -38,-13,51 -38,-14,51 -38,-14,50 -38,-13,51 -38,-14,50 -38,-14,51 -38,-13,51 -38,-14,51 -38,-13,51 -38,-14,51 -38,-14,50 -38,-13,51 -38,-14,50 -38,-14,51 -38,-13,51 -38,-14,51 -38,-13,51 -38,-14,51 -38,-14,50 -38,-13,51 -38,-14,50 -38,-14,51 -38,-13,51 -38,-14,51 -38,-13,51 -38,-14,51 -38,-14,50 -38,-13,51 -38,-14,50 -38,-14,51 -38,-13,51 -38,-14,51 -38,-13,51 -38,-14,51 -38,-14,50 -38,-13,51 -38,-14,50 -38,-14,51 -38,-13,51 -38,-14,51 -38,-13,51 -38,-14,51 -38,-14,50 -38,-13,51 -38,-14,50 -38,-14,51 -38,-13,51 -38,-14,51 -38,-13,51 -38,-14,51 -38,-14,50 -38,-13,51 -38,-14,50 -38,-14,51 -38,-13,51 -38,-14,51 -38,-13,51 -38,-14,51 -38,-14,50 -38,-13,51 -38,-14,50 -38,-14,51 -38,-13,51 -38,-14,51 -38,-13,51 -38,-14,51 -38,-14,50 -38,-13,51 -38,-14,50 -38,-14,51 -38,-13,51 -38,-14,51 -38,-13,51 -38,-14,51 -38,-14,50 -38,-13,51 -38,-14,50 -38,-14,51 -38,-13,51 -38,-14,51 -38,-13,51 -38,-14,51 -38,-14,50 -38,-13,51 -38,-14,50 -38,-14,51 -38,-13,51 -38,-14,51 -38,-13,51 -38,-14,51 -38,-14,50 -38,-13,51 -38,-14,50 -38,-14,51 -38,-13,51 ================================================ FILE: modin/tests/pandas/data/issue_3119.csv ================================================ ,a,b,c i1,0,1,2 i2,3,4,5 i3,6,7,8 i4,9,10,11 ================================================ FILE: modin/tests/pandas/data/issue_4543.csv ================================================ str_data,float_data,country fanta,3.14,usa cocacola,9.8,france sprite,89.2,china ================================================ FILE: modin/tests/pandas/data/issue_976.csv ================================================ 1;11800000560005;11800000560005; ;;-;. ;. i; ; ;105.6000 1;10200007400477;10200007400477; ;;-;. ;;³ ; ;696.6400 1;11100008540930;11100008540930; ;2;9;. ;.; ; ;124.4800 1;12300000051493;12300000051493; ;;50;. ;.;'- ; ;-0.4700 1;12300000117460;12300000117460; ³ ;;60;. ;;'- ; ;221.0400 ================================================ FILE: modin/tests/pandas/data/multiple_csv/test_data0.csv ================================================ a,b,c 0,True,x 1,False,y 2,True,z 3,False,w ================================================ FILE: modin/tests/pandas/data/multiple_csv/test_data1.csv ================================================ a,b,c 4,True,m 5,False,n 6,True,t 7,True,l ================================================ FILE: modin/tests/pandas/data/newlines.csv ================================================ col1,col2,col3,col4 "This is a very long string with several newline characters that will probably cause some problem for Modin and I suspect that we will hopefully reproduce the issue",2,3,4 "H",2,3,4 "I",2,3,4 "J",2,3,4 "And there is another string with several newline characters that will probably cause some problem for Modin and I suspect that we will hopefully reproduce the issue",2,3,4 "I",2,3,4 "J",2,3,4 "H",2,3,4 "I",2,3,4 "J",2,3,4 "H",2,3,4 "I",2,3,4 "And there is another string with several newline characters that will probably cause some problem for Modin and I suspect that we will hopefully reproduce the issue",2,"And there is another string with several newline characters that will probably cause some problem for Modin and I suspect that we will hopefully reproduce the issue",4 "I",2,3,4 "J",2,3,4 "H",2,3,4 "I",2,3,4 "J",2,3,4 "H",2,3,4 "I",2,3,4 "And there is another string with several newline characters that will probably cause some problem for Modin and I suspect that we will hopefully reproduce the issue",2,3,4 ================================================ FILE: modin/tests/pandas/data/test_categories.csv ================================================ 111,AAA 222,BBB 333,CCC ================================================ FILE: modin/tests/pandas/data/test_categories.json ================================================ {"one":{"0":111,"1":222,"2":333},"two":{"0":"AAA","1":"BBB","2":"CCC"}} ================================================ FILE: modin/tests/pandas/data/test_data.fwf ================================================ ACW000116041961TAVG -142 k 183 k 419 k 720 k 1075 k 1546 k 1517 k 1428 k 1360 k 1121 k 457 k -92 k ACW000116041962TAVG 60 k 32 k -207 k 582 k 855 k 1328 k 1457 k 1340 k 1110 k 941 k 270 k -179 k ACW000116041963TAVG -766 k -606 k -152 k 488 k 1171 k 1574 k 1567 k 1543 k 1279 k 887 k 513 k -161 k ACW000116041964TAVG 9 k -138 k 2 k 685 k 1166 k 1389 k 1453 k 1504 k 1168 k 735 k 493 k 59 k ACW000116041965TAVG -9 k -158 k -15 k 537 k 934 k 1447 k 1434 k 1424 k 1324 k 921 k -22 k -231 k ACW000116041966TAVG -490 k -614 k 108 k 246 k 1082 k 1642 k 1620 k 1471 k 1195 k 803 k 329 k 2 k ACW000116041967TAVG -270 k 36 k 397 k 481 k 1052 k 1373 k 1655 k 1598 k 1318 k 997 k 559 k -96 k ACW000116041968TAVG -306 k -183 k 220 k 714 k 935 k 1635 k 1572 k 1718 k 1331 k 781 k 180 k -56 k ACW000116041969TAVG -134 k -494 k -185 k 497 k 962 k 1634 k 1687 k 1773 k 1379 k 932 k 321 k -275 k ACW000116041970TAVG -483 k -704 k -75 k 261 k 1093 k 1724 k 1470 k 1609 k 1163 k 836 k 300 k 73 k ACW000116041971TAVG -6 k 83 k -40 k 472 k 1180 k 1411 k 1700 k 1600 k 1165 k 908 k 361 k 383 k ACW000116041972TAVG -377 k -4 k 250 k 556 k 1117 k 1444 k 1778 k 1545 k 1073 k 797 k 481 k 404 k ACW000116041973TAVG 61 k 169 k 453 k 472 k 1075 k 1545 k 1866 k 1579 k 1199 k 563 k 154 k 11 k ACW000116041974TAVG 191 k 209 k 339 k 748 k 1094 k 1463 k 1498 k 1541 k 1319 k 585 k 428 k 335 k ACW000116041975TAVG 346 k 88 k 198 k 488 k 1165 k 1483 k 1756 k 1906 k 1374 k 845 k 406 k 387 k ACW000116041976TAVG -163 k -62 k -135 k 502 k 1128 k 1461 k 1822 k 1759 k 1136 k 715 k 458 k -205 k ACW000116041977TAVG -192 k -279 k 234 k 332 k 1128 k 1566 k 1565 k 1556 k 1126 k 949 k 421 k 162 k ACW000116041978TAVG 55 k -354 k 66 k 493 k 1155 k 1552 k 1564 k 1555 k 1061 k 932 k 688 k -464 k ACW000116041979TAVG -618 k -632 k 35 k 474 k 993 k 1566 k 1484 k 1483 k 1229 k 647 k 412 k -40 k ACW000116041980TAVG -340 k -500 k -35 k 524 k 1071 k 1534 k 1655 k 1502 k 1269 k 660 k 138 k 125 k ================================================ FILE: modin/tests/pandas/data/test_data.json ================================================ {"Duration":60,"Pulse":110,"Maxpulse":130,"Calories":409} {"Duration":60,"Pulse":117,"Maxpulse":145,"Calories":479} {"Duration":60,"Pulse":103,"Maxpulse":135,"Calories":340} {"Duration":45,"Pulse":109,"Maxpulse":175,"Calories":282} {"Duration":45,"Pulse":117,"Maxpulse":148,"Calories":406} {"Duration":60,"Pulse":102,"Maxpulse":127,"Calories":300} ================================================ FILE: modin/tests/pandas/data/test_delim.csv ================================================ a|b|c|d|e 1|2|3|4|5 2|3|4|5|6 3|4|5|6|7 4|5|6|7|8 5|6|7|8|9 6|7|8|9|0 ================================================ FILE: modin/tests/pandas/data/test_different_columns_in_rows.json ================================================ {"a1": 1} {"a2": 1} {"a3": 2} {"a4": 1} {"a5": 2} {"a6": 1} {"a7": 2} {"a8": 1} {"a9": 2} {"a10": 1} {"a11": 2} {"a12": 1} {"a13": 2} {"a14": 1} {"a15": 2} {"a16": 2} ================================================ FILE: modin/tests/pandas/data/test_null_col.csv ================================================ a,b,c 1,1, 2,2, 3,3, ================================================ FILE: modin/tests/pandas/data/test_time_parsing.csv ================================================ timestamp,year,month,date,symbol,high,low,open,close,spread,volume 2010-04-01 00:00:00,2010,04,01,USD/JPY,93.52600,93.36100,93.51800,93.38200,0.00500,3049 2010-04-01 00:30:00,2010,04,01,USD/JPY,93.47500,93.35200,93.38500,93.39100,0.00600,2251 2010-04-01 01:00:00,2010,04,01,USD/JPY,93.42100,93.32600,93.39100,93.38400,0.00600,1577 ================================================ FILE: modin/tests/pandas/data/test_usecols.csv ================================================ a,b,c,d,e 1,2,3,4,5 2,3,4,5,6 3,4,5,6,7 4,5,6,7,8 5,6,7,8,9 6,7,8,9,0 ================================================ FILE: modin/tests/pandas/dataframe/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. ================================================ FILE: modin/tests/pandas/dataframe/test_binary.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import matplotlib import numpy as np import pandas import pytest import modin.pandas as pd from modin.config import NPartitions, StorageFormat from modin.core.dataframe.pandas.partitioning.axis_partition import ( PandasDataframeAxisPartition, ) from modin.core.storage_formats.pandas.query_compiler_caster import ( _assert_casting_functions_wrap_same_implementation, ) from modin.tests.pandas.utils import ( CustomIntegerForAddition, NonCommutativeMultiplyInteger, create_test_dfs, default_to_pandas_ignore_string, df_equals, eval_general, test_data, test_data_keys, test_data_values, ) from modin.tests.test_utils import ( df_or_series_using_native_execution, warns_that_defaulting_to_pandas_if, ) from modin.utils import get_current_execution NPartitions.put(4) # Force matplotlib to not use any Xwindows backend. matplotlib.use("Agg") # Our configuration in pytest.ini requires that we explicitly catch all # instances of defaulting to pandas, but some test modules, like this one, # have too many such instances. pytestmark = pytest.mark.filterwarnings(default_to_pandas_ignore_string) @pytest.mark.parametrize( "other", [ lambda df, axis: 4, lambda df, axis: df.iloc[0] if axis == "columns" else list(df[df.columns[0]]), lambda df, axis: { label: idx + 1 for idx, label in enumerate(df.axes[0 if axis == "rows" else 1]) }, lambda df, axis: { label if idx % 2 else f"random_key{idx}": idx + 1 for idx, label in enumerate(df.axes[0 if axis == "rows" else 1][::-1]) }, ], ids=[ "scalar", "series_or_list", "dictionary_keys_equal_columns", "dictionary_keys_unequal_columns", ], ) @pytest.mark.parametrize("axis", ["rows", "columns"]) @pytest.mark.parametrize( "op", [ *("add", "radd", "sub", "rsub", "mod", "rmod", "pow", "rpow"), *("truediv", "rtruediv", "mul", "rmul", "floordiv", "rfloordiv"), ], ) @pytest.mark.parametrize("backend", [None, "pyarrow"]) def test_math_functions(other, axis, op, backend): data = test_data["float_nan_data"] if (op == "floordiv" or op == "rfloordiv") and axis == "rows": # lambda == "series_or_list" pytest.xfail(reason="different behavior") if op == "rmod" and axis == "rows": # lambda == "series_or_list" pytest.xfail(reason="different behavior") if op in ("mod", "rmod") and backend == "pyarrow": pytest.skip(reason="These functions are not implemented in pandas itself") eval_general( *create_test_dfs(data, backend=backend), lambda df: getattr(df, op)(other(df, axis), axis=axis), ) @pytest.mark.parametrize("other", [lambda df: 2, lambda df: df]) def test___divmod__(other): data = test_data["float_nan_data"] eval_general(*create_test_dfs(data), lambda df: divmod(df, other(df))) def test___rdivmod__(): data = test_data["float_nan_data"] eval_general(*create_test_dfs(data), lambda df: divmod(2, df)) @pytest.mark.parametrize( "other", [lambda df: df[: -(2**4)], lambda df: df[df.columns[0]].reset_index(drop=True)], ids=["check_missing_value", "check_different_index"], ) @pytest.mark.parametrize("fill_value", [None, 3.0]) @pytest.mark.parametrize( "op", [ *("add", "radd", "sub", "rsub", "mod", "rmod", "pow", "rpow"), *("truediv", "rtruediv", "mul", "rmul", "floordiv", "rfloordiv"), ], ) def test_math_functions_fill_value(other, fill_value, op, request): data = test_data["int_data"] modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) expected_exception = None if "check_different_index" in request.node.callspec.id and fill_value == 3.0: expected_exception = NotImplementedError("fill_value 3.0 not supported.") eval_general( modin_df, pandas_df, lambda df: getattr(df, op)(other(df), axis=0, fill_value=fill_value), expected_exception=expected_exception, # This test causes an empty slice to be generated thus triggering: # https://github.com/modin-project/modin/issues/5974 comparator_kwargs={"check_dtypes": get_current_execution() != "BaseOnPython"}, ) @pytest.mark.parametrize( "op", [ *("add", "radd", "sub", "rsub", "mod", "rmod", "pow", "rpow"), *("truediv", "rtruediv", "mul", "rmul", "floordiv", "rfloordiv"), ], ) def test_math_functions_level(op): modin_df = pd.DataFrame(test_data["int_data"]) modin_df.index = pandas.MultiIndex.from_tuples( [(i // 4, i // 2, i) for i in modin_df.index] ) # Defaults to pandas with warns_that_defaulting_to_pandas_if( not df_or_series_using_native_execution(modin_df) ): # Operation against self for sanity check getattr(modin_df, op)(modin_df, axis=0, level=1) @pytest.mark.parametrize( "math_op, alias", [ ("truediv", "divide"), ("truediv", "div"), ("rtruediv", "rdiv"), ("mul", "multiply"), ("sub", "subtract"), ("add", "__add__"), ("radd", "__radd__"), ("truediv", "__truediv__"), ("rtruediv", "__rtruediv__"), ("floordiv", "__floordiv__"), ("rfloordiv", "__rfloordiv__"), ("mod", "__mod__"), ("rmod", "__rmod__"), ("mul", "__mul__"), ("rmul", "__rmul__"), ("pow", "__pow__"), ("rpow", "__rpow__"), ("sub", "__sub__"), ("rsub", "__rsub__"), ], ) def test_math_alias(math_op, alias): _assert_casting_functions_wrap_same_implementation( getattr(pd.DataFrame, math_op), getattr(pd.DataFrame, alias) ) @pytest.mark.parametrize("other", ["as_left", 4, 4.0, "a"]) @pytest.mark.parametrize("op", ["eq", "ge", "gt", "le", "lt", "ne"]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_comparison(data, op, other, request): def operation(df): return getattr(df, op)(df if other == "as_left" else other) expected_exception = None if "int_data" in request.node.callspec.id and other == "a": pytest.xfail(reason="https://github.com/modin-project/modin/issues/7019") elif "float_nan_data" in request.node.callspec.id and other == "a": expected_exception = TypeError( "Invalid comparison between dtype=float64 and str" ) eval_general( *create_test_dfs(data), operation=operation, expected_exception=expected_exception, ) @pytest.mark.skipif( StorageFormat.get() != "Pandas", reason="Modin on this engine doesn't create virtual partitions.", ) @pytest.mark.parametrize( "left_virtual,right_virtual", [(True, False), (False, True), (True, True)] ) def test_virtual_partitions(left_virtual: bool, right_virtual: bool): # This test covers https://github.com/modin-project/modin/issues/4691 n: int = 1000 pd_df = pandas.DataFrame(list(range(n))) def modin_df(is_virtual): if not is_virtual: return pd.DataFrame(pd_df) result = pd.concat([pd.DataFrame([i]) for i in range(n)], ignore_index=True) # Modin should rebalance the partitions after the concat, producing virtual partitions. assert isinstance( result._query_compiler._modin_frame._partitions[0][0], PandasDataframeAxisPartition, ) return result df_equals(modin_df(left_virtual) + modin_df(right_virtual), pd_df + pd_df) @pytest.mark.parametrize("op", ["eq", "ge", "gt", "le", "lt", "ne"]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_multi_level_comparison(data, op): modin_df_multi_level = pd.DataFrame(data) new_idx = pandas.MultiIndex.from_tuples( [(i // 4, i // 2, i) for i in modin_df_multi_level.index] ) modin_df_multi_level.index = new_idx # Defaults to pandas with warns_that_defaulting_to_pandas_if( not df_or_series_using_native_execution(modin_df_multi_level) ): # Operation against self for sanity check getattr(modin_df_multi_level, op)(modin_df_multi_level, axis=0, level=1) @pytest.mark.parametrize( "frame1_data,frame2_data,expected_pandas_equals", [ pytest.param({}, {}, True, id="two_empty_dataframes"), pytest.param([[1]], [[0]], False, id="single_unequal_values"), pytest.param([[None]], [[None]], True, id="single_none_values"), pytest.param([[np.nan]], [[np.nan]], True, id="single_nan_values"), pytest.param({1: [10]}, {1.0: [10]}, True, id="different_column_types"), pytest.param({1: [10]}, {2: [10]}, False, id="different_columns"), pytest.param( pandas.DataFrame({1: [10]}, index=[1]), pandas.DataFrame({1: [10]}, index=[1.0]), True, id="different_index_types", ), pytest.param( pandas.DataFrame({1: [10]}, index=[1]), pandas.DataFrame({1: [10]}, index=[2]), False, id="different_indexes", ), pytest.param({1: [10]}, {1: [10.0]}, False, id="different_value_types"), pytest.param( [[1, 2], [3, 4]], [[1, 2], [3, 4]], True, id="equal_two_by_two_dataframes", ), pytest.param( [[1, 2], [3, 4]], [[5, 2], [3, 4]], False, id="unequal_two_by_two_dataframes", ), pytest.param( [[1, 1]], [[1]], False, id="different_row_lengths", ), pytest.param( [[1], [1]], [[1]], False, id="different_column_lengths", ), ], ) def test_equals(frame1_data, frame2_data, expected_pandas_equals): modin_df1 = pd.DataFrame(frame1_data) pandas_df1 = pandas.DataFrame(frame1_data) modin_df2 = pd.DataFrame(frame2_data) pandas_df2 = pandas.DataFrame(frame2_data) pandas_equals = pandas_df1.equals(pandas_df2) assert pandas_equals == expected_pandas_equals, ( "Test expected pandas to say the dataframes were" + f"{'' if expected_pandas_equals else ' not'} equal, but they were" + f"{' not' if expected_pandas_equals else ''} equal." ) assert modin_df1.equals(modin_df2) == pandas_equals assert modin_df1.equals(pandas_df2) == pandas_equals def test_equals_several_partitions(): modin_series1 = pd.concat([pd.DataFrame([0, 1]), pd.DataFrame([None, 1])]) modin_series2 = pd.concat([pd.DataFrame([0, 1]), pd.DataFrame([1, None])]) assert not modin_series1.equals(modin_series2) def test_equals_with_nans(): df1 = pd.DataFrame([0, 1, None], dtype="uint8[pyarrow]") df2 = pd.DataFrame([None, None, None], dtype="uint8[pyarrow]") assert not df1.equals(df2) @pytest.mark.parametrize("is_more_other_partitions", [True, False]) @pytest.mark.parametrize( "op_type", ["df_ser", "df_df", "ser_ser_same_name", "ser_ser_different_name"] ) @pytest.mark.parametrize( "is_idx_aligned", [True, False], ids=["idx_aligned", "idx_not_aligned"] ) def test_mismatched_row_partitions(is_idx_aligned, op_type, is_more_other_partitions): data = [0, 1, 2, 3, 4, 5] modin_df1, pandas_df1 = create_test_dfs({"a": data, "b": data}) modin_df, pandas_df = modin_df1.loc[:2], pandas_df1.loc[:2] modin_df2 = pd.concat((modin_df, modin_df)) pandas_df2 = pandas.concat((pandas_df, pandas_df)) if is_more_other_partitions: modin_df2, modin_df1 = modin_df1, modin_df2 pandas_df2, pandas_df1 = pandas_df1, pandas_df2 if is_idx_aligned: if is_more_other_partitions: modin_df1.index = pandas_df1.index = pandas_df2.index else: modin_df2.index = pandas_df2.index = pandas_df1.index # Pandas don't support this case because result will contain duplicate values by col axis. if op_type == "df_ser" and not is_idx_aligned and is_more_other_partitions: eval_general( modin_df2, pandas_df2, lambda df: ( df / modin_df1.a if isinstance(df, pd.DataFrame) else df / pandas_df1.a ), expected_exception=ValueError( "cannot reindex on an axis with duplicate labels" ), ) return if op_type == "df_ser": modin_res = modin_df2 / modin_df1.a pandas_res = pandas_df2 / pandas_df1.a elif op_type == "df_df": modin_res = modin_df2 / modin_df1 pandas_res = pandas_df2 / pandas_df1 elif op_type == "ser_ser_same_name": modin_res = modin_df2.a / modin_df1.a pandas_res = pandas_df2.a / pandas_df1.a elif op_type == "ser_ser_different_name": modin_res = modin_df2.a / modin_df1.b pandas_res = pandas_df2.a / pandas_df1.b else: raise Exception(f"op_type: {op_type} not supported in test") df_equals(modin_res, pandas_res) def test_duplicate_indexes(): data = [0, 1, 2, 3, 4, 5] modin_df1, pandas_df1 = create_test_dfs( {"a": data, "b": data}, index=[0, 1, 2, 0, 1, 2] ) modin_df2, pandas_df2 = create_test_dfs({"a": data, "b": data}) df_equals(modin_df1 / modin_df2, pandas_df1 / pandas_df2) df_equals(modin_df1 / modin_df1, pandas_df1 / pandas_df1) @pytest.mark.parametrize("subset_operand", ["left", "right"]) def test_mismatched_col_partitions(subset_operand): data = [0, 1, 2, 3] modin_df1, pandas_df1 = create_test_dfs({"a": data, "b": data}) modin_df_tmp, pandas_df_tmp = create_test_dfs({"c": data}) modin_df2 = pd.concat([modin_df1, modin_df_tmp], axis=1) pandas_df2 = pandas.concat([pandas_df1, pandas_df_tmp], axis=1) if subset_operand == "right": modin_res = modin_df2 + modin_df1 pandas_res = pandas_df2 + pandas_df1 else: modin_res = modin_df1 + modin_df2 pandas_res = pandas_df1 + pandas_df2 df_equals(modin_res, pandas_res) @pytest.mark.parametrize("empty_operand", ["right", "left", "both"]) def test_empty_df(empty_operand): modin_df, pandas_df = create_test_dfs([0, 1, 2, 0, 1, 2]) modin_df_empty, pandas_df_empty = create_test_dfs() if empty_operand == "right": modin_res = modin_df + modin_df_empty pandas_res = pandas_df + pandas_df_empty elif empty_operand == "left": modin_res = modin_df_empty + modin_df pandas_res = pandas_df_empty + pandas_df else: modin_res = modin_df_empty + modin_df_empty pandas_res = pandas_df_empty + pandas_df_empty df_equals(modin_res, pandas_res) def test_add_string_to_df(): modin_df, pandas_df = create_test_dfs(["a", "b"]) eval_general(modin_df, pandas_df, lambda df: "string" + df) eval_general(modin_df, pandas_df, lambda df: df + "string") def test_add_custom_class(): # see https://github.com/modin-project/modin/issues/5236 # Test that we can add any object that is addable to pandas object data # via "+". eval_general( *create_test_dfs(test_data["int_data"]), lambda df: df + CustomIntegerForAddition(4), ) def test_non_commutative_multiply_pandas(): # The non commutative integer class implementation is tricky. Check that # multiplying such an integer with a pandas dataframe is really not # commutative. pandas_df = pandas.DataFrame([[1]], dtype=int) integer = NonCommutativeMultiplyInteger(2) assert not (integer * pandas_df).equals(pandas_df * integer) def test_non_commutative_multiply(): # This test checks that mul and rmul do different things when # multiplication is not commutative, e.g. for adding a string to a string. # For context see https://github.com/modin-project/modin/issues/5238 modin_df, pandas_df = create_test_dfs([1], dtype=int) integer = NonCommutativeMultiplyInteger(2) eval_general(modin_df, pandas_df, lambda s: integer * s) eval_general(modin_df, pandas_df, lambda s: s * integer) @pytest.mark.parametrize( "op", [ *("add", "radd", "sub", "rsub", "mod", "rmod", "pow", "rpow"), *("truediv", "rtruediv", "mul", "rmul", "floordiv", "rfloordiv"), ], ) @pytest.mark.parametrize( "val1", [ pytest.param([10, 20], id="int"), pytest.param([10, True], id="obj"), pytest.param([True, True], id="bool"), pytest.param([3.5, 4.5], id="float"), ], ) @pytest.mark.parametrize( "val2", [ pytest.param([10, 20], id="int"), pytest.param([10, True], id="obj"), pytest.param([True, True], id="bool"), pytest.param([3.5, 4.5], id="float"), pytest.param(2, id="int scalar"), pytest.param(True, id="bool scalar"), pytest.param(3.5, id="float scalar"), ], ) def test_arithmetic_with_tricky_dtypes(val1, val2, op, request): modin_df1, pandas_df1 = create_test_dfs(val1) modin_df2, pandas_df2 = ( create_test_dfs(val2) if isinstance(val2, list) else (val2, val2) ) expected_exception = None if ( "bool-bool" in request.node.callspec.id or "bool scalar-bool" in request.node.callspec.id ) and op in [ "pow", "rpow", "truediv", "rtruediv", "floordiv", "rfloordiv", ]: op_name = op[1:] if op.startswith("r") else op expected_exception = NotImplementedError( f"operator '{op_name}' not implemented for bool dtypes" ) elif ( "bool-bool" in request.node.callspec.id or "bool scalar-bool" in request.node.callspec.id ) and op in ["sub", "rsub"]: expected_exception = TypeError( "numpy boolean subtract, the `-` operator, is not supported, " + "use the bitwise_xor, the `^` operator, or the logical_xor function instead." ) eval_general( (modin_df1, modin_df2), (pandas_df1, pandas_df2), lambda dfs: getattr(dfs[0], op)(dfs[1]), expected_exception=expected_exception, ) @pytest.mark.parametrize( "data, other_data", [ ({"A": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, 5, 6], "C": [7, 8, 9]}), ({"C": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, 5, 6], "A": [7, 8, 9]}), ], ) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("match_index", [True, False]) def test_bin_op_mismatched_columns(data, other_data, axis, match_index): modin_df, pandas_df = create_test_dfs(data) other_modin_df, other_pandas_df = create_test_dfs(other_data) if axis == 0: if not match_index: modin_df.index = pandas_df.index = ["1", "2", "3"] other_modin_df.index = other_pandas_df.index = ["2", "1", "3"] eval_general( modin_df, pandas_df, lambda df: ( df.add(other_modin_df, axis=axis) if isinstance(df, pd.DataFrame) else df.add(other_pandas_df, axis=axis) ), ) ================================================ FILE: modin/tests/pandas/dataframe/test_default.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import io import warnings import matplotlib import numpy as np import pandas import pandas._libs.lib as lib import pyarrow as pa import pytest from numpy.testing import assert_array_equal from packaging.version import Version import modin.pandas as pd from modin.config import Backend, Engine, NPartitions, StorageFormat from modin.pandas.io import to_pandas from modin.tests.pandas.utils import ( axis_keys, axis_values, create_test_dfs, create_test_series, default_to_pandas_ignore_string, df_equals, eval_general, generate_multiindex, modin_df_almost_equals_pandas, name_contains, numeric_dfs, test_data, test_data_diff_dtype, test_data_keys, test_data_large_categorical_dataframe, test_data_resample, test_data_values, ) from modin.tests.test_utils import ( current_execution_is_native, df_or_series_using_native_execution, warns_that_defaulting_to_pandas_if, ) from modin.utils import get_current_execution NPartitions.put(4) # Force matplotlib to not use any Xwindows backend. matplotlib.use("Agg") # Our configuration in pytest.ini requires that we explicitly catch all # instances of defaulting to pandas, but some test modules, like this one, # have too many such instances. pytestmark = [ pytest.mark.filterwarnings(default_to_pandas_ignore_string), # IGNORE FUTUREWARNINGS MARKS TO CLEANUP OUTPUT pytest.mark.filterwarnings( "ignore:.*bool is now deprecated and will be removed:FutureWarning" ), pytest.mark.filterwarnings( "ignore:first is deprecated and will be removed:FutureWarning" ), pytest.mark.filterwarnings( "ignore:last is deprecated and will be removed:FutureWarning" ), ] @pytest.mark.parametrize( "op, make_args", [ ("align", lambda df: {"other": df}), ("corrwith", lambda df: {"other": df}), ("ewm", lambda df: {"com": 0.5}), ("from_dict", lambda df: {"data": None}), ("from_records", lambda df: {"data": to_pandas(df)}), ("hist", lambda df: {"column": "int_col"}), ("interpolate", None), ("mask", lambda df: {"cond": df != 0}), ("pct_change", None), ("to_xarray", None), ("flags", None), ("set_flags", lambda df: {"allows_duplicate_labels": False}), ], ) def test_ops_defaulting_to_pandas(op, make_args): modin_df = pd.DataFrame(test_data_diff_dtype).drop(["str_col", "bool_col"], axis=1) with warns_that_defaulting_to_pandas_if( not df_or_series_using_native_execution(modin_df) ): operation = getattr(modin_df, op) if make_args is not None: operation(**make_args(modin_df)) else: try: operation() # `except` for non callable attributes except TypeError: pass def test_style(): data = test_data_values[0] with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): pd.DataFrame(data).style def test_to_timestamp(): idx = pd.date_range("1/1/2012", periods=5, freq="M") df = pd.DataFrame(np.random.randint(0, 100, size=(len(idx), 4)), index=idx) with warns_that_defaulting_to_pandas_if( not df_or_series_using_native_execution(df) ): df.to_period().to_timestamp() @pytest.mark.parametrize( "data", test_data_values + [test_data_large_categorical_dataframe], ids=test_data_keys + ["categorical_ints"], ) def test_to_numpy(data): modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) assert_array_equal(modin_df.values, pandas_df.values) @pytest.mark.skipif( StorageFormat.get() != "Pandas", reason="NativeQueryCompiler does not contain partitions.", ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_partition_to_numpy(data): frame = pd.DataFrame(data) for partition in frame._query_compiler._modin_frame._partitions.flatten().tolist(): assert_array_equal(partition.to_pandas().values, partition.to_numpy()) def test_asfreq(): index = pd.date_range("1/1/2000", periods=4, freq="min") series = pd.Series([0.0, None, 2.0, 3.0], index=index) df = pd.DataFrame({"s": series}) with warns_that_defaulting_to_pandas_if( not df_or_series_using_native_execution(df) ): # We are only testing that this defaults to pandas, so we will just check for # the warning df.asfreq(freq="30S") def test_assign(): data = test_data_values[0] modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) modin_result = modin_df.assign(new_column=pd.Series(modin_df.iloc[:, 0])) pandas_result = pandas_df.assign(new_column=pandas.Series(pandas_df.iloc[:, 0])) df_equals(modin_result, pandas_result) modin_result = modin_df.assign( new_column=pd.Series(modin_df.iloc[:, 0]), new_column2=pd.Series(modin_df.iloc[:, 1]), ) pandas_result = pandas_df.assign( new_column=pandas.Series(pandas_df.iloc[:, 0]), new_column2=pandas.Series(pandas_df.iloc[:, 1]), ) df_equals(modin_result, pandas_result) def test_at_time(): i = pd.date_range("2008-01-01", periods=1000, freq="12H") modin_df = pd.DataFrame({"A": list(range(1000)), "B": list(range(1000))}, index=i) pandas_df = pandas.DataFrame( {"A": list(range(1000)), "B": list(range(1000))}, index=i ) df_equals(modin_df.at_time("12:00"), pandas_df.at_time("12:00")) df_equals(modin_df.at_time("3:00"), pandas_df.at_time("3:00")) df_equals(modin_df.T.at_time("12:00", axis=1), pandas_df.T.at_time("12:00", axis=1)) def test_between_time(): i = pd.date_range("2008-01-01", periods=1000, freq="12H") modin_df = pd.DataFrame({"A": list(range(1000)), "B": list(range(1000))}, index=i) pandas_df = pandas.DataFrame( {"A": list(range(1000)), "B": list(range(1000))}, index=i ) df_equals( modin_df.between_time("12:00", "17:00"), pandas_df.between_time("12:00", "17:00"), ) df_equals( modin_df.between_time("3:00", "4:00"), pandas_df.between_time("3:00", "4:00"), ) df_equals( modin_df.T.between_time("12:00", "17:00", axis=1), pandas_df.T.between_time("12:00", "17:00", axis=1), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_bfill(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) df_equals(modin_df.bfill(), pandas_df.bfill()) @pytest.mark.parametrize("limit_area", [None, "inside", "outside"]) @pytest.mark.parametrize("method", ["ffill", "bfill"]) def test_ffill_bfill_limit_area(method, limit_area): modin_df, pandas_df = create_test_dfs([1, None, 2, None]) eval_general( modin_df, pandas_df, lambda df: getattr(df, method)(limit_area=limit_area) ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_bool(data): modin_df = pd.DataFrame(data) with pytest.warns( FutureWarning, match="bool is now deprecated and will be removed" ): with pytest.raises(ValueError): modin_df.bool() modin_df.__bool__() single_bool_pandas_df = pandas.DataFrame([True]) single_bool_modin_df = pd.DataFrame([True]) assert single_bool_pandas_df.bool() == single_bool_modin_df.bool() with pytest.raises(ValueError): # __bool__ always raises this error for DataFrames single_bool_modin_df.__bool__() @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_boxplot(data): modin_df = pd.DataFrame(data) assert modin_df.boxplot() == to_pandas(modin_df).boxplot() def test_combine_first(): data1 = {"A": [None, 0], "B": [None, 4]} modin_df1 = pd.DataFrame(data1) pandas_df1 = pandas.DataFrame(data1) data2 = {"A": [1, 1], "B": [3, 3]} modin_df2 = pd.DataFrame(data2) pandas_df2 = pandas.DataFrame(data2) df_equals( modin_df1.combine_first(modin_df2), pandas_df1.combine_first(pandas_df2), # https://github.com/modin-project/modin/issues/5959 check_dtypes=False, ) class TestCorr: @pytest.mark.parametrize("method", ["pearson", "kendall", "spearman"]) @pytest.mark.parametrize("backend", [None, "pyarrow"]) def test_corr(self, method, backend): eval_general( *create_test_dfs(test_data["int_data"], backend=backend), lambda df: df.corr(method=method), ) # Modin result may slightly differ from pandas result # due to floating pointing arithmetic. eval_general( *create_test_dfs(test_data["float_nan_data"], backend=backend), lambda df: df.corr(method=method), comparator=modin_df_almost_equals_pandas, ) @pytest.mark.parametrize("min_periods", [1, 3, 5, 6]) def test_corr_min_periods(self, min_periods): # only 3 valid values (a valid value is considered a row with no NaNs) eval_general( *create_test_dfs({"a": [1, 2, 3], "b": [3, 1, 5]}), lambda df: df.corr(min_periods=min_periods), ) # only 5 valid values (a valid value is considered a row with no NaNs) eval_general( *create_test_dfs( {"a": [1, 2, 3, 4, 5, np.nan], "b": [1, 2, 1, 4, 5, np.nan]} ), lambda df: df.corr(min_periods=min_periods), ) # only 4 valid values (a valid value is considered a row with no NaNs) eval_general( *create_test_dfs( {"a": [1, np.nan, 3, 4, 5, 6], "b": [1, 2, 1, 4, 5, np.nan]} ), lambda df: df.corr(min_periods=min_periods), ) if StorageFormat.get() == "Pandas": # only 4 valid values located in different partitions (a valid value is considered a row with no NaNs) modin_df, pandas_df = create_test_dfs( {"a": [1, np.nan, 3, 4, 5, 6], "b": [1, 2, 1, 4, 5, np.nan]} ) modin_df = pd.concat([modin_df.iloc[:3], modin_df.iloc[3:]]) assert modin_df._query_compiler._modin_frame._partitions.shape == (2, 1) eval_general( modin_df, pandas_df, lambda df: df.corr(min_periods=min_periods) ) @pytest.mark.parametrize("numeric_only", [True, False]) def test_corr_non_numeric(self, numeric_only): if not numeric_only: pytest.xfail(reason="https://github.com/modin-project/modin/issues/7023") eval_general( *create_test_dfs({"a": [1, 2, 3], "b": [3, 2, 5], "c": ["a", "b", "c"]}), lambda df: df.corr(numeric_only=numeric_only), ) @pytest.mark.skipif( StorageFormat.get() != "Pandas", reason="doesn't make sense for non-partitioned executions", ) def test_corr_nans_in_different_partitions(self): # NaN in the first partition modin_df, pandas_df = create_test_dfs( {"a": [np.nan, 2, 3, 4, 5, 6], "b": [3, 4, 2, 0, 7, 8]} ) modin_df = pd.concat([modin_df.iloc[:2], modin_df.iloc[2:4], modin_df.iloc[4:]]) assert modin_df._query_compiler._modin_frame._partitions.shape == (3, 1) eval_general(modin_df, pandas_df, lambda df: df.corr()) # NaN in the last partition modin_df, pandas_df = create_test_dfs( {"a": [1, 2, 3, 4, 5, np.nan], "b": [3, 4, 2, 0, 7, 8]} ) modin_df = pd.concat([modin_df.iloc[:2], modin_df.iloc[2:4], modin_df.iloc[4:]]) assert modin_df._query_compiler._modin_frame._partitions.shape == (3, 1) eval_general(modin_df, pandas_df, lambda df: df.corr()) # NaN in two partitions modin_df, pandas_df = create_test_dfs( {"a": [np.nan, 2, 3, 4, 5, 6], "b": [3, 4, 2, 0, 7, np.nan]} ) modin_df = pd.concat([modin_df.iloc[:2], modin_df.iloc[2:4], modin_df.iloc[4:]]) assert modin_df._query_compiler._modin_frame._partitions.shape == (3, 1) eval_general(modin_df, pandas_df, lambda df: df.corr()) # NaN in all partitions modin_df, pandas_df = create_test_dfs( {"a": [np.nan, 2, 3, np.nan, 5, 6], "b": [3, 4, 2, 0, 7, np.nan]} ) modin_df = pd.concat([modin_df.iloc[:2], modin_df.iloc[2:4], modin_df.iloc[4:]]) assert modin_df._query_compiler._modin_frame._partitions.shape == (3, 1) eval_general(modin_df, pandas_df, lambda df: df.corr()) @pytest.mark.parametrize("min_periods", [1, 3, 5], ids=lambda x: f"min_periods={x}") @pytest.mark.parametrize("ddof", [1, 2, 4], ids=lambda x: f"ddof={x}") @pytest.mark.parametrize("backend", [None, "pyarrow"]) def test_cov(min_periods, ddof, backend): eval_general( *create_test_dfs(test_data["int_data"], backend=backend), lambda df: df.cov(min_periods=min_periods, ddof=ddof), comparator=df_equals, ) # Modin result may slightly differ from pandas result # due to floating pointing arithmetic. That's why we use `modin_df_almost_equals_pandas`. eval_general( *create_test_dfs(test_data["float_nan_data"], backend=backend), lambda df: df.cov(min_periods=min_periods), comparator=modin_df_almost_equals_pandas, ) @pytest.mark.parametrize("numeric_only", [True, False]) def test_cov_numeric_only(numeric_only): if not numeric_only: pytest.xfail(reason="https://github.com/modin-project/modin/issues/7023") eval_general( *create_test_dfs({"a": [1, 2, 3], "b": [3, 2, 5], "c": ["a", "b", "c"]}), lambda df: df.cov(numeric_only=numeric_only), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_dot(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) col_len = len(modin_df.columns) # Test list input arr = np.arange(col_len) modin_result = modin_df.dot(arr) pandas_result = pandas_df.dot(arr) df_equals(modin_result, pandas_result) # Test bad dimensions with pytest.raises(ValueError): modin_df.dot(np.arange(col_len + 10)) # Test series input modin_series = pd.Series(np.arange(col_len), index=modin_df.columns) pandas_series = pandas.Series(np.arange(col_len), index=pandas_df.columns) modin_result = modin_df.dot(modin_series) pandas_result = pandas_df.dot(pandas_series) df_equals(modin_result, pandas_result) # Test dataframe input modin_result = modin_df.dot(modin_df.T) pandas_result = pandas_df.dot(pandas_df.T) df_equals(modin_result, pandas_result) # Test when input series index doesn't line up with columns with pytest.raises(ValueError): modin_df.dot(pd.Series(np.arange(col_len))) # Test case when left dataframe has size (n x 1) # and right dataframe has size (1 x n) modin_df = pd.DataFrame(modin_series) pandas_df = pandas.DataFrame(pandas_series) modin_result = modin_df.dot(modin_df.T) pandas_result = pandas_df.dot(pandas_df.T) df_equals(modin_result, pandas_result) # Test case when left dataframe has size (1 x 1) # and right dataframe has size (1 x n) modin_result = pd.DataFrame([1]).dot(modin_df.T) pandas_result = pandas.DataFrame([1]).dot(pandas_df.T) df_equals(modin_result, pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_matmul(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) col_len = len(modin_df.columns) # Test list input arr = np.arange(col_len) modin_result = modin_df @ arr pandas_result = pandas_df @ arr df_equals(modin_result, pandas_result) # Test bad dimensions with pytest.raises(ValueError): modin_df @ np.arange(col_len + 10) # Test series input modin_series = pd.Series(np.arange(col_len), index=modin_df.columns) pandas_series = pandas.Series(np.arange(col_len), index=pandas_df.columns) modin_result = modin_df @ modin_series pandas_result = pandas_df @ pandas_series df_equals(modin_result, pandas_result) # Test dataframe input modin_result = modin_df @ modin_df.T pandas_result = pandas_df @ pandas_df.T df_equals(modin_result, pandas_result) # Test when input series index doesn't line up with columns with pytest.raises(ValueError): modin_df @ pd.Series(np.arange(col_len)) def test_first(): i = pd.date_range("2010-04-09", periods=400, freq="2D") modin_df = pd.DataFrame({"A": list(range(400)), "B": list(range(400))}, index=i) pandas_df = pandas.DataFrame( {"A": list(range(400)), "B": list(range(400))}, index=i ) with pytest.warns(FutureWarning, match="first is deprecated and will be removed"): modin_result = modin_df.first("3D") df_equals(modin_result, pandas_df.first("3D")) df_equals(modin_df.first("20D"), pandas_df.first("20D")) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_info_default_param(data): with io.StringIO() as first, io.StringIO() as second: eval_general( pd.DataFrame(data), pandas.DataFrame(data), verbose=None, max_cols=None, memory_usage=None, operation=lambda df, **kwargs: df.info(**kwargs), buf=lambda df: second if isinstance(df, pandas.DataFrame) else first, ) modin_info = first.getvalue().splitlines() pandas_info = second.getvalue().splitlines() assert modin_info[0] == str(pd.DataFrame) assert pandas_info[0] == str(pandas.DataFrame) assert modin_info[1:] == pandas_info[1:] # randint data covers https://github.com/modin-project/modin/issues/5137 @pytest.mark.parametrize( "data", [test_data_values[0], np.random.randint(0, 100, (10, 10))] ) @pytest.mark.parametrize("verbose", [True, False]) @pytest.mark.parametrize("max_cols", [10, 99999999]) @pytest.mark.parametrize("memory_usage", [True, False, "deep"]) @pytest.mark.parametrize("show_counts", [True, False]) def test_info(data, verbose, max_cols, memory_usage, show_counts): with io.StringIO() as first, io.StringIO() as second: eval_general( pd.DataFrame(data), pandas.DataFrame(data), operation=lambda df, **kwargs: df.info(**kwargs), verbose=verbose, max_cols=max_cols, memory_usage=memory_usage, show_counts=show_counts, buf=lambda df: second if isinstance(df, pandas.DataFrame) else first, ) modin_info = first.getvalue().splitlines() pandas_info = second.getvalue().splitlines() assert modin_info[0] == str(pd.DataFrame) assert pandas_info[0] == str(pandas.DataFrame) assert modin_info[1:] == pandas_info[1:] @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize("skipna", [False, True]) @pytest.mark.parametrize("numeric_only", [False, True]) @pytest.mark.parametrize("method", ["kurtosis", "kurt"]) def test_kurt_kurtosis(axis, skipna, numeric_only, method): data = test_data["float_nan_data"] eval_general( *create_test_dfs(data), lambda df: getattr(df, method)( axis=axis, skipna=skipna, numeric_only=numeric_only ), ) def test_last(): modin_index = pd.date_range("2010-04-09", periods=400, freq="2D") pandas_index = pandas.date_range("2010-04-09", periods=400, freq="2D") modin_df = pd.DataFrame( {"A": list(range(400)), "B": list(range(400))}, index=modin_index ) pandas_df = pandas.DataFrame( {"A": list(range(400)), "B": list(range(400))}, index=pandas_index ) with pytest.warns(FutureWarning, match="last is deprecated and will be removed"): modin_result = modin_df.last("3D") df_equals(modin_result, pandas_df.last("3D")) df_equals(modin_df.last("20D"), pandas_df.last("20D")) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize( "id_vars", [lambda df: df.columns[0], lambda df: df.columns[:4], None] ) @pytest.mark.parametrize( "value_vars", [lambda df: df.columns[-1], lambda df: df.columns[-4:], None] ) def test_melt(data, id_vars, value_vars): def melt(df, *args, **kwargs): return df.melt(*args, **kwargs).sort_values(["variable", "value"]) eval_general( *create_test_dfs(data), lambda df, *args, **kwargs: melt(df, *args, **kwargs).reset_index(drop=True), id_vars=id_vars, value_vars=value_vars, ) # Functional test for BUG:7206 def test_melt_duplicate_col_names(): data = {"data": [[1, 2], [3, 4]], "columns": ["dupe", "dupe"]} def melt(df, *args, **kwargs): return df.melt(*args, **kwargs).sort_values(["variable", "value"]) eval_general( *create_test_dfs(**data), lambda df, *args, **kwargs: melt(df, *args, **kwargs).reset_index(drop=True), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize( "index", [lambda df: df.columns[0], lambda df: df.columns[:2], lib.no_default], ids=["one_column_index", "several_columns_index", "default"], ) @pytest.mark.parametrize( "columns", [lambda df: df.columns[len(df.columns) // 2]], ids=["one_column"] ) @pytest.mark.parametrize( "values", [lambda df: df.columns[-1], lambda df: df.columns[-2:], lib.no_default], ids=["one_column_values", "several_columns_values", "default"], ) def test_pivot(data, index, columns, values, request): current_execution = get_current_execution() if ( "one_column_values-one_column-default-float_nan_data" in request.node.callspec.id or "default-one_column-several_columns_index" in request.node.callspec.id or "default-one_column-one_column_index" in request.node.callspec.id or ( (current_execution == "BaseOnPython" or current_execution_is_native()) and index is lib.no_default ) ): pytest.xfail(reason="https://github.com/modin-project/modin/issues/7010") expected_exception = None if index is not lib.no_default: expected_exception = ValueError( "Index contains duplicate entries, cannot reshape" ) eval_general( *create_test_dfs(data), lambda df, *args, **kwargs: df.pivot(*args, **kwargs), index=index, columns=columns, values=values, expected_exception=expected_exception, ) @pytest.mark.parametrize("data", [test_data["int_data"]], ids=["int_data"]) @pytest.mark.parametrize( "index", [ pytest.param(lambda df: df.columns[0], id="single_index_col"), pytest.param( lambda df: [*df.columns[0:2], *df.columns[-7:-4]], id="multiple_index_cols" ), pytest.param(None, id="default_index"), ], ) @pytest.mark.parametrize( "columns", [ pytest.param(lambda df: df.columns[len(df.columns) // 2], id="single_col"), pytest.param( lambda df: [ *df.columns[(len(df.columns) // 2) : (len(df.columns) // 2 + 4)], df.columns[-7], ], id="multiple_cols", ), pytest.param(None, id="default_columns"), ], ) @pytest.mark.parametrize( "values", [ pytest.param(lambda df: df.columns[-1], id="single_value_col"), pytest.param(lambda df: df.columns[-4:-1], id="multiple_value_cols"), pytest.param(None, id="default_values"), ], ) @pytest.mark.parametrize( "aggfunc", [ pytest.param(np.mean, id="callable_tree_reduce_func"), pytest.param("mean", id="tree_reduce_func"), pytest.param("nunique", id="full_axis_func"), ], ) def test_pivot_table_data(data, index, columns, values, aggfunc, request): if ( "callable_tree_reduce_func-single_value_col-multiple_cols-multiple_index_cols" in request.node.callspec.id or "callable_tree_reduce_func-multiple_value_cols-multiple_cols-multiple_index_cols" in request.node.callspec.id or "tree_reduce_func-single_value_col-multiple_cols-multiple_index_cols" in request.node.callspec.id or "tree_reduce_func-multiple_value_cols-multiple_cols-multiple_index_cols" in request.node.callspec.id or "full_axis_func-single_value_col-multiple_cols-multiple_index_cols" in request.node.callspec.id or "full_axis_func-multiple_value_cols-multiple_cols-multiple_index_cols" in request.node.callspec.id ): pytest.xfail(reason="https://github.com/modin-project/modin/issues/7011") md_df, pd_df = create_test_dfs(data) # when values is None the output will be huge-dimensional, # so reducing dimension of testing data at that case if values is None: md_df, pd_df = md_df.iloc[:42, :42], pd_df.iloc[:42, :42] expected_exception = None if "default_columns-default_index" in request.node.callspec.id: expected_exception = ValueError("No group keys passed!") elif ( "callable_tree_reduce_func" in request.node.callspec.id and "int_data" in request.node.callspec.id ): expected_exception = TypeError("'numpy.float64' object is not callable") eval_general( md_df, pd_df, operation=lambda df, *args, **kwargs: df.pivot_table( *args, **kwargs ).sort_index(axis=int(index is not None)), index=index, columns=columns, values=values, aggfunc=aggfunc, expected_exception=expected_exception, ) @pytest.mark.parametrize("data", [test_data["int_data"]], ids=["int_data"]) @pytest.mark.parametrize( "index", [ pytest.param([], id="no_index_cols"), pytest.param(lambda df: df.columns[0], id="single_index_column"), pytest.param( lambda df: [df.columns[0], df.columns[len(df.columns) // 2 - 1]], id="multiple_index_cols", ), ], ) @pytest.mark.parametrize( "columns", [ pytest.param(lambda df: df.columns[len(df.columns) // 2], id="single_column"), pytest.param( lambda df: [ *df.columns[(len(df.columns) // 2) : (len(df.columns) // 2 + 4)], df.columns[-7], ], id="multiple_cols", ), ], ) @pytest.mark.parametrize( "values", [ pytest.param(lambda df: df.columns[-1], id="single_value"), pytest.param(lambda df: df.columns[-4:-1], id="multiple_values"), ], ) @pytest.mark.parametrize( "aggfunc", [ pytest.param(["mean", "sum"], id="list_func"), pytest.param( lambda df: {df.columns[5]: "mean", df.columns[-5]: "sum"}, id="dict_func" ), ], ) @pytest.mark.parametrize( "margins_name", [pytest.param("Custom name", id="str_name")], ) @pytest.mark.parametrize("fill_value", [None, 0]) @pytest.mark.parametrize("backend", [None, "pyarrow"]) def test_pivot_table_margins( data, index, columns, values, aggfunc, margins_name, fill_value, backend, request, ): expected_exception = None if "dict_func" in request.node.callspec.id: expected_exception = KeyError("Column(s) ['col28', 'col38'] do not exist") eval_general( *create_test_dfs(data, backend=backend), operation=lambda df, *args, **kwargs: df.pivot_table(*args, **kwargs), index=index, columns=columns, values=values, aggfunc=aggfunc, margins=True, margins_name=margins_name, fill_value=fill_value, expected_exception=expected_exception, ) @pytest.mark.parametrize( "aggfunc", [ pytest.param("sum", id="MapReduce_func"), pytest.param("nunique", id="FullAxis_func"), ], ) @pytest.mark.parametrize("margins", [True, False]) def test_pivot_table_fill_value(aggfunc, margins): md_df, pd_df = create_test_dfs(test_data["int_data"]) eval_general( md_df, pd_df, operation=lambda df, *args, **kwargs: df.pivot_table(*args, **kwargs), index=md_df.columns[0], columns=md_df.columns[1], values=md_df.columns[2], aggfunc=aggfunc, margins=margins, fill_value=10, ) @pytest.mark.parametrize("data", [test_data["int_data"]], ids=["int_data"]) def test_pivot_table_dropna(data): eval_general( *create_test_dfs(data), operation=lambda df, *args, **kwargs: df.pivot_table(*args, **kwargs), index=lambda df: df.columns[0], columns=lambda df: df.columns[1], values=lambda df: df.columns[-1], dropna=False, ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_plot(request, data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) if name_contains(request.node.name, numeric_dfs): # We have to test this way because equality in plots means same object. zipped_plot_lines = zip(modin_df.plot().lines, pandas_df.plot().lines) for left, right in zipped_plot_lines: if isinstance(left.get_xdata(), np.ma.core.MaskedArray) and isinstance( right.get_xdata(), np.ma.core.MaskedArray ): assert all((left.get_xdata() == right.get_xdata()).data) else: assert np.array_equal(left.get_xdata(), right.get_xdata()) if isinstance(left.get_ydata(), np.ma.core.MaskedArray) and isinstance( right.get_ydata(), np.ma.core.MaskedArray ): assert all((left.get_ydata() == right.get_ydata()).data) else: assert np.array_equal(left.get_xdata(), right.get_xdata()) def test_replace(): modin_df = pd.DataFrame( {"A": [0, 1, 2, 3, 4], "B": [5, 6, 7, 8, 9], "C": ["a", "b", "c", "d", "e"]} ) pandas_df = pandas.DataFrame( {"A": [0, 1, 2, 3, 4], "B": [5, 6, 7, 8, 9], "C": ["a", "b", "c", "d", "e"]} ) modin_result = modin_df.replace({"A": 0, "B": 5}, 100) pandas_result = pandas_df.replace({"A": 0, "B": 5}, 100) df_equals(modin_result, pandas_result) modin_result = modin_df.replace({"A": {0: 100, 4: 400}}) pandas_result = pandas_df.replace({"A": {0: 100, 4: 400}}) df_equals(modin_result, pandas_result) modin_df = pd.DataFrame({"A": ["bat", "foo", "bait"], "B": ["abc", "bar", "xyz"]}) pandas_df = pandas.DataFrame( {"A": ["bat", "foo", "bait"], "B": ["abc", "bar", "xyz"]} ) modin_result = modin_df.replace(regex={r"^ba.$": "new", "foo": "xyz"}) pandas_result = pandas_df.replace(regex={r"^ba.$": "new", "foo": "xyz"}) df_equals(modin_result, pandas_result) modin_result = modin_df.replace(regex=[r"^ba.$", "foo"], value="new") pandas_result = pandas_df.replace(regex=[r"^ba.$", "foo"], value="new") df_equals(modin_result, pandas_result) modin_df.replace(regex=[r"^ba.$", "foo"], value="new", inplace=True) pandas_df.replace(regex=[r"^ba.$", "foo"], value="new", inplace=True) df_equals(modin_df, pandas_df) @pytest.mark.parametrize("rule", ["5min", pandas.offsets.Hour()]) @pytest.mark.parametrize("axis", [0]) def test_resampler(rule, axis): data, index = ( test_data_resample["data"], test_data_resample["index"], ) modin_resampler = pd.DataFrame(data, index=index).resample(rule, axis=axis) pandas_resampler = pandas.DataFrame(data, index=index).resample(rule, axis=axis) assert pandas_resampler.indices == modin_resampler.indices assert pandas_resampler.groups == modin_resampler.groups df_equals( modin_resampler.get_group(name=list(modin_resampler.groups)[0]), pandas_resampler.get_group(name=list(pandas_resampler.groups)[0]), ) @pytest.mark.parametrize("rule", ["5min"]) @pytest.mark.parametrize("axis", ["index", "columns"]) @pytest.mark.parametrize( "method", [ *("count", "sum", "std", "sem", "size", "prod", "ohlc", "quantile"), *("min", "median", "mean", "max", "last", "first", "nunique", "var"), *("interpolate", "asfreq", "nearest", "bfill", "ffill"), ], ) def test_resampler_functions(rule, axis, method): data, index = ( test_data_resample["data"], test_data_resample["index"], ) modin_df = pd.DataFrame(data, index=index) pandas_df = pandas.DataFrame(data, index=index) if axis == "columns": columns = pandas.date_range( "31/12/2000", periods=len(pandas_df.columns), freq="min" ) modin_df.columns = columns pandas_df.columns = columns expected_exception = None if method in ("interpolate", "asfreq", "nearest", "bfill", "ffill"): # It looks like pandas is preparing to completely # remove `axis` parameter for `resample` function. expected_exception = AssertionError("axis must be 0") eval_general( modin_df, pandas_df, lambda df: getattr(df.resample(rule, axis=axis), method)(), expected_exception=expected_exception, ) @pytest.mark.parametrize("rule", ["5min"]) @pytest.mark.parametrize("axis", ["index", "columns"]) @pytest.mark.parametrize( "method_arg", [ ("pipe", lambda x: x.max() - x.min()), ("transform", lambda x: (x - x.mean()) / x.std()), ("apply", ["sum", "mean", "max"]), ("aggregate", ["sum", "mean", "max"]), ], ) def test_resampler_functions_with_arg(rule, axis, method_arg): data, index = ( test_data_resample["data"], test_data_resample["index"], ) modin_df = pd.DataFrame(data, index=index) pandas_df = pandas.DataFrame(data, index=index) if axis == "columns": columns = pandas.date_range( "31/12/2000", periods=len(pandas_df.columns), freq="min" ) modin_df.columns = columns pandas_df.columns = columns method, arg = method_arg[0], method_arg[1] expected_exception = None if method in ("apply", "aggregate"): expected_exception = NotImplementedError("axis other than 0 is not supported") eval_general( modin_df, pandas_df, lambda df: getattr(df.resample(rule, axis=axis), method)(arg), expected_exception=expected_exception, ) @pytest.mark.parametrize("rule", ["5min"]) @pytest.mark.parametrize("closed", ["left", "right"]) @pytest.mark.parametrize("label", ["right", "left"]) @pytest.mark.parametrize( "on", [ None, pytest.param( "DateColumn", marks=pytest.mark.xfail( condition=Engine.get() in ("Ray", "Unidist", "Dask", "Python") and StorageFormat.get() != "Base", reason="https://github.com/modin-project/modin/issues/6399", ), ), ], ) @pytest.mark.parametrize("level", [None, 1]) def test_resample_specific(rule, closed, label, on, level): data, index = ( test_data_resample["data"], test_data_resample["index"], ) modin_df = pd.DataFrame(data, index=index) pandas_df = pandas.DataFrame(data, index=index) if on is None and level is not None: index = pandas.MultiIndex.from_product( [ ["a", "b", "c", "d"], pandas.date_range("31/12/2000", periods=len(pandas_df) // 4, freq="h"), ] ) pandas_df.index = index modin_df.index = index else: level = None if on is not None: pandas_df[on] = pandas.date_range( "22/06/1941", periods=len(pandas_df), freq="min" ) modin_df[on] = pandas.date_range( "22/06/1941", periods=len(modin_df), freq="min" ) pandas_resampler = pandas_df.resample( rule, closed=closed, label=label, on=on, level=level, ) modin_resampler = modin_df.resample( rule, closed=closed, label=label, on=on, level=level, ) df_equals(modin_resampler.var(0), pandas_resampler.var(0)) if on is None and level is None: df_equals( modin_resampler.fillna(method="nearest"), pandas_resampler.fillna(method="nearest"), ) @pytest.mark.parametrize( "columns", [ "volume", "date", ["volume"], ("volume",), pandas.Series(["volume"]), pandas.Index(["volume"]), ["volume", "volume", "volume"], ["volume", "price", "date"], ], ids=[ "column", "only_missed_column", "list", "tuple", "series", "index", "duplicate_column", "missed_column", ], ) def test_resample_getitem(columns, request): index = pandas.date_range("1/1/2013", periods=9, freq="min") data = { "price": range(9), "volume": range(10, 19), } expected_exception = None if "only_missed_column" in request.node.callspec.id: expected_exception = KeyError("Column not found: date") elif "missed_column" in request.node.callspec.id: expected_exception = KeyError("Columns not found: 'date'") eval_general( *create_test_dfs(data, index=index), lambda df: df.resample("3min")[columns].mean(), expected_exception=expected_exception, ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("index", ["default", "ndarray", "has_duplicates"]) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("periods", [0, 1, -1, 10, -10, 1000000000, -1000000000]) def test_shift(data, index, axis, periods): modin_df, pandas_df = create_test_dfs(data) if index == "ndarray": data_column_length = len(data[next(iter(data))]) modin_df.index = pandas_df.index = np.arange(2, data_column_length + 2) elif index == "has_duplicates": modin_df.index = pandas_df.index = list(modin_df.index[:-3]) + [0, 1, 2] df_equals( modin_df.shift(periods=periods, axis=axis), pandas_df.shift(periods=periods, axis=axis), ) df_equals( modin_df.shift(periods=periods, axis=axis, fill_value=777), pandas_df.shift(periods=periods, axis=axis, fill_value=777), ) @pytest.mark.parametrize("is_multi_idx", [True, False], ids=["idx_multi", "idx_index"]) @pytest.mark.parametrize("is_multi_col", [True, False], ids=["col_multi", "col_index"]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_stack(data, is_multi_idx, is_multi_col): pandas_df = pandas.DataFrame(data) modin_df = pd.DataFrame(data) if is_multi_idx: if len(pandas_df.index) == 256: index = pd.MultiIndex.from_product( [ ["a", "b", "c", "d"], ["x", "y", "z", "last"], ["i", "j", "k", "index"], [1, 2, 3, 4], ] ) elif len(pandas_df.index) == 100: index = pd.MultiIndex.from_product( [ ["x", "y", "z", "last"], ["a", "b", "c", "d", "f"], ["i", "j", "k", "l", "index"], ] ) else: index = pd.MultiIndex.from_tuples( [(i, i * 2, i * 3) for i in range(len(pandas_df.index))] ) else: index = pandas_df.index if is_multi_col: if len(pandas_df.columns) == 64: columns = pd.MultiIndex.from_product( [["A", "B", "C", "D"], ["xx", "yy", "zz", "LAST"], [10, 20, 30, 40]] ) elif len(pandas_df.columns) == 100: columns = pd.MultiIndex.from_product( [ ["xx", "yy", "zz", "LAST"], ["A", "B", "C", "D", "F"], ["I", "J", "K", "L", "INDEX"], ] ) else: columns = pd.MultiIndex.from_tuples( [(i, i * 2, i * 3) for i in range(len(pandas_df.columns))] ) else: columns = pandas_df.columns pandas_df.columns = columns pandas_df.index = index modin_df.columns = columns modin_df.index = index df_equals(modin_df.stack(), pandas_df.stack()) if is_multi_col: df_equals(modin_df.stack(level=0), pandas_df.stack(level=0)) df_equals(modin_df.stack(level=[0, 1]), pandas_df.stack(level=[0, 1])) df_equals(modin_df.stack(level=[0, 1, 2]), pandas_df.stack(level=[0, 1, 2])) @pytest.mark.parametrize("sort", [True, False]) def test_stack_sort(sort): # Example frame slightly modified from pandas docs to be unsorted cols = pd.MultiIndex.from_tuples([("weight", "pounds"), ("weight", "kg")]) modin_df, pandas_df = create_test_dfs( [[1, 2], [2, 4]], index=["cat", "dog"], columns=cols ) df_equals(modin_df.stack(sort=sort), pandas_df.stack(sort=sort)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis1", [0, 1]) @pytest.mark.parametrize("axis2", [0, 1]) def test_swapaxes(data, axis1, axis2): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) pandas_result = pandas_df.swapaxes(axis1, axis2) modin_result = modin_df.swapaxes(axis1, axis2) df_equals(modin_result, pandas_result) def test_swapaxes_axes_names(): modin_df = pd.DataFrame(test_data_values[0]) modin_result1 = modin_df.swapaxes(0, 1) modin_result2 = modin_df.swapaxes("columns", "index") df_equals(modin_result1, modin_result2) def test_swaplevel(): data = np.random.randint(1, 100, 12) modin_df = pd.DataFrame( data, index=pd.MultiIndex.from_tuples( [ (num, letter, color) for num in range(1, 3) for letter in ["a", "b", "c"] for color in ["Red", "Green"] ], names=["Number", "Letter", "Color"], ), ) pandas_df = pandas.DataFrame( data, index=pandas.MultiIndex.from_tuples( [ (num, letter, color) for num in range(1, 3) for letter in ["a", "b", "c"] for color in ["Red", "Green"] ], names=["Number", "Letter", "Color"], ), ) df_equals( modin_df.swaplevel("Number", "Color"), pandas_df.swaplevel("Number", "Color"), ) df_equals(modin_df.swaplevel(), pandas_df.swaplevel()) df_equals(modin_df.swaplevel(0, 1), pandas_df.swaplevel(0, 1)) def test_take(): modin_df = pd.DataFrame( [ ("falcon", "bird", 389.0), ("parrot", "bird", 24.0), ("lion", "mammal", 80.5), ("monkey", "mammal", np.nan), ], columns=["name", "class", "max_speed"], index=[0, 2, 3, 1], ) pandas_df = pandas.DataFrame( [ ("falcon", "bird", 389.0), ("parrot", "bird", 24.0), ("lion", "mammal", 80.5), ("monkey", "mammal", np.nan), ], columns=["name", "class", "max_speed"], index=[0, 2, 3, 1], ) df_equals(modin_df.take([0, 3]), pandas_df.take([0, 3])) df_equals(modin_df.take([2], axis=1), pandas_df.take([2], axis=1)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_to_records(data): # `to_records` doesn't work when `index` is among column names eval_general( *create_test_dfs(data), lambda df: ( df.dropna().drop("index", axis=1) if "index" in df.columns else df.dropna() ).to_records(), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_to_string(data): eval_general( *create_test_dfs(data), lambda df: df.to_string(), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_truncate(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) before = 1 after = len(modin_df - 3) df_equals(modin_df.truncate(before, after), pandas_df.truncate(before, after)) before = 1 after = 3 df_equals(modin_df.truncate(before, after), pandas_df.truncate(before, after)) before = modin_df.columns[1] after = modin_df.columns[-3] try: pandas_result = pandas_df.truncate(before, after, axis=1) except Exception as err: with pytest.raises(type(err)): modin_df.truncate(before, after, axis=1) else: modin_result = modin_df.truncate(before, after, axis=1) df_equals(modin_result, pandas_result) before = modin_df.columns[1] after = modin_df.columns[3] try: pandas_result = pandas_df.truncate(before, after, axis=1) except Exception as err: with pytest.raises(type(err)): modin_df.truncate(before, after, axis=1) else: modin_result = modin_df.truncate(before, after, axis=1) df_equals(modin_result, pandas_result) before = None after = None df_equals(modin_df.truncate(before, after), pandas_df.truncate(before, after)) try: pandas_result = pandas_df.truncate(before, after, axis=1) except Exception as err: with pytest.raises(type(err)): modin_df.truncate(before, after, axis=1) else: modin_result = modin_df.truncate(before, after, axis=1) df_equals(modin_result, pandas_result) def test_truncate_before_greater_than_after(): df = pd.DataFrame([[1, 2, 3]]) with pytest.raises(ValueError, match="Truncate: 1 must be after 2"): df.truncate(before=2, after=1) def test_tz_convert(): modin_idx = pd.date_range( "1/1/2012", periods=500, freq="2D", tz="America/Los_Angeles" ) pandas_idx = pandas.date_range( "1/1/2012", periods=500, freq="2D", tz="America/Los_Angeles" ) data = np.random.randint(0, 100, size=(len(modin_idx), 4)) modin_df = pd.DataFrame(data, index=modin_idx) pandas_df = pandas.DataFrame(data, index=pandas_idx) modin_result = modin_df.tz_convert("UTC", axis=0) pandas_result = pandas_df.tz_convert("UTC", axis=0) df_equals(modin_result, pandas_result) modin_multi = pd.MultiIndex.from_arrays([modin_idx, range(len(modin_idx))]) pandas_multi = pandas.MultiIndex.from_arrays([pandas_idx, range(len(modin_idx))]) modin_series = pd.DataFrame(data, index=modin_multi) pandas_series = pandas.DataFrame(data, index=pandas_multi) df_equals( modin_series.tz_convert("UTC", axis=0, level=0), pandas_series.tz_convert("UTC", axis=0, level=0), ) def test_tz_localize(): idx = pd.date_range("1/1/2012", periods=400, freq="2D") data = np.random.randint(0, 100, size=(len(idx), 4)) modin_df = pd.DataFrame(data, index=idx) pandas_df = pandas.DataFrame(data, index=idx) df_equals(modin_df.tz_localize("UTC", axis=0), pandas_df.tz_localize("UTC", axis=0)) df_equals( modin_df.tz_localize("America/Los_Angeles", axis=0), pandas_df.tz_localize("America/Los_Angeles", axis=0), ) @pytest.mark.parametrize("is_multi_idx", [True, False], ids=["idx_multi", "idx_index"]) @pytest.mark.parametrize("is_multi_col", [True, False], ids=["col_multi", "col_index"]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_unstack(data, is_multi_idx, is_multi_col): modin_df, pandas_df = create_test_dfs(data) if is_multi_idx: index = generate_multiindex(len(pandas_df), nlevels=4, is_tree_like=True) else: index = pandas_df.index if is_multi_col: columns = generate_multiindex( len(pandas_df.columns), nlevels=3, is_tree_like=True ) else: columns = pandas_df.columns pandas_df.columns = modin_df.columns = columns pandas_df.index = modin_df.index = index df_equals(modin_df.unstack(), pandas_df.unstack()) df_equals(modin_df.unstack(level=1), pandas_df.unstack(level=1)) if is_multi_idx: df_equals(modin_df.unstack(level=[0, 1]), pandas_df.unstack(level=[0, 1])) df_equals(modin_df.unstack(level=[0, 1, 2]), pandas_df.unstack(level=[0, 1, 2])) df_equals( modin_df.unstack(level=[0, 1, 2, 3]), pandas_df.unstack(level=[0, 1, 2, 3]) ) @pytest.mark.parametrize( "multi_col", ["col_multi_tree", "col_multi_not_tree", "col_index"] ) @pytest.mark.parametrize( "multi_idx", ["idx_multi_tree", "idx_multi_not_tree", "idx_index"] ) def test_unstack_multiindex_types(multi_col, multi_idx): MAX_NROWS = MAX_NCOLS = 36 pandas_df = pandas.DataFrame(test_data["int_data"]).iloc[:MAX_NROWS, :MAX_NCOLS] modin_df = pd.DataFrame(test_data["int_data"]).iloc[:MAX_NROWS, :MAX_NCOLS] def get_new_index(index, cond): if cond == "col_multi_tree" or cond == "idx_multi_tree": return generate_multiindex(len(index), nlevels=3, is_tree_like=True) elif cond == "col_multi_not_tree" or cond == "idx_multi_not_tree": return generate_multiindex(len(index), nlevels=3) else: return index pandas_df.columns = modin_df.columns = get_new_index(pandas_df.columns, multi_col) pandas_df.index = modin_df.index = get_new_index(pandas_df.index, multi_idx) df_equals(modin_df.unstack(), pandas_df.unstack()) df_equals(modin_df.unstack(level=1), pandas_df.unstack(level=1)) if multi_idx != "idx_index": df_equals(modin_df.unstack(level=[0, 1]), pandas_df.unstack(level=[0, 1])) df_equals(modin_df.unstack(level=[0, 1, 2]), pandas_df.unstack(level=[0, 1, 2])) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("copy_kwargs", ({"copy": True}, {"copy": None}, {})) @pytest.mark.parametrize( "get_array, get_array_name", ( (lambda df, copy_kwargs: df.__array__(**copy_kwargs), "__array__"), (lambda df, copy_kwargs: np.array(df, **copy_kwargs), "np.array"), ), ) def test___array__(data, copy_kwargs, get_array, get_array_name): if ( get_array_name == "np.array" and Version(np.__version__) < Version("2") and "copy" in copy_kwargs and copy_kwargs["copy"] is None ): pytest.skip(reason="np.array does not support copy=None before numpy 2.0") assert_array_equal(*(get_array(df, copy_kwargs) for df in create_test_dfs(data))) @pytest.mark.xfail( condition=Backend.get() != "Pandas", raises=AssertionError, reason="https://github.com/modin-project/modin/issues/4650", ) def test___array__copy_false_creates_view(): def do_in_place_update_via_copy(df): array = np.array(df, copy=False) array[0, 0] += 1 eval_general( *create_test_dfs([[11]]), do_in_place_update_via_copy, __inplace__=True ) @pytest.mark.parametrize("data", [[False], [True], [1, 2]]) def test___bool__(data): eval_general( *create_test_dfs(data), lambda df: df.__bool__(), expected_exception=ValueError( "The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all()." ), ) @pytest.mark.parametrize( "is_sparse_data", [True, False], ids=["is_sparse", "is_not_sparse"] ) def test_hasattr_sparse(is_sparse_data): modin_df, pandas_df = ( create_test_dfs(pandas.arrays.SparseArray(test_data["float_nan_data"].values())) if is_sparse_data else create_test_dfs(test_data["float_nan_data"]) ) eval_general(modin_df, pandas_df, lambda df: hasattr(df, "sparse")) def test_setattr_axes(): # Test that setting .index or .columns does not warn df = pd.DataFrame([[1, 2], [3, 4]]) with warnings.catch_warnings(): if get_current_execution() != "BaseOnPython": # In BaseOnPython, setting columns raises a warning because get_axis # defaults to pandas. warnings.simplefilter("error") df.index = ["foo", "bar"] # Check that ensure_index was called pd.testing.assert_index_equal(df.index, pandas.Index(["foo", "bar"])) df.columns = [9, 10] pd.testing.assert_index_equal(df.columns, pandas.Index([9, 10])) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_attrs(data): modin_df, pandas_df = create_test_dfs(data) eval_general(modin_df, pandas_df, lambda df: df.attrs) def test_df_from_series_with_tuple_name(): # Tests that creating a DataFrame from a series with a tuple name results in # a DataFrame with MultiIndex columns. pandas_result = pandas.DataFrame(pandas.Series(name=("a", 1))) # 1. Creating a Modin DF from native pandas Series df_equals(pd.DataFrame(pandas.Series(name=("a", 1))), pandas_result) # 2. Creating a Modin DF from Modin Series df_equals(pd.DataFrame(pd.Series(name=("a", 1))), pandas_result) def test_large_df_warns_distributing_takes_time(): # https://github.com/modin-project/modin/issues/6574 regex = r"Distributing (.*) object\. This may take some time\." with pytest.warns(UserWarning, match=regex): pd.DataFrame(np.random.randint(1_000_000, size=(100_000, 10))) def test_large_series_warns_distributing_takes_time(): # https://github.com/modin-project/modin/issues/6574 regex = r"Distributing (.*) object\. This may take some time\." with pytest.warns(UserWarning, match=regex): pd.Series(np.random.randint(1_000_000, size=(2_500_000))) def test_df_does_not_warn_distributing_takes_time(): # https://github.com/modin-project/modin/issues/6574 regex = r"Distributing (.*) object\. This may take some time\." with warnings.catch_warnings(): warnings.filterwarnings("error", regex, UserWarning) pd.DataFrame(np.random.randint(1_000_000, size=(100_000, 9))) def test_series_does_not_warn_distributing_takes_time(): # https://github.com/modin-project/modin/issues/6574 regex = r"Distributing (.*) object\. This may take some time\." with warnings.catch_warnings(): warnings.filterwarnings("error", regex, UserWarning) pd.Series(np.random.randint(1_000_000, size=(2_400_000))) @pytest.mark.parametrize("dtype", [np.int64, pd.ArrowDtype(pa.int64())]) def test_empty_df_dtypes(dtype): df = pd.DataFrame({"A": []}, dtype=dtype) assert df.dtypes["A"] == dtype def test_array_ufunc(): modin_df, pandas_df = create_test_dfs([[1, 2], [3, 4]]) eval_general(modin_df, pandas_df, np.sqrt) modin_ser, pandas_ser = create_test_series([1, 2, 3, 4, 9]) eval_general(modin_ser, pandas_ser, np.sqrt) ================================================ FILE: modin/tests/pandas/dataframe/test_indexing.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import os import sys import matplotlib import numpy as np import pandas import pytest from pandas._testing import ensure_clean import modin.pandas as pd from modin.config import MinRowPartitionSize, NPartitions from modin.pandas.indexing import is_range_like from modin.pandas.testing import assert_index_equal from modin.tests.pandas.utils import ( NROWS, RAND_HIGH, RAND_LOW, arg_keys, assert_dtypes_equal, axis_keys, axis_values, create_test_dfs, default_to_pandas_ignore_string, df_equals, eval_general, generate_multiindex, int_arg_keys, int_arg_values, name_contains, test_data, test_data_keys, test_data_values, ) from modin.utils import get_current_execution NPartitions.put(4) # Force matplotlib to not use any Xwindows backend. matplotlib.use("Agg") # Our configuration in pytest.ini requires that we explicitly catch all # instances of defaulting to pandas, but some test modules, like this one, # have too many such instances. # TODO(https://github.com/modin-project/modin/issues/3655): catch all instances # of defaulting to pandas. pytestmark = pytest.mark.filterwarnings(default_to_pandas_ignore_string) def eval_setitem(md_df, pd_df, value, col=None, loc=None, expected_exception=None): if loc is not None: col = pd_df.columns[loc] value_getter = value if callable(value) else (lambda *args, **kwargs: value) eval_general( md_df, pd_df, lambda df: df.__setitem__(col, value_getter(df)), __inplace__=True, expected_exception=expected_exception, ) def eval_loc(md_df, pd_df, value, key): if isinstance(value, tuple): assert len(value) == 2 # case when value for pandas different md_value, pd_value = value else: md_value, pd_value = value, value eval_general( md_df, pd_df, lambda df: df.loc.__setitem__( key, pd_value if isinstance(df, pandas.DataFrame) else md_value ), __inplace__=True, ) @pytest.mark.parametrize( "dates", [ ["2018-02-27 09:03:30", "2018-02-27 09:04:30"], ["2018-02-27 09:03:00", "2018-02-27 09:05:00"], ], ) @pytest.mark.parametrize("subset", ["a", "b", ["a", "b"], None]) def test_asof_with_nan(dates, subset): data = {"a": [10, 20, 30, 40, 50], "b": [None, None, None, None, 500]} index = pd.DatetimeIndex( [ "2018-02-27 09:01:00", "2018-02-27 09:02:00", "2018-02-27 09:03:00", "2018-02-27 09:04:00", "2018-02-27 09:05:00", ] ) modin_where = pd.DatetimeIndex(dates) pandas_where = pandas.DatetimeIndex(dates) compare_asof(data, index, modin_where, pandas_where, subset) @pytest.mark.parametrize( "dates", [ ["2018-02-27 09:03:30", "2018-02-27 09:04:30"], ["2018-02-27 09:03:00", "2018-02-27 09:05:00"], ], ) @pytest.mark.parametrize("subset", ["a", "b", ["a", "b"], None]) def test_asof_without_nan(dates, subset): data = {"a": [10, 20, 30, 40, 50], "b": [70, 600, 30, -200, 500]} index = pd.DatetimeIndex( [ "2018-02-27 09:01:00", "2018-02-27 09:02:00", "2018-02-27 09:03:00", "2018-02-27 09:04:00", "2018-02-27 09:05:00", ] ) modin_where = pd.DatetimeIndex(dates) pandas_where = pandas.DatetimeIndex(dates) compare_asof(data, index, modin_where, pandas_where, subset) @pytest.mark.parametrize( "lookup", [[60, 70, 90], [60.5, 70.5, 100]], ) @pytest.mark.parametrize("subset", ["col2", "col1", ["col1", "col2"], None]) def test_asof_large(lookup, subset): data = test_data["float_nan_data"] index = list(range(NROWS)) modin_where = pd.Index(lookup) pandas_where = pandas.Index(lookup) compare_asof(data, index, modin_where, pandas_where, subset) def compare_asof( data, index, modin_where: pd.Index, pandas_where: pandas.Index, subset ): modin_df = pd.DataFrame(data, index=index) pandas_df = pandas.DataFrame(data, index=index) df_equals( modin_df.asof(modin_where, subset=subset), pandas_df.asof(pandas_where, subset=subset), ) df_equals( modin_df.asof(modin_where.values, subset=subset), pandas_df.asof(pandas_where.values, subset=subset), ) df_equals( modin_df.asof(list(modin_where.values), subset=subset), pandas_df.asof(list(pandas_where.values), subset=subset), ) df_equals( modin_df.asof(modin_where.values[0], subset=subset), pandas_df.asof(pandas_where.values[0], subset=subset), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_first_valid_index(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) assert modin_df.first_valid_index() == (pandas_df.first_valid_index()) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("n", int_arg_values, ids=arg_keys("n", int_arg_keys)) def test_head(data, n): # Test normal dataframe head modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) df_equals(modin_df.head(n), pandas_df.head(n)) df_equals(modin_df.head(len(modin_df) + 1), pandas_df.head(len(pandas_df) + 1)) # Test head when we call it from a QueryCompilerView modin_result = modin_df.loc[:, ["col1", "col3", "col3"]].head(n) pandas_result = pandas_df.loc[:, ["col1", "col3", "col3"]].head(n) df_equals(modin_result, pandas_result) @pytest.mark.skip(reason="Defaulting to Pandas") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_iat(data): modin_df = pd.DataFrame(data) with pytest.raises(NotImplementedError): modin_df.iat() @pytest.mark.gpu @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_iloc(request, data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) if not name_contains(request.node.name, ["empty_data"]): # Scalar np.testing.assert_equal(modin_df.iloc[0, 1], pandas_df.iloc[0, 1]) # Series df_equals(modin_df.iloc[0], pandas_df.iloc[0]) df_equals(modin_df.iloc[1:, 0], pandas_df.iloc[1:, 0]) df_equals(modin_df.iloc[1:2, 0], pandas_df.iloc[1:2, 0]) # DataFrame df_equals(modin_df.iloc[[1, 2]], pandas_df.iloc[[1, 2]]) # See issue #80 # df_equals(modin_df.iloc[[1, 2], [1, 0]], pandas_df.iloc[[1, 2], [1, 0]]) df_equals(modin_df.iloc[1:2, 0:2], pandas_df.iloc[1:2, 0:2]) # Issue #43 modin_df.iloc[0:3, :] # Write Item modin_df.iloc[[1, 2]] = 42 pandas_df.iloc[[1, 2]] = 42 df_equals(modin_df, pandas_df) modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) modin_df.iloc[0] = modin_df.iloc[1] pandas_df.iloc[0] = pandas_df.iloc[1] df_equals(modin_df, pandas_df) modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) modin_df.iloc[:, 0] = modin_df.iloc[:, 1] pandas_df.iloc[:, 0] = pandas_df.iloc[:, 1] df_equals(modin_df, pandas_df) # From issue #1775 df_equals( modin_df.iloc[lambda df: df.index.get_indexer_for(df.index[:5])], pandas_df.iloc[lambda df: df.index.get_indexer_for(df.index[:5])], ) # Read values, selecting rows with callable and a column with a scalar. df_equals( pandas_df.iloc[lambda df: df.index.get_indexer_for(df.index[:5]), 0], modin_df.iloc[lambda df: df.index.get_indexer_for(df.index[:5]), 0], ) else: with pytest.raises(IndexError): modin_df.iloc[0, 1] @pytest.mark.gpu @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_index(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) df_equals(modin_df.index, pandas_df.index) modin_df_cp = modin_df.copy() pandas_df_cp = pandas_df.copy() modin_df_cp.index = [str(i) for i in modin_df_cp.index] pandas_df_cp.index = [str(i) for i in pandas_df_cp.index] df_equals(modin_df_cp.index, pandas_df_cp.index) @pytest.mark.gpu @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_indexing_duplicate_axis(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) modin_df.index = pandas_df.index = [i // 3 for i in range(len(modin_df))] assert any(modin_df.index.duplicated()) assert any(pandas_df.index.duplicated()) df_equals(modin_df.iloc[0], pandas_df.iloc[0]) df_equals(modin_df.loc[0], pandas_df.loc[0]) df_equals(modin_df.iloc[0, 0:4], pandas_df.iloc[0, 0:4]) df_equals( modin_df.loc[0, modin_df.columns[0:4]], pandas_df.loc[0, pandas_df.columns[0:4]], ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize( "key_func", [ # test for the case from https://github.com/modin-project/modin/issues/4308 lambda df: "non_existing_column", lambda df: df.columns[0], lambda df: df.index, lambda df: [df.index, df.columns[0]], lambda df: ( pandas.Series(list(range(len(df.index)))) if isinstance(df, pandas.DataFrame) else pd.Series(list(range(len(df)))) ), ], ids=[ "non_existing_column", "first_column_name", "original_index", "list_of_index_and_first_column_name", "series_of_integers", ], ) @pytest.mark.parametrize( "drop_kwargs", [{"drop": True}, {"drop": False}, {}], ids=["drop_True", "drop_False", "no_drop_param"], ) def test_set_index(data, key_func, drop_kwargs, request): if ( "list_of_index_and_first_column_name" in request.node.name and "drop_False" in request.node.name ): pytest.xfail( reason="KeyError: https://github.com/modin-project/modin/issues/5636" ) expected_exception = None if "non_existing_column" in request.node.callspec.id: expected_exception = KeyError( "None of ['non_existing_column'] are in the columns" ) eval_general( *create_test_dfs(data), lambda df: df.set_index(key_func(df), **drop_kwargs), expected_exception=expected_exception, ) @pytest.mark.parametrize("index", ["a", ["a", ("b", "")]]) def test_set_index_with_multiindex(index): # see #5186 for details kwargs = {"columns": [["a", "b", "c", "d"], ["", "", "x", "y"]]} modin_df, pandas_df = create_test_dfs(np.random.rand(2, 4), **kwargs) eval_general(modin_df, pandas_df, lambda df: df.set_index(index)) @pytest.mark.gpu @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_keys(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) df_equals(modin_df.keys(), pandas_df.keys()) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_loc(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) key1 = modin_df.columns[0] key2 = modin_df.columns[1] # Scalar df_equals(modin_df.loc[0, key1], pandas_df.loc[0, key1]) # Series df_equals(modin_df.loc[0], pandas_df.loc[0]) df_equals(modin_df.loc[1:, key1], pandas_df.loc[1:, key1]) df_equals(modin_df.loc[1:2, key1], pandas_df.loc[1:2, key1]) df_equals(modin_df.loc[:, key1], pandas_df.loc[:, key1]) # DataFrame df_equals(modin_df.loc[[1, 2]], pandas_df.loc[[1, 2]]) indices = [i % 3 == 0 for i in range(len(modin_df.index))] columns = [i % 5 == 0 for i in range(len(modin_df.columns))] # Key is a list of booleans modin_result = modin_df.loc[indices, columns] pandas_result = pandas_df.loc[indices, columns] df_equals(modin_result, pandas_result) # Key is a Modin or pandas series of booleans df_equals( modin_df.loc[pd.Series(indices), pd.Series(columns, index=modin_df.columns)], pandas_df.loc[ pandas.Series(indices), pandas.Series(columns, index=modin_df.columns) ], ) modin_result = modin_df.loc[:, columns] pandas_result = pandas_df.loc[:, columns] df_equals(modin_result, pandas_result) modin_result = modin_df.loc[indices] pandas_result = pandas_df.loc[indices] df_equals(modin_result, pandas_result) # See issue #80 # df_equals(modin_df.loc[[1, 2], ['col1']], pandas_df.loc[[1, 2], ['col1']]) df_equals(modin_df.loc[1:2, key1:key2], pandas_df.loc[1:2, key1:key2]) # From issue #421 df_equals(modin_df.loc[:, [key2, key1]], pandas_df.loc[:, [key2, key1]]) df_equals(modin_df.loc[[2, 1], :], pandas_df.loc[[2, 1], :]) # From issue #1023 key1 = modin_df.columns[0] key2 = modin_df.columns[-2] df_equals(modin_df.loc[:, key1:key2], pandas_df.loc[:, key1:key2]) # Write Item modin_df_copy = modin_df.copy() pandas_df_copy = pandas_df.copy() modin_df_copy.loc[[1, 2]] = 42 pandas_df_copy.loc[[1, 2]] = 42 df_equals(modin_df_copy, pandas_df_copy) # Write an item, selecting rows with a callable. modin_df_copy2 = modin_df.copy() pandas_df_copy2 = pandas_df.copy() modin_df_copy2.loc[lambda df: df[key1].isin(list(range(1000)))] = 42 pandas_df_copy2.loc[lambda df: df[key1].isin(list(range(1000)))] = 42 df_equals(modin_df_copy2, pandas_df_copy2) # Write an item, selecting rows with a callable and a column with a scalar. modin_df_copy3 = modin_df.copy() pandas_df_copy3 = pandas_df.copy() modin_df_copy3.loc[lambda df: df[key1].isin(list(range(1000))), key1] = 42 pandas_df_copy3.loc[lambda df: df[key1].isin(list(range(1000))), key1] = 42 df_equals(modin_df_copy3, pandas_df_copy3) # Disabled for `BaseOnPython` because of the issue with `getitem_array`: # https://github.com/modin-project/modin/issues/3701 if get_current_execution() != "BaseOnPython": # From issue #1775 df_equals( modin_df.loc[lambda df: df.iloc[:, 0].isin(list(range(1000)))], pandas_df.loc[lambda df: df.iloc[:, 0].isin(list(range(1000)))], ) # Read values, selecting rows with a callable and a column with a scalar. df_equals( pandas_df.loc[lambda df: df[key1].isin(list(range(1000))), key1], modin_df.loc[lambda df: df[key1].isin(list(range(1000))), key1], ) # From issue #1374 with pytest.raises(KeyError): modin_df.loc["NO_EXIST"] @pytest.mark.parametrize( "key_getter, value_getter", [ pytest.param( lambda df, axis: ( (slice(None), df.axes[axis][:2]) if axis else (df.axes[axis][:2], slice(None)) ), lambda df, axis: df.iloc[:, :1] if axis else df.iloc[:1, :], id="len(key)_>_len(value)", ), pytest.param( lambda df, axis: ( (slice(None), df.axes[axis][:2]) if axis else (df.axes[axis][:2], slice(None)) ), lambda df, axis: df.iloc[:, :3] if axis else df.iloc[:3, :], id="len(key)_<_len(value)", ), pytest.param( lambda df, axis: ( (slice(None), df.axes[axis][:2]) if axis else (df.axes[axis][:2], slice(None)) ), lambda df, axis: df.iloc[:, :2] if axis else df.iloc[:2, :], id="len(key)_==_len(value)", ), ], ) @pytest.mark.parametrize("key_axis", [0, 1]) @pytest.mark.parametrize("reverse_value_index", [True, False]) @pytest.mark.parametrize("reverse_value_columns", [True, False]) def test_loc_4456( key_getter, value_getter, key_axis, reverse_value_index, reverse_value_columns ): data = test_data["float_nan_data"] modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) key = key_getter(pandas_df, key_axis) # `df.loc` doesn't work right for range-like indexers. Converting them to a list. # https://github.com/modin-project/modin/issues/4497 if is_range_like(key[0]): key = (list(key[0]), key[1]) if is_range_like(key[1]): key = (key[0], list(key[1])) value = pandas.DataFrame( np.random.randint(0, 100, size=pandas_df.shape), index=pandas_df.index, columns=pandas_df.columns, ) pdf_value = value_getter(value, key_axis) mdf_value = value_getter(pd.DataFrame(value), key_axis) if reverse_value_index: pdf_value = pdf_value.reindex(index=pdf_value.index[::-1]) mdf_value = mdf_value.reindex(index=mdf_value.index[::-1]) if reverse_value_columns: pdf_value = pdf_value.reindex(columns=pdf_value.columns[::-1]) mdf_value = mdf_value.reindex(columns=mdf_value.columns[::-1]) eval_loc(modin_df, pandas_df, pdf_value, key) eval_loc(modin_df, pandas_df, (mdf_value, pdf_value), key) def test_loc_6774(): modin_df, pandas_df = create_test_dfs( {"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]} ) pandas_df.loc[:, "c"] = [10, 20, 30, 40, 51] modin_df.loc[:, "c"] = [10, 20, 30, 40, 51] df_equals(modin_df, pandas_df) pandas_df.loc[2:, "y"] = [30, 40, 51] modin_df.loc[2:, "y"] = [30, 40, 51] df_equals(modin_df, pandas_df) pandas_df.loc[:, ["b", "c", "d"]] = ( pd.DataFrame([[10, 20, 30, 40, 50], [10, 20, 30, 40], [10, 20, 30]]) .transpose() .values ) modin_df.loc[:, ["b", "c", "d"]] = ( pd.DataFrame([[10, 20, 30, 40, 50], [10, 20, 30, 40], [10, 20, 30]]) .transpose() .values ) df_equals(modin_df, pandas_df) def test_loc_5829(): data = {"a": [1, 2, 3, 4, 5], "b": [11, 12, 13, 14, 15]} modin_df = pd.DataFrame(data, dtype=object) pandas_df = pandas.DataFrame(data, dtype=object) eval_loc( modin_df, pandas_df, value=np.array([[24, 34, 44], [25, 35, 45]]), key=([3, 4], ["c", "d", "e"]), ) def test_loc_7135(): data = np.random.randint(0, 100, size=(2**16, 2**8)) modin_df, pandas_df = create_test_dfs(data) key = len(pandas_df) eval_loc( modin_df, pandas_df, value=list(range(2**8)), key=key, ) # This tests the bug from https://github.com/modin-project/modin/issues/3736 def test_loc_setting_single_categorical_column(): modin_df = pd.DataFrame({"status": ["a", "b", "c"]}, dtype="category") pandas_df = pandas.DataFrame({"status": ["a", "b", "c"]}, dtype="category") modin_df.loc[1:3, "status"] = "a" pandas_df.loc[1:3, "status"] = "a" df_equals(modin_df, pandas_df) def test_loc_multi_index(): modin_df = pd.read_csv( "modin/tests/pandas/data/blah.csv", header=[0, 1, 2, 3], index_col=0 ) pandas_df = pandas.read_csv( "modin/tests/pandas/data/blah.csv", header=[0, 1, 2, 3], index_col=0 ) df_equals(modin_df.loc[1], pandas_df.loc[1]) df_equals(modin_df.loc[1, "Presidents"], pandas_df.loc[1, "Presidents"]) df_equals( modin_df.loc[1, ("Presidents", "Pure mentions")], pandas_df.loc[1, ("Presidents", "Pure mentions")], ) assert ( modin_df.loc[1, ("Presidents", "Pure mentions", "IND", "all")] == pandas_df.loc[1, ("Presidents", "Pure mentions", "IND", "all")] ) df_equals(modin_df.loc[(1, 2), "Presidents"], pandas_df.loc[(1, 2), "Presidents"]) tuples = [ ("bar", "one"), ("bar", "two"), ("bar", "three"), ("bar", "four"), ("baz", "one"), ("baz", "two"), ("baz", "three"), ("baz", "four"), ("foo", "one"), ("foo", "two"), ("foo", "three"), ("foo", "four"), ("qux", "one"), ("qux", "two"), ("qux", "three"), ("qux", "four"), ] modin_index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"]) pandas_index = pandas.MultiIndex.from_tuples(tuples, names=["first", "second"]) frame_data = np.random.randint(0, 100, size=(16, 100)) modin_df = pd.DataFrame( frame_data, index=modin_index, columns=["col{}".format(i) for i in range(100)], ) pandas_df = pandas.DataFrame( frame_data, index=pandas_index, columns=["col{}".format(i) for i in range(100)], ) df_equals(modin_df.loc["bar", "col1"], pandas_df.loc["bar", "col1"]) assert modin_df.loc[("bar", "one"), "col1"] == pandas_df.loc[("bar", "one"), "col1"] df_equals( modin_df.loc["bar", ("col1", "col2")], pandas_df.loc["bar", ("col1", "col2")], ) # From issue #1456 transposed_modin = modin_df.T transposed_pandas = pandas_df.T df_equals( transposed_modin.loc[transposed_modin.index[:-2], :], transposed_pandas.loc[transposed_pandas.index[:-2], :], ) # From issue #1610 df_equals(modin_df.loc[modin_df.index], pandas_df.loc[pandas_df.index]) df_equals(modin_df.loc[modin_df.index[:7]], pandas_df.loc[pandas_df.index[:7]]) def test_loc_multi_index_with_tuples(): arrays = [ ["bar", "bar", "baz", "baz"], ["one", "two", "one", "two"], ] nrows = 5 columns = pd.MultiIndex.from_tuples(zip(*arrays), names=["a", "b"]) data = np.arange(0, nrows * len(columns)).reshape(nrows, len(columns)) modin_df, pandas_df = create_test_dfs(data, columns=columns) eval_general(modin_df, pandas_df, lambda df: df.loc[:, ("bar", "two")]) def test_loc_multi_index_rows_with_tuples_5721(): arrays = [ ["bar", "bar", "baz", "baz"], ["one", "two", "one", "two"], ] ncols = 5 index = pd.MultiIndex.from_tuples(zip(*arrays), names=["a", "b"]) data = np.arange(0, ncols * len(index)).reshape(len(index), ncols) modin_df, pandas_df = create_test_dfs(data, index=index) eval_general(modin_df, pandas_df, lambda df: df.loc[("bar",)]) eval_general(modin_df, pandas_df, lambda df: df.loc[("bar", "two")]) def test_loc_multi_index_level_two_has_same_name_as_column(): eval_general( *create_test_dfs( pandas.DataFrame( [[0]], index=[pd.Index(["foo"]), pd.Index(["bar"])], columns=["bar"] ) ), lambda df: df.loc[("foo", "bar")], ) def test_loc_multi_index_duplicate_keys(): modin_df, pandas_df = create_test_dfs([1, 2], index=[["a", "a"], ["b", "b"]]) eval_general(modin_df, pandas_df, lambda df: df.loc[("a", "b"), 0]) eval_general(modin_df, pandas_df, lambda df: df.loc[("a", "b"), :]) def test_loc_multi_index_both_axes(): multi_index = pd.MultiIndex.from_tuples( [("r0", "rA"), ("r1", "rB")], names=["Courses", "Fee"] ) cols = pd.MultiIndex.from_tuples( [ ("Gasoline", "Toyota"), ("Gasoline", "Ford"), ("Electric", "Tesla"), ("Electric", "Nio"), ] ) data = [[100, 300, 900, 400], [200, 500, 300, 600]] modin_df, pandas_df = create_test_dfs(data, columns=cols, index=multi_index) eval_general(modin_df, pandas_df, lambda df: df.loc[("r0", "rA"), :]) eval_general(modin_df, pandas_df, lambda df: df.loc[:, ("Gasoline", "Toyota")]) def test_loc_empty(): pandas_df = pandas.DataFrame(index=range(5)) modin_df = pd.DataFrame(index=range(5)) df_equals(pandas_df.loc[1], modin_df.loc[1]) pandas_df.loc[1] = 3 modin_df.loc[1] = 3 df_equals(pandas_df, modin_df) @pytest.mark.parametrize("locator_name", ["iloc", "loc"]) def test_loc_iloc_2064(locator_name): modin_df, pandas_df = create_test_dfs(columns=["col1", "col2"]) if locator_name == "iloc": expected_exception = IndexError( "index 1 is out of bounds for axis 0 with size 0" ) else: _type = "int32" if os.name == "nt" else "int64" expected_exception = KeyError( f"None of [Index([1], dtype='{_type}')] are in the [index]" ) eval_general( modin_df, pandas_df, lambda df: getattr(df, locator_name).__setitem__([1], [11, 22]), __inplace__=True, expected_exception=expected_exception, ) @pytest.mark.parametrize("index", [["row1", "row2", "row3"]]) @pytest.mark.parametrize("columns", [["col1", "col2"]]) def test_loc_assignment(index, columns): md_df, pd_df = create_test_dfs(index=index, columns=columns) for i, ind in enumerate(index): for j, col in enumerate(columns): value_to_assign = int(str(i) + str(j)) md_df.loc[ind][col] = value_to_assign pd_df.loc[ind][col] = value_to_assign df_equals(md_df, pd_df) @pytest.mark.parametrize("left, right", [(2, 1), (6, 1), (lambda df: 70, 1), (90, 70)]) def test_loc_insert_row(left, right): # This test case comes from # https://github.com/modin-project/modin/issues/3764 pandas_df = pandas.DataFrame([[1, 2, 3], [4, 5, 6]]) modin_df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) def _test_loc_rows(df): df.loc[left] = df.loc[right] return df expected_exception = None if right == 70: pytest.xfail(reason="https://github.com/modin-project/modin/issues/7024") eval_general( modin_df, pandas_df, _test_loc_rows, expected_exception=expected_exception ) @pytest.mark.parametrize( "columns", [10, (100, 102), (2, 6), [10, 11, 12], "a", ["b", "c", "d"]] ) def test_loc_insert_col(columns): # This test case comes from # https://github.com/modin-project/modin/issues/3764 pandas_df = pandas.DataFrame([[1, 2, 3], [4, 5, 6]]) modin_df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) if isinstance(columns, tuple) and len(columns) == 2: def _test_loc_cols(df): df.loc[:, columns[0] : columns[1]] = 1 else: def _test_loc_cols(df): df.loc[:, columns] = 1 eval_general(modin_df, pandas_df, _test_loc_cols) @pytest.fixture def loc_iter_dfs(): columns = ["col1", "col2", "col3"] index = ["row1", "row2", "row3"] return create_test_dfs( {col: ([idx] * len(index)) for idx, col in enumerate(columns)}, columns=columns, index=index, ) @pytest.mark.parametrize("reverse_order", [False, True]) @pytest.mark.parametrize("axis", [0, 1]) def test_loc_iter_assignment(loc_iter_dfs, reverse_order, axis): if reverse_order and axis: pytest.xfail( "Due to internal sorting of lookup values assignment order is lost, see GH-#2552" ) md_df, pd_df = loc_iter_dfs select = [slice(None), slice(None)] select[axis] = sorted(pd_df.axes[axis][:-1], reverse=reverse_order) select = tuple(select) pd_df.loc[select] = pd_df.loc[select] + pd_df.loc[select] md_df.loc[select] = md_df.loc[select] + md_df.loc[select] df_equals(md_df, pd_df) @pytest.mark.parametrize("reverse_order", [False, True]) @pytest.mark.parametrize("axis", [0, 1]) def test_loc_order(loc_iter_dfs, reverse_order, axis): md_df, pd_df = loc_iter_dfs select = [slice(None), slice(None)] select[axis] = sorted(pd_df.axes[axis][:-1], reverse=reverse_order) select = tuple(select) df_equals(pd_df.loc[select], md_df.loc[select]) @pytest.mark.gpu @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_loc_nested_assignment(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) key1 = modin_df.columns[0] key2 = modin_df.columns[1] modin_df[key1].loc[0] = 500 pandas_df[key1].loc[0] = 500 df_equals(modin_df, pandas_df) modin_df[key2].loc[0] = None pandas_df[key2].loc[0] = None df_equals(modin_df, pandas_df) def test_iloc_assignment(): modin_df = pd.DataFrame(index=["row1", "row2", "row3"], columns=["col1", "col2"]) pandas_df = pandas.DataFrame( index=["row1", "row2", "row3"], columns=["col1", "col2"] ) modin_df.iloc[0]["col1"] = 11 modin_df.iloc[1]["col1"] = 21 modin_df.iloc[2]["col1"] = 31 modin_df.iloc[lambda df: 0]["col2"] = 12 modin_df.iloc[1][lambda df: ["col2"]] = 22 modin_df.iloc[lambda df: 2][lambda df: ["col2"]] = 32 pandas_df.iloc[0]["col1"] = 11 pandas_df.iloc[1]["col1"] = 21 pandas_df.iloc[2]["col1"] = 31 pandas_df.iloc[lambda df: 0]["col2"] = 12 pandas_df.iloc[1][lambda df: ["col2"]] = 22 pandas_df.iloc[lambda df: 2][lambda df: ["col2"]] = 32 df_equals(modin_df, pandas_df) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_iloc_nested_assignment(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) key1 = modin_df.columns[0] key2 = modin_df.columns[1] modin_df[key1].iloc[0] = 500 pandas_df[key1].iloc[0] = 500 df_equals(modin_df, pandas_df) modin_df[key2].iloc[0] = None pandas_df[key2].iloc[0] = None df_equals(modin_df, pandas_df) def test_iloc_empty(): pandas_df = pandas.DataFrame(index=range(5)) modin_df = pd.DataFrame(index=range(5)) df_equals(pandas_df.iloc[1], modin_df.iloc[1]) pandas_df.iloc[1] = 3 modin_df.iloc[1] = 3 df_equals(pandas_df, modin_df) def test_iloc_loc_key_length_except(): modin_ser, pandas_ser = pd.Series(0), pandas.Series(0) eval_general( modin_ser, pandas_ser, lambda ser: ser.iloc[0, 0], expected_exception=pandas.errors.IndexingError("Too many indexers"), ) eval_general( modin_ser, pandas_ser, lambda ser: ser.loc[0, 0], expected_exception=pandas.errors.IndexingError("Too many indexers"), ) def test_loc_series(): md_df, pd_df = create_test_dfs({"a": [1, 2], "b": [3, 4]}) pd_df.loc[pd_df["a"] > 1, "b"] = np.log(pd_df["b"]) md_df.loc[md_df["a"] > 1, "b"] = np.log(md_df["b"]) df_equals(pd_df, md_df) @pytest.mark.parametrize("locator_name", ["loc", "iloc"]) @pytest.mark.parametrize( "slice_indexer", [ slice(None, None, -2), slice(1, 10, None), slice(None, 10, None), slice(10, None, None), slice(10, None, -2), slice(-10, None, -2), slice(None, 1_000_000_000, None), ], ) def test_loc_iloc_slice_indexer(locator_name, slice_indexer): md_df, pd_df = create_test_dfs(test_data_values[0]) # Shifting the index, so labels won't match its position shifted_index = pandas.RangeIndex(1, len(md_df) + 1) md_df.index = shifted_index pd_df.index = shifted_index eval_general(md_df, pd_df, lambda df: getattr(df, locator_name)[slice_indexer]) @pytest.mark.parametrize( "indexer_size", [ 1, 2, NROWS, pytest.param( NROWS + 1, marks=pytest.mark.xfail( reason="https://github.com/modin-project/modin/issues/5739", strict=True ), ), ], ) class TestLocRangeLikeIndexer: """Test cases related to https://github.com/modin-project/modin/issues/5702""" def test_range_index_getitem_single_value(self, indexer_size): eval_general( *create_test_dfs(test_data["int_data"]), lambda df: df.loc[pd.RangeIndex(indexer_size)], ) def test_range_index_getitem_two_values(self, indexer_size): eval_general( *create_test_dfs(test_data["int_data"]), lambda df: df.loc[pd.RangeIndex(indexer_size), :], ) def test_range_getitem_single_value(self, indexer_size): eval_general( *create_test_dfs(test_data["int_data"]), lambda df: df.loc[range(indexer_size)], ) def test_range_getitem_two_values_5702(self, indexer_size): eval_general( *create_test_dfs(test_data["int_data"]), lambda df: df.loc[range(indexer_size), :], ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_pop(request, data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) if "empty_data" not in request.node.name: key = modin_df.columns[0] temp_modin_df = modin_df.copy() temp_pandas_df = pandas_df.copy() modin_popped = temp_modin_df.pop(key) pandas_popped = temp_pandas_df.pop(key) df_equals(modin_popped, pandas_popped) df_equals(temp_modin_df, temp_pandas_df) def test_reindex(): frame_data = { "col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], "col3": [8, 9, 10, 11], "col4": [12, 13, 14, 15], "col5": [0, 0, 0, 0], } pandas_df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) df_equals(modin_df.reindex([0, 3, 2, 1]), pandas_df.reindex([0, 3, 2, 1])) df_equals(modin_df.reindex([0, 6, 2]), pandas_df.reindex([0, 6, 2])) df_equals( modin_df.reindex(["col1", "col3", "col4", "col2"], axis=1), pandas_df.reindex(["col1", "col3", "col4", "col2"], axis=1), ) df_equals( modin_df.reindex(["col1", "col7", "col4", "col8"], axis=1), pandas_df.reindex(["col1", "col7", "col4", "col8"], axis=1), ) df_equals( modin_df.reindex(index=[0, 1, 5], columns=["col1", "col7", "col4", "col8"]), pandas_df.reindex(index=[0, 1, 5], columns=["col1", "col7", "col4", "col8"]), ) df_equals( modin_df.T.reindex(["col1", "col7", "col4", "col8"], axis=0), pandas_df.T.reindex(["col1", "col7", "col4", "col8"], axis=0), ) def test_reindex_4438(): index = pd.date_range(end="1/1/2018", periods=3, freq="h", name="some meta") new_index = list(reversed(index)) # index case modin_df = pd.DataFrame([1, 2, 3], index=index) pandas_df = pandas.DataFrame([1, 2, 3], index=index) new_modin_df = modin_df.reindex(new_index) new_pandas_df = pandas_df.reindex(new_index) df_equals(new_modin_df, new_pandas_df) # column case modin_df = pd.DataFrame(np.array([[1], [2], [3]]).T, columns=index) pandas_df = pandas.DataFrame(np.array([[1], [2], [3]]).T, columns=index) new_modin_df = modin_df.reindex(columns=new_index) new_pandas_df = pandas_df.reindex(columns=new_index) df_equals(new_modin_df, new_pandas_df) # multiindex case multi_index = pandas.MultiIndex.from_arrays( [("a", "b", "c"), ("a", "b", "c")], names=["first", "second"] ) new_multi_index = list(reversed(multi_index)) modin_df = pd.DataFrame([1, 2, 3], index=multi_index) pandas_df = pandas.DataFrame([1, 2, 3], index=multi_index) new_modin_df = modin_df.reindex(new_multi_index) new_pandas_df = pandas_df.reindex(new_multi_index) df_equals(new_modin_df, new_pandas_df) # multicolumn case modin_df = pd.DataFrame(np.array([[1], [2], [3]]).T, columns=multi_index) pandas_df = pandas.DataFrame(np.array([[1], [2], [3]]).T, columns=multi_index) new_modin_df = modin_df.reindex(columns=new_multi_index) new_pandas_df = pandas_df.reindex(columns=new_multi_index) df_equals(new_modin_df, new_pandas_df) # index + multiindex case modin_df = pd.DataFrame([1, 2, 3], index=index) pandas_df = pandas.DataFrame([1, 2, 3], index=index) new_modin_df = modin_df.reindex(new_multi_index) new_pandas_df = pandas_df.reindex(new_multi_index) df_equals(new_modin_df, new_pandas_df) def test_reindex_like(): o_data = [ [24.3, 75.7, "high"], [31, 87.8, "high"], [22, 71.6, "medium"], [35, 95, "medium"], ] o_columns = ["temp_celsius", "temp_fahrenheit", "windspeed"] o_index = pd.date_range(start="2014-02-12", end="2014-02-15", freq="D") new_data = [[28, "low"], [30, "low"], [35.1, "medium"]] new_columns = ["temp_celsius", "windspeed"] new_index = pd.DatetimeIndex(["2014-02-12", "2014-02-13", "2014-02-15"]) modin_df1 = pd.DataFrame(o_data, columns=o_columns, index=o_index) modin_df2 = pd.DataFrame(new_data, columns=new_columns, index=new_index) modin_result = modin_df2.reindex_like(modin_df1) pandas_df1 = pandas.DataFrame(o_data, columns=o_columns, index=o_index) pandas_df2 = pandas.DataFrame(new_data, columns=new_columns, index=new_index) pandas_result = pandas_df2.reindex_like(pandas_df1) df_equals(modin_result, pandas_result) def test_rename_sanity(): source_df = pandas.DataFrame(test_data["int_data"])[ ["col1", "index", "col3", "col4"] ] mapping = {"col1": "a", "index": "b", "col3": "c", "col4": "d"} modin_df = pd.DataFrame(source_df) df_equals(modin_df.rename(columns=mapping), source_df.rename(columns=mapping)) renamed2 = source_df.rename(columns=str.lower) df_equals(modin_df.rename(columns=str.lower), renamed2) modin_df = pd.DataFrame(renamed2) df_equals(modin_df.rename(columns=str.upper), renamed2.rename(columns=str.upper)) # index data = {"A": {"foo": 0, "bar": 1}} # gets sorted alphabetical df = pandas.DataFrame(data) modin_df = pd.DataFrame(data) assert_index_equal( modin_df.rename(index={"foo": "bar", "bar": "foo"}).index, df.rename(index={"foo": "bar", "bar": "foo"}).index, ) assert_index_equal( modin_df.rename(index=str.upper).index, df.rename(index=str.upper).index ) # Using the `mapper` functionality with `axis` assert_index_equal( modin_df.rename(str.upper, axis=0).index, df.rename(str.upper, axis=0).index ) assert_index_equal( modin_df.rename(str.upper, axis=1).columns, df.rename(str.upper, axis=1).columns, ) assert_index_equal(modin_df.rename(str.upper).index, df.rename(str.upper).index) # have to pass something with pytest.raises(TypeError): modin_df.rename() # partial columns source_df.rename(columns={"col3": "foo", "col4": "bar"}) modin_df = pd.DataFrame(source_df) assert_index_equal( modin_df.rename(columns={"col3": "foo", "col4": "bar"}).index, source_df.rename(columns={"col3": "foo", "col4": "bar"}).index, ) # other axis source_df.T.rename(index={"col3": "foo", "col4": "bar"}) assert_index_equal( source_df.T.rename(index={"col3": "foo", "col4": "bar"}).index, modin_df.T.rename(index={"col3": "foo", "col4": "bar"}).index, ) # index with name index = pandas.Index(["foo", "bar"], name="name") renamer = pandas.DataFrame(data, index=index) modin_df = pd.DataFrame(data, index=index) renamed = renamer.rename(index={"foo": "bar", "bar": "foo"}) modin_renamed = modin_df.rename(index={"foo": "bar", "bar": "foo"}) assert_index_equal(renamed.index, modin_renamed.index) assert renamed.index.name == modin_renamed.index.name def test_rename_multiindex(): tuples_index = [("foo1", "bar1"), ("foo2", "bar2")] tuples_columns = [("fizz1", "buzz1"), ("fizz2", "buzz2")] index = pandas.MultiIndex.from_tuples(tuples_index, names=["foo", "bar"]) columns = pandas.MultiIndex.from_tuples(tuples_columns, names=["fizz", "buzz"]) frame_data = [(0, 0), (1, 1)] df = pandas.DataFrame(frame_data, index=index, columns=columns) modin_df = pd.DataFrame(frame_data, index=index, columns=columns) # # without specifying level -> accross all levels renamed = df.rename( index={"foo1": "foo3", "bar2": "bar3"}, columns={"fizz1": "fizz3", "buzz2": "buzz3"}, ) modin_renamed = modin_df.rename( index={"foo1": "foo3", "bar2": "bar3"}, columns={"fizz1": "fizz3", "buzz2": "buzz3"}, ) assert_index_equal(renamed.index, modin_renamed.index) renamed = df.rename( index={"foo1": "foo3", "bar2": "bar3"}, columns={"fizz1": "fizz3", "buzz2": "buzz3"}, ) assert_index_equal(renamed.columns, modin_renamed.columns) assert renamed.index.names == modin_renamed.index.names assert renamed.columns.names == modin_renamed.columns.names # # with specifying a level # dict renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=0) modin_renamed = modin_df.rename( columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=0 ) assert_index_equal(renamed.columns, modin_renamed.columns) renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="fizz") modin_renamed = modin_df.rename( columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="fizz" ) assert_index_equal(renamed.columns, modin_renamed.columns) renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=1) modin_renamed = modin_df.rename( columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=1 ) assert_index_equal(renamed.columns, modin_renamed.columns) renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="buzz") modin_renamed = modin_df.rename( columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="buzz" ) assert_index_equal(renamed.columns, modin_renamed.columns) # function func = str.upper renamed = df.rename(columns=func, level=0) modin_renamed = modin_df.rename(columns=func, level=0) assert_index_equal(renamed.columns, modin_renamed.columns) renamed = df.rename(columns=func, level="fizz") modin_renamed = modin_df.rename(columns=func, level="fizz") assert_index_equal(renamed.columns, modin_renamed.columns) renamed = df.rename(columns=func, level=1) modin_renamed = modin_df.rename(columns=func, level=1) assert_index_equal(renamed.columns, modin_renamed.columns) renamed = df.rename(columns=func, level="buzz") modin_renamed = modin_df.rename(columns=func, level="buzz") assert_index_equal(renamed.columns, modin_renamed.columns) # index renamed = df.rename(index={"foo1": "foo3", "bar2": "bar3"}, level=0) modin_renamed = modin_df.rename(index={"foo1": "foo3", "bar2": "bar3"}, level=0) assert_index_equal(modin_renamed.index, renamed.index) @pytest.mark.xfail(reason="Pandas does not pass this test") def test_rename_nocopy(): source_df = pandas.DataFrame(test_data["int_data"])[ ["col1", "index", "col3", "col4"] ] modin_df = pd.DataFrame(source_df) modin_renamed = modin_df.rename(columns={"col3": "foo"}, copy=False) modin_renamed["foo"] = 1 assert (modin_df["col3"] == 1).all() def test_rename_inplace(): source_df = pandas.DataFrame(test_data["int_data"])[ ["col1", "index", "col3", "col4"] ] modin_df = pd.DataFrame(source_df) df_equals( modin_df.rename(columns={"col3": "foo"}), source_df.rename(columns={"col3": "foo"}), ) frame = source_df.copy() modin_frame = modin_df.copy() frame.rename(columns={"col3": "foo"}, inplace=True) modin_frame.rename(columns={"col3": "foo"}, inplace=True) df_equals(modin_frame, frame) def test_rename_bug(): # rename set ref_locs, and set_index was not resetting frame_data = {0: ["foo", "bar"], 1: ["bah", "bas"], 2: [1, 2]} df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) df = df.rename(columns={0: "a"}) df = df.rename(columns={1: "b"}) df = df.set_index(["a", "b"]) df.columns = ["2001-01-01"] modin_df = modin_df.rename(columns={0: "a"}) modin_df = modin_df.rename(columns={1: "b"}) modin_df = modin_df.set_index(["a", "b"]) modin_df.columns = ["2001-01-01"] df_equals(modin_df, df) def test_index_to_datetime_using_set_index(): data = {"YEAR": ["1992", "1993", "1994"], "ALIENS": [1, 99, 1]} modin_df_years = pd.DataFrame(data=data) df_years = pandas.DataFrame(data=data) modin_df_years = modin_df_years.set_index("YEAR") df_years = df_years.set_index("YEAR") modin_datetime_index = pd.to_datetime(modin_df_years.index, format="%Y") pandas_datetime_index = pandas.to_datetime(df_years.index, format="%Y") modin_df_years.index = modin_datetime_index df_years.index = pandas_datetime_index modin_df_years.set_index(modin_datetime_index) df_years.set_index(pandas_datetime_index) df_equals(modin_df_years, df_years) def test_rename_axis(): data = {"num_legs": [4, 4, 2], "num_arms": [0, 0, 2]} index = ["dog", "cat", "monkey"] modin_df = pd.DataFrame(data, index) pandas_df = pandas.DataFrame(data, index) df_equals(modin_df.rename_axis("animal"), pandas_df.rename_axis("animal")) df_equals( modin_df.rename_axis("limbs", axis="columns"), pandas_df.rename_axis("limbs", axis="columns"), ) modin_df.rename_axis("limbs", axis="columns", inplace=True) pandas_df.rename_axis("limbs", axis="columns", inplace=True) df_equals(modin_df, pandas_df) new_index = pd.MultiIndex.from_product( [["mammal"], ["dog", "cat", "monkey"]], names=["type", "name"] ) modin_df.index = new_index pandas_df.index = new_index df_equals( modin_df.rename_axis(index={"type": "class"}), pandas_df.rename_axis(index={"type": "class"}), ) df_equals( modin_df.rename_axis(columns=str.upper), pandas_df.rename_axis(columns=str.upper), ) df_equals( modin_df.rename_axis(columns=[str.upper(o) for o in modin_df.columns.names]), pandas_df.rename_axis(columns=[str.upper(o) for o in pandas_df.columns.names]), ) with pytest.raises(ValueError): df_equals( modin_df.rename_axis(str.upper, axis=1), pandas_df.rename_axis(str.upper, axis=1), ) def test_rename_axis_inplace(): test_frame = pandas.DataFrame(test_data["int_data"]) modin_df = pd.DataFrame(test_frame) result = test_frame.copy() modin_result = modin_df.copy() no_return = result.rename_axis("foo", inplace=True) modin_no_return = modin_result.rename_axis("foo", inplace=True) assert no_return is modin_no_return df_equals(modin_result, result) result = test_frame.copy() modin_result = modin_df.copy() no_return = result.rename_axis("bar", axis=1, inplace=True) modin_no_return = modin_result.rename_axis("bar", axis=1, inplace=True) assert no_return is modin_no_return df_equals(modin_result, result) def test_rename_issue5600(): # Check the issue for more details # https://github.com/modin-project/modin/issues/5600 df = pd.DataFrame({"a": [1, 2]}) df_renamed = df.rename(columns={"a": "new_a"}, copy=True, inplace=False) # Check that the source frame was untouched assert df.dtypes.keys().tolist() == ["a"] assert df.columns.tolist() == ["a"] assert df_renamed.dtypes.keys().tolist() == ["new_a"] assert df_renamed.columns.tolist() == ["new_a"] def test_reorder_levels(): data = np.random.randint(1, 100, 12) modin_df = pd.DataFrame( data, index=pd.MultiIndex.from_tuples( [ (num, letter, color) for num in range(1, 3) for letter in ["a", "b", "c"] for color in ["Red", "Green"] ], names=["Number", "Letter", "Color"], ), ) pandas_df = pandas.DataFrame( data, index=pandas.MultiIndex.from_tuples( [ (num, letter, color) for num in range(1, 3) for letter in ["a", "b", "c"] for color in ["Red", "Green"] ], names=["Number", "Letter", "Color"], ), ) df_equals( modin_df.reorder_levels(["Letter", "Color", "Number"]), pandas_df.reorder_levels(["Letter", "Color", "Number"]), ) def test_reindex_multiindex(): data1, data2 = np.random.randint(1, 20, (5, 5)), np.random.randint(10, 25, 6) index = np.array(["AUD", "BRL", "CAD", "EUR", "INR"]) modin_midx = pd.MultiIndex.from_product( [["Bank_1", "Bank_2"], ["AUD", "CAD", "EUR"]], names=["Bank", "Curency"] ) pandas_midx = pandas.MultiIndex.from_product( [["Bank_1", "Bank_2"], ["AUD", "CAD", "EUR"]], names=["Bank", "Curency"] ) modin_df1, modin_df2 = ( pd.DataFrame(data=data1, index=index, columns=index), pd.DataFrame(data2, modin_midx), ) pandas_df1, pandas_df2 = ( pandas.DataFrame(data=data1, index=index, columns=index), pandas.DataFrame(data2, pandas_midx), ) modin_df2.columns, pandas_df2.columns = ["Notional"], ["Notional"] md_midx = pd.MultiIndex.from_product([modin_df2.index.levels[0], modin_df1.index]) pd_midx = pandas.MultiIndex.from_product( [pandas_df2.index.levels[0], pandas_df1.index] ) # reindex without axis, index, or columns modin_result = modin_df1.reindex(md_midx, fill_value=0) pandas_result = pandas_df1.reindex(pd_midx, fill_value=0) df_equals(modin_result, pandas_result) # reindex with only axis modin_result = modin_df1.reindex(md_midx, fill_value=0, axis=0) pandas_result = pandas_df1.reindex(pd_midx, fill_value=0, axis=0) df_equals(modin_result, pandas_result) # reindex with axis and level modin_result = modin_df1.reindex(md_midx, fill_value=0, axis=0, level=0) pandas_result = pandas_df1.reindex(pd_midx, fill_value=0, axis=0, level=0) df_equals(modin_result, pandas_result) @pytest.mark.parametrize("test_async_reset_index", [False, True]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_reset_index(data, test_async_reset_index): modin_df, pandas_df = create_test_dfs(data) if test_async_reset_index: modin_df._query_compiler.set_frame_index_cache(None) modin_result = modin_df.reset_index(inplace=False) pandas_result = pandas_df.reset_index(inplace=False) df_equals(modin_result, pandas_result) modin_df_cp = modin_df.copy() pd_df_cp = pandas_df.copy() if test_async_reset_index: modin_df._query_compiler.set_frame_index_cache(None) modin_df_cp.reset_index(inplace=True) pd_df_cp.reset_index(inplace=True) df_equals(modin_df_cp, pd_df_cp) @pytest.mark.parametrize( "data", [ test_data["int_data"], test_data["float_nan_data"], ], ) def test_reset_index_multiindex_groupby(data): # GH#4394 modin_df, pandas_df = create_test_dfs(data) modin_df.index = pd.MultiIndex.from_tuples( [(i // 10, i // 5, i) for i in range(len(modin_df))] ) pandas_df.index = pandas.MultiIndex.from_tuples( [(i // 10, i // 5, i) for i in range(len(pandas_df))] ) eval_general( modin_df, pandas_df, lambda df: df.reset_index().groupby(list(df.columns[:2])).count(), ) @pytest.mark.parametrize("test_async_reset_index", [False, True]) @pytest.mark.parametrize( "data", [ pytest.param( test_data["int_data"], marks=pytest.mark.exclude_by_default, ), test_data["float_nan_data"], ], ids=["int_data", "float_nan_data"], ) @pytest.mark.parametrize("nlevels", [3]) @pytest.mark.parametrize("columns_multiindex", [True, False]) @pytest.mark.parametrize( "level", [ "no_level", None, 0, 1, 2, [2, 0], [2, 1], [1, 0], pytest.param( [2, 1, 2], marks=pytest.mark.exclude_by_default, ), pytest.param( [0, 0, 0, 0], marks=pytest.mark.exclude_by_default, ), pytest.param( ["level_name_1"], marks=pytest.mark.exclude_by_default, ), pytest.param( ["level_name_2", "level_name_1"], marks=pytest.mark.exclude_by_default, ), pytest.param( [2, "level_name_0"], marks=pytest.mark.exclude_by_default, ), ], ) @pytest.mark.parametrize("col_level", ["no_col_level", 0, 1, 2]) @pytest.mark.parametrize("col_fill", ["no_col_fill", None, 0, "new"]) @pytest.mark.parametrize("drop", [False]) @pytest.mark.parametrize( "multiindex_levels_names_max_levels", [ 0, 1, 2, pytest.param(3, marks=pytest.mark.exclude_by_default), pytest.param(4, marks=pytest.mark.exclude_by_default), ], ) @pytest.mark.parametrize( "none_in_index_names", [ pytest.param( False, marks=pytest.mark.exclude_by_default, ), True, "mixed_1st_None", pytest.param( "mixed_2nd_None", marks=pytest.mark.exclude_by_default, ), ], ) def test_reset_index_with_multi_index_no_drop( data, nlevels, columns_multiindex, level, col_level, col_fill, drop, multiindex_levels_names_max_levels, none_in_index_names, test_async_reset_index, ): data_rows = len(data[list(data.keys())[0]]) index = generate_multiindex(data_rows, nlevels=nlevels) data_columns = len(data.keys()) columns = ( generate_multiindex(data_columns, nlevels=nlevels) if columns_multiindex else pandas.RangeIndex(0, data_columns) ) # Replace original data columns with generated data = {columns[ind]: data[key] for ind, key in enumerate(data)} index.names = ( [f"level_{i}" for i in range(index.nlevels)] if multiindex_levels_names_max_levels == 0 else [ ( tuple( [ f"level_{i}_name_{j}" for j in range( 0, max( multiindex_levels_names_max_levels + 1 - index.nlevels, 0, ) + i, ) ] ) if max(multiindex_levels_names_max_levels + 1 - index.nlevels, 0) + i > 0 else f"level_{i}" ) for i in range(index.nlevels) ] ) if none_in_index_names is True: index.names = [None] * len(index.names) elif none_in_index_names: names_list = list(index.names) start_index = 0 if none_in_index_names == "mixed_1st_None" else 1 names_list[start_index::2] = [None] * len(names_list[start_index::2]) index.names = names_list modin_df = pd.DataFrame(data, index=index, columns=columns) pandas_df = pandas.DataFrame(data, index=index, columns=columns) if isinstance(level, list): level = [ ( index.names[int(x[len("level_name_") :])] if isinstance(x, str) and x.startswith("level_name_") else x ) for x in level ] kwargs = {"drop": drop} if level != "no_level": kwargs["level"] = level if col_level != "no_col_level": kwargs["col_level"] = col_level if col_fill != "no_col_fill": kwargs["col_fill"] = col_fill if test_async_reset_index: modin_df._query_compiler.set_frame_index_cache(None) eval_general( modin_df, pandas_df, lambda df: df.reset_index(**kwargs), # https://github.com/modin-project/modin/issues/5960 comparator_kwargs={"check_dtypes": False}, ) @pytest.mark.parametrize("test_async_reset_index", [False, True]) @pytest.mark.parametrize( "data", [ pytest.param( test_data["int_data"], marks=pytest.mark.exclude_by_default, ), test_data["float_nan_data"], ], ids=["int_data", "float_nan_data"], ) @pytest.mark.parametrize("nlevels", [3]) @pytest.mark.parametrize( "level", [ "no_level", None, 0, 1, 2, [2, 0], [2, 1], [1, 0], pytest.param( [2, 1, 2], marks=pytest.mark.exclude_by_default, ), pytest.param( [0, 0, 0, 0], marks=pytest.mark.exclude_by_default, ), pytest.param( ["level_name_1"], marks=pytest.mark.exclude_by_default, ), pytest.param( ["level_name_2", "level_name_1"], marks=pytest.mark.exclude_by_default, ), pytest.param( [2, "level_name_0"], marks=pytest.mark.exclude_by_default, ), ], ) @pytest.mark.parametrize( "multiindex_levels_names_max_levels", [ 0, 1, 2, pytest.param(3, marks=pytest.mark.exclude_by_default), pytest.param(4, marks=pytest.mark.exclude_by_default), ], ) @pytest.mark.parametrize( "none_in_index_names", [ pytest.param( False, marks=pytest.mark.exclude_by_default, ), True, "mixed_1st_None", pytest.param( "mixed_2nd_None", marks=pytest.mark.exclude_by_default, ), ], ) def test_reset_index_with_multi_index_drop( data, nlevels, level, multiindex_levels_names_max_levels, none_in_index_names, test_async_reset_index, ): test_reset_index_with_multi_index_no_drop( data, nlevels, True, level, "no_col_level", "no_col_fill", True, multiindex_levels_names_max_levels, none_in_index_names, test_async_reset_index, ) @pytest.mark.parametrize("test_async_reset_index", [False, True]) @pytest.mark.parametrize("index_levels_names_max_levels", [0, 1, 2]) def test_reset_index_with_named_index( index_levels_names_max_levels, test_async_reset_index ): modin_df = pd.DataFrame(test_data_values[0]) pandas_df = pandas.DataFrame(test_data_values[0]) index_name = ( tuple([f"name_{j}" for j in range(0, index_levels_names_max_levels)]) if index_levels_names_max_levels > 0 else "NAME_OF_INDEX" ) modin_df.index.name = pandas_df.index.name = index_name df_equals(modin_df, pandas_df) if test_async_reset_index: # The change in index is not automatically handled by Modin. See #3941. modin_df.index = modin_df.index modin_df.modin.to_pandas() modin_df._query_compiler.set_frame_index_cache(None) df_equals(modin_df.reset_index(drop=False), pandas_df.reset_index(drop=False)) if test_async_reset_index: # The change in index is not automatically handled by Modin. See #3941. modin_df.index = modin_df.index modin_df.modin.to_pandas() modin_df._query_compiler.set_frame_index_cache(None) modin_df.reset_index(drop=True, inplace=True) pandas_df.reset_index(drop=True, inplace=True) df_equals(modin_df, pandas_df) modin_df = pd.DataFrame(test_data_values[0]) pandas_df = pandas.DataFrame(test_data_values[0]) modin_df.index.name = pandas_df.index.name = index_name if test_async_reset_index: # The change in index is not automatically handled by Modin. See #3941. modin_df.index = modin_df.index modin_df._to_pandas() modin_df._query_compiler.set_frame_index_cache(None) df_equals(modin_df.reset_index(drop=False), pandas_df.reset_index(drop=False)) @pytest.mark.parametrize("test_async_reset_index", [False, True]) @pytest.mark.parametrize( "index", [ pandas.Index([11, 22, 33, 44], name="col0"), pandas.MultiIndex.from_product( [[100, 200], [300, 400]], names=["level1", "col0"] ), ], ids=["index", "multiindex"], ) def test_reset_index_metadata_update(index, test_async_reset_index): modin_df, pandas_df = create_test_dfs({"col0": [0, 1, 2, 3]}, index=index) modin_df.columns = pandas_df.columns = ["col1"] if test_async_reset_index: # The change in index is not automatically handled by Modin. See #3941. modin_df.index = modin_df.index modin_df._to_pandas() modin_df._query_compiler.set_frame_index_cache(None) eval_general(modin_df, pandas_df, lambda df: df.reset_index()) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) def test_sample(data, axis): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) with pytest.raises(ValueError): modin_df.sample(n=3, frac=0.4, axis=axis) with pytest.raises(KeyError): modin_df.sample(frac=0.5, weights="CoLuMn_No_ExIsT", axis=0) with pytest.raises(ValueError): modin_df.sample(frac=0.5, weights=modin_df.columns[0], axis=1) with pytest.raises(ValueError): modin_df.sample( frac=0.5, weights=[0.5 for _ in range(len(modin_df.index[:-1]))], axis=0 ) with pytest.raises(ValueError): modin_df.sample( frac=0.5, weights=[0.5 for _ in range(len(modin_df.columns[:-1]))], axis=1, ) with pytest.raises(ValueError): modin_df.sample(n=-3, axis=axis) with pytest.raises(ValueError): modin_df.sample(frac=0.2, weights=pandas.Series(), axis=axis) if isinstance(axis, str): num_axis = pandas.DataFrame()._get_axis_number(axis) else: num_axis = axis # weights that sum to 1 sums = sum(i % 2 for i in range(len(modin_df.axes[num_axis]))) weights = [i % 2 / sums for i in range(len(modin_df.axes[num_axis]))] modin_result = modin_df.sample( frac=0.5, random_state=42, weights=weights, axis=axis ) pandas_result = pandas_df.sample( frac=0.5, random_state=42, weights=weights, axis=axis ) df_equals(modin_result, pandas_result) # weights that don't sum to 1 weights = [i % 2 for i in range(len(modin_df.axes[num_axis]))] modin_result = modin_df.sample( frac=0.5, random_state=42, weights=weights, axis=axis ) pandas_result = pandas_df.sample( frac=0.5, random_state=42, weights=weights, axis=axis ) df_equals(modin_result, pandas_result) modin_result = modin_df.sample(n=0, axis=axis) pandas_result = pandas_df.sample(n=0, axis=axis) df_equals(modin_result, pandas_result) modin_result = modin_df.sample(frac=0.5, random_state=42, axis=axis) pandas_result = pandas_df.sample(frac=0.5, random_state=42, axis=axis) df_equals(modin_result, pandas_result) modin_result = modin_df.sample(n=2, random_state=42, axis=axis) pandas_result = pandas_df.sample(n=2, random_state=42, axis=axis) df_equals(modin_result, pandas_result) # issue #1692, numpy RandomState object # We must create a new random state for each iteration because the values that # are selected will be impacted if the object has already been used. random_state = np.random.RandomState(42) modin_result = modin_df.sample(frac=0.5, random_state=random_state, axis=axis) random_state = np.random.RandomState(42) pandas_result = pandas_df.sample(frac=0.5, random_state=random_state, axis=axis) df_equals(modin_result, pandas_result) def test_empty_sample(): modin_df, pandas_df = create_test_dfs([1]) # issue #4983 # If we have a fraction of the dataset that results in n=0, we should # make sure that we don't pass in both n and frac to sample internally. eval_general(modin_df, pandas_df, lambda df: df.sample(frac=0.12)) def test_select_dtypes(): frame_data = { "test1": list("abc"), "test2": np.arange(3, 6).astype("u1"), "test3": np.arange(8.0, 11.0, dtype="float64"), "test4": [True, False, True], "test5": pandas.date_range("now", periods=3).values, "test6": list(range(5, 8)), } df = pandas.DataFrame(frame_data) rd = pd.DataFrame(frame_data) include = np.float64, "integer" exclude = (np.bool_,) r = rd.select_dtypes(include=include, exclude=exclude) e = df[["test2", "test3", "test6"]] df_equals(r, e) r = rd.select_dtypes(include=np.bool_) e = df[["test4"]] df_equals(r, e) r = rd.select_dtypes(exclude=np.bool_) e = df[["test1", "test2", "test3", "test5", "test6"]] df_equals(r, e) try: pd.DataFrame().select_dtypes() assert False except ValueError: assert True @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("n", int_arg_values, ids=arg_keys("n", int_arg_keys)) def test_tail(data, n): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) df_equals(modin_df.tail(n), pandas_df.tail(n)) df_equals(modin_df.tail(len(modin_df)), pandas_df.tail(len(pandas_df))) def test_xs(): # example is based on the doctest in the upstream pandas docstring data = { "num_legs": [4, 4, 2, 2], "num_wings": [0, 0, 2, 2], "class": ["mammal", "mammal", "mammal", "bird"], "animal": ["cat", "dog", "bat", "penguin"], "locomotion": ["walks", "walks", "flies", "walks"], } modin_df, pandas_df = create_test_dfs(data) def prepare_dataframes(df): # to make several partitions (only for Modin dataframe) df = (pd if isinstance(df, pd.DataFrame) else pandas).concat([df, df], axis=0) # looks like pandas is sorting the index whereas modin is not, performing a join operation. df = df.reset_index(drop=True) df = df.join(df, rsuffix="_y") return df.set_index(["class", "animal", "locomotion"]) modin_df = prepare_dataframes(modin_df) pandas_df = prepare_dataframes(pandas_df) eval_general(modin_df, pandas_df, lambda df: df.xs("mammal")) eval_general(modin_df, pandas_df, lambda df: df.xs("cat", level=1)) eval_general(modin_df, pandas_df, lambda df: df.xs("num_legs", axis=1)) eval_general( modin_df, pandas_df, lambda df: df.xs("cat", level=1, drop_level=False) ) eval_general(modin_df, pandas_df, lambda df: df.xs(("mammal", "cat"))) eval_general( modin_df, pandas_df, lambda df: df.xs(("mammal", "cat"), drop_level=False) ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___getitem__(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) key = modin_df.columns[0] modin_col = modin_df.__getitem__(key) assert isinstance(modin_col, pd.Series) pd_col = pandas_df[key] df_equals(pd_col, modin_col) slices = [ (None, -1), (-1, None), (1, 2), (1, None), (None, 1), (1, -1), (-3, -1), (1, -1, 2), (-1, 1, -1), (None, None, 2), ] # slice test for slice_param in slices: s = slice(*slice_param) df_equals(modin_df[s], pandas_df[s]) # Test empty df_equals(pd.DataFrame([])[:10], pandas.DataFrame([])[:10]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___getitem_bool_indexers(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) indices = [i % 3 == 0 for i in range(len(modin_df.index))] columns = [i % 5 == 0 for i in range(len(modin_df.columns))] # Key is a list of booleans modin_result = modin_df.loc[indices, columns] pandas_result = pandas_df.loc[indices, columns] df_equals(modin_result, pandas_result) # Key is a Modin or pandas series of booleans df_equals( modin_df.loc[pd.Series(indices), pd.Series(columns, index=modin_df.columns)], pandas_df.loc[ pandas.Series(indices), pandas.Series(columns, index=modin_df.columns) ], ) def test_getitem_empty_mask(): # modin-project/modin#517 modin_frames = [] pandas_frames = [] data1 = np.random.randint(0, 100, size=(100, 4)) mdf1 = pd.DataFrame(data1, columns=list("ABCD")) pdf1 = pandas.DataFrame(data1, columns=list("ABCD")) modin_frames.append(mdf1) pandas_frames.append(pdf1) data2 = np.random.randint(0, 100, size=(100, 4)) mdf2 = pd.DataFrame(data2, columns=list("ABCD")) pdf2 = pandas.DataFrame(data2, columns=list("ABCD")) modin_frames.append(mdf2) pandas_frames.append(pdf2) data3 = np.random.randint(0, 100, size=(100, 4)) mdf3 = pd.DataFrame(data3, columns=list("ABCD")) pdf3 = pandas.DataFrame(data3, columns=list("ABCD")) modin_frames.append(mdf3) pandas_frames.append(pdf3) modin_data = pd.concat(modin_frames) pandas_data = pandas.concat(pandas_frames) df_equals( modin_data[[False for _ in modin_data.index]], pandas_data[[False for _ in modin_data.index]], ) def test_getitem_datetime_slice(): data = {"data": range(1000)} index = pd.date_range("2017/1/4", periods=1000) modin_df = pd.DataFrame(data=data, index=index) pandas_df = pandas.DataFrame(data=data, index=index) s = slice("2017-01-06", "2017-01-09") df_equals(modin_df[s], pandas_df[s]) def test_getitem_same_name(): data = [ [1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16], [17, 18, 19, 20], ] columns = ["c1", "c2", "c1", "c3"] modin_df = pd.DataFrame(data, columns=columns) pandas_df = pandas.DataFrame(data, columns=columns) df_equals(modin_df["c1"], pandas_df["c1"]) df_equals(modin_df["c2"], pandas_df["c2"]) df_equals(modin_df[["c1", "c2"]], pandas_df[["c1", "c2"]]) df_equals(modin_df["c3"], pandas_df["c3"]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___getattr__(request, data): modin_df = pd.DataFrame(data) if "empty_data" not in request.node.name: key = modin_df.columns[0] modin_df.__getattr__(key) col = modin_df.__getattr__("col1") assert isinstance(col, pd.Series) col = getattr(modin_df, "col1") assert isinstance(col, pd.Series) # Check that lookup in column doesn't override other attributes df2 = modin_df.rename(index=str, columns={key: "columns"}) assert isinstance(df2.columns, pandas.Index) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___setitem__(data): eval_setitem(*create_test_dfs(data), loc=-1, value=1) eval_setitem( *create_test_dfs(data), loc=-1, value=lambda df: type(df)(df[df.columns[0]]) ) nrows = len(data[list(data.keys())[0]]) arr = np.arange(nrows * 2).reshape(-1, 2) eval_setitem(*create_test_dfs(data), loc=-1, value=arr) eval_setitem(*create_test_dfs(data), col="___NON EXISTENT COLUMN", value=arr.T[0]) eval_setitem(*create_test_dfs(data), loc=0, value=np.arange(nrows)) modin_df = pd.DataFrame(columns=data.keys()) pandas_df = pandas.DataFrame(columns=data.keys()) for col in modin_df.columns: modin_df[col] = np.arange(1000) for col in pandas_df.columns: pandas_df[col] = np.arange(1000) df_equals(modin_df, pandas_df) # Test df assignment to a columns selection modin_df[modin_df.columns[[0, -1]]] = modin_df[modin_df.columns[[0, -1]]] pandas_df[pandas_df.columns[[0, -1]]] = pandas_df[pandas_df.columns[[0, -1]]] df_equals(modin_df, pandas_df) # Test series assignment to column modin_df = pd.DataFrame(columns=modin_df.columns) pandas_df = pandas.DataFrame(columns=pandas_df.columns) modin_df[modin_df.columns[-1]] = modin_df[modin_df.columns[0]] pandas_df[pandas_df.columns[-1]] = pandas_df[pandas_df.columns[0]] df_equals(modin_df, pandas_df) if not sys.version_info.major == 3 and sys.version_info.minor > 6: # This test doesn't work correctly on Python 3.6 # Test 2d ndarray assignment to column modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) modin_df["new_col"] = modin_df[[modin_df.columns[0]]].values pandas_df["new_col"] = pandas_df[[pandas_df.columns[0]]].values df_equals(modin_df, pandas_df) assert isinstance(modin_df["new_col"][0], type(pandas_df["new_col"][0])) modin_df[1:5] = 10 pandas_df[1:5] = 10 df_equals(modin_df, pandas_df) # Transpose test modin_df = pd.DataFrame(data).T pandas_df = pandas.DataFrame(data).T modin_df[modin_df.columns[0]] = 0 pandas_df[pandas_df.columns[0]] = 0 df_equals(modin_df, pandas_df) modin_df.columns = [str(i) for i in modin_df.columns] pandas_df.columns = [str(i) for i in pandas_df.columns] modin_df[modin_df.columns[0]] = 0 pandas_df[pandas_df.columns[0]] = 0 df_equals(modin_df, pandas_df) modin_df[modin_df.columns[0]][modin_df.index[0]] = 12345 pandas_df[pandas_df.columns[0]][pandas_df.index[0]] = 12345 df_equals(modin_df, pandas_df) modin_df[1:5] = 10 pandas_df[1:5] = 10 df_equals(modin_df, pandas_df) def test___setitem__partitions_aligning(): # from issue #2390 modin_df = pd.DataFrame({"a": [1, 2, 3]}) pandas_df = pandas.DataFrame({"a": [1, 2, 3]}) modin_df["b"] = pd.Series([4, 5, 6, 7, 8]) pandas_df["b"] = pandas.Series([4, 5, 6, 7, 8]) df_equals(modin_df, pandas_df) # from issue #2442 data = {"a": [1, 2, 3, 4]} # Index with duplicated timestamp index = pandas.to_datetime(["2020-02-06", "2020-02-06", "2020-02-22", "2020-03-26"]) md_df, pd_df = create_test_dfs(data, index=index) # Setting new column pd_df["b"] = pandas.Series(np.arange(4)) md_df["b"] = pd.Series(np.arange(4)) df_equals(md_df, pd_df) # Setting existing column pd_df["b"] = pandas.Series(np.arange(4)) md_df["b"] = pd.Series(np.arange(4)) df_equals(md_df, pd_df) pd_df["a"] = pandas.Series(np.arange(4)) md_df["a"] = pd.Series(np.arange(4)) df_equals(md_df, pd_df) def test___setitem__with_mismatched_partitions(): with ensure_clean(".csv") as fname: np.savetxt(fname, np.random.randint(0, 100, size=(200_000, 99)), delimiter=",") modin_df = pd.read_csv(fname) pandas_df = pandas.read_csv(fname) modin_df["new"] = pd.Series(list(range(len(modin_df)))) pandas_df["new"] = pandas.Series(list(range(len(pandas_df)))) df_equals(modin_df, pandas_df) def test___setitem__mask(): # DataFrame mask: data = test_data["int_data"] modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) mean = int((RAND_HIGH + RAND_LOW) / 2) pandas_df[pandas_df > mean] = -50 modin_df[modin_df > mean] = -50 df_equals(modin_df, pandas_df) # Array mask: pandas_df = pandas.DataFrame(data) modin_df = pd.DataFrame(data) array = (pandas_df > mean).to_numpy() modin_df[array] = -50 pandas_df[array] = -50 df_equals(modin_df, pandas_df) # Array mask of wrong size: with pytest.raises(ValueError): array = np.array([[1, 2], [3, 4]]) modin_df[array] = 20 @pytest.mark.parametrize( "data", [ {}, {"id": [], "max_speed": [], "health": []}, {"id": [1], "max_speed": [2], "health": [3]}, {"id": [4, 40, 400], "max_speed": [111, 222, 333], "health": [33, 22, 11]}, ], ids=["empty_frame", "empty_cols", "1_length_cols", "2_length_cols"], ) @pytest.mark.parametrize( "value", [[11, 22], [11, 22, 33]], ids=["2_length_val", "3_length_val"], ) @pytest.mark.parametrize("convert_to_series", [False, True]) @pytest.mark.parametrize("new_col_id", [123, "new_col"], ids=["integer", "string"]) def test_setitem_on_empty_df(data, value, convert_to_series, new_col_id): pandas_df = pandas.DataFrame(data) modin_df = pd.DataFrame(data) def applyier(df): if convert_to_series: converted_value = ( pandas.Series(value) if isinstance(df, pandas.DataFrame) else pd.Series(value) ) else: converted_value = value df[new_col_id] = converted_value return df expected_exception = None if not convert_to_series: values_length = len(value) index_length = len(pandas_df.index) expected_exception = ValueError( f"Length of values ({values_length}) does not match length of index ({index_length})" ) eval_general( modin_df, pandas_df, applyier, expected_exception=expected_exception, __inplace__=True, ) # Because of https://github.com/modin-project/modin/issues/7600, # df_equals does not check dtypes equality for empty frames. assert_dtypes_equal(modin_df, pandas_df) def test_setitem_on_empty_df_4407(): data = {} index = pd.date_range(end="1/1/2018", periods=0, freq="D") column = pd.date_range(end="1/1/2018", periods=1, freq="h")[0] modin_df = pd.DataFrame(data, columns=index) pandas_df = pandas.DataFrame(data, columns=index) modin_df[column] = pd.Series([1]) pandas_df[column] = pandas.Series([1]) df_equals(modin_df, pandas_df) assert modin_df.columns.freq == pandas_df.columns.freq def test_setitem_on_empty_df_does_not_change_other_dtypes_5961(): def _do_setitem(df): df["col0"] = df["col0"].astype(float) modin_df, pandas_df = create_test_dfs(pandas.DataFrame(columns=["col0", "col1"])) _do_setitem(modin_df) _do_setitem(pandas_df) # Because of https://github.com/modin-project/modin/issues/7600, we cannot # use df_equals to check dtypes equality. assert_dtypes_equal(modin_df, pandas_df) def test___setitem__unhashable_list(): # from #3258 and #3291 cols = ["a", "b"] modin_df = pd.DataFrame([[0, 0]], columns=cols) modin_df[cols] = modin_df[cols] pandas_df = pandas.DataFrame([[0, 0]], columns=cols) pandas_df[cols] = pandas_df[cols] df_equals(modin_df, pandas_df) def test_setitem_unhashable_key(): source_modin_df, source_pandas_df = create_test_dfs(test_data["float_nan_data"]) row_count = source_modin_df.shape[0] def _make_copy(df1, df2): return df1.copy(deep=True), df2.copy(deep=True) for key in (["col1", "col2"], ["new_col1", "new_col2"]): # 1d list case value = [1, 2] modin_df, pandas_df = _make_copy(source_modin_df, source_pandas_df) eval_setitem(modin_df, pandas_df, value, key) # 2d list case value = [[1, 2]] * row_count modin_df, pandas_df = _make_copy(source_modin_df, source_pandas_df) eval_setitem(modin_df, pandas_df, value, key) # pandas DataFrame case df_value = pandas.DataFrame(value, columns=["value_col1", "value_col2"]) modin_df, pandas_df = _make_copy(source_modin_df, source_pandas_df) eval_setitem(modin_df, pandas_df, df_value, key) # numpy array case value = df_value.to_numpy() modin_df, pandas_df = _make_copy(source_modin_df, source_pandas_df) eval_setitem(modin_df, pandas_df, value, key) # pandas Series case value = df_value["value_col1"] modin_df, pandas_df = _make_copy(source_modin_df, source_pandas_df) eval_setitem( modin_df, pandas_df, value, key[:1], expected_exception=ValueError("Columns must be same length as key"), ) # pandas Index case value = df_value.index modin_df, pandas_df = _make_copy(source_modin_df, source_pandas_df) eval_setitem( modin_df, pandas_df, value, key[:1], expected_exception=ValueError("Columns must be same length as key"), ) # scalar case value = 3 modin_df, pandas_df = _make_copy(source_modin_df, source_pandas_df) eval_setitem(modin_df, pandas_df, value, key) # test failed case: ValueError('Columns must be same length as key') eval_setitem( modin_df, pandas_df, df_value[["value_col1"]], key, expected_exception=ValueError("Columns must be same length as key"), ) def test_setitem_2d_insertion(): def build_value_picker(modin_value, pandas_value): """Build a function that returns either Modin or pandas DataFrame depending on the passed frame.""" return lambda source_df, *args, **kwargs: ( modin_value if isinstance(source_df, (pd.DataFrame, pd.Series)) else pandas_value ) modin_df, pandas_df = create_test_dfs(test_data["int_data"]) # Easy case - key and value.columns are equal modin_value, pandas_value = create_test_dfs( {"new_value1": np.arange(len(modin_df)), "new_value2": np.arange(len(modin_df))} ) eval_setitem( modin_df, pandas_df, build_value_picker(modin_value, pandas_value), col=["new_value1", "new_value2"], ) # Key and value.columns have equal values but in different order new_columns = ["new_value3", "new_value4"] modin_value.columns, pandas_value.columns = new_columns, new_columns eval_setitem( modin_df, pandas_df, build_value_picker(modin_value, pandas_value), col=["new_value4", "new_value3"], ) # Key and value.columns have different values new_columns = ["new_value5", "new_value6"] modin_value.columns, pandas_value.columns = new_columns, new_columns eval_setitem( modin_df, pandas_df, build_value_picker(modin_value, pandas_value), col=["__new_value5", "__new_value6"], ) # Key and value.columns have different lengths, testing that both raise the same exception eval_setitem( modin_df, pandas_df, build_value_picker(modin_value.iloc[:, [0]], pandas_value.iloc[:, [0]]), col=["new_value7", "new_value8"], expected_exception=ValueError("Columns must be same length as key"), ) @pytest.mark.parametrize("does_value_have_different_columns", [True, False]) def test_setitem_2d_update(does_value_have_different_columns): def test(dfs, iloc): """Update columns on the given numeric indices.""" df1, df2 = dfs cols1 = df1.columns[iloc].tolist() cols2 = df2.columns[iloc].tolist() df1[cols1] = df2[cols2] return df1 modin_df, pandas_df = create_test_dfs(test_data["int_data"]) modin_df2, pandas_df2 = create_test_dfs(test_data["int_data"]) modin_df2 *= 10 pandas_df2 *= 10 if does_value_have_different_columns: new_columns = [f"{col}_new" for col in modin_df.columns] modin_df2.columns = new_columns pandas_df2.columns = new_columns modin_dfs = (modin_df, modin_df2) pandas_dfs = (pandas_df, pandas_df2) eval_general(modin_dfs, pandas_dfs, test, iloc=[0, 1, 2]) eval_general(modin_dfs, pandas_dfs, test, iloc=[0, -1]) eval_general( modin_dfs, pandas_dfs, test, iloc=slice(1, None) ) # (start=1, stop=None) eval_general( modin_dfs, pandas_dfs, test, iloc=slice(None, -2) ) # (start=None, stop=-2) eval_general( modin_dfs, pandas_dfs, test, iloc=[0, 1, 5, 6, 9, 10, -2, -1], ) eval_general( modin_dfs, pandas_dfs, test, iloc=[5, 4, 0, 10, 1, -1], ) eval_general( modin_dfs, pandas_dfs, test, iloc=slice(None, None, 2) ) # (start=None, stop=None, step=2) def test___setitem__single_item_in_series(): # Test assigning a single item in a Series for issue # https://github.com/modin-project/modin/issues/3860 modin_series = pd.Series(99) pandas_series = pandas.Series(99) modin_series[:1] = pd.Series(100) pandas_series[:1] = pandas.Series(100) df_equals(modin_series, pandas_series) def test___setitem__assigning_single_categorical_sets_correct_dtypes(): # This test case comes from # https://github.com/modin-project/modin/issues/3895 modin_df = pd.DataFrame({"categories": ["A"]}) modin_df["categories"] = pd.Categorical(["A"]) pandas_df = pandas.DataFrame({"categories": ["A"]}) pandas_df["categories"] = pandas.Categorical(["A"]) df_equals(modin_df, pandas_df) def test_iloc_assigning_scalar_none_to_string_frame(): # This test case comes from # https://github.com/modin-project/modin/issues/3981 data = [["A"]] modin_df = pd.DataFrame(data, dtype="string") modin_df.iloc[0, 0] = None pandas_df = pandas.DataFrame(data, dtype="string") pandas_df.iloc[0, 0] = None df_equals(modin_df, pandas_df) @pytest.mark.parametrize( "value", [ 1, np.int32(1), 1.0, "str val", pandas.Timestamp("1/4/2018"), np.datetime64(0, "ms"), True, ], ) def test_loc_boolean_assignment_scalar_dtypes(value): modin_df, pandas_df = create_test_dfs( { "a": [1, 2, 3], "b": [3.0, 5.0, 6.0], "c": ["a", "b", "c"], "d": [1.0, "c", 2.0], "e": pandas.to_datetime(["1/1/2018", "1/2/2018", "1/3/2018"]), "f": [True, False, True], } ) modin_idx, pandas_idx = pd.Series([False, True, True]), pandas.Series( [False, True, True] ) modin_df.loc[modin_idx] = value pandas_df.loc[pandas_idx] = value df_equals(modin_df, pandas_df) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___len__(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) assert len(modin_df) == len(pandas_df) def test_index_order(): # see #1708 and #1869 for details df_modin, df_pandas = ( pd.DataFrame(test_data["float_nan_data"]), pandas.DataFrame(test_data["float_nan_data"]), ) rows_number = len(df_modin.index) level_0 = np.random.choice([x for x in range(10)], rows_number) level_1 = np.random.choice([x for x in range(10)], rows_number) index = pandas.MultiIndex.from_arrays([level_0, level_1]) df_modin.index = index df_pandas.index = index for func in ["all", "any", "count"]: df_equals( getattr(df_modin, func)().index, getattr(df_pandas, func)().index, ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("sortorder", [0, 3, 5]) def test_multiindex_from_frame(data, sortorder): modin_df, pandas_df = create_test_dfs(data) def call_from_frame(df): if type(df).__module__.startswith("pandas"): return pandas.MultiIndex.from_frame(df, sortorder) else: return pd.MultiIndex.from_frame(df, sortorder) eval_general(modin_df, pandas_df, call_from_frame, comparator=assert_index_equal) def test__getitem_bool_single_row_dataframe(): # This test case comes from # https://github.com/modin-project/modin/issues/4845 eval_general(pd, pandas, lambda lib: lib.DataFrame([1])[lib.Series([True])]) def test__getitem_bool_with_empty_partition(): # This test case comes from # https://github.com/modin-project/modin/issues/5188 size = MinRowPartitionSize.get() pandas_series = pandas.Series([True if i % 2 else False for i in range(size)]) modin_series = pd.Series(pandas_series) pandas_df = pandas.DataFrame([i for i in range(size + 1)]) pandas_df.iloc[size] = np.nan modin_df = pd.DataFrame(pandas_df) pandas_tmp_result = pandas_df.dropna() modin_tmp_result = modin_df.dropna() eval_general( modin_tmp_result, pandas_tmp_result, lambda df: ( df[modin_series] if isinstance(df, pd.DataFrame) else df[pandas_series] ), ) # This is a very subtle bug that comes from: # https://github.com/modin-project/modin/issues/4945 def test_lazy_eval_index(): modin_df, pandas_df = create_test_dfs({"col0": [0, 1]}) def func(df): df_copy = df[df["col0"] < 6].copy() # The problem here is that the index is not copied over so it needs # to get recomputed at some point. Our implementation of __setitem__ # requires us to build a mask and insert the value from the right # handside into the new DataFrame. However, it's possible that we # won't have any new partitions, so we will end up computing an empty # index. df_copy["col0"] = df_copy["col0"].apply(lambda x: x + 1) return df_copy eval_general(modin_df, pandas_df, func) def test_index_of_empty_frame(): # Test on an empty frame created by user md_df, pd_df = create_test_dfs( {}, index=pandas.Index([], name="index name"), columns=["a", "b"] ) assert md_df.empty and pd_df.empty df_equals(md_df.index, pd_df.index) # Test on an empty frame produced by Modin's logic data = test_data_values[0] md_df, pd_df = create_test_dfs( data, index=pandas.RangeIndex(len(next(iter(data.values()))), name="index name") ) md_res = md_df.query(f"{md_df.columns[0]} > {RAND_HIGH}") pd_res = pd_df.query(f"{pd_df.columns[0]} > {RAND_HIGH}") assert md_res.empty and pd_res.empty df_equals(md_res.index, pd_res.index) # https://github.com/modin-project/modin/issues/7405 @pytest.mark.parametrize("indexer", ["loc", "iloc"]) def test_loc_and_iloc_set_order(indexer): rng = np.random.default_rng(seed=0) is_loc = indexer == "loc" data = {"col": rng.integers(0, 100, size=100)} set_count = 20 # Pick a bunch of unsorted row indices; may contain repeat values. row_indexer = rng.integers(0, 100, size=set_count) col_indexer = "col" if is_loc else 0 set_data = range(100, 100 + set_count) md_df, pd_df = create_test_dfs(data) def get_helper(df): if is_loc: return df.loc[row_indexer, col_indexer] else: return df.iloc[row_indexer, col_indexer] # First, ensure loc/iloc read succeeds. eval_general(md_df, pd_df, get_helper) def set_helper(df): if is_loc: df.loc[row_indexer, col_indexer] = set_data else: df.iloc[row_indexer, col_indexer] = set_data # Second, check results of loc/iloc write. eval_general( md_df, pd_df, set_helper, __inplace__=True, ) # Finally, check the result of a loc/iloc read again. eval_general(md_df, pd_df, get_helper) def test_iloc_set_negative_index(): rng = np.random.default_rng(seed=0) row_count = 50 col_count = 80 data = {f"col_{i}": rng.integers(0, 100, size=row_count) for i in range(col_count)} row_set_count = 20 col_set_count = 30 # Pick a bunch of unsorted row indices; may contain repeat values and negative numbers. row_indexer = rng.integers(-row_count, row_count, size=row_set_count) col_indexer = rng.integers(-col_count, col_count, size=col_set_count) set_data = np.reshape( range(100, 100 + row_set_count * col_set_count), (row_set_count, col_set_count) ) md_df, pd_df = create_test_dfs(data) def get_helper(df): return df.iloc[row_indexer, col_indexer] # First, ensure loc/iloc read succeeds. eval_general(md_df, pd_df, get_helper) def set_helper(df): df.iloc[row_indexer, col_indexer] = set_data # Second, check results of loc/iloc write. eval_general( md_df, pd_df, set_helper, __inplace__=True, ) # Finally, check the result of a loc/iloc read again. eval_general(md_df, pd_df, get_helper) ================================================ FILE: modin/tests/pandas/dataframe/test_iter.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import io import warnings import matplotlib import numpy as np import pandas import pytest import modin.pandas as pd from modin.config import NPartitions from modin.pandas.utils import SET_DATAFRAME_ATTRIBUTE_WARNING from modin.tests.pandas.utils import ( RAND_HIGH, RAND_LOW, create_test_dfs, df_equals, eval_general, random_state, test_data, test_data_keys, test_data_values, ) from modin.tests.test_utils import ( current_execution_is_native, warns_that_defaulting_to_pandas_if, ) NPartitions.put(4) # Force matplotlib to not use any Xwindows backend. matplotlib.use("Agg") @pytest.mark.parametrize("method", ["items", "iterrows"]) def test_items_iterrows(method): data = test_data["float_nan_data"] modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) for modin_item, pandas_item in zip( getattr(modin_df, method)(), getattr(pandas_df, method)() ): modin_index, modin_series = modin_item pandas_index, pandas_series = pandas_item df_equals(pandas_series, modin_series) assert pandas_index == modin_index @pytest.mark.parametrize("name", [None, "NotPandas"]) def test_itertuples_name(name): data = test_data["float_nan_data"] modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) modin_it_custom = modin_df.itertuples(name=name) pandas_it_custom = pandas_df.itertuples(name=name) for modin_row, pandas_row in zip(modin_it_custom, pandas_it_custom): np.testing.assert_equal(modin_row, pandas_row) def test_itertuples_multiindex(): data = test_data["int_data"] modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) new_idx = pd.MultiIndex.from_tuples( [(i // 4, i // 2, i) for i in range(len(modin_df.columns))] ) modin_df.columns = new_idx pandas_df.columns = new_idx modin_it_custom = modin_df.itertuples() pandas_it_custom = pandas_df.itertuples() for modin_row, pandas_row in zip(modin_it_custom, pandas_it_custom): np.testing.assert_equal(modin_row, pandas_row) def test___iter__(): modin_df = pd.DataFrame(test_data_values[0]) pandas_df = pandas.DataFrame(test_data_values[0]) modin_iterator = modin_df.__iter__() # Check that modin_iterator implements the iterator interface assert hasattr(modin_iterator, "__iter__") assert hasattr(modin_iterator, "next") or hasattr(modin_iterator, "__next__") pd_iterator = pandas_df.__iter__() assert list(modin_iterator) == list(pd_iterator) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___contains__(request, data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) result = False key = "Not Exist" assert result == modin_df.__contains__(key) assert result == (key in modin_df) if "empty_data" not in request.node.name: result = True key = pandas_df.columns[0] assert result == modin_df.__contains__(key) assert result == (key in modin_df) @pytest.mark.parametrize("expand_frame_repr", [False, True]) @pytest.mark.parametrize( "max_rows_columns", [(5, 5), (10, 10), (50, 50), (51, 51), (52, 52), (75, 75), (None, None)], ) @pytest.mark.parametrize("frame_size", [101, 102]) def test_display_options_for___repr__(max_rows_columns, expand_frame_repr, frame_size): frame_data = random_state.randint( RAND_LOW, RAND_HIGH, size=(frame_size, frame_size) ) pandas_df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) context_arg = [ "display.max_rows", max_rows_columns[0], "display.max_columns", max_rows_columns[1], "display.expand_frame_repr", expand_frame_repr, ] with pd.option_context(*context_arg): modin_df_repr = repr(modin_df) with pandas.option_context(*context_arg): pandas_df_repr = repr(pandas_df) assert modin_df_repr == pandas_df_repr def test___finalize__(): data = test_data_values[0] # NOTE: __finalize__() defaults to pandas at the API layer. with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): pd.DataFrame(data).__finalize__(None) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___copy__(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) modin_df_copy, pandas_df_copy = modin_df.__copy__(), pandas_df.__copy__() df_equals(modin_df_copy, pandas_df_copy) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___deepcopy__(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) modin_df_copy, pandas_df_copy = ( modin_df.__deepcopy__(), pandas_df.__deepcopy__(), ) df_equals(modin_df_copy, pandas_df_copy) def test___repr__(): frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 100)) pandas_df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) assert repr(pandas_df) == repr(modin_df) frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 99)) pandas_df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) assert repr(pandas_df) == repr(modin_df) frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 101)) pandas_df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) assert repr(pandas_df) == repr(modin_df) frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 102)) pandas_df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) assert repr(pandas_df) == repr(modin_df) # ___repr___ method has a different code path depending on # whether the number of rows is >60; and a different code path # depending on the number of columns is >20. # Previous test cases already check the case when cols>20 # and rows>60. The cases that follow exercise the other three # combinations. # rows <= 60, cols > 20 frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(10, 100)) pandas_df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) assert repr(pandas_df) == repr(modin_df) # rows <= 60, cols <= 20 frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(10, 10)) pandas_df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) assert repr(pandas_df) == repr(modin_df) # rows > 60, cols <= 20 frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(100, 10)) pandas_df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) assert repr(pandas_df) == repr(modin_df) # Empty pandas_df = pandas.DataFrame(columns=["col{}".format(i) for i in range(100)]) modin_df = pd.DataFrame(columns=["col{}".format(i) for i in range(100)]) assert repr(pandas_df) == repr(modin_df) # From Issue #1705 string_data = """"time","device_id","lat","lng","accuracy","activity_1","activity_1_conf","activity_2","activity_2_conf","activity_3","activity_3_conf" "2016-08-26 09:00:00.206",2,60.186805,24.821049,33.6080017089844,"STILL",75,"IN_VEHICLE",5,"ON_BICYCLE",5 "2016-08-26 09:00:05.428",5,60.192928,24.767222,5,"WALKING",62,"ON_BICYCLE",29,"RUNNING",6 "2016-08-26 09:00:05.818",1,60.166382,24.700443,3,"WALKING",75,"IN_VEHICLE",5,"ON_BICYCLE",5 "2016-08-26 09:00:15.816",1,60.166254,24.700671,3,"WALKING",75,"IN_VEHICLE",5,"ON_BICYCLE",5 "2016-08-26 09:00:16.413",5,60.193055,24.767427,5,"WALKING",85,"ON_BICYCLE",15,"UNKNOWN",0 "2016-08-26 09:00:20.578",3,60.152996,24.745216,3.90000009536743,"STILL",69,"IN_VEHICLE",31,"UNKNOWN",0""" pandas_df = pandas.read_csv(io.StringIO(string_data)) with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): modin_df = pd.read_csv(io.StringIO(string_data)) assert repr(pandas_df) == repr(modin_df) def test___repr__does_not_raise_attribute_column_warning(): # See https://github.com/modin-project/modin/issues/5380 df = pd.DataFrame([1]) with warnings.catch_warnings(): warnings.filterwarnings(action="error", message=SET_DATAFRAME_ATTRIBUTE_WARNING) repr(df) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_inplace_series_ops(data): pandas_df = pandas.DataFrame(data) modin_df = pd.DataFrame(data) if len(modin_df.columns) > len(pandas_df.columns): col0 = modin_df.columns[0] col1 = modin_df.columns[1] pandas_df[col1].dropna(inplace=True) modin_df[col1].dropna(inplace=True) df_equals(modin_df, pandas_df) pandas_df[col0].fillna(0, inplace=True) modin_df[col0].fillna(0, inplace=True) df_equals(modin_df, pandas_df) # Note: Tests setting an attribute that is not an existing column label def test___setattr__not_column(): pandas_df = pandas.DataFrame([1, 2, 3]) modin_df = pd.DataFrame([1, 2, 3]) pandas_df.new_col = [4, 5, 6] modin_df.new_col = [4, 5, 6] df_equals(modin_df, pandas_df) # While `new_col` is not a column of the dataframe, # it should be accessible with __getattr__. assert modin_df.new_col == pandas_df.new_col def test___setattr__mutating_column(): # Use case from issue #4577 pandas_df = pandas.DataFrame([[1]], columns=["col0"]) modin_df = pd.DataFrame([[1]], columns=["col0"]) # Replacing a column with a list should mutate the column in place. pandas_df.col0 = [3] modin_df.col0 = [3] df_equals(modin_df, pandas_df) # Check that the col0 attribute reflects the value update. df_equals(modin_df.col0, pandas_df.col0) pandas_df.col0 = pandas.Series([5]) modin_df.col0 = pd.Series([5]) # Check that the col0 attribute reflects this update df_equals(modin_df, pandas_df) pandas_df.loc[0, "col0"] = 4 modin_df.loc[0, "col0"] = 4 # Check that the col0 attribute reflects update via loc df_equals(modin_df, pandas_df) assert modin_df.col0.equals(modin_df["col0"]) # Check that attempting to add a new col via attributes raises warning # and adds the provided list as a new attribute and not a column. with pytest.warns( UserWarning, match=SET_DATAFRAME_ATTRIBUTE_WARNING, ): modin_df.col1 = [4] with warnings.catch_warnings(): warnings.filterwarnings( action="error", message=SET_DATAFRAME_ATTRIBUTE_WARNING, ) modin_df.col1 = [5] modin_df.new_attr = 6 modin_df.col0 = 7 assert "new_attr" in dir( modin_df ), "Modin attribute was not correctly added to the df." assert ( "new_attr" not in modin_df ), "New attribute was not correctly added to columns." assert modin_df.new_attr == 6, "Modin attribute value was set incorrectly." assert isinstance( modin_df.col0, pd.Series ), "Scalar was not broadcasted properly to an existing column." @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_isin(data): pandas_df = pandas.DataFrame(data) modin_df = pd.DataFrame(data) val = [1, 2, 3, 4] pandas_result = pandas_df.isin(val) modin_result = modin_df.isin(val) df_equals(modin_result, pandas_result) def test_isin_with_modin_objects(): modin_df1, pandas_df1 = create_test_dfs({"a": [1, 2], "b": [3, 4]}) modin_series, pandas_series = pd.Series([1, 4, 5, 6]), pandas.Series([1, 4, 5, 6]) eval_general( (modin_df1, modin_series), (pandas_df1, pandas_series), lambda srs: srs[0].isin(srs[1]), ) modin_df2 = modin_series.to_frame("a") pandas_df2 = pandas_series.to_frame("a") eval_general( (modin_df1, modin_df2), (pandas_df1, pandas_df2), lambda srs: srs[0].isin(srs[1]), ) # Check case when indices are not matching modin_df1, pandas_df1 = create_test_dfs({"a": [1, 2], "b": [3, 4]}, index=[10, 11]) eval_general( (modin_df1, modin_series), (pandas_df1, pandas_series), lambda srs: srs[0].isin(srs[1]), ) eval_general( (modin_df1, modin_df2), (pandas_df1, pandas_df2), lambda srs: srs[0].isin(srs[1]), ) ================================================ FILE: modin/tests/pandas/dataframe/test_join_sort.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import warnings import matplotlib import numpy as np import pandas import pytest import modin.pandas as pd from modin.config import Engine, NPartitions, StorageFormat from modin.pandas.io import to_pandas from modin.tests.pandas.utils import ( arg_keys, axis_keys, axis_values, bool_arg_keys, bool_arg_values, create_test_dfs, default_to_pandas_ignore_string, df_equals, eval_general, generate_multiindex, random_state, rotate_decimal_digits_or_symbols, test_data, test_data_keys, test_data_values, ) from modin.tests.test_utils import ( df_or_series_using_native_execution, warns_that_defaulting_to_pandas_if, ) NPartitions.put(4) # Force matplotlib to not use any Xwindows backend. matplotlib.use("Agg") # Our configuration in pytest.ini requires that we explicitly catch all # instances of defaulting to pandas, but some test modules, like this one, # have too many such instances. pytestmark = pytest.mark.filterwarnings(default_to_pandas_ignore_string) # Initialize env for storage format detection in @pytest.mark.* pd.DataFrame() def df_equals_and_sort(df1, df2): """Sort dataframe's rows and run ``df_equals()`` for them.""" df1 = df1.sort_values(by=df1.columns.tolist(), ignore_index=True) df2 = df2.sort_values(by=df2.columns.tolist(), ignore_index=True) df_equals(df1, df2) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_combine(data): pandas_df = pandas.DataFrame(data) modin_df = pd.DataFrame(data) modin_df.combine(modin_df + 1, lambda s1, s2: s1 if s1.count() < s2.count() else s2) pandas_df.combine( pandas_df + 1, lambda s1, s2: s1 if s1.count() < s2.count() else s2 ) @pytest.mark.parametrize( "test_data, test_data2", [ ( np.random.randint(0, 100, size=(64, 64)), np.random.randint(0, 100, size=(128, 64)), ), ( np.random.randint(0, 100, size=(128, 64)), np.random.randint(0, 100, size=(64, 64)), ), ( np.random.randint(0, 100, size=(64, 64)), np.random.randint(0, 100, size=(64, 128)), ), ( np.random.randint(0, 100, size=(64, 128)), np.random.randint(0, 100, size=(64, 64)), ), ], ) def test_join(test_data, test_data2): modin_df = pd.DataFrame( test_data, columns=["col{}".format(i) for i in range(test_data.shape[1])], index=pd.Index([i for i in range(1, test_data.shape[0] + 1)], name="key"), ) pandas_df = pandas.DataFrame( test_data, columns=["col{}".format(i) for i in range(test_data.shape[1])], index=pandas.Index([i for i in range(1, test_data.shape[0] + 1)], name="key"), ) modin_df2 = pd.DataFrame( test_data2, columns=["col{}".format(i) for i in range(test_data2.shape[1])], index=pd.Index([i for i in range(1, test_data2.shape[0] + 1)], name="key"), ) pandas_df2 = pandas.DataFrame( test_data2, columns=["col{}".format(i) for i in range(test_data2.shape[1])], index=pandas.Index([i for i in range(1, test_data2.shape[0] + 1)], name="key"), ) hows = ["inner", "left", "right", "outer"] ons = ["col33", "col34"] sorts = [False, True] assert len(ons) == len(sorts), "the loop below is designed for this condition" for i in range(len(hows)): for j in range(len(ons)): modin_result = modin_df.join( modin_df2, how=hows[i], on=ons[j], sort=sorts[j], lsuffix="_caller", rsuffix="_other", ) pandas_result = pandas_df.join( pandas_df2, how=hows[i], on=ons[j], sort=sorts[j], lsuffix="_caller", rsuffix="_other", ) if sorts[j]: # sorting in `join` is implemented through range partitioning technique # therefore the order of the rows after it does not match the pandas, # so additional sorting is needed in order to get the same result as for pandas df_equals_and_sort(modin_result, pandas_result) else: df_equals(modin_result, pandas_result) frame_data = { "col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6], } modin_df = pd.DataFrame(frame_data) pandas_df = pandas.DataFrame(frame_data) frame_data2 = {"col5": [0], "col6": [1]} modin_df2 = pd.DataFrame(frame_data2) pandas_df2 = pandas.DataFrame(frame_data2) join_types = ["left", "right", "outer", "inner"] for how in join_types: modin_join = modin_df.join(modin_df2, how=how) pandas_join = pandas_df.join(pandas_df2, how=how) df_equals(modin_join, pandas_join) frame_data3 = {"col7": [1, 2, 3, 5, 6, 7, 8]} modin_df3 = pd.DataFrame(frame_data3) pandas_df3 = pandas.DataFrame(frame_data3) join_types = ["left", "outer", "inner"] for how in join_types: modin_join = modin_df.join([modin_df2, modin_df3], how=how) pandas_join = pandas_df.join([pandas_df2, pandas_df3], how=how) df_equals(modin_join, pandas_join) @pytest.mark.parametrize("how", ["left", "inner", "right"]) def test_join_empty(how): data = np.random.randint(0, 100, size=(64, 64)) eval_general( *create_test_dfs(data), lambda df: df.join(df.iloc[:0], on=1, how=how, lsuffix="_caller"), ) def test_join_cross_6786(): data = [[7, 8, 9], [10, 11, 12]] modin_df, pandas_df = create_test_dfs(data, columns=["x", "y", "z"]) modin_join = modin_df.join( modin_df[["x"]].set_axis(["p", "q"], axis=0), how="cross", lsuffix="p" ) pandas_join = pandas_df.join( pandas_df[["x"]].set_axis(["p", "q"], axis=0), how="cross", lsuffix="p" ) df_equals(modin_join, pandas_join) def test_join_5203(): data = np.ones([2, 4]) kwargs = {"columns": ["a", "b", "c", "d"]} modin_dfs, pandas_dfs = [None] * 3, [None] * 3 for idx in range(len(modin_dfs)): modin_dfs[idx], pandas_dfs[idx] = create_test_dfs(data, **kwargs) for dfs in (modin_dfs, pandas_dfs): with pytest.raises( ValueError, match="Joining multiple DataFrames only supported for joining on index", ): dfs[0].join([dfs[1], dfs[2]], how="inner", on="a") def test_join_6602(): abbreviations = pd.Series( ["Major League Baseball", "National Basketball Association"], index=["MLB", "NBA"], ) teams = pd.DataFrame( { "name": ["Mariners", "Lakers"] * 50, "league_abbreviation": ["MLB", "NBA"] * 50, } ) with warnings.catch_warnings(): # check that join doesn't show UserWarning warnings.filterwarnings( "error", "Distributing object", category=UserWarning ) teams.set_index("league_abbreviation").join(abbreviations.rename("league_name")) @pytest.mark.parametrize( "test_data, test_data2", [ ( np.random.randint(0, 100, size=(64, 64)), np.random.randint(0, 100, size=(128, 64)), ), ( np.random.randint(0, 100, size=(128, 64)), np.random.randint(0, 100, size=(64, 64)), ), ( np.random.randint(0, 100, size=(64, 64)), np.random.randint(0, 100, size=(64, 128)), ), ( np.random.randint(0, 100, size=(64, 128)), np.random.randint(0, 100, size=(64, 64)), ), ], ) def test_merge(test_data, test_data2): modin_df = pd.DataFrame( test_data, columns=["col{}".format(i) for i in range(test_data.shape[1])], index=pd.Index([i for i in range(1, test_data.shape[0] + 1)], name="key"), ) pandas_df = pandas.DataFrame( test_data, columns=["col{}".format(i) for i in range(test_data.shape[1])], index=pandas.Index([i for i in range(1, test_data.shape[0] + 1)], name="key"), ) modin_df2 = pd.DataFrame( test_data2, columns=["col{}".format(i) for i in range(test_data2.shape[1])], index=pd.Index([i for i in range(1, test_data2.shape[0] + 1)], name="key"), ) pandas_df2 = pandas.DataFrame( test_data2, columns=["col{}".format(i) for i in range(test_data2.shape[1])], index=pandas.Index([i for i in range(1, test_data2.shape[0] + 1)], name="key"), ) hows = ["left", "inner", "right"] ons = ["col33", ["col33", "col34"]] sorts = [False, True] assert len(ons) == len(sorts), "the loop below is designed for this condition" for i in range(len(hows)): for j in range(len(ons)): modin_result = modin_df.merge( modin_df2, how=hows[i], on=ons[j], sort=sorts[j] ) pandas_result = pandas_df.merge( pandas_df2, how=hows[i], on=ons[j], sort=sorts[j] ) # FIXME: https://github.com/modin-project/modin/issues/2246 df_equals_and_sort(modin_result, pandas_result) modin_result = modin_df.merge( modin_df2, how=hows[i], left_on="key", right_on="key", sort=sorts[j], ) pandas_result = pandas_df.merge( pandas_df2, how=hows[i], left_on="key", right_on="key", sort=sorts[j], ) # FIXME: https://github.com/modin-project/modin/issues/2246 df_equals_and_sort(modin_result, pandas_result) # Test for issue #1771 modin_df = pd.DataFrame({"name": np.arange(40)}) modin_df2 = pd.DataFrame({"name": [39], "position": [0]}) pandas_df = pandas.DataFrame({"name": np.arange(40)}) pandas_df2 = pandas.DataFrame({"name": [39], "position": [0]}) modin_result = modin_df.merge(modin_df2, on="name", how="inner") pandas_result = pandas_df.merge(pandas_df2, on="name", how="inner") # FIXME: https://github.com/modin-project/modin/issues/2246 df_equals_and_sort(modin_result, pandas_result) frame_data = { "col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6], } modin_df = pd.DataFrame(frame_data) pandas_df = pandas.DataFrame(frame_data) frame_data2 = {"col1": [0, 1, 2], "col2": [1, 5, 6]} modin_df2 = pd.DataFrame(frame_data2) pandas_df2 = pandas.DataFrame(frame_data2) join_types = ["outer", "inner"] for how in join_types: # Defaults modin_result = modin_df.merge(modin_df2, how=how) pandas_result = pandas_df.merge(pandas_df2, how=how) # FIXME: https://github.com/modin-project/modin/issues/2246 df_equals_and_sort(modin_result, pandas_result) # left_on and right_index modin_result = modin_df.merge( modin_df2, how=how, left_on="col1", right_index=True ) pandas_result = pandas_df.merge( pandas_df2, how=how, left_on="col1", right_index=True ) # FIXME: https://github.com/modin-project/modin/issues/2246 df_equals_and_sort(modin_result, pandas_result) # left_index and right_on modin_result = modin_df.merge( modin_df2, how=how, left_index=True, right_on="col1" ) pandas_result = pandas_df.merge( pandas_df2, how=how, left_index=True, right_on="col1" ) # FIXME: https://github.com/modin-project/modin/issues/2246 df_equals_and_sort(modin_result, pandas_result) # left_on and right_on col1 modin_result = modin_df.merge( modin_df2, how=how, left_on="col1", right_on="col1" ) pandas_result = pandas_df.merge( pandas_df2, how=how, left_on="col1", right_on="col1" ) # FIXME: https://github.com/modin-project/modin/issues/2246 df_equals_and_sort(modin_result, pandas_result) # left_on and right_on col2 modin_result = modin_df.merge( modin_df2, how=how, left_on="col2", right_on="col2" ) pandas_result = pandas_df.merge( pandas_df2, how=how, left_on="col2", right_on="col2" ) # FIXME: https://github.com/modin-project/modin/issues/2246 df_equals_and_sort(modin_result, pandas_result) # left_index and right_index modin_result = modin_df.merge( modin_df2, how=how, left_index=True, right_index=True ) pandas_result = pandas_df.merge( pandas_df2, how=how, left_index=True, right_index=True ) # FIXME: https://github.com/modin-project/modin/issues/2246 df_equals_and_sort(modin_result, pandas_result) # Cannot merge a Series without a name ps = pandas.Series(frame_data2.get("col1")) ms = pd.Series(frame_data2.get("col1")) eval_general( modin_df, pandas_df, lambda df: df.merge(ms if isinstance(df, pd.DataFrame) else ps), # FIXME: https://github.com/modin-project/modin/issues/2246 comparator=df_equals_and_sort, expected_exception=ValueError("Cannot merge a Series without a name"), ) # merge a Series with a name ps = pandas.Series(frame_data2.get("col1"), name="col1") ms = pd.Series(frame_data2.get("col1"), name="col1") eval_general( modin_df, pandas_df, lambda df: df.merge(ms if isinstance(df, pd.DataFrame) else ps), # FIXME: https://github.com/modin-project/modin/issues/2246 comparator=df_equals_and_sort, ) with pytest.raises(TypeError): modin_df.merge("Non-valid type") @pytest.mark.parametrize("how", ["left", "inner", "right"]) def test_merge_empty(how): data = np.random.randint(0, 100, size=(64, 64)) eval_general(*create_test_dfs(data), lambda df: df.merge(df.iloc[:0], how=how)) def test_merge_with_mi_columns(): modin_df1, pandas_df1 = create_test_dfs( { ("col0", "a"): [1, 2, 3, 4], ("col0", "b"): [2, 3, 4, 5], ("col1", "a"): [3, 4, 5, 6], } ) modin_df2, pandas_df2 = create_test_dfs( { ("col0", "a"): [1, 2, 3, 4], ("col0", "c"): [2, 3, 4, 5], ("col1", "a"): [3, 4, 5, 6], } ) eval_general( (modin_df1, modin_df2), (pandas_df1, pandas_df2), lambda dfs: dfs[0].merge(dfs[1], on=[("col0", "a")]), ) @pytest.mark.parametrize("has_index_cache", [True, False]) def test_merge_on_index(has_index_cache): modin_df1, pandas_df1 = create_test_dfs( { "idx_key1": [1, 2, 3, 4], "idx_key2": [2, 3, 4, 5], "idx_key3": [3, 4, 5, 6], "data_col1": [10, 2, 3, 4], "col_key1": [3, 4, 5, 6], "col_key2": [3, 4, 5, 6], } ) modin_df1 = modin_df1.set_index(["idx_key1", "idx_key2"]) pandas_df1 = pandas_df1.set_index(["idx_key1", "idx_key2"]) modin_df2, pandas_df2 = create_test_dfs( { "idx_key1": [4, 3, 2, 1], "idx_key2": [5, 4, 3, 2], "idx_key3": [6, 5, 4, 3], "data_col2": [10, 2, 3, 4], "col_key1": [6, 5, 4, 3], "col_key2": [6, 5, 4, 3], } ) modin_df2 = modin_df2.set_index(["idx_key2", "idx_key3"]) pandas_df2 = pandas_df2.set_index(["idx_key2", "idx_key3"]) def setup_cache(): if has_index_cache: modin_df1.index # triggering index materialization modin_df2.index assert modin_df1._query_compiler.frame_has_index_cache assert modin_df2._query_compiler.frame_has_index_cache else: # Propagate deferred indices to partitions # The change in index is not automatically handled by Modin. See #3941. modin_df1.index = modin_df1.index modin_df1._to_pandas() modin_df1._query_compiler.set_frame_index_cache(None) modin_df2.index = modin_df2.index modin_df2._to_pandas() modin_df2._query_compiler.set_frame_index_cache(None) for on in ( ["col_key1", "idx_key1"], ["col_key1", "idx_key2"], ["col_key1", "idx_key3"], ["idx_key1"], ["idx_key2"], ["idx_key3"], ): setup_cache() eval_general( (modin_df1, modin_df2), (pandas_df1, pandas_df2), lambda dfs: dfs[0].merge(dfs[1], on=on), ) for left_on, right_on in ( (["idx_key1"], ["col_key1"]), (["col_key1"], ["idx_key3"]), (["idx_key1"], ["idx_key3"]), (["idx_key2"], ["idx_key2"]), (["col_key1", "idx_key2"], ["col_key2", "idx_key2"]), ): setup_cache() eval_general( (modin_df1, modin_df2), (pandas_df1, pandas_df2), lambda dfs: dfs[0].merge(dfs[1], left_on=left_on, right_on=right_on), ) @pytest.mark.parametrize( "left_index", [[], ["key"], ["key", "b"], ["key", "b", "c"], ["b"], ["b", "c"]] ) @pytest.mark.parametrize( "right_index", [[], ["key"], ["key", "e"], ["key", "e", "f"], ["e"], ["e", "f"]] ) def test_merge_on_single_index(left_index, right_index): """ Test ``.merge()`` method when merging on a single column, that is located in an index level of one of the frames. """ modin_df1, pandas_df1 = create_test_dfs( {"b": [3, 4, 4, 5], "key": [1, 1, 2, 2], "c": [2, 3, 2, 2], "d": [2, 1, 3, 1]} ) if len(left_index): modin_df1 = modin_df1.set_index(left_index) pandas_df1 = pandas_df1.set_index(left_index) modin_df2, pandas_df2 = create_test_dfs( {"e": [3, 4, 4, 5], "f": [2, 3, 2, 2], "key": [1, 1, 2, 2], "h": [2, 1, 3, 1]} ) if len(right_index): modin_df2 = modin_df2.set_index(right_index) pandas_df2 = pandas_df2.set_index(right_index) eval_general( (modin_df1, modin_df2), (pandas_df1, pandas_df2), lambda dfs: dfs[0].merge(dfs[1], on="key"), ) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("ascending", [False, True]) @pytest.mark.parametrize("na_position", ["first", "last"], ids=["first", "last"]) def test_sort_index(axis, ascending, na_position): data = test_data["float_nan_data"] modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) # Change index value so sorting will actually make a difference if axis == 0: length = len(modin_df.index) for df in [modin_df, pandas_df]: df.index = [(i - length / 2) % length for i in range(length)] dfs = [modin_df, pandas_df] # Add NaNs to sorted index for idx in range(len(dfs)): sort_index = dfs[idx].axes[axis] dfs[idx] = dfs[idx].set_axis( [np.nan if i % 2 == 0 else sort_index[i] for i in range(len(sort_index))], axis=axis, copy=False, ) modin_df, pandas_df = dfs eval_general( modin_df, pandas_df, lambda df: df.sort_index( axis=axis, ascending=ascending, na_position=na_position ), ) @pytest.mark.parametrize("axis", ["rows", "columns"]) def test_sort_index_inplace(axis): data = test_data["int_data"] modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) for df in [modin_df, pandas_df]: df.sort_index(axis=axis, inplace=True) df_equals(modin_df, pandas_df) @pytest.mark.parametrize( "sort_remaining", bool_arg_values, ids=arg_keys("sort_remaining", bool_arg_keys) ) def test_sort_multiindex(sort_remaining): data = test_data["int_data"] modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) for index in ["index", "columns"]: new_index = generate_multiindex(len(getattr(modin_df, index))) for df in [modin_df, pandas_df]: setattr(df, index, new_index) for kwargs in [{"level": 0}, {"axis": 0}, {"axis": 1}]: with warns_that_defaulting_to_pandas_if( not df_or_series_using_native_execution(modin_df) ): df_equals( modin_df.sort_index(sort_remaining=sort_remaining, **kwargs), pandas_df.sort_index(sort_remaining=sort_remaining, **kwargs), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize( "by", [ pytest.param( "first", marks=pytest.mark.exclude_by_default, ), pytest.param( "first,last", marks=pytest.mark.exclude_by_default, ), "first,last,middle", ], ) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize( "ascending", [False, True] + ["list_first_True", "list_first_False"], ids=arg_keys( "ascending", ["False", "True"] + ["list_first_True", "list_first_False"] ), ) @pytest.mark.parametrize( "inplace", bool_arg_values, ids=arg_keys("inplace", bool_arg_keys) ) @pytest.mark.parametrize( "kind", [ pytest.param( "mergesort", marks=pytest.mark.exclude_by_default, ), "quicksort", pytest.param( "heapsort", marks=pytest.mark.exclude_by_default, ), ], ) @pytest.mark.parametrize("na_position", ["first", "last"], ids=["first", "last"]) @pytest.mark.parametrize( "ignore_index", bool_arg_values, ids=arg_keys("ignore_index", bool_arg_keys), ) @pytest.mark.parametrize("key", [None, rotate_decimal_digits_or_symbols]) def test_sort_values( data, by, axis, ascending, inplace, kind, na_position, ignore_index, key ): if ascending is None: pytest.skip("None is not a valid value for ascending.") if (axis == 1 or axis == "columns") and ignore_index: pytest.skip("Pandas bug #39426 which is fixed in Pandas 1.3") if ascending is None and key is not None: pytest.skip("Pandas bug #41318") if "multiindex" in by: index = generate_multiindex(len(data[list(data.keys())[0]]), nlevels=2) columns = generate_multiindex(len(data.keys()), nlevels=2) data = {columns[ind]: data[key] for ind, key in enumerate(data)} else: index = None columns = None modin_df = pd.DataFrame(data, index=index, columns=columns) pandas_df = pandas.DataFrame(data, index=index, columns=columns) index = modin_df.index if axis == 1 or axis == "columns" else modin_df.columns # Parse "by" spec by_list = [] for b in by.split(","): if b == "first": by_list.append(index[0]) elif b == "last": by_list.append(index[-1]) elif b == "middle": by_list.append(index[len(index) // 2]) elif b.startswith("multiindex_level"): by_list.append(index.names[int(b[len("multiindex_level") :])]) else: raise Exception('Unknown "by" specifier:' + b) # Create "ascending" list if ascending in ["list_first_True", "list_first_False"]: start = 0 if ascending == "list_first_False" else 1 ascending = [i & 1 > 0 for i in range(start, len(by_list) + start)] eval_general( modin_df, pandas_df, lambda df: df.sort_values( by_list, axis=axis, ascending=ascending, inplace=inplace, kind=kind, na_position=na_position, ignore_index=ignore_index, key=key, ), __inplace__=inplace, ) def test_sort_values_descending_with_only_two_bins(): # test case from https://github.com/modin-project/modin/issues/5781 part1 = pd.DataFrame({"a": [1, 2, 3, 4]}) part2 = pd.DataFrame({"a": [5, 6, 7, 8]}) modin_df = pd.concat([part1, part2]) pandas_df = modin_df._to_pandas() if StorageFormat.get() == "Pandas": assert modin_df._query_compiler._modin_frame._partitions.shape == (2, 1) eval_general( modin_df, pandas_df, lambda df: df.sort_values(by="a", ascending=False) ) @pytest.mark.parametrize("ignore_index", [True, False]) def test_sort_values_preserve_index_names(ignore_index): modin_df, pandas_df = create_test_dfs( np.random.choice(128, 128, replace=False).reshape((128, 1)) ) pandas_df.index.names, pandas_df.columns.names = ["custom_name"], ["custom_name"] modin_df.index.names, modin_df.columns.names = ["custom_name"], ["custom_name"] # workaround for #1618 to actually propagate index change modin_df.index = modin_df.index modin_df.columns = modin_df.columns def comparator(df1, df2): assert df1.index.names == df2.index.names assert df1.columns.names == df2.columns.names df_equals(df1, df2) eval_general( modin_df, pandas_df, lambda df: df.sort_values(df.columns[0], ignore_index=ignore_index), comparator=comparator, ) @pytest.mark.parametrize("ascending", [True, False]) def test_sort_values_with_one_partition(ascending): # Test case from https://github.com/modin-project/modin/issues/5859 modin_df, pandas_df = create_test_dfs( np.array([["hello", "goodbye"], ["hello", "Hello"]]) ) if StorageFormat.get() == "Pandas": assert modin_df._query_compiler._modin_frame._partitions.shape == (1, 1) eval_general( modin_df, pandas_df, lambda df: df.sort_values(by=1, ascending=ascending) ) def test_sort_overpartitioned_df(): # First we test when the final df will have only 1 row and column partition. data = [[4, 5, 6], [1, 2, 3]] modin_df = pd.concat([pd.DataFrame(row).T for row in data]).reset_index(drop=True) pandas_df = pandas.DataFrame(data) eval_general(modin_df, pandas_df, lambda df: df.sort_values(by=0)) # Next we test when the final df will only have 1 row, but starts with multiple column # partitions. data = [list(range(100)), list(range(100, 200))] modin_df = pd.concat([pd.DataFrame(row).T for row in data]).reset_index(drop=True) pandas_df = pandas.DataFrame(data) eval_general(modin_df, pandas_df, lambda df: df.sort_values(by=0)) # Next we test when the final df will have multiple row partitions. data = np.random.choice(650, 650, replace=False).reshape((65, 10)) modin_df = pd.concat([pd.DataFrame(row).T for row in data]).reset_index(drop=True) pandas_df = pandas.DataFrame(data) eval_general(modin_df, pandas_df, lambda df: df.sort_values(by=0)) old_nptns = NPartitions.get() NPartitions.put(24) try: # Next we test when there's only one row per partition. data = np.random.choice(650, 650, replace=False).reshape((65, 10)) modin_df = pd.concat([pd.DataFrame(row).T for row in data]).reset_index( drop=True ) pandas_df = pandas.DataFrame(data) eval_general(modin_df, pandas_df, lambda df: df.sort_values(by=0)) # And again, when there's more than one column partition. data = np.random.choice(6500, 6500, replace=False).reshape((65, 100)) modin_df = pd.concat([pd.DataFrame(row).T for row in data]).reset_index( drop=True ) pandas_df = pandas.DataFrame(data) eval_general(modin_df, pandas_df, lambda df: df.sort_values(by=0)) # Additionally, we should test when we have a number of partitions # that doesn't divide cleanly into our desired number of partitions. # In this case, we start with 17 partitions, and want 2. NPartitions.put(21) data = np.random.choice(6500, 6500, replace=False).reshape((65, 100)) modin_df = pd.concat([pd.DataFrame(row).T for row in data]).reset_index( drop=True ) pandas_df = pandas.DataFrame(data) eval_general(modin_df, pandas_df, lambda df: df.sort_values(by=0)) finally: NPartitions.put(old_nptns) def test_sort_values_with_duplicates(): modin_df = pd.DataFrame({"col": [2, 1, 1]}, index=[1, 1, 0]) pandas_df = pandas.DataFrame({"col": [2, 1, 1]}, index=[1, 1, 0]) key = modin_df.columns[0] modin_result = modin_df.sort_values(key, inplace=False) pandas_result = pandas_df.sort_values(key, inplace=False) df_equals(modin_result, pandas_result) modin_df.sort_values(key, inplace=True) pandas_df.sort_values(key, inplace=True) df_equals(modin_df, pandas_df) def test_sort_values_with_string_index(): modin_df = pd.DataFrame({"col": [25, 17, 1]}, index=["ccc", "bbb", "aaa"]) pandas_df = pandas.DataFrame({"col": [25, 17, 1]}, index=["ccc", "bbb", "aaa"]) key = modin_df.columns[0] modin_result = modin_df.sort_values(key, inplace=False) pandas_result = pandas_df.sort_values(key, inplace=False) df_equals(modin_result, pandas_result) modin_df.sort_values(key, inplace=True) pandas_df.sort_values(key, inplace=True) df_equals(modin_df, pandas_df) @pytest.mark.skipif( StorageFormat.get() != "Pandas", reason="We only need to test this case where sort does not default to pandas.", ) @pytest.mark.parametrize("ascending", [True, False], ids=["True", "False"]) @pytest.mark.parametrize("na_position", ["first", "last"], ids=["first", "last"]) def test_sort_values_with_only_one_non_na_row_in_partition(ascending, na_position): pandas_df = pandas.DataFrame( np.random.rand(1000, 100), columns=[f"col {i}" for i in range(100)] ) # Need to ensure that one of the partitions has all NA values except for one row pandas_df.iloc[340:] = np.nan pandas_df.iloc[-1] = -4.0 modin_df = pd.DataFrame(pandas_df) eval_general( modin_df, pandas_df, lambda df: df.sort_values( "col 3", ascending=ascending, na_position=na_position ), ) @pytest.mark.skipif( Engine.get() not in ("Ray", "Unidist", "Dask"), reason="We only need to test this case where sort does not default to pandas.", ) def test_sort_values_with_sort_key_on_partition_boundary(): modin_df = pd.DataFrame( np.random.rand(1000, 100), columns=[f"col {i}" for i in range(100)] ) sort_key = modin_df.columns[modin_df._query_compiler._modin_frame.column_widths[0]] eval_general(modin_df, modin_df._to_pandas(), lambda df: df.sort_values(sort_key)) def test_where(): columns = list("abcdefghij") frame_data = random_state.randn(100, 10) modin_df, pandas_df = create_test_dfs(frame_data, columns=columns) pandas_cond_df = pandas_df % 5 < 2 modin_cond_df = modin_df % 5 < 2 pandas_result = pandas_df.where(pandas_cond_df, -pandas_df) modin_result = modin_df.where(modin_cond_df, -modin_df) assert all((to_pandas(modin_result) == pandas_result).all()) # test case when other is Series other_data = random_state.randn(len(pandas_df)) modin_other, pandas_other = pd.Series(other_data), pandas.Series(other_data) pandas_result = pandas_df.where(pandas_cond_df, pandas_other, axis=0) modin_result = modin_df.where(modin_cond_df, modin_other, axis=0) df_equals(modin_result, pandas_result) # Test that we choose the right values to replace when `other` == `True` # everywhere. other_data = np.full(shape=pandas_df.shape, fill_value=True) modin_other, pandas_other = create_test_dfs(other_data, columns=columns) pandas_result = pandas_df.where(pandas_cond_df, pandas_other) modin_result = modin_df.where(modin_cond_df, modin_other) df_equals(modin_result, pandas_result) other = pandas_df.loc[3] pandas_result = pandas_df.where(pandas_cond_df, other, axis=1) modin_result = modin_df.where(modin_cond_df, other, axis=1) assert all((to_pandas(modin_result) == pandas_result).all()) other = pandas_df["e"] pandas_result = pandas_df.where(pandas_cond_df, other, axis=0) modin_result = modin_df.where(modin_cond_df, other, axis=0) assert all((to_pandas(modin_result) == pandas_result).all()) pandas_result = pandas_df.where(pandas_df < 2, True) modin_result = modin_df.where(modin_df < 2, True) assert all((to_pandas(modin_result) == pandas_result).all()) def test_where_different_axis_order(): # Test `where` when `cond`, `df`, and `other` each have columns and index # in different orders. data = test_data["float_nan_data"] pandas_df = pandas.DataFrame(data) pandas_cond_df = pandas_df % 5 < 2 pandas_cond_df = pandas_cond_df.reindex( columns=pandas_df.columns[::-1], index=pandas_df.index[::-1] ) pandas_other_df = -pandas_df pandas_other_df = pandas_other_df.reindex( columns=pandas_df.columns[-1:].append(pandas_df.columns[:-1]), index=pandas_df.index[-1:].append(pandas_df.index[:-1]), ) modin_df = pd.DataFrame(pandas_df) modin_cond_df = pd.DataFrame(pandas_cond_df) modin_other_df = pd.DataFrame(pandas_other_df) pandas_result = pandas_df.where(pandas_cond_df, pandas_other_df) modin_result = modin_df.where(modin_cond_df, modin_other_df) df_equals(modin_result, pandas_result) @pytest.mark.parametrize("align_axis", ["index", "columns"]) @pytest.mark.parametrize("keep_shape", [False, True]) @pytest.mark.parametrize("keep_equal", [False, True]) def test_compare(align_axis, keep_shape, keep_equal): kwargs = { "align_axis": align_axis, "keep_shape": keep_shape, "keep_equal": keep_equal, } frame_data1 = random_state.randn(100, 10) frame_data2 = random_state.randn(100, 10) pandas_df = pandas.DataFrame(frame_data1, columns=list("abcdefghij")) pandas_df2 = pandas.DataFrame(frame_data2, columns=list("abcdefghij")) modin_df = pd.DataFrame(frame_data1, columns=list("abcdefghij")) modin_df2 = pd.DataFrame(frame_data2, columns=list("abcdefghij")) modin_result = modin_df.compare(modin_df2, **kwargs) pandas_result = pandas_df.compare(pandas_df2, **kwargs) assert to_pandas(modin_result).equals(pandas_result) modin_result = modin_df2.compare(modin_df, **kwargs) pandas_result = pandas_df2.compare(pandas_df, **kwargs) assert to_pandas(modin_result).equals(pandas_result) series_data1 = ["a", "b", "c", "d", "e"] series_data2 = ["a", "a", "c", "b", "e"] pandas_series1 = pandas.Series(series_data1) pandas_series2 = pandas.Series(series_data2) modin_series1 = pd.Series(series_data1) modin_series2 = pd.Series(series_data2) modin_result = modin_series1.compare(modin_series2, **kwargs) pandas_result = pandas_series1.compare(pandas_series2, **kwargs) assert to_pandas(modin_result).equals(pandas_result) modin_result = modin_series2.compare(modin_series1, **kwargs) pandas_result = pandas_series2.compare(pandas_series1, **kwargs) assert to_pandas(modin_result).equals(pandas_result) @pytest.mark.parametrize( "params", [ {"ascending": True}, {"normalize": True}, pytest.param( {"sort": False}, marks=( pytest.mark.xfail( reason="Known issue with sort=False in `groupby()` " + "(https://github.com/modin-project/modin/issues/3571)", strict=True, ) if Engine.get() in ("Python", "Ray", "Dask", "Unidist") and StorageFormat.get() != "Base" else [] ), ), ], ) def test_value_counts(params): data = [[4, 1, 3, 2], [2, 5, 6, 5], [4, 3, 3, 5]] columns = ["col1", "col2", "col3", "col4"] eval_general( *create_test_dfs(data, columns=columns), lambda df: df["col1"].value_counts(**params), ) def test_value_counts_with_nulls(): data = [[5, 6, None, 7, 7], [None, None, 5, 8]] eval_general(*create_test_dfs(data), lambda df: df[0].value_counts(dropna=False)) def test_value_counts_with_multiindex(): data = [[1, 2, 2, 4]] index = pd.MultiIndex.from_arrays( arrays=[["a", "a", "b", "b"], [1, 2, 1, 2]], names=("l1", "l2") ) eval_general( *create_test_dfs(data, index=index), lambda df: df[0].value_counts(), ) ================================================ FILE: modin/tests/pandas/dataframe/test_map_metadata.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. from decimal import Decimal import matplotlib import numpy as np import pandas import pytest import modin.pandas as pd from modin.config import MinRowPartitionSize, NPartitions, StorageFormat from modin.core.dataframe.pandas.metadata import LazyProxyCategoricalDtype from modin.core.storage_formats.pandas.utils import split_result_of_axis_func_pandas from modin.pandas.testing import assert_index_equal, assert_series_equal from modin.tests.pandas.utils import ( RAND_HIGH, RAND_LOW, arg_keys, axis_keys, axis_values, bool_arg_keys, bool_arg_values, create_test_dfs, default_to_pandas_ignore_string, df_equals, df_is_empty, eval_general, indices_keys, indices_values, name_contains, numeric_dfs, random_state, sort_if_range_partitioning, test_data, test_data_keys, test_data_values, test_data_with_duplicates_keys, test_data_with_duplicates_values, test_func_keys, test_func_values, ) from modin.tests.test_utils import ( current_execution_is_native, df_or_series_using_native_execution, warns_that_defaulting_to_pandas_if, ) from modin.utils import get_current_execution NPartitions.put(4) # Force matplotlib to not use any Xwindows backend. matplotlib.use("Agg") # Our configuration in pytest.ini requires that we explicitly catch all # instances of defaulting to pandas, but some test modules, like this one, # have too many such instances. pytestmark = pytest.mark.filterwarnings(default_to_pandas_ignore_string) def eval_insert(modin_df, pandas_df, **kwargs): if "col" in kwargs and "column" not in kwargs: kwargs["column"] = kwargs.pop("col") _kwargs = {"loc": 0, "column": "New column"} _kwargs.update(kwargs) eval_general( modin_df, pandas_df, operation=lambda df, **kwargs: df.insert(**kwargs), __inplace__=True, **_kwargs, ) def test_indexing(): modin_df = pd.DataFrame( dict(a=[1, 2, 3], b=[4, 5, 6], c=[7, 8, 9]), index=["a", "b", "c"] ) pandas_df = pandas.DataFrame( dict(a=[1, 2, 3], b=[4, 5, 6], c=[7, 8, 9]), index=["a", "b", "c"] ) modin_result = modin_df pandas_result = pandas_df df_equals(modin_result, pandas_result) modin_result = modin_df["b"] pandas_result = pandas_df["b"] df_equals(modin_result, pandas_result) modin_result = modin_df[["b"]] pandas_result = pandas_df[["b"]] df_equals(modin_result, pandas_result) modin_result = modin_df[["b", "a"]] pandas_result = pandas_df[["b", "a"]] df_equals(modin_result, pandas_result) modin_result = modin_df.loc["b"] pandas_result = pandas_df.loc["b"] df_equals(modin_result, pandas_result) modin_result = modin_df.loc[["b"]] pandas_result = pandas_df.loc[["b"]] df_equals(modin_result, pandas_result) modin_result = modin_df.loc[["b", "a"]] pandas_result = pandas_df.loc[["b", "a"]] df_equals(modin_result, pandas_result) modin_result = modin_df.loc[["b", "a"], ["a", "c"]] pandas_result = pandas_df.loc[["b", "a"], ["a", "c"]] df_equals(modin_result, pandas_result) modin_result = modin_df.loc[:, ["a", "c"]] pandas_result = pandas_df.loc[:, ["a", "c"]] df_equals(modin_result, pandas_result) modin_result = modin_df.loc[:, ["c"]] pandas_result = pandas_df.loc[:, ["c"]] df_equals(modin_result, pandas_result) modin_result = modin_df.loc[[]] pandas_result = pandas_df.loc[[]] df_equals(modin_result, pandas_result) def test_empty_df(): df = pd.DataFrame(index=["a", "b"]) df_is_empty(df) assert_index_equal(df.index, pd.Index(["a", "b"])) assert len(df.columns) == 0 df = pd.DataFrame(columns=["a", "b"]) df_is_empty(df) assert len(df.index) == 0 assert_index_equal(df.columns, pd.Index(["a", "b"])) df = pd.DataFrame() df_is_empty(df) assert len(df.index) == 0 assert len(df.columns) == 0 df = pd.DataFrame(index=["a", "b"]) df_is_empty(df) assert_index_equal(df.index, pd.Index(["a", "b"])) assert len(df.columns) == 0 df = pd.DataFrame(columns=["a", "b"]) df_is_empty(df) assert len(df.index) == 0 assert_index_equal(df.columns, pd.Index(["a", "b"])) df = pd.DataFrame() df_is_empty(df) assert len(df.index) == 0 assert len(df.columns) == 0 df = pd.DataFrame() pd_df = pandas.DataFrame() df["a"] = [1, 2, 3, 4, 5] pd_df["a"] = [1, 2, 3, 4, 5] df_equals(df, pd_df) df = pd.DataFrame() pd_df = pandas.DataFrame() df["a"] = list("ABCDEF") pd_df["a"] = list("ABCDEF") df_equals(df, pd_df) df = pd.DataFrame() pd_df = pandas.DataFrame() df["a"] = pd.Series([1, 2, 3, 4, 5]) pd_df["a"] = pandas.Series([1, 2, 3, 4, 5]) df_equals(df, pd_df) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_abs(request, data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) try: pandas_result = pandas_df.abs() except Exception as err: with pytest.raises(type(err)): modin_df.abs() else: modin_result = modin_df.abs() df_equals(modin_result, pandas_result) @pytest.mark.parametrize("axis", [None, 0, 1]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_add_prefix(data, axis): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) test_prefix = "TEST" new_modin_df = modin_df.add_prefix(test_prefix, axis=axis) new_pandas_df = pandas_df.add_prefix(test_prefix, axis=axis) df_equals(new_modin_df.columns, new_pandas_df.columns) # TODO(https://github.com/modin-project/modin/issues/3804): # make df_equals always check dtypes. df_equals(new_modin_df.dtypes, new_pandas_df.dtypes) @pytest.mark.parametrize("axis", [None, 0, 1]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_add_suffix(data, axis): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) test_suffix = "TEST" new_modin_df = modin_df.add_suffix(test_suffix, axis=axis) new_pandas_df = pandas_df.add_suffix(test_suffix, axis=axis) df_equals(new_modin_df.columns, new_pandas_df.columns) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("testfunc", test_func_values, ids=test_func_keys) @pytest.mark.parametrize( "na_action", [None, "ignore"], ids=["no_na_action", "ignore_na"] ) def test_applymap(data, testfunc, na_action): modin_df, pandas_df = create_test_dfs(data) with pytest.raises(ValueError): x = 2 modin_df.applymap(x) eval_general(modin_df, pandas_df, lambda df: df.applymap(testfunc, na_action)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("testfunc", test_func_values, ids=test_func_keys) def test_applymap_numeric(request, data, testfunc): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) if name_contains(request.node.name, numeric_dfs): try: pandas_result = pandas_df.applymap(testfunc) except Exception as err: with pytest.raises(type(err)): modin_df.applymap(testfunc) else: modin_result = modin_df.applymap(testfunc) df_equals(modin_result, pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_at(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) key1 = modin_df.columns[0] # Scalar df_equals(modin_df.at[0, key1], pandas_df.at[0, key1]) # Series df_equals(modin_df.loc[0].at[key1], pandas_df.loc[0].at[key1]) # Write Item modin_df_copy = modin_df.copy() pandas_df_copy = pandas_df.copy() modin_df_copy.at[1, key1] = modin_df.at[0, key1] pandas_df_copy.at[1, key1] = pandas_df.at[0, key1] df_equals(modin_df_copy, pandas_df_copy) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_axes(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) for modin_axis, pd_axis in zip(modin_df.axes, pandas_df.axes): assert np.array_equal(modin_axis, pd_axis) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_copy(data): modin_df = pd.DataFrame(data) # pandas_df is unused but there so there won't be confusing list comprehension # stuff in the pytest.mark.parametrize new_modin_df = modin_df.copy(deep=True) assert new_modin_df is not modin_df assert new_modin_df.index is not modin_df.index assert new_modin_df.columns is not modin_df.columns assert new_modin_df.dtypes is not modin_df.dtypes if get_current_execution() != "BaseOnPython" and not current_execution_is_native(): assert np.array_equal( new_modin_df._query_compiler._modin_frame._partitions, modin_df._query_compiler._modin_frame._partitions, ) df_equals(new_modin_df, modin_df) # Shallow copy tests modin_df = pd.DataFrame(data) modin_df_cp = modin_df.copy(deep=False) assert modin_df_cp is not modin_df assert modin_df_cp.index is modin_df.index assert modin_df_cp.columns is modin_df.columns # FIXME: we're different from pandas here as modin doesn't copy dtypes for a shallow copy # https://github.com/modin-project/modin/issues/5602 # assert modin_df_cp.dtypes is not modin_df.dtypes modin_df[modin_df.columns[0]] = 0 df_equals(modin_df, modin_df_cp) def test_copy_empty_dataframe(): df = pd.DataFrame(range(3)) res = df[:0].copy() assert res.dtypes.equals(df.dtypes) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_dtypes(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) df_equals(modin_df.dtypes, pandas_df.dtypes) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("key", indices_values, ids=indices_keys) def test_get(data, key): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) df_equals(modin_df.get(key), pandas_df.get(key)) df_equals( modin_df.get(key, default="default"), pandas_df.get(key, default="default") ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize( "dummy_na", bool_arg_values, ids=arg_keys("dummy_na", bool_arg_keys) ) @pytest.mark.parametrize( "drop_first", bool_arg_values, ids=arg_keys("drop_first", bool_arg_keys) ) def test_get_dummies(request, data, dummy_na, drop_first): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) try: pandas_result = pandas.get_dummies( pandas_df, dummy_na=dummy_na, drop_first=drop_first ) except Exception as err: with pytest.raises(type(err)): pd.get_dummies(modin_df, dummy_na=dummy_na, drop_first=drop_first) else: modin_result = pd.get_dummies( modin_df, dummy_na=dummy_na, drop_first=drop_first ) df_equals(modin_result, pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_isna(data): pandas_df = pandas.DataFrame(data) modin_df = pd.DataFrame(data) pandas_result = pandas_df.isna() modin_result = modin_df.isna() df_equals(modin_result, pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_isnull(data): pandas_df = pandas.DataFrame(data) modin_df = pd.DataFrame(data) pandas_result = pandas_df.isnull() modin_result = modin_df.isnull() df_equals(modin_result, pandas_result) def test_astype(): td = pandas.DataFrame(test_data["int_data"])[["col1", "index", "col3", "col4"]] modin_df = pd.DataFrame(td.values, index=td.index, columns=td.columns) expected_df = pandas.DataFrame(td.values, index=td.index, columns=td.columns) modin_df_casted = modin_df.astype(np.int32) expected_df_casted = expected_df.astype(np.int32) df_equals(modin_df_casted, expected_df_casted) modin_df_casted = modin_df.astype(np.float64) expected_df_casted = expected_df.astype(np.float64) df_equals(modin_df_casted, expected_df_casted) modin_df_casted = modin_df.astype(str) expected_df_casted = expected_df.astype(str) df_equals(modin_df_casted, expected_df_casted) # pandas nullable dtype modin_df_casted = modin_df.astype("Float64") expected_df_casted = expected_df.astype("Float64") df_equals(modin_df_casted, expected_df_casted) modin_df_casted = modin_df.astype("category") expected_df_casted = expected_df.astype("category") df_equals(modin_df_casted, expected_df_casted) dtype_dict = {"col1": np.int32, "index": np.int64, "col3": str} modin_df_casted = modin_df.astype(dtype_dict) expected_df_casted = expected_df.astype(dtype_dict) df_equals(modin_df_casted, expected_df_casted) modin_df = pd.DataFrame(index=["row1"], columns=["col1"]) modin_df["col1"]["row1"] = 11 modin_df_casted = modin_df.astype(int) expected_df = pandas.DataFrame(index=["row1"], columns=["col1"]) expected_df["col1"]["row1"] = 11 expected_df_casted = expected_df.astype(int) df_equals(modin_df_casted, expected_df_casted) with pytest.raises(KeyError): modin_df.astype({"not_exists": np.uint8}) # The dtypes series must have a unique index. eval_general( modin_df, expected_df, lambda df: df.astype( pd.Series([str, str], index=["col1", "col1"]) if isinstance(df, pd.DataFrame) else pandas.Series([str, str], index=["col1", "col1"]) ), expected_exception=ValueError( "cannot reindex on an axis with duplicate labels" ), ) @pytest.mark.parametrize("errors", ["raise", "ignore"]) def test_astype_errors(errors): data = {"a": ["a", 2, -1]} modin_df, pandas_df = create_test_dfs(data) expected_exception = None if errors == "raise": pytest.xfail(reason="https://github.com/modin-project/modin/issues/7025") eval_general( modin_df, pandas_df, lambda df: df.astype("int", errors=errors), # https://github.com/modin-project/modin/issues/5962 comparator_kwargs={"check_dtypes": errors != "ignore"}, expected_exception=expected_exception, ) @pytest.mark.parametrize("has_dtypes", [False, True]) def test_astype_copy(has_dtypes): data = [1] modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) if not has_dtypes: modin_df._query_compiler.set_frame_dtypes_cache(None) eval_general(modin_df, pandas_df, lambda df: df.astype(str, copy=False)) # trivial case where copying can be avoided, behavior should match pandas s1 = pd.Series([1, 2]) if not has_dtypes: modin_df._query_compiler.set_frame_dtypes_cache(None) s2 = s1.astype("int64", copy=False) s2[0] = 10 df_equals(s1, s2) @pytest.mark.parametrize("dtypes_are_dict", [True, False]) def test_astype_dict_or_series_multiple_column_partitions(dtypes_are_dict): # Test astype with a dtypes dict that is complex in that: # - It applies to columns spanning multiple column partitions # - Within a partition frame df: # - dtypes.index is not a subset of df.columns # - df.columns is not a subset of dtypes.index modin_df, pandas_df = create_test_dfs(test_data["int_data"]) if dtypes_are_dict: new_dtypes = {} else: new_dtypes = pandas.Series() for i, column in enumerate(pandas_df.columns): if i % 3 == 1: new_dtypes[column] = "string" elif i % 3 == 2: new_dtypes[column] = float eval_general(modin_df, pandas_df, lambda df: df.astype(new_dtypes)) def test_astype_category(): modin_df = pd.DataFrame( {"col1": ["A", "A", "B", "B", "A"], "col2": [1, 2, 3, 4, 5]} ) pandas_df = pandas.DataFrame( {"col1": ["A", "A", "B", "B", "A"], "col2": [1, 2, 3, 4, 5]} ) modin_result = modin_df.astype({"col1": "category"}) pandas_result = pandas_df.astype({"col1": "category"}) df_equals(modin_result, pandas_result) assert modin_result.dtypes.equals(pandas_result.dtypes) modin_result = modin_df.astype("category") pandas_result = pandas_df.astype("category") df_equals(modin_result, pandas_result) assert modin_result.dtypes.equals(pandas_result.dtypes) dtype = pd.CategoricalDtype(categories=["A", "B"]) modin_result = modin_df.astype({"col1": dtype}) pandas_result = pandas_df.astype({"col1": dtype}) df_equals(modin_result, pandas_result) assert modin_result.dtypes.equals(pandas_result.dtypes) dtype = pd.CategoricalDtype(categories=["A", "B"]) modin_result = modin_df.astype(dtype) pandas_result = pandas_df.astype(dtype) df_equals(modin_result, pandas_result) assert modin_result.dtypes.equals(pandas_result.dtypes) def test_astype_category_large(): series_length = 10_000 modin_df = pd.DataFrame( { "col1": ["str{0}".format(i) for i in range(0, series_length)], "col2": [i for i in range(0, series_length)], } ) pandas_df = pandas.DataFrame( { "col1": ["str{0}".format(i) for i in range(0, series_length)], "col2": [i for i in range(0, series_length)], } ) modin_result = modin_df.astype({"col1": "category"}) pandas_result = pandas_df.astype({"col1": "category"}) df_equals(modin_result, pandas_result) assert modin_result.dtypes.equals(pandas_result.dtypes) modin_result = modin_df.astype("category") pandas_result = pandas_df.astype("category") df_equals(modin_result, pandas_result) assert modin_result.dtypes.equals(pandas_result.dtypes) dtype = pd.CategoricalDtype(categories=["str0", "str1"]) modin_result = modin_df.astype({"col1": dtype}) pandas_result = pandas_df.astype({"col1": dtype}) df_equals(modin_result, pandas_result) assert modin_result.dtypes.equals(pandas_result.dtypes) dtype = pd.CategoricalDtype(categories=["str0", "str1"]) modin_result = modin_df.astype(dtype) pandas_result = pandas_df.astype(dtype) df_equals(modin_result, pandas_result) assert modin_result.dtypes.equals(pandas_result.dtypes) def test_astype_int64_to_astype_category_github_issue_6259(): eval_general( *create_test_dfs( {"c0": [0, 1, 2, 3, 4], "par": ["foo", "boo", "bar", "foo", "boo"]}, index=["a", "b", "c", "d", "e"], ), lambda df: df["c0"].astype("Int64").astype("category"), ) @pytest.mark.skipif( get_current_execution() == "BaseOnPython" or current_execution_is_native(), reason="BaseOnPython and NativeQueryCompiler don't have proxy categories", ) class TestCategoricalProxyDtype: """This class contains test and test usilities for the ``LazyProxyCategoricalDtype`` class.""" @staticmethod def _get_lazy_proxy(): """ Build a dataframe containing a column that has a proxy type and return this proxy together with an original dtype that this proxy is emulating. Returns ------- (LazyProxyCategoricalDtype, pandas.CategoricalDtype, modin.pandas.DataFrame) """ nchunks = 3 pandas_df = pandas.DataFrame({"a": [1, 1, 2, 2, 3, 2], "b": [1, 2, 3, 4, 5, 6]}) original_dtype = pandas_df.astype({"a": "category"}).dtypes["a"] chunks = split_result_of_axis_func_pandas( axis=0, num_splits=nchunks, result=pandas_df, min_block_size=MinRowPartitionSize.get(), length_list=[2, 2, 2], ) if StorageFormat.get() == "Pandas": df = pd.concat([pd.DataFrame(chunk) for chunk in chunks]) assert df._query_compiler._modin_frame._partitions.shape == (nchunks, 1) df = df.astype({"a": "category"}) return df.dtypes["a"], original_dtype, df else: raise NotImplementedError() def test_update_proxy(self): """Verify that ``LazyProxyCategoricalDtype._update_proxy`` method works as expected.""" lazy_proxy, _, _ = self._get_lazy_proxy() new_parent = pd.DataFrame({"a": [10, 20, 30]})._query_compiler._modin_frame assert isinstance(lazy_proxy, LazyProxyCategoricalDtype) # When we try to create a new proxy from the same arguments it should return itself assert ( lazy_proxy._update_proxy(lazy_proxy._parent, lazy_proxy._column_name) is lazy_proxy ) # When any of the arguments is changing we should create a new proxy proxy_with_new_column = lazy_proxy._update_proxy( lazy_proxy._parent, "other_column" ) assert proxy_with_new_column is not lazy_proxy and isinstance( proxy_with_new_column, LazyProxyCategoricalDtype ) # When any of the arguments is changing we should create a new proxy proxy_with_new_parent = lazy_proxy._update_proxy( new_parent, lazy_proxy._column_name ) assert proxy_with_new_parent is not lazy_proxy and isinstance( proxy_with_new_parent, LazyProxyCategoricalDtype ) lazy_proxy.categories # trigger materialization # `._update_proxy` now should produce pandas Categoricals instead of a proxy as it already has materialized data assert ( type(lazy_proxy._update_proxy(lazy_proxy._parent, lazy_proxy._column_name)) == pandas.CategoricalDtype ) def test_update_proxy_implicit(self): """ Verify that a lazy proxy correctly updates its parent when passed from one parent to another. """ lazy_proxy, _, parent = self._get_lazy_proxy() parent_frame = parent._query_compiler._modin_frame if StorageFormat.get() == "Pandas": assert lazy_proxy._parent is parent_frame else: raise NotImplementedError( f"The test is not implemented for {StorageFormat.get()} storage format" ) # Making a copy of the dataframe, the new proxy should now start pointing to the new parent new_parent = parent.copy() new_parent_frame = new_parent._query_compiler._modin_frame new_lazy_proxy = new_parent_frame.dtypes[lazy_proxy._column_name] if StorageFormat.get() == "Pandas": # Make sure that the old proxy still pointing to the old parent assert lazy_proxy._parent is parent_frame assert new_lazy_proxy._parent is new_parent_frame else: raise NotImplementedError( f"The test is not implemented for {StorageFormat.get()} storage format" ) def test_if_proxy_lazy(self): """Verify that proxy is able to pass simple comparison checks without triggering materialization.""" lazy_proxy, actual_dtype, _ = self._get_lazy_proxy() assert isinstance(lazy_proxy, LazyProxyCategoricalDtype) assert not lazy_proxy._is_materialized assert lazy_proxy == "category" assert isinstance(lazy_proxy, pd.CategoricalDtype) assert isinstance(lazy_proxy, pandas.CategoricalDtype) assert str(lazy_proxy) == "category" assert str(lazy_proxy) == str(actual_dtype) assert not lazy_proxy.ordered assert not lazy_proxy._is_materialized # Further, there are all checks that materialize categories assert lazy_proxy == actual_dtype assert actual_dtype == lazy_proxy assert repr(lazy_proxy) == repr(actual_dtype) assert lazy_proxy.categories.equals(actual_dtype.categories) assert lazy_proxy._is_materialized def test_proxy_as_dtype(self): """Verify that proxy can be used as an actual dtype.""" lazy_proxy, actual_dtype, _ = self._get_lazy_proxy() assert isinstance(lazy_proxy, LazyProxyCategoricalDtype) assert not lazy_proxy._is_materialized modin_df2, pandas_df2 = create_test_dfs({"c": [2, 2, 3, 4, 5, 6]}) eval_general( (modin_df2, lazy_proxy), (pandas_df2, actual_dtype), lambda args: args[0].astype({"c": args[1]}), ) def test_proxy_with_pandas_constructor(self): """Verify that users still can use pandas' constructor using `type(cat)(...)` notation.""" lazy_proxy, _, _ = self._get_lazy_proxy() assert isinstance(lazy_proxy, LazyProxyCategoricalDtype) new_cat_values = pandas.Index([3, 4, 5]) new_category_dtype = type(lazy_proxy)(categories=new_cat_values, ordered=True) assert not lazy_proxy._is_materialized assert new_category_dtype._is_materialized assert new_category_dtype.categories.equals(new_cat_values) assert new_category_dtype.ordered def test_infer_objects_single_partition(): data = {"a": ["s", 2, 3]} modin_df = pd.DataFrame(data).iloc[1:] pandas_df = pandas.DataFrame(data).iloc[1:] modin_result = modin_df.infer_objects() pandas_result = pandas_df.infer_objects() df_equals(modin_result, pandas_result) assert modin_result.dtypes.equals(pandas_result.dtypes) @pytest.mark.parametrize( "infer_objects", bool_arg_values, ids=arg_keys("infer_objects", bool_arg_keys) ) @pytest.mark.parametrize( "convert_string", bool_arg_values, ids=arg_keys("convert_string", bool_arg_keys) ) @pytest.mark.parametrize( "convert_integer", bool_arg_values, ids=arg_keys("convert_integer", bool_arg_keys) ) @pytest.mark.parametrize( "convert_boolean", bool_arg_values, ids=arg_keys("convert_boolean", bool_arg_keys) ) @pytest.mark.parametrize( "convert_floating", bool_arg_values, ids=arg_keys("convert_floating", bool_arg_keys) ) @pytest.mark.exclude_in_sanity def test_convert_dtypes_single_partition( infer_objects, convert_string, convert_integer, convert_boolean, convert_floating ): # Sanity check, copied from pandas documentation: # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.convert_dtypes.html data = { "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")), "c": pd.Series([True, False, np.nan], dtype=np.dtype("O")), "d": pd.Series(["h", "i", np.nan], dtype=np.dtype("O")), "e": pd.Series([10, np.nan, 20], dtype=np.dtype("float")), "f": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")), } kwargs = { "infer_objects": infer_objects, "convert_string": convert_string, "convert_integer": convert_integer, "convert_boolean": convert_boolean, "convert_floating": convert_floating, } modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) modin_result = modin_df.convert_dtypes(**kwargs) pandas_result = pandas_df.convert_dtypes(**kwargs) assert modin_result.dtypes.equals(pandas_result.dtypes) @pytest.mark.parametrize("dtype_backend", ["numpy_nullable", "pyarrow"]) def test_convert_dtypes_dtype_backend(dtype_backend): data = { "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")), "c": pd.Series([True, False, np.nan], dtype=np.dtype("O")), "d": pd.Series(["h", "i", np.nan], dtype=np.dtype("O")), "e": pd.Series([10, np.nan, 20], dtype=np.dtype("float")), "f": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")), } def comparator(df1, df2): df_equals(df1, df2) df_equals(df1.dtypes, df2.dtypes) eval_general( *create_test_dfs(data), lambda df: df.convert_dtypes(dtype_backend=dtype_backend), comparator=comparator, ) @pytest.mark.skipif( current_execution_is_native(), reason="NativeQueryCompiler does not contain partitions.", ) def test_convert_dtypes_multiple_row_partitions(): # Column 0 should have string dtype modin_part1 = pd.DataFrame(["a"]).convert_dtypes() # Column 0 should have an int dtype modin_part2 = pd.DataFrame([1]).convert_dtypes() modin_df = pd.concat([modin_part1, modin_part2]) if StorageFormat.get() == "Pandas": assert modin_df._query_compiler._modin_frame._partitions.shape == (2, 1) pandas_df = pandas.DataFrame(["a", 1], index=[0, 0]) # The initial dataframes should be the same df_equals(modin_df, pandas_df) # TODO(https://github.com/modin-project/modin/pull/3805): delete # this assert once df_equals checks dtypes assert modin_df.dtypes.equals(pandas_df.dtypes) modin_result = modin_df.convert_dtypes() pandas_result = pandas_df.convert_dtypes() df_equals(modin_result, pandas_result) assert modin_result.dtypes.equals(pandas_result.dtypes) def test_convert_dtypes_5653(): modin_part1 = pd.DataFrame({"col1": ["a", "b", "c", "d"]}) modin_part2 = pd.DataFrame({"col1": [None, None, None, None]}) modin_df = pd.concat([modin_part1, modin_part2]) if StorageFormat.get() == "Pandas": assert modin_df._query_compiler._modin_frame._partitions.shape == (2, 1) modin_df = modin_df.convert_dtypes() assert len(modin_df.dtypes) == 1 assert modin_df.dtypes.iloc[0] == "string" @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize("bound_type", ["list", "series"], ids=["list", "series"]) @pytest.mark.exclude_in_sanity def test_clip(request, data, axis, bound_type): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) if name_contains(request.node.name, numeric_dfs): ind_len = ( len(modin_df.index) if not pandas.DataFrame()._get_axis_number(axis) else len(modin_df.columns) ) # set bounds lower, upper = np.sort(random_state.randint(RAND_LOW, RAND_HIGH, 2)) # test only upper scalar bound modin_result = modin_df.clip(None, upper, axis=axis) pandas_result = pandas_df.clip(None, upper, axis=axis) df_equals(modin_result, pandas_result) # test lower and upper scalar bound modin_result = modin_df.clip(lower, upper, axis=axis) pandas_result = pandas_df.clip(lower, upper, axis=axis) df_equals(modin_result, pandas_result) lower = random_state.randint(RAND_LOW, RAND_HIGH, ind_len) upper = random_state.randint(RAND_LOW, RAND_HIGH, ind_len) if bound_type == "series": modin_lower = pd.Series(lower) pandas_lower = pandas.Series(lower) modin_upper = pd.Series(upper) pandas_upper = pandas.Series(upper) else: modin_lower = pandas_lower = lower modin_upper = pandas_upper = upper # test lower and upper list bound on each column modin_result = modin_df.clip(modin_lower, modin_upper, axis=axis) pandas_result = pandas_df.clip(pandas_lower, pandas_upper, axis=axis) df_equals(modin_result, pandas_result) # test only upper list bound on each column modin_result = modin_df.clip(np.nan, modin_upper, axis=axis) pandas_result = pandas_df.clip(np.nan, pandas_upper, axis=axis) df_equals(modin_result, pandas_result) with pytest.raises(ValueError): modin_df.clip(lower=[1, 2, 3], axis=None) def test_clip_4485(): modin_result = pd.DataFrame([1]).clip([3]) pandas_result = pandas.DataFrame([1]).clip([3]) df_equals(modin_result, pandas_result) def test_drop(): frame_data = {"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]} simple = pandas.DataFrame(frame_data) modin_simple = pd.DataFrame(frame_data) df_equals(modin_simple.drop("A", axis=1), simple[["B"]]) df_equals(modin_simple.drop(["A", "B"], axis="columns"), simple[[]]) df_equals(modin_simple.drop([0, 1, 3], axis=0), simple.loc[[2], :]) df_equals(modin_simple.drop([0, 3], axis="index"), simple.loc[[1, 2], :]) pytest.raises(KeyError, modin_simple.drop, 5) pytest.raises(KeyError, modin_simple.drop, "C", axis=1) pytest.raises(KeyError, modin_simple.drop, [1, 5]) pytest.raises(KeyError, modin_simple.drop, ["A", "C"], axis=1) # errors = 'ignore' df_equals(modin_simple.drop(5, errors="ignore"), simple) df_equals(modin_simple.drop([0, 5], errors="ignore"), simple.loc[[1, 2, 3], :]) df_equals(modin_simple.drop("C", axis=1, errors="ignore"), simple) df_equals(modin_simple.drop(["A", "C"], axis=1, errors="ignore"), simple[["B"]]) # non-unique nu_df = pandas.DataFrame( zip(range(3), range(-3, 1), list("abc")), columns=["a", "a", "b"] ) modin_nu_df = pd.DataFrame(nu_df) df_equals(modin_nu_df.drop("a", axis=1), nu_df[["b"]]) df_equals(modin_nu_df.drop("b", axis="columns"), nu_df["a"]) df_equals(modin_nu_df.drop([]), nu_df) nu_df = nu_df.set_index(pandas.Index(["X", "Y", "X"])) nu_df.columns = list("abc") modin_nu_df = pd.DataFrame(nu_df) df_equals(modin_nu_df.drop("X", axis="rows"), nu_df.loc[["Y"], :]) df_equals(modin_nu_df.drop(["X", "Y"], axis=0), nu_df.loc[[], :]) # inplace cache issue frame_data = random_state.randn(10, 3) df = pandas.DataFrame(frame_data, columns=list("abc")) modin_df = pd.DataFrame(frame_data, columns=list("abc")) expected = df[~(df.b > 0)] modin_df.drop(labels=df[df.b > 0].index, inplace=True) df_equals(modin_df, expected) midx = pd.MultiIndex( levels=[["lama", "cow", "falcon"], ["speed", "weight", "length"]], codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], ) df = pd.DataFrame( index=midx, columns=["big", "small"], data=[ [45, 30], [200, 100], [1.5, 1], [30, 20], [250, 150], [1.5, 0.8], [320, 250], [1, 0.8], [0.3, 0.2], ], ) with warns_that_defaulting_to_pandas_if( not df_or_series_using_native_execution(df) ): df.drop(index="length", level=1) def test_drop_api_equivalence(): # equivalence of the labels/axis and index/columns API's frame_data = [[1, 2, 3], [3, 4, 5], [5, 6, 7]] modin_df = pd.DataFrame(frame_data, index=["a", "b", "c"], columns=["d", "e", "f"]) modin_df1 = modin_df.drop("a") modin_df2 = modin_df.drop(index="a") df_equals(modin_df1, modin_df2) modin_df1 = modin_df.drop("d", axis=1) modin_df2 = modin_df.drop(columns="d") df_equals(modin_df1, modin_df2) modin_df1 = modin_df.drop(labels="e", axis=1) modin_df2 = modin_df.drop(columns="e") df_equals(modin_df1, modin_df2) modin_df1 = modin_df.drop(["a"], axis=0) modin_df2 = modin_df.drop(index=["a"]) df_equals(modin_df1, modin_df2) modin_df1 = modin_df.drop(["a"], axis=0).drop(["d"], axis=1) modin_df2 = modin_df.drop(index=["a"], columns=["d"]) df_equals(modin_df1, modin_df2) with pytest.raises(ValueError): modin_df.drop(labels="a", index="b") with pytest.raises(ValueError): modin_df.drop(labels="a", columns="b") with pytest.raises(ValueError): modin_df.drop(axis=1) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_drop_transpose(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) modin_result = modin_df.T.drop(columns=[0, 1, 2]) pandas_result = pandas_df.T.drop(columns=[0, 1, 2]) df_equals(modin_result, pandas_result) modin_result = modin_df.T.drop(index=["col3", "col1"]) pandas_result = pandas_df.T.drop(index=["col3", "col1"]) df_equals(modin_result, pandas_result) modin_result = modin_df.T.drop(columns=[0, 1, 2], index=["col3", "col1"]) pandas_result = pandas_df.T.drop(columns=[0, 1, 2], index=["col3", "col1"]) df_equals(modin_result, pandas_result) def test_droplevel(): df = ( pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]) .set_index([0, 1]) .rename_axis(["a", "b"]) ) df.columns = pd.MultiIndex.from_tuples( [("c", "e"), ("d", "f")], names=["level_1", "level_2"] ) df.droplevel("a") df.droplevel("level_2", axis=1) @pytest.mark.parametrize( "data", test_data_with_duplicates_values, ids=test_data_with_duplicates_keys ) @pytest.mark.parametrize( "keep", ["last", "first", False], ids=["last", "first", "False"] ) @pytest.mark.parametrize( "subset", [None, "col1", "name", ("col1", "col3"), ["col1", "col3", "col7"]], ids=["None", "string", "name", "tuple", "list"], ) @pytest.mark.parametrize("ignore_index", [True, False], ids=["True", "False"]) @pytest.mark.exclude_in_sanity def test_drop_duplicates(data, keep, subset, ignore_index): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) try: pandas_df.drop_duplicates( keep=keep, inplace=False, subset=subset, ignore_index=ignore_index ) except Exception as err: with pytest.raises(type(err)): modin_df.drop_duplicates( keep=keep, inplace=False, subset=subset, ignore_index=ignore_index ) else: sort_if_range_partitioning( pandas_df.drop_duplicates( keep=keep, inplace=False, subset=subset, ignore_index=ignore_index ), modin_df.drop_duplicates( keep=keep, inplace=False, subset=subset, ignore_index=ignore_index ), ) try: pandas_df.drop_duplicates( keep=keep, inplace=True, subset=subset, ignore_index=ignore_index ) except Exception as err: with pytest.raises(type(err)): modin_df.drop_duplicates( keep=keep, inplace=True, subset=subset, ignore_index=ignore_index ) else: modin_df.drop_duplicates( keep=keep, inplace=True, subset=subset, ignore_index=ignore_index ) sort_if_range_partitioning(modin_df, pandas_df) def test_drop_duplicates_with_missing_index_values(): data = { "columns": ["value", "time", "id"], "index": [ 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 21, 22, 23, 24, 25, 26, 27, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, ], "data": [ ["3", 1279213398000.0, 88.0], ["3", 1279204682000.0, 88.0], ["0", 1245772835000.0, 448.0], ["0", 1270564258000.0, 32.0], ["0", 1267106669000.0, 118.0], ["7", 1300621123000.0, 5.0], ["0", 1251130752000.0, 957.0], ["0", 1311683506000.0, 62.0], ["9", 1283692698000.0, 89.0], ["9", 1270234253000.0, 64.0], ["0", 1285088818000.0, 50.0], ["0", 1218212725000.0, 695.0], ["2", 1383933968000.0, 348.0], ["0", 1368227625000.0, 257.0], ["1", 1454514093000.0, 446.0], ["1", 1428497427000.0, 134.0], ["1", 1459184936000.0, 568.0], ["1", 1502293302000.0, 599.0], ["1", 1491833358000.0, 829.0], ["1", 1485431534000.0, 806.0], ["8", 1351800505000.0, 101.0], ["0", 1357247721000.0, 916.0], ["0", 1335804423000.0, 370.0], ["24", 1327547726000.0, 720.0], ["0", 1332334140000.0, 415.0], ["0", 1309543100000.0, 30.0], ["18", 1309541141000.0, 30.0], ["0", 1298979435000.0, 48.0], ["14", 1276098160000.0, 59.0], ["0", 1233936302000.0, 109.0], ], } pandas_df = pandas.DataFrame( data["data"], index=data["index"], columns=data["columns"] ) modin_df = pd.DataFrame(data["data"], index=data["index"], columns=data["columns"]) modin_result = modin_df.sort_values(["id", "time"]).drop_duplicates(["id"]) pandas_result = pandas_df.sort_values(["id", "time"]).drop_duplicates(["id"]) sort_if_range_partitioning(modin_result, pandas_result) def test_drop_duplicates_after_sort(): data = [ {"value": 1, "time": 2}, {"value": 1, "time": 1}, {"value": 2, "time": 1}, {"value": 2, "time": 2}, ] modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) modin_result = modin_df.sort_values(["value", "time"]).drop_duplicates(["value"]) pandas_result = pandas_df.sort_values(["value", "time"]).drop_duplicates(["value"]) sort_if_range_partitioning(modin_result, pandas_result) def test_drop_duplicates_with_repeated_index_values(): # This tests for issue #4467: https://github.com/modin-project/modin/issues/4467 data = [[0], [1], [0]] index = [0, 0, 0] modin_df, pandas_df = create_test_dfs(data, index=index) eval_general( modin_df, pandas_df, lambda df: df.drop_duplicates(), comparator=sort_if_range_partitioning, ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize("how", ["any", "all"], ids=["any", "all"]) def test_dropna(data, axis, how): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) with pytest.raises(ValueError): modin_df.dropna(axis=axis, how="invalid") with pytest.raises(TypeError): modin_df.dropna(axis=axis, how=None, thresh=None) with pytest.raises(KeyError): modin_df.dropna(axis=axis, subset=["NotExists"], how=how) modin_result = modin_df.dropna(axis=axis, how=how) pandas_result = pandas_df.dropna(axis=axis, how=how) df_equals(modin_result, pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_dropna_inplace(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) pandas_result = pandas_df.dropna() modin_df.dropna(inplace=True) df_equals(modin_df, pandas_result) modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) pandas_df.dropna(thresh=2, inplace=True) modin_df.dropna(thresh=2, inplace=True) df_equals(modin_df, pandas_df) modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) pandas_df.dropna(axis=1, how="any", inplace=True) modin_df.dropna(axis=1, how="any", inplace=True) df_equals(modin_df, pandas_df) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_dropna_multiple_axes(data): modin_df = pd.DataFrame(data) with pytest.raises(TypeError): modin_df.dropna(how="all", axis=[0, 1]) with pytest.raises(TypeError): modin_df.dropna(how="all", axis=(0, 1)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_dropna_subset(request, data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) if "empty_data" not in request.node.name: column_subset = modin_df.columns[0:2] df_equals( modin_df.dropna(how="all", subset=column_subset), pandas_df.dropna(how="all", subset=column_subset), ) df_equals( modin_df.dropna(how="any", subset=column_subset), pandas_df.dropna(how="any", subset=column_subset), ) row_subset = modin_df.index[0:2] df_equals( modin_df.dropna(how="all", axis=1, subset=row_subset), pandas_df.dropna(how="all", axis=1, subset=row_subset), ) df_equals( modin_df.dropna(how="any", axis=1, subset=row_subset), pandas_df.dropna(how="any", axis=1, subset=row_subset), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis,subset", [(0, list("EF")), (1, [4, 5])]) def test_dropna_subset_error(data, axis, subset): eval_general( *create_test_dfs(data), lambda df: df.dropna(axis=axis, subset=subset), expected_exception=KeyError(["E", "F"]), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("astype", ["category", "int32", "float"]) def test_insert_dtypes(data, astype, request): modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) # categories with NaN works incorrect for now if astype == "category" and pandas_df.iloc[:, 0].isnull().any(): return expected_exception = None if "int32-float_nan_data" in request.node.callspec.id: pytest.xfail(reason="https://github.com/modin-project/modin/issues/7026") eval_insert( modin_df, pandas_df, col="TypeSaver", value=lambda df: df.iloc[:, 0].astype(astype), expected_exception=expected_exception, ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("loc", [-3, 0, 3]) def test_insert_loc(data, loc): modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) expected_exception = None if loc == -3: expected_exception = ValueError("unbounded slice") eval_insert( modin_df, pandas_df, loc=loc, value=lambda df: df.iloc[:, 0], expected_exception=expected_exception, ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_insert(data): modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) eval_insert( modin_df, pandas_df, col="Duplicate", value=lambda df: df[df.columns[0]] ) eval_insert(modin_df, pandas_df, col="Scalar", value=100) eval_insert( pd.DataFrame(columns=list("ab")), pandas.DataFrame(columns=list("ab")), col="Series insert", value=lambda df: df[df.columns[0]], ) eval_insert( modin_df, pandas_df, col="DataFrame insert", value=lambda df: df[[df.columns[0]]], ) eval_insert( modin_df, pandas_df, col="Different indices", value=lambda df: df[[df.columns[0]]].set_index(df.index[::-1]), ) eval_insert( modin_df, pandas_df, col="2d list insert", value=lambda df: [[1, 2]] * len(df), ) # Bad inserts eval_insert( modin_df, pandas_df, col="Bad Column", value=lambda df: df, expected_exception=ValueError( f"Expected a one-dimensional object, got a DataFrame with {len(pandas_df.columns)} columns instead." ), ) eval_insert( modin_df, pandas_df, col="Too Short", value=lambda df: list(df[df.columns[0]])[:-1], expected_exception=ValueError( f"Length of values ({len(pandas_df)-1}) does not match length of index ({len(pandas_df)})" ), ) eval_insert( modin_df, pandas_df, col=lambda df: df.columns[0], value=lambda df: df[df.columns[0]], expected_exception=ValueError("cannot insert 2d list insert, already exists"), ) eval_insert( modin_df, pandas_df, loc=lambda df: len(df.columns) + 100, col="Bad Loc", value=100, expected_exception=IndexError( f"index {len(pandas_df.columns) + 100} is out of bounds for axis 0 with size {len(pandas_df.columns)}" ), ) def test_insert_4407(): data = {"col1": [1, 2, 3], "col2": [2, 3, 4]} modin_df, pandas_df = create_test_dfs(data) def comparator(df1, df2): assert_series_equal(df1.dtypes, df2.dtypes, check_index=False) return df_equals(df1, df2) for idx, value in enumerate( (pandas_df.to_numpy(), np.array([[1]] * 3), np.array([[1, 2, 3], [4, 5, 6]])) ): expected_exception = None if idx == 0: expected_exception = ValueError( "Expected a 1D array, got an array with shape (3, 2)" ) elif idx == 2: # FIXME: https://github.com/modin-project/modin/issues/7080 expected_exception = False eval_insert( modin_df, pandas_df, loc=0, col=f"test_col{idx}", value=value, comparator=lambda df1, df2: comparator(df1, df2), expected_exception=expected_exception, ) def test_insert_modin_array(): from modin.numpy import array data = {"col1": [1, 2, 3], "col2": [2, 3, 4]} modin_df1, modin_df2 = pd.DataFrame(data), pd.DataFrame(data) np_value = np.array([7, 7, 7]) md_np_value = array(np_value) modin_df1.insert(1, "new_col", np_value) modin_df2.insert(1, "new_col", md_np_value) df_equals(modin_df1, modin_df2) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_ndim(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) assert modin_df.ndim == pandas_df.ndim @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_notna(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) df_equals(modin_df.notna(), pandas_df.notna()) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_notnull(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) df_equals(modin_df.notnull(), pandas_df.notnull()) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_round(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) df_equals(modin_df.round(), pandas_df.round()) df_equals(modin_df.round(1), pandas_df.round(1)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) def test_set_axis(data, axis): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) x = pandas.DataFrame()._get_axis_number(axis) index = modin_df.columns if x else modin_df.index labels = ["{0}_{1}".format(index[i], i) for i in range(modin_df.shape[x])] eval_general( modin_df, pandas_df, lambda df: df.set_axis(labels, axis=axis, copy=True) ) modin_df_copy = modin_df.copy() modin_df = modin_df.set_axis(labels, axis=axis, copy=False) # Check that the copy and original are different try: df_equals(modin_df, modin_df_copy) except AssertionError: assert True else: assert False pandas_df = pandas_df.set_axis(labels, axis=axis) df_equals(modin_df, pandas_df) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("drop", bool_arg_values, ids=arg_keys("drop", bool_arg_keys)) @pytest.mark.parametrize( "append", bool_arg_values, ids=arg_keys("append", bool_arg_keys) ) def test_set_index(request, data, drop, append): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) if "empty_data" not in request.node.name: key = modin_df.columns[0] modin_result = modin_df.set_index(key, drop=drop, append=append, inplace=False) pandas_result = pandas_df.set_index( key, drop=drop, append=append, inplace=False ) df_equals(modin_result, pandas_result) modin_df_copy = modin_df.copy() modin_df.set_index(key, drop=drop, append=append, inplace=True) # Check that the copy and original are different try: df_equals(modin_df, modin_df_copy) except AssertionError: assert True else: assert False pandas_df.set_index(key, drop=drop, append=append, inplace=True) df_equals(modin_df, pandas_df) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_shape(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) assert modin_df.shape == pandas_df.shape @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_size(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) assert modin_df.size == pandas_df.size def test_squeeze(): frame_data = { "col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], "col3": [8, 9, 10, 11], "col4": [12, 13, 14, 15], "col5": [0, 0, 0, 0], } frame_data_2 = {"col1": [0, 1, 2, 3]} frame_data_3 = { "col1": [0], "col2": [4], "col3": [8], "col4": [12], "col5": [0], } frame_data_4 = {"col1": [2]} frame_data_5 = {"col1": ["string"]} # Different data for different cases pandas_df = pandas.DataFrame(frame_data).squeeze() modin_df = pd.DataFrame(frame_data).squeeze() df_equals(modin_df, pandas_df) pandas_df_2 = pandas.DataFrame(frame_data_2).squeeze() modin_df_2 = pd.DataFrame(frame_data_2).squeeze() df_equals(modin_df_2, pandas_df_2) pandas_df_3 = pandas.DataFrame(frame_data_3).squeeze() modin_df_3 = pd.DataFrame(frame_data_3).squeeze() df_equals(modin_df_3, pandas_df_3) pandas_df_4 = pandas.DataFrame(frame_data_4).squeeze() modin_df_4 = pd.DataFrame(frame_data_4).squeeze() df_equals(modin_df_4, pandas_df_4) pandas_df_5 = pandas.DataFrame(frame_data_5).squeeze() modin_df_5 = pd.DataFrame(frame_data_5).squeeze() df_equals(modin_df_5, pandas_df_5) data = [ [ pd.Timestamp("2019-01-02"), pd.Timestamp("2019-01-03"), pd.Timestamp("2019-01-04"), pd.Timestamp("2019-01-05"), ], [1, 1, 1, 2], ] df = pd.DataFrame(data, index=["date", "value"]).T pf = pandas.DataFrame(data, index=["date", "value"]).T df.set_index("date", inplace=True) pf.set_index("date", inplace=True) df_equals(df.iloc[0], pf.iloc[0]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_transpose(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) df_equals(modin_df.T, pandas_df.T) df_equals(modin_df.transpose(), pandas_df.transpose()) # Test for map across full axis for select indices df_equals(modin_df.T.dropna(), pandas_df.T.dropna()) # Test for map across full axis df_equals(modin_df.T.nunique(), pandas_df.T.nunique()) # Test for map across blocks df_equals(modin_df.T.notna(), pandas_df.T.notna()) @pytest.mark.parametrize( "data, other_data", [ ({"A": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, 5, 6], "C": [7, 8, 9]}), ({"C": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, 5, 6], "A": [7, 8, 9]}), ( {"A": ["a", "b", "c"], "B": ["x", "y", "z"]}, {"B": ["d", "e", "f", "g", "h", "i"]}, ), ({"A": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, np.nan, 6]}), ], ) @pytest.mark.parametrize("errors", ["raise", "ignore"]) def test_update(data, other_data, errors): modin_df, pandas_df = create_test_dfs(data) other_modin_df, other_pandas_df = create_test_dfs(other_data) expected_exception = None if errors == "raise": expected_exception = ValueError("Data overlaps.") eval_general( modin_df, pandas_df, lambda df: ( df.update(other_modin_df, errors=errors) if isinstance(df, pd.DataFrame) else df.update(other_pandas_df, errors=errors) ), __inplace__=True, expected_exception=expected_exception, ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___neg__(request, data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) try: pandas_result = pandas_df.__neg__() except Exception as err: with pytest.raises(type(err)): modin_df.__neg__() else: modin_result = modin_df.__neg__() df_equals(modin_result, pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___invert__(data, request): expected_exception = None if "float_nan_data" in request.node.callspec.id: # FIXME: https://github.com/modin-project/modin/issues/7081 expected_exception = False eval_general( *create_test_dfs(data), lambda df: ~df, expected_exception=expected_exception ) def test___invert___bool(): data = [False] modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) modin_result = ~modin_df pandas_result = ~pandas_df df_equals(modin_result, pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___delitem__(request, data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) if "empty_data" not in request.node.name: key = pandas_df.columns[0] modin_df = modin_df.copy() pandas_df = pandas_df.copy() modin_df.__delitem__(key) pandas_df.__delitem__(key) df_equals(modin_df, pandas_df) # Issue 2027 last_label = pandas_df.iloc[:, -1].name modin_df.__delitem__(last_label) pandas_df.__delitem__(last_label) df_equals(modin_df, pandas_df) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___nonzero__(data): modin_df = pd.DataFrame(data) with pytest.raises(ValueError): # Always raises ValueError modin_df.__nonzero__() @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___abs__(request, data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) try: pandas_result = abs(pandas_df) except Exception as err: with pytest.raises(type(err)): abs(modin_df) else: modin_result = abs(modin_df) df_equals(modin_result, pandas_result) def test___round__(): data = test_data_values[0] eval_general(pd.DataFrame(data), pandas.DataFrame(data), lambda df: df.__round__()) @pytest.mark.parametrize( "get_index", [ pytest.param(lambda idx: None, id="None_idx"), pytest.param(lambda idx: ["a", "b", "c"], id="No_intersection_idx"), pytest.param(lambda idx: idx, id="Equal_idx"), pytest.param(lambda idx: idx[::-1], id="Reversed_idx"), ], ) @pytest.mark.parametrize( "get_columns", [ pytest.param(lambda idx: None, id="None_idx"), pytest.param(lambda idx: ["a", "b", "c"], id="No_intersection_idx"), pytest.param(lambda idx: idx, id="Equal_idx"), pytest.param(lambda idx: idx[::-1], id="Reversed_idx"), ], ) @pytest.mark.parametrize("dtype", [None, "str"]) @pytest.mark.exclude_in_sanity def test_constructor_from_modin_series(get_index, get_columns, dtype): modin_df, pandas_df = create_test_dfs(test_data_values[0]) modin_data = {f"new_col{i}": modin_df.iloc[:, i] for i in range(modin_df.shape[1])} pandas_data = { f"new_col{i}": pandas_df.iloc[:, i] for i in range(pandas_df.shape[1]) } index = get_index(modin_df.index) columns = get_columns(list(modin_data.keys())) new_modin = pd.DataFrame(modin_data, index=index, columns=columns, dtype=dtype) new_pandas = pandas.DataFrame( pandas_data, index=index, columns=columns, dtype=dtype ) df_equals(new_modin, new_pandas) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_constructor(data): pandas_df = pandas.DataFrame(data) modin_df = pd.DataFrame(data) df_equals(pandas_df, modin_df) pandas_df = pandas.DataFrame({k: pandas.Series(v) for k, v in data.items()}) modin_df = pd.DataFrame({k: pd.Series(v) for k, v in data.items()}) df_equals(pandas_df, modin_df) def test_pyarrow_constructor(): pa = pytest.importorskip("pyarrow") data = [[Decimal("3.19"), None], [None, Decimal("-1.23")]] df_equals(*create_test_dfs(data, dtype=pd.ArrowDtype(pa.decimal128(3, scale=2)))) @pytest.mark.parametrize( "data", [ np.arange(1, 10000, dtype=np.float32), [ pd.Series([1, 2, 3], dtype="int32"), pandas.Series([4, 5, 6], dtype="int64"), np.array([7, 8, 9], dtype=np.float32), ], pandas.Categorical([1, 2, 3, 4, 5]), ], ) def test_constructor_dtypes(data): modin_df, pandas_df = create_test_dfs(data) df_equals(modin_df, pandas_df) def test_constructor_columns_and_index(): modin_df = pd.DataFrame( [[1, 1, 10], [2, 4, 20], [3, 7, 30]], index=[1, 2, 3], columns=["id", "max_speed", "health"], ) pandas_df = pandas.DataFrame( [[1, 1, 10], [2, 4, 20], [3, 7, 30]], index=[1, 2, 3], columns=["id", "max_speed", "health"], ) df_equals(modin_df, pandas_df) df_equals(pd.DataFrame(modin_df), pandas.DataFrame(pandas_df)) df_equals( pd.DataFrame(modin_df, columns=["max_speed", "health"]), pandas.DataFrame(pandas_df, columns=["max_speed", "health"]), ) df_equals( pd.DataFrame(modin_df, index=[1, 2]), pandas.DataFrame(pandas_df, index=[1, 2]), ) df_equals( pd.DataFrame(modin_df, index=[1, 2], columns=["health"]), pandas.DataFrame(pandas_df, index=[1, 2], columns=["health"]), ) df_equals( pd.DataFrame(modin_df.iloc[:, 0], index=[1, 2, 3]), pandas.DataFrame(pandas_df.iloc[:, 0], index=[1, 2, 3]), ) df_equals( pd.DataFrame(modin_df.iloc[:, 0], columns=["NO_EXIST"]), pandas.DataFrame(pandas_df.iloc[:, 0], columns=["NO_EXIST"]), ) with pytest.raises(NotImplementedError): pd.DataFrame(modin_df, index=[1, 2, 99999]) with pytest.raises(NotImplementedError): pd.DataFrame(modin_df, columns=["NO_EXIST"]) def test_constructor_from_index(): data = pd.Index([1, 2, 3], name="pricing_date") modin_df, pandas_df = create_test_dfs(data) df_equals(modin_df, pandas_df) def test_insert_datelike_string_issue_7371(): # When a new value is inserted into a frame, we call pandas.api.types.pandas_dtype(value) to # extract the dtype of an object like a pandas Series or numpy array. When a scalar value is passed, # this usually raises a TypeError, so we construct a local pandas Series from the object and # extract the dtype from there. # When the passed value is a date-like string, pandas will instead raise a ValueError because # it tries to parse it as a numpy structured dtype. After fixing GH#7371, we now catch # ValueError in addition to TypeError to handle this case. modin_df = pd.DataFrame({"a": [0]}) modin_df["c"] = "2020-01-01" pandas_df = pandas.DataFrame({"a": [0]}) pandas_df["c"] = "2020-01-01" df_equals(modin_df, pandas_df) ================================================ FILE: modin/tests/pandas/dataframe/test_pickle.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import pickle import numpy as np import pytest import modin.pandas as pd from modin.config import PersistentPickle from modin.tests.pandas.utils import create_test_dfs, df_equals @pytest.fixture def modin_df_non_empty(): return pd.DataFrame({"col1": np.arange(1000), "col2": np.arange(2000, 3000)}) @pytest.fixture def modin_df_empty(): return pd.DataFrame() @pytest.fixture def modin_column(modin_df_non_empty): return modin_df_non_empty["col1"] @pytest.fixture(params=[True, False]) def persistent(request): old = PersistentPickle.get() PersistentPickle.put(request.param) yield request.param PersistentPickle.put(old) @pytest.mark.parametrize("modin_df_name", ["modin_df_non_empty", "modin_df_empty"]) def test_dataframe_pickle(request, modin_df_name): modin_df = request.getfixturevalue(modin_df_name) other = pickle.loads(pickle.dumps(modin_df)) df_equals(modin_df, other) def test__reduce__(): # `DataFrame.__reduce__` will be called implicitly when lambda expressions are # pre-processed for the distributed engine. dataframe_data = ["Major League Baseball", "National Basketball Association"] abbr_md, abbr_pd = create_test_dfs(dataframe_data, index=["MLB", "NBA"]) dataframe_data = { "name": ["Mariners", "Lakers"] * 500, "league_abbreviation": ["MLB", "NBA"] * 500, } teams_md, teams_pd = create_test_dfs(dataframe_data) result_md = ( teams_md.set_index("name") .league_abbreviation.apply(lambda abbr: abbr_md[0].loc[abbr]) .rename("league") ) result_pd = ( teams_pd.set_index("name") .league_abbreviation.apply(lambda abbr: abbr_pd[0].loc[abbr]) .rename("league") ) df_equals(result_md, result_pd) def test_column_pickle(modin_column, modin_df_non_empty, persistent): dmp = pickle.dumps(modin_column) other = pickle.loads(dmp) df_equals(modin_column.to_frame(), other.to_frame()) # make sure we don't pickle the whole frame if doing persistent storage if persistent: assert len(dmp) < len(pickle.dumps(modin_df_non_empty)) ================================================ FILE: modin/tests/pandas/dataframe/test_reduce.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import matplotlib import numpy as np import pandas import pytest import modin.pandas as pd from modin.config import NPartitions from modin.tests.pandas.utils import ( arg_keys, axis_keys, axis_values, bool_arg_keys, bool_arg_values, create_test_dfs, default_to_pandas_ignore_string, df_equals, df_equals_with_non_stable_indices, eval_general, int_arg_keys, int_arg_values, test_data, test_data_diff_dtype, test_data_keys, test_data_large_categorical_dataframe, test_data_values, ) NPartitions.put(4) # Force matplotlib to not use any Xwindows backend. matplotlib.use("Agg") # Our configuration in pytest.ini requires that we explicitly catch all # instances of defaulting to pandas, but some test modules, like this one, # have too many such instances. pytestmark = pytest.mark.filterwarnings(default_to_pandas_ignore_string) @pytest.mark.parametrize("method", ["all", "any"]) @pytest.mark.parametrize("is_transposed", [False, True]) @pytest.mark.parametrize("skipna", [False, True]) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize("data", [test_data["float_nan_data"]]) def test_all_any(data, axis, skipna, is_transposed, method): eval_general( *create_test_dfs(data), lambda df: getattr((df.T if is_transposed else df), method)( axis=axis, skipna=skipna, bool_only=None ), ) @pytest.mark.parametrize("method", ["all", "any"]) @pytest.mark.parametrize( "bool_only", bool_arg_values, ids=arg_keys("bool_only", bool_arg_keys) ) def test_all_any_specific(bool_only, method): eval_general( *create_test_dfs(test_data_diff_dtype), lambda df: getattr(df, method)(bool_only=bool_only), ) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize( "data", [test_data["float_nan_data"], test_data_large_categorical_dataframe] ) def test_count(data, axis): eval_general( *create_test_dfs(data), lambda df: df.count(axis=axis), ) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("dropna", [True, False]) def test_nunique(data, axis, dropna): eval_general( *create_test_dfs(data), lambda df: df.nunique(axis=axis, dropna=dropna), ) @pytest.mark.parametrize("numeric_only", [False, True]) def test_count_specific(numeric_only): eval_general( *create_test_dfs(test_data_diff_dtype), lambda df: df.count(numeric_only=numeric_only), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_count_dtypes(data): modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) eval_general( modin_df, pandas_df, lambda df: df.isna().count(axis=0), ) @pytest.mark.parametrize("percentiles", [None, 0.10, 0.11, 0.44, 0.78, 0.99]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_describe(data, percentiles): if percentiles is not None: percentiles = [percentiles] eval_general( *create_test_dfs(data), lambda df: df.describe(percentiles=percentiles), ) @pytest.mark.parametrize("has_numeric_column", [False, True]) def test_2195(has_numeric_column): data = { "categorical": pd.Categorical(["d"] * 10**2), "date": [np.datetime64("2000-01-01")] * 10**2, } if has_numeric_column: data.update({"numeric": [5] * 10**2}) modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) eval_general( modin_df, pandas_df, lambda df: df.describe(), ) # Issue: https://github.com/modin-project/modin/issues/4641 def test_describe_column_partition_has_different_index(): pandas_df = pandas.DataFrame(test_data["int_data"]) # We add a string column to test the case where partitions with mixed data # types have different 'describe' rows, which causes an index mismatch. pandas_df["string_column"] = "abc" modin_df = pd.DataFrame(pandas_df) eval_general(modin_df, pandas_df, lambda df: df.describe(include="all")) @pytest.mark.parametrize( "exclude,include", [ ([np.float64], None), (np.float64, None), (None, [np.timedelta64, np.datetime64, np.object_, np.bool_]), (None, "all"), (None, np.number), ], ) def test_describe_specific(exclude, include): eval_general( *create_test_dfs(test_data_diff_dtype), lambda df: df.drop("str_col", axis=1).describe( exclude=exclude, include=include ), ) @pytest.mark.parametrize("data", [test_data["int_data"]]) def test_describe_str(data): modin_df = pd.DataFrame(data).applymap(str) pandas_df = pandas.DataFrame(data).applymap(str) try: df_equals(modin_df.describe(), pandas_df.describe()) except AssertionError: # We have to do this because we choose the highest count slightly differently # than pandas. Because there is no true guarantee which one will be first, # If they don't match, make sure that the `freq` is the same at least. df_equals( modin_df.describe().loc[["count", "unique", "freq"]], pandas_df.describe().loc[["count", "unique", "freq"]], ) def test_describe_dtypes(): data = { "col1": list("abc"), "col2": list("abc"), "col3": list("abc"), "col4": [1, 2, 3], } eval_general(*create_test_dfs(data), lambda df: df.describe()) @pytest.mark.parametrize("method", ["idxmin", "idxmax"]) @pytest.mark.parametrize("is_transposed", [False, True]) @pytest.mark.parametrize("skipna", [False, True]) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize("data", [test_data["float_nan_data"]]) def test_idxmin_idxmax(data, axis, skipna, is_transposed, method): eval_general( *create_test_dfs(data), lambda df: getattr((df.T if is_transposed else df), method)( axis=axis, skipna=skipna ), ) @pytest.mark.parametrize("axis", [0, 1]) def test_idxmin_idxmax_string_columns(axis): # https://github.com/modin-project/modin/issues/7093 modin_df, pandas_df = create_test_dfs([["a", "b"]]) eval_general(modin_df, pandas_df, lambda df: df.idxmax(axis=axis)) eval_general(modin_df, pandas_df, lambda df: df.idxmin(axis=axis)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_last_valid_index(data): modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) assert modin_df.last_valid_index() == pandas_df.last_valid_index() @pytest.mark.parametrize("index", bool_arg_values, ids=arg_keys("index", bool_arg_keys)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_memory_usage(data, index): eval_general(*create_test_dfs(data), lambda df: df.memory_usage(index=index)) @pytest.mark.parametrize("method", ["min", "max", "mean"]) @pytest.mark.parametrize("is_transposed", [False, True]) @pytest.mark.parametrize("numeric_only", [False, True]) @pytest.mark.parametrize("skipna", [False, True]) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize("data", [test_data["float_nan_data"]]) def test_min_max_mean(data, axis, skipna, numeric_only, is_transposed, method): eval_general( *create_test_dfs(data), lambda df: getattr((df.T if is_transposed else df), method)( axis=axis, skipna=skipna, numeric_only=numeric_only ), ) @pytest.mark.parametrize("method", ["prod", "product"]) @pytest.mark.parametrize("is_transposed", [False, True]) @pytest.mark.parametrize("skipna", [False, True]) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize("data", [test_data["float_nan_data"]]) def test_prod( data, axis, skipna, is_transposed, method, ): eval_general( *create_test_dfs(data), lambda df, *args, **kwargs: getattr(df.T if is_transposed else df, method)( axis=axis, skipna=skipna, ), ) # test for issue #1953 arrays = [["1", "1", "2", "2"], ["1", "2", "3", "4"]] modin_df = pd.DataFrame( [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays ) pandas_df = pandas.DataFrame( [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays ) modin_result = modin_df.prod() pandas_result = pandas_df.prod() df_equals(modin_result, pandas_result) @pytest.mark.parametrize("is_transposed", [False, True]) @pytest.mark.parametrize("skipna", [False, True]) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize("data", [test_data["float_nan_data"]]) def test_sum(data, axis, skipna, is_transposed, request): eval_general( *create_test_dfs(data), lambda df: (df.T if is_transposed else df).sum( axis=axis, skipna=skipna, ), ) # test for issue #1953 arrays = [["1", "1", "2", "2"], ["1", "2", "3", "4"]] modin_df = pd.DataFrame( [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays ) pandas_df = pandas.DataFrame( [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays ) modin_result = modin_df.sum() pandas_result = pandas_df.sum() df_equals(modin_result, pandas_result) @pytest.mark.parametrize("dtype", ["int64", "Int64", "int64[pyarrow]"]) def test_dtype_consistency(dtype): # test for issue #6781 res_dtype = pd.DataFrame([1, 2, 3, 4], dtype=dtype).sum().dtype assert res_dtype == pandas.api.types.pandas_dtype(dtype) @pytest.mark.parametrize("fn", ["prod", "sum"]) @pytest.mark.parametrize("numeric_only", [False, True]) @pytest.mark.parametrize( "min_count", int_arg_values, ids=arg_keys("min_count", int_arg_keys) ) def test_sum_prod_specific(fn, min_count, numeric_only): expected_exception = None if not numeric_only and fn == "prod": # FIXME: https://github.com/modin-project/modin/issues/7029 expected_exception = False elif not numeric_only and fn == "sum": expected_exception = TypeError('can only concatenate str (not "int") to str') if numeric_only and fn == "sum": pytest.xfail(reason="https://github.com/modin-project/modin/issues/7029") if min_count == 5 and not numeric_only: pytest.xfail(reason="https://github.com/modin-project/modin/issues/7029") eval_general( *create_test_dfs(test_data_diff_dtype), lambda df: getattr(df, fn)(min_count=min_count, numeric_only=numeric_only), expected_exception=expected_exception, ) @pytest.mark.parametrize("backend", [None, "pyarrow"]) def test_sum_prod_min_count(backend): md_df, pd_df = create_test_dfs(test_data["float_nan_data"], backend=backend) eval_general(md_df, pd_df, lambda df: df.prod(min_count=len(pd_df) + 1)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_sum_single_column(data): modin_df = pd.DataFrame(data).iloc[:, [0]] pandas_df = pandas.DataFrame(data).iloc[:, [0]] df_equals(modin_df.sum(), pandas_df.sum()) df_equals(modin_df.sum(axis=1), pandas_df.sum(axis=1)) def test_sum_datetime64(): pd_ser = pandas.date_range(start="1/1/2018", end="1/08/2018") modin_df, pandas_df = create_test_dfs({"A": pd_ser, "B": [1, 2, 3, 4, 5, 6, 7, 8]}) eval_general( modin_df, pandas_df, lambda df: df.sum(), expected_exception=TypeError( "'DatetimeArray' with dtype datetime64[ns] does not support reduction 'sum'" ), ) def test_min_datetime64(): pd_ser = pandas.date_range(start="1/1/2018", end="1/08/2018") modin_df, pandas_df = create_test_dfs({"A": pd_ser, "B": [1, 2, 3, 4, 5, 6, 7, 8]}) eval_general( modin_df, pandas_df, lambda df: df.min(), ) eval_general( modin_df, pandas_df, lambda df: df.min(axis=1), # pandas raises: `TypeError: '<=' not supported between instances of 'Timestamp' and 'int'` # while modin raises quite general: `TypeError("Cannot compare Numeric and Non-Numeric Types")` expected_exception=False, ) @pytest.mark.parametrize( "fn", ["max", "min", "median", "mean", "skew", "kurt", "sem", "std", "var"] ) @pytest.mark.parametrize("axis", [0, 1, None]) @pytest.mark.parametrize("numeric_only", [False, True]) def test_reduce_specific(fn, numeric_only, axis): expected_exception = None if not numeric_only: if fn in ("max", "min"): if axis == 0: operator = ">=" if fn == "max" else "<=" expected_exception = TypeError( f"'{operator}' not supported between instances of 'str' and 'float'" ) else: # FIXME: https://github.com/modin-project/modin/issues/7030 expected_exception = False elif fn in ("skew", "kurt", "sem", "std", "var", "median", "mean"): # FIXME: https://github.com/modin-project/modin/issues/7030 expected_exception = False eval_general( *create_test_dfs(test_data_diff_dtype), lambda df: getattr(df, fn)(numeric_only=numeric_only, axis=axis), expected_exception=expected_exception, ) @pytest.mark.parametrize("subset_len", [1, 2]) @pytest.mark.parametrize("sort", bool_arg_values, ids=bool_arg_keys) @pytest.mark.parametrize("normalize", bool_arg_values, ids=bool_arg_keys) @pytest.mark.parametrize("dropna", bool_arg_values, ids=bool_arg_keys) @pytest.mark.parametrize("ascending", [False, True]) def test_value_counts(subset_len, sort, normalize, dropna, ascending): def comparator(md_res, pd_res): if subset_len == 1: # 'pandas.DataFrame.value_counts' always returns frames with MultiIndex, # even when 'subset_len == 1' it returns MultiIndex with 'nlevels == 1'. # This behavior is expensive to mimic, so Modin 'value_counts' returns frame # with non-multi index in that case. That's why we flatten indices here. assert md_res.index.nlevels == pd_res.index.nlevels == 1 for df in [md_res, pd_res]: df.index = df.index.get_level_values(0) if sort: # We sort indices for the result because of: # https://github.com/modin-project/modin/issues/1650 df_equals_with_non_stable_indices(md_res, pd_res) else: df_equals(md_res.sort_index(), pd_res.sort_index()) data = test_data_values[0] md_df, pd_df = create_test_dfs(data) # We're picking columns with different index signs to involve columns from different partitions subset = [pd_df.columns[-i if i % 2 else i] for i in range(subset_len)] eval_general( md_df, pd_df, lambda df: df.value_counts( subset=subset, sort=sort, normalize=normalize, dropna=dropna, ascending=ascending, ), comparator=comparator, ) def test_value_counts_categorical(): # from issue #3571 data = np.array(["a"] * 50000 + ["b"] * 10000 + ["c"] * 1000) random_state = np.random.RandomState(seed=42) random_state.shuffle(data) modin_df, pandas_df = create_test_dfs( {"col1": data, "col2": data}, dtype="category" ) eval_general( modin_df, pandas_df, lambda df: df.value_counts(), comparator=df_equals, ) ================================================ FILE: modin/tests/pandas/dataframe/test_udf.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import matplotlib import numpy as np import pandas import pytest from pandas.core.dtypes.common import is_list_like import modin.pandas as pd from modin.config import MinRowPartitionSize, NPartitions from modin.core.storage_formats.pandas.query_compiler_caster import ( _assert_casting_functions_wrap_same_implementation, ) from modin.tests.pandas.utils import ( UNIVERSAL_UNARY_NUMPY_FUNCTIONS_FOR_FLOATS, agg_func_except_keys, agg_func_except_values, agg_func_keys, agg_func_values, arg_keys, bool_arg_keys, bool_arg_values, create_test_dfs, default_to_pandas_ignore_string, df_equals, eval_general, query_func_keys, query_func_values, random_state, test_data, test_data_keys, test_data_values, udf_func_keys, udf_func_values, ) from modin.tests.test_utils import ( current_execution_is_native, warns_that_defaulting_to_pandas_if, ) from modin.utils import get_current_execution NPartitions.put(4) # Force matplotlib to not use any Xwindows backend. matplotlib.use("Agg") # Our configuration in pytest.ini requires that we explicitly catch all # instances of defaulting to pandas, but some test modules, like this one, # have too many such instances. # TODO(https://github.com/modin-project/modin/issues/3655): catch all instances # of defaulting to pandas. pytestmark = pytest.mark.filterwarnings(default_to_pandas_ignore_string) def test_agg_dict(): md_df, pd_df = create_test_dfs(test_data_values[0]) agg_dict = {pd_df.columns[0]: "sum", pd_df.columns[-1]: ("sum", "count")} eval_general(md_df, pd_df, lambda df: df.agg(agg_dict)) agg_dict = { "new_col1": (pd_df.columns[0], "sum"), "new_col2": (pd_df.columns[-1], "count"), } eval_general(md_df, pd_df, lambda df: df.agg(**agg_dict)) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize( "func", agg_func_values + agg_func_except_values, ids=agg_func_keys + agg_func_except_keys, ) @pytest.mark.parametrize("op", ["agg", "apply"]) def test_agg_apply(axis, func, op, request): expected_exception = None if "sum sum" in request.node.callspec.id: expected_exception = pandas.errors.SpecificationError( "Function names must be unique if there is no new column names assigned" ) elif "should raise AssertionError" in request.node.callspec.id: # FIXME: https://github.com/modin-project/modin/issues/7031 expected_exception = False eval_general( *create_test_dfs(test_data["float_nan_data"]), lambda df: getattr(df, op)(func, axis), expected_exception=expected_exception, ) @pytest.mark.parametrize("axis", ["rows", "columns"]) @pytest.mark.parametrize( "func", agg_func_values + agg_func_except_values, ids=agg_func_keys + agg_func_except_keys, ) @pytest.mark.parametrize("op", ["agg", "apply"]) def test_agg_apply_axis_names(axis, func, op, request): expected_exception = None if "sum sum" in request.node.callspec.id: expected_exception = pandas.errors.SpecificationError( "Function names must be unique if there is no new column names assigned" ) elif "should raise AssertionError" in request.node.callspec.id: # FIXME: https://github.com/modin-project/modin/issues/7031 expected_exception = False eval_general( *create_test_dfs(test_data["int_data"]), lambda df: getattr(df, op)(func, axis), expected_exception=expected_exception, ) def test_aggregate_alias(): _assert_casting_functions_wrap_same_implementation( pd.DataFrame.agg, pd.DataFrame.aggregate ) def test_aggregate_error_checking(): modin_df = pd.DataFrame(test_data["float_nan_data"]) with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): modin_df.aggregate({modin_df.columns[0]: "sum", modin_df.columns[1]: "mean"}) with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): modin_df.aggregate("arcsin") @pytest.mark.parametrize( "func", agg_func_values + agg_func_except_values, ids=agg_func_keys + agg_func_except_keys, ) def test_apply_key_error(func): if not (is_list_like(func) or callable(func) or isinstance(func, str)): pytest.xfail( reason="Because index materialization is expensive Modin first" + "checks the validity of the function itself and only then the engine level" + "checks the validity of the indices. Pandas order of such checks is reversed," + "so we get different errors when both (function and index) are invalid." ) eval_general( *create_test_dfs(test_data["int_data"]), lambda df: df.apply({"row": func}, axis=1), expected_exception=KeyError("Column(s) ['row'] do not exist"), ) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("func", ["kurt", "count", "sum", "mean", "all", "any"]) def test_apply_text_func(data, func, axis): func_kwargs = {"axis": axis} rows_number = len(next(iter(data.values()))) # length of the first data column level_0 = np.random.choice([0, 1, 2], rows_number) level_1 = np.random.choice([3, 4, 5], rows_number) index = pd.MultiIndex.from_arrays([level_0, level_1]) eval_general( *create_test_dfs(data, index=index), lambda df, *args, **kwargs: df.apply(func, *args, **kwargs), **func_kwargs, ) @pytest.mark.parametrize( "column", ["A", ["A", "C"]], ids=arg_keys("column", ["A", ["A", "C"]]) ) @pytest.mark.parametrize( "ignore_index", bool_arg_values, ids=arg_keys("ignore_index", bool_arg_keys) ) def test_explode_single_partition(column, ignore_index): # This test data has two columns where some items are lists that # explode() should expand. In some rows, the columns have list-like # elements that must be expanded, and in others, they have empty lists # or items that aren't list-like at all. data = { "A": [[0, 1, 2], "foo", [], [3, 4]], "B": 1, "C": [["a", "b", "c"], np.nan, [], ["d", "e"]], } eval_general( *create_test_dfs(data), lambda df: df.explode(column, ignore_index=ignore_index), ) @pytest.mark.parametrize( "column", ["A", ["A", "C"]], ids=arg_keys("column", ["A", ["A", "C"]]) ) @pytest.mark.parametrize( "ignore_index", bool_arg_values, ids=arg_keys("ignore_index", bool_arg_keys) ) def test_explode_all_partitions(column, ignore_index): # Test explode with enough rows to fill all partitions. explode should # expand every row in the input data into two rows. It's especially # important that the input data has list-like elements that must be # expanded at the boundaries of the partitions, e.g. at row 31. num_rows = NPartitions.get() * MinRowPartitionSize.get() data = {"A": [[3, 4]] * num_rows, "C": [["a", "b"]] * num_rows} eval_general( *create_test_dfs(data), lambda df: df.explode(column, ignore_index=ignore_index), ) @pytest.mark.parametrize("axis", ["rows", "columns"]) @pytest.mark.parametrize("args", [(1,), ("_A",)]) def test_apply_args(axis, args): def apply_func(series, y): try: return series + y except TypeError: return series.map(str) + str(y) eval_general( *create_test_dfs(test_data["int_data"]), lambda df: df.apply(apply_func, axis=axis, args=args), ) def test_apply_metadata(): def add(a, b, c): return a + b + c data = {"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]} modin_df = pd.DataFrame(data) modin_df["add"] = modin_df.apply( lambda row: add(row["A"], row["B"], row["C"]), axis=1 ) pandas_df = pandas.DataFrame(data) pandas_df["add"] = pandas_df.apply( lambda row: add(row["A"], row["B"], row["C"]), axis=1 ) df_equals(modin_df, pandas_df) @pytest.mark.parametrize("func", udf_func_values, ids=udf_func_keys) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_apply_udf(data, func): eval_general( *create_test_dfs(data), lambda df, *args, **kwargs: df.apply(func, *args, **kwargs), other=lambda df: df, ) def test_apply_dict_4828(): data = [[2, 4], [1, 3]] modin_df1, pandas_df1 = create_test_dfs(data) eval_general( modin_df1, pandas_df1, lambda df: df.apply({0: (lambda x: x**2)}), ) eval_general( modin_df1, pandas_df1, lambda df: df.apply({0: (lambda x: x**2)}, axis=1), ) # several partitions along axis 0 modin_df2, pandas_df2 = create_test_dfs(data, index=[2, 3]) modin_df3 = pd.concat([modin_df1, modin_df2], axis=0) pandas_df3 = pandas.concat([pandas_df1, pandas_df2], axis=0) eval_general( modin_df3, pandas_df3, lambda df: df.apply({0: (lambda x: x**2)}), ) eval_general( modin_df3, pandas_df3, lambda df: df.apply({0: (lambda x: x**2)}, axis=1), ) # several partitions along axis 1 modin_df4, pandas_df4 = create_test_dfs(data, columns=[2, 3]) modin_df5 = pd.concat([modin_df1, modin_df4], axis=1) pandas_df5 = pandas.concat([pandas_df1, pandas_df4], axis=1) eval_general( modin_df5, pandas_df5, lambda df: df.apply({0: (lambda x: x**2)}), ) eval_general( modin_df5, pandas_df5, lambda df: df.apply({0: (lambda x: x**2)}, axis=1), ) def test_apply_modin_func_4635(): data = [1] modin_df, pandas_df = create_test_dfs(data) df_equals(modin_df.apply(pd.Series.sum), pandas_df.apply(pandas.Series.sum)) data = {"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]} modin_df, pandas_df = create_test_dfs(data) modin_df = modin_df.set_index(["a"]) pandas_df = pandas_df.set_index(["a"]) df_equals( modin_df.groupby("a", group_keys=False).apply(pd.DataFrame.sample, n=1), pandas_df.groupby("a", group_keys=False).apply(pandas.DataFrame.sample, n=1), ) @pytest.mark.parametrize( "apply_function", ( lambda df, function: function(df), lambda df, function: df.apply(function, axis=0), lambda df, function: df.apply(function, axis=1), ), ) @pytest.mark.parametrize("function", UNIVERSAL_UNARY_NUMPY_FUNCTIONS_FOR_FLOATS) def test_apply_unary_numpy_universal_function_issue_7645(function, apply_function): eval_general( *create_test_dfs(test_data["float_nan_data"]), lambda df: apply_function(df, function), ) def test_eval_df_use_case(): frame_data = {"a": random_state.randn(10), "b": random_state.randn(10)} df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) # test eval for series results tmp_pandas = df.eval("arctan2(sin(a), b)", engine="python", parser="pandas") tmp_modin = modin_df.eval("arctan2(sin(a), b)", engine="python", parser="pandas") assert isinstance(tmp_modin, pd.Series) df_equals(tmp_modin, tmp_pandas) # Test not inplace assignments tmp_pandas = df.eval("e = arctan2(sin(a), b)", engine="python", parser="pandas") tmp_modin = modin_df.eval( "e = arctan2(sin(a), b)", engine="python", parser="pandas" ) df_equals(tmp_modin, tmp_pandas) # Test inplace assignments df.eval("e = arctan2(sin(a), b)", engine="python", parser="pandas", inplace=True) modin_df.eval( "e = arctan2(sin(a), b)", engine="python", parser="pandas", inplace=True ) # TODO: Use a series equality validator. df_equals(modin_df, df) def test_eval_df_arithmetic_subexpression(): frame_data = {"a": random_state.randn(10), "b": random_state.randn(10)} df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) df.eval("not_e = sin(a + b)", engine="python", parser="pandas", inplace=True) modin_df.eval("not_e = sin(a + b)", engine="python", parser="pandas", inplace=True) # TODO: Use a series equality validator. df_equals(modin_df, df) def test_eval_groupby_transform(): # see #5511 for details df = pd.DataFrame({"num": range(1, 1001), "group": ["A"] * 500 + ["B"] * 500}) assert df.eval("num.groupby(group).transform('min')").unique().tolist() == [1, 501] def test_eval_scalar(): # see #4477 for details df = pd.DataFrame([[2]]) assert df.eval("1") == 1 @pytest.mark.parametrize("engine", ("numexpr", "python")) def test_eval_not_inplace_does_not_change_input_dataframe(engine): snow_df, pandas_df = create_test_dfs({"a": [1, 2, 3]}) original_pandas = pandas_df.copy() snow_result = snow_df.eval("b = a + 1", inplace=False, engine=engine) pandas_result = pandas_df.eval("b = a + 1", inplace=False, engine=engine) df_equals(snow_df, original_pandas) df_equals(pandas_df, original_pandas) df_equals(snow_result, pandas_result) TEST_VAR = 2 @pytest.mark.parametrize("method", ["query", "eval"]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("local_var", [2]) @pytest.mark.parametrize("engine", ["python", "numexpr"]) def test_eval_and_query_with_local_and_global_var(method, data, engine, local_var): modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) op = "+" if method == "eval" else "<" for expr in (f"col1 {op} @local_var", f"col1 {op} @TEST_VAR"): df_equals( getattr(modin_df, method)(expr, engine=engine), getattr(pandas_df, method)(expr, engine=engine), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_filter(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) by = {"items": ["col1", "col5"], "regex": "4$|3$", "like": "col"} df_equals(modin_df.filter(items=by["items"]), pandas_df.filter(items=by["items"])) df_equals( modin_df.filter(regex=by["regex"], axis=0), pandas_df.filter(regex=by["regex"], axis=0), ) df_equals( modin_df.filter(regex=by["regex"], axis=1), pandas_df.filter(regex=by["regex"], axis=1), ) df_equals(modin_df.filter(like=by["like"]), pandas_df.filter(like=by["like"])) with pytest.raises(TypeError): modin_df.filter(items=by["items"], regex=by["regex"]) with pytest.raises(TypeError): modin_df.filter() @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_pipe(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) n = len(modin_df.index) a, b, c = 2 % n, 0, 3 % n col = modin_df.columns[3 % len(modin_df.columns)] def h(x): return x.drop(columns=[col]) def g(x, arg1=0): for _ in range(arg1): x = (pd if isinstance(x, pd.DataFrame) else pandas).concat((x, x)) return x def f(x, arg2=0, arg3=0): return x.drop([arg2, arg3]) df_equals( f(g(h(modin_df), arg1=a), arg2=b, arg3=c), (modin_df.pipe(h).pipe(g, arg1=a).pipe(f, arg2=b, arg3=c)), ) df_equals( (modin_df.pipe(h).pipe(g, arg1=a).pipe(f, arg2=b, arg3=c)), (pandas_df.pipe(h).pipe(g, arg1=a).pipe(f, arg2=b, arg3=c)), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("funcs", query_func_values, ids=query_func_keys) @pytest.mark.parametrize("engine", ["python", "numexpr"]) def test_query(data, funcs, engine): if get_current_execution() == "BaseOnPython" and funcs != "col3 > col4": pytest.xfail( reason="In this case, we are faced with the problem of handling empty data frames - #4934" ) modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) try: pandas_result = pandas_df.query(funcs, engine=engine) except Exception as err: with pytest.raises(type(err)): modin_df.query(funcs, engine=engine) else: modin_result = modin_df.query(funcs, engine=engine) # `dtypes` must be evaluated after `query` so we need to check cache assert modin_result._query_compiler.frame_has_dtypes_cache df_equals(modin_result, pandas_result) df_equals(modin_result.dtypes, pandas_result.dtypes) def test_query_named_index(): eval_general( *(df.set_index("col1") for df in create_test_dfs(test_data["int_data"])), lambda df: df.query("col1 % 2 == 0 | col3 % 2 == 1"), ) def test_query_named_multiindex(): eval_general( *( df.set_index(["col1", "col3"]) for df in create_test_dfs(test_data["int_data"]) ), lambda df: df.query("col1 % 2 == 1 | col3 % 2 == 1"), ) def test_query_multiindex_without_names(): def make_df(without_index): new_df = without_index.set_index(["col1", "col3"]) new_df.index.names = [None, None] return new_df eval_general( *(make_df(df) for df in create_test_dfs(test_data["int_data"])), lambda df: df.query("ilevel_0 % 2 == 0 | ilevel_1 % 2 == 1 | col4 % 2 == 1"), ) def test_empty_query(): modin_df = pd.DataFrame([1, 2, 3, 4, 5]) with pytest.raises(ValueError): modin_df.query("") @pytest.mark.parametrize("engine", ["python", "numexpr"]) def test_query_after_insert(engine): modin_df = pd.DataFrame({"x": [-1, 0, 1, None], "y": [1, 2, None, 3]}) modin_df["z"] = modin_df.eval("x / y") modin_df = modin_df.query("z >= 0", engine=engine) modin_result = modin_df.reset_index(drop=True) modin_result.columns = ["a", "b", "c"] pandas_df = pd.DataFrame({"x": [-1, 0, 1, None], "y": [1, 2, None, 3]}) pandas_df["z"] = pandas_df.eval("x / y") pandas_df = pandas_df.query("z >= 0", engine=engine) pandas_result = pandas_df.reset_index(drop=True) pandas_result.columns = ["a", "b", "c"] df_equals(modin_result, pandas_result) df_equals(modin_df, pandas_df) @pytest.mark.parametrize("engine", ["python", "numexpr"]) def test_query_with_element_access_issue_4580(engine): pdf = pandas.DataFrame({"a": [0, 1, 2]}) # get two row partitions by concatenating df = pd.concat([pd.DataFrame(pdf[:1]), pd.DataFrame(pdf[1:])]) eval_general(df, pdf, lambda df: df.query("a == a[0]", engine=engine)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize( "func", [lambda x: x + 1, [np.sqrt, np.exp]], ids=["lambda", "list_udfs"] ) def test_transform(data, func, request): if "list_udfs" in request.node.callspec.id: pytest.xfail(reason="https://github.com/modin-project/modin/issues/6998") eval_general(*create_test_dfs(data), lambda df: df.transform(func)) ================================================ FILE: modin/tests/pandas/dataframe/test_window.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import matplotlib import numpy as np import pandas import pytest import modin.pandas as pd from modin.config import NPartitions from modin.tests.pandas.utils import ( arg_keys, axis_keys, axis_values, bool_arg_keys, bool_arg_values, create_test_dfs, df_equals, eval_general, int_arg_keys, int_arg_values, is_native_shallow_copy, name_contains, no_numeric_dfs, quantiles_keys, quantiles_values, random_state, test_data, test_data_keys, test_data_values, test_data_with_duplicates_keys, test_data_with_duplicates_values, ) NPartitions.put(4) # Force matplotlib to not use any Xwindows backend. matplotlib.use("Agg") @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("skipna", [False, True]) @pytest.mark.parametrize("method", ["cumprod", "cummin", "cummax", "cumsum"]) def test_cumprod_cummin_cummax_cumsum(axis, skipna, method): eval_general( *create_test_dfs(test_data["float_nan_data"]), lambda df: getattr(df, method)(axis=axis, skipna=skipna), ) @pytest.mark.parametrize("axis", ["rows", "columns"]) @pytest.mark.parametrize("method", ["cumprod", "cummin", "cummax", "cumsum"]) def test_cumprod_cummin_cummax_cumsum_transposed(axis, method): eval_general( *create_test_dfs(test_data["int_data"]), lambda df: getattr(df.T, method)(axis=axis), ) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("method", ["cummin", "cummax"]) def test_cummin_cummax_int_and_float(axis, method): data = {"col1": list(range(1000)), "col2": [i * 0.1 for i in range(1000)]} eval_general(*create_test_dfs(data), lambda df: getattr(df, method)(axis=axis)) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize( "periods", int_arg_values, ids=arg_keys("periods", int_arg_keys) ) def test_diff(axis, periods): eval_general( *create_test_dfs(test_data["float_nan_data"]), lambda df: df.diff(axis=axis, periods=periods), ) def test_diff_with_datetime_types(): pandas_df = pandas.DataFrame( [[1, 2.0, 3], [4, 5.0, 6], [7, np.nan, 9], [10, 11.3, 12], [13, 14.5, 15]] ) data = pandas.date_range("2018-01-01", periods=5, freq="h").values pandas_df = pandas.concat([pandas_df, pandas.Series(data)], axis=1) modin_df = pd.DataFrame(pandas_df) # Test `diff` with datetime type. pandas_result = pandas_df.diff() modin_result = modin_df.diff() df_equals(modin_result, pandas_result) # Test `diff` with timedelta type. td_pandas_result = pandas_result.diff() td_modin_result = modin_result.diff() df_equals(td_modin_result, td_pandas_result) def test_diff_error_handling(): df = pd.DataFrame([["a", "b", "c"]], columns=["col 0", "col 1", "col 2"]) with pytest.raises( ValueError, match="periods must be an int. got instead" ): df.diff(axis=0, periods="1") with pytest.raises(TypeError, match="unsupported operand type for -: got object"): df.diff() @pytest.mark.parametrize("axis", ["rows", "columns"]) def test_diff_transposed(axis): eval_general( *create_test_dfs(test_data["int_data"]), lambda df: df.T.diff(axis=axis), ) @pytest.mark.parametrize( "data", test_data_with_duplicates_values, ids=test_data_with_duplicates_keys ) @pytest.mark.parametrize( "keep", ["last", "first", False], ids=["last", "first", "False"] ) def test_duplicated(data, keep): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) pandas_result = pandas_df.duplicated(keep=keep) modin_result = modin_df.duplicated(keep=keep) df_equals(modin_result, pandas_result) import random subset = random.sample( list(pandas_df.columns), random.randint(1, len(pandas_df.columns)) ) pandas_result = pandas_df.duplicated(keep=keep, subset=subset) modin_result = modin_df.duplicated(keep=keep, subset=subset) df_equals(modin_result, pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_ffill(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) df_equals(modin_df.ffill(), pandas_df.ffill()) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize( "method", ["backfill", "bfill", "pad", "ffill", None], ids=["backfill", "bfill", "pad", "ffill", "None"], ) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize("limit", int_arg_values, ids=int_arg_keys) def test_fillna(data, method, axis, limit): # We are not testing when axis is over rows until pandas-17399 gets fixed. if axis != 1 and axis != "columns": modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) try: pandas_result = pandas_df.fillna(0, method=method, axis=axis, limit=limit) except Exception as err: with pytest.raises(type(err)): modin_df.fillna(0, method=method, axis=axis, limit=limit) else: modin_result = modin_df.fillna(0, method=method, axis=axis, limit=limit) df_equals(modin_result, pandas_result) def test_fillna_sanity(): # with different dtype frame_data = [ ["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"], ] df = pandas.DataFrame(frame_data) result = df.fillna({2: "foo"}) modin_df = pd.DataFrame(frame_data).fillna({2: "foo"}) df_equals(modin_df, result) modin_df = pd.DataFrame(df) df.fillna({2: "foo"}, inplace=True) modin_df.fillna({2: "foo"}, inplace=True) df_equals(modin_df, result) frame_data = { "Date": [pandas.NaT, pandas.Timestamp("2014-1-1")], "Date2": [pandas.Timestamp("2013-1-1"), pandas.NaT], } df = pandas.DataFrame(frame_data) result = df.fillna(value={"Date": df["Date2"]}) modin_df = pd.DataFrame(frame_data).fillna(value={"Date": df["Date2"]}) df_equals(modin_df, result) frame_data = {"A": [pandas.Timestamp("2012-11-11 00:00:00+01:00"), pandas.NaT]} df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) df_equals(modin_df.fillna(method="pad"), df.fillna(method="pad")) frame_data = {"A": [pandas.NaT, pandas.Timestamp("2012-11-11 00:00:00+01:00")]} df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data).fillna(method="bfill") df_equals(modin_df, df.fillna(method="bfill")) def test_fillna_downcast(): # infer int64 from float64 frame_data = {"a": [1.0, np.nan]} df = pandas.DataFrame(frame_data) result = df.fillna(0, downcast="infer") modin_df = pd.DataFrame(frame_data).fillna(0, downcast="infer") df_equals(modin_df, result) # infer int64 from float64 when fillna value is a dict df = pandas.DataFrame(frame_data) result = df.fillna({"a": 0}, downcast="infer") modin_df = pd.DataFrame(frame_data).fillna({"a": 0}, downcast="infer") df_equals(modin_df, result) def test_fillna_4660(): eval_general( *create_test_dfs({"a": ["a"], "b": ["b"], "c": [pd.NA]}, index=["row1"]), lambda df: df["c"].fillna(df["b"]), ) @pytest.mark.xfail( condition=is_native_shallow_copy(), reason="native pandas backend does not deep copy inputs by default", strict=True, ) def test_fillna_inplace(): frame_data = random_state.randn(10, 4) df = pandas.DataFrame(frame_data) df[1][:4] = np.nan df[3][-4:] = np.nan modin_df = pd.DataFrame(df) df.fillna(value=0, inplace=True) try: df_equals(modin_df, df) except AssertionError: pass else: assert False modin_df.fillna(value=0, inplace=True) df_equals(modin_df, df) modin_df = pd.DataFrame(df).fillna(value={0: 0}, inplace=True) assert modin_df is None df[1][:4] = np.nan df[3][-4:] = np.nan modin_df = pd.DataFrame(df) df.fillna(method="ffill", inplace=True) try: df_equals(modin_df, df) except AssertionError: pass else: assert False modin_df.fillna(method="ffill", inplace=True) df_equals(modin_df, df) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("limit", [1, 2, 0.5, -1, -2, 1.5]) def test_frame_fillna_limit(data, limit): pandas_df = pandas.DataFrame(data) replace_pandas_series = pandas_df.columns.to_series().sample(frac=1) replace_dict = replace_pandas_series.to_dict() replace_pandas_df = pandas.DataFrame( {col: pandas_df.index.to_series() for col in pandas_df.columns}, index=pandas_df.index, ).sample(frac=1) replace_modin_series = pd.Series(replace_pandas_series) replace_modin_df = pd.DataFrame(replace_pandas_df) index = pandas_df.index result = pandas_df[:2].reindex(index) modin_df = pd.DataFrame(result) if isinstance(limit, float): limit = int(len(modin_df) * limit) if limit is not None and limit < 0: limit = len(modin_df) + limit df_equals( modin_df.fillna(method="pad", limit=limit), result.fillna(method="pad", limit=limit), ) df_equals( modin_df.fillna(replace_dict, limit=limit), result.fillna(replace_dict, limit=limit), ) df_equals( modin_df.fillna(replace_modin_series, limit=limit), result.fillna(replace_pandas_series, limit=limit), ) df_equals( modin_df.fillna(replace_modin_df, limit=limit), result.fillna(replace_pandas_df, limit=limit), ) result = pandas_df[-2:].reindex(index) modin_df = pd.DataFrame(result) df_equals( modin_df.fillna(method="backfill", limit=limit), result.fillna(method="backfill", limit=limit), ) df_equals( modin_df.fillna(replace_dict, limit=limit), result.fillna(replace_dict, limit=limit), ) df_equals( modin_df.fillna(replace_modin_series, limit=limit), result.fillna(replace_pandas_series, limit=limit), ) df_equals( modin_df.fillna(replace_modin_df, limit=limit), result.fillna(replace_pandas_df, limit=limit), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_frame_pad_backfill_limit(data): pandas_df = pandas.DataFrame(data) index = pandas_df.index result = pandas_df[:2].reindex(index) modin_df = pd.DataFrame(result) df_equals( modin_df.fillna(method="pad", limit=2), result.fillna(method="pad", limit=2) ) result = pandas_df[-2:].reindex(index) modin_df = pd.DataFrame(result) df_equals( modin_df.fillna(method="backfill", limit=2), result.fillna(method="backfill", limit=2), ) def test_fillna_dtype_conversion(): # make sure that fillna on an empty frame works df = pandas.DataFrame(index=range(3), columns=["A", "B"], dtype="float64") modin_df = pd.DataFrame(index=range(3), columns=["A", "B"], dtype="float64") df_equals(modin_df.fillna("nan"), df.fillna("nan")) frame_data = {"A": [1, np.nan], "B": [1.0, 2.0]} df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) for v in ["", 1, np.nan, 1.0]: df_equals(modin_df.fillna(v), df.fillna(v)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_fillna_skip_certain_blocks(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) # don't try to fill boolean, int blocks df_equals(modin_df.fillna(np.nan), pandas_df.fillna(np.nan)) def test_fillna_dict_series(): frame_data = { "a": [np.nan, 1, 2, np.nan, np.nan], "b": [1, 2, 3, np.nan, np.nan], "c": [np.nan, 1, 2, 3, 4], } df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) df_equals(modin_df.fillna({"a": 0, "b": 5}), df.fillna({"a": 0, "b": 5})) df_equals( modin_df.fillna({"a": 0, "b": 5, "d": 7}), df.fillna({"a": 0, "b": 5, "d": 7}), ) # Series treated same as dict df_equals(modin_df.fillna(modin_df.max()), df.fillna(df.max())) def test_fillna_dataframe(): frame_data = { "a": [np.nan, 1, 2, np.nan, np.nan], "b": [1, 2, 3, np.nan, np.nan], "c": [np.nan, 1, 2, 3, 4], } df = pandas.DataFrame(frame_data, index=list("VWXYZ")) modin_df = pd.DataFrame(frame_data, index=list("VWXYZ")) # df2 may have different index and columns df2 = pandas.DataFrame( {"a": [np.nan, 10, 20, 30, 40], "b": [50, 60, 70, 80, 90], "foo": ["bar"] * 5}, index=list("VWXuZ"), ) modin_df2 = pd.DataFrame(df2) # only those columns and indices which are shared get filled df_equals(modin_df.fillna(modin_df2), df.fillna(df2)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_fillna_columns(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) df_equals( modin_df.fillna(method="ffill", axis=1), pandas_df.fillna(method="ffill", axis=1), ) df_equals( modin_df.fillna(method="ffill", axis=1), pandas_df.fillna(method="ffill", axis=1), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_fillna_invalid_method(data): modin_df = pd.DataFrame(data) with pytest.raises(ValueError): modin_df.fillna(method="ffil") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_fillna_invalid_value(data): modin_df = pd.DataFrame(data) # list pytest.raises(TypeError, modin_df.fillna, [1, 2]) # tuple pytest.raises(TypeError, modin_df.fillna, (1, 2)) # frame with series pytest.raises(TypeError, modin_df.iloc[:, 0].fillna, modin_df) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_fillna_col_reordering(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) df_equals(modin_df.fillna(method="ffill"), pandas_df.fillna(method="ffill")) def test_fillna_datetime_columns(): frame_data = { "A": [-1, -2, np.nan], "B": pd.date_range("20130101", periods=3), "C": ["foo", "bar", None], "D": ["foo2", "bar2", None], } df = pandas.DataFrame(frame_data, index=pd.date_range("20130110", periods=3)) modin_df = pd.DataFrame(frame_data, index=pd.date_range("20130110", periods=3)) df_equals(modin_df.fillna("?"), df.fillna("?")) frame_data = { "A": [-1, -2, np.nan], "B": [ pandas.Timestamp("2013-01-01"), pandas.Timestamp("2013-01-02"), pandas.NaT, ], "C": ["foo", "bar", None], "D": ["foo2", "bar2", None], } df = pandas.DataFrame(frame_data, index=pd.date_range("20130110", periods=3)) modin_df = pd.DataFrame(frame_data, index=pd.date_range("20130110", periods=3)) df_equals(modin_df.fillna("?"), df.fillna("?")) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("skipna", [False, True]) @pytest.mark.parametrize("method", ["median", "skew"]) def test_median_skew(axis, skipna, method): eval_general( *create_test_dfs(test_data["float_nan_data"]), lambda df: getattr(df, method)(axis=axis, skipna=skipna), ) @pytest.mark.parametrize("axis", ["rows", "columns"]) @pytest.mark.parametrize("method", ["median", "skew"]) def test_median_skew_transposed(axis, method): eval_general( *create_test_dfs(test_data["int_data"]), lambda df: getattr(df.T, method)(axis=axis), ) @pytest.mark.parametrize("method", ["median", "skew", "std", "var", "sem"]) def test_median_skew_std_var_sem_1953(method): # See #1953 for details arrays = [["1", "1", "2", "2"], ["1", "2", "3", "4"]] data = [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]] modin_df = pd.DataFrame(data, index=arrays) pandas_df = pandas.DataFrame(data, index=arrays) eval_general(modin_df, pandas_df, lambda df: getattr(df, method)()) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize("numeric_only", [False, True]) def test_mode(data, axis, numeric_only): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) try: pandas_result = pandas_df.mode(axis=axis, numeric_only=numeric_only) except Exception: with pytest.raises(TypeError): modin_df.mode(axis=axis, numeric_only=numeric_only) else: modin_result = modin_df.mode(axis=axis, numeric_only=numeric_only) df_equals(modin_result, pandas_result) def test_nlargest(): data = { "population": [ 59000000, 65000000, 434000, 434000, 434000, 337000, 11300, 11300, 11300, ], "GDP": [1937894, 2583560, 12011, 4520, 12128, 17036, 182, 38, 311], "alpha-2": ["IT", "FR", "MT", "MV", "BN", "IS", "NR", "TV", "AI"], } index = [ "Italy", "France", "Malta", "Maldives", "Brunei", "Iceland", "Nauru", "Tuvalu", "Anguilla", ] modin_df = pd.DataFrame(data=data, index=index) pandas_df = pandas.DataFrame(data=data, index=index) df_equals(modin_df.nlargest(3, "population"), pandas_df.nlargest(3, "population")) def test_nsmallest(): data = { "population": [ 59000000, 65000000, 434000, 434000, 434000, 337000, 11300, 11300, 11300, ], "GDP": [1937894, 2583560, 12011, 4520, 12128, 17036, 182, 38, 311], "alpha-2": ["IT", "FR", "MT", "MV", "BN", "IS", "NR", "TV", "AI"], } index = [ "Italy", "France", "Malta", "Maldives", "Brunei", "Iceland", "Nauru", "Tuvalu", "Anguilla", ] modin_df = pd.DataFrame(data=data, index=index) pandas_df = pandas.DataFrame(data=data, index=index) df_equals( modin_df.nsmallest(n=3, columns="population"), pandas_df.nsmallest(n=3, columns="population"), ) df_equals( modin_df.nsmallest(n=2, columns=["population", "GDP"], keep="all"), pandas_df.nsmallest(n=2, columns=["population", "GDP"], keep="all"), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize( "dropna", bool_arg_values, ids=arg_keys("dropna", bool_arg_keys) ) def test_nunique(data, axis, dropna): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) modin_result = modin_df.nunique(axis=axis, dropna=dropna) pandas_result = pandas_df.nunique(axis=axis, dropna=dropna) df_equals(modin_result, pandas_result) modin_result = modin_df.T.nunique(axis=axis, dropna=dropna) pandas_result = pandas_df.T.nunique(axis=axis, dropna=dropna) df_equals(modin_result, pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("q", quantiles_values, ids=quantiles_keys) def test_quantile(request, data, q): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) if not name_contains(request.node.name, no_numeric_dfs): df_equals(modin_df.quantile(q), pandas_df.quantile(q)) df_equals(modin_df.quantile(q, axis=1), pandas_df.quantile(q, axis=1)) try: pandas_result = pandas_df.quantile(q, axis=1, numeric_only=False) except Exception as err: with pytest.raises(type(err)): modin_df.quantile(q, axis=1, numeric_only=False) else: modin_result = modin_df.quantile(q, axis=1, numeric_only=False) df_equals(modin_result, pandas_result) else: with pytest.raises(ValueError): modin_df.quantile(q) if not name_contains(request.node.name, no_numeric_dfs): df_equals(modin_df.T.quantile(q), pandas_df.T.quantile(q)) df_equals(modin_df.T.quantile(q, axis=1), pandas_df.T.quantile(q, axis=1)) try: pandas_result = pandas_df.T.quantile(q, axis=1, numeric_only=False) except Exception as err: with pytest.raises(type(err)): modin_df.T.quantile(q, axis=1, numeric_only=False) else: modin_result = modin_df.T.quantile(q, axis=1, numeric_only=False) df_equals(modin_result, pandas_result) else: with pytest.raises(ValueError): modin_df.T.quantile(q) def test_quantile_7157(): # for details: https://github.com/modin-project/modin/issues/7157 n_rows = 100 n_fcols = 10 n_mcols = 5 df1_md, df1_pd = create_test_dfs( random_state.rand(n_rows, n_fcols), columns=[f"feat_{i}" for i in range(n_fcols)], ) df2_md, df2_pd = create_test_dfs( { "test_string1": ["test_string2" for _ in range(n_rows)] for _ in range(n_mcols) } ) df3_md = pd.concat([df2_md, df1_md], axis=1) df3_pd = pandas.concat([df2_pd, df1_pd], axis=1) eval_general(df3_md, df3_pd, lambda df: df.quantile(0.25, numeric_only=True)) eval_general(df3_md, df3_pd, lambda df: df.quantile((0.25,), numeric_only=True)) eval_general( df3_md, df3_pd, lambda df: df.quantile((0.25, 0.75), numeric_only=True) ) @pytest.mark.parametrize("axis", ["rows", "columns"]) @pytest.mark.parametrize( "na_option", ["keep", "top", "bottom"], ids=["keep", "top", "bottom"] ) def test_rank_transposed(axis, na_option): eval_general( *create_test_dfs(test_data["int_data"]), lambda df: df.rank(axis=axis, na_option=na_option), ) @pytest.mark.parametrize("skipna", [False, True]) @pytest.mark.parametrize("ddof", int_arg_values, ids=arg_keys("ddof", int_arg_keys)) def test_sem_float_nan_only(skipna, ddof): eval_general( *create_test_dfs(test_data["float_nan_data"]), lambda df: df.sem(skipna=skipna, ddof=ddof), ) @pytest.mark.parametrize("axis", ["rows", "columns"]) @pytest.mark.parametrize("ddof", int_arg_values, ids=arg_keys("ddof", int_arg_keys)) def test_sem_int_only(axis, ddof): eval_general( *create_test_dfs(test_data["int_data"]), lambda df: df.sem(axis=axis, ddof=ddof), ) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("skipna", [False, True]) @pytest.mark.parametrize("method", ["std", "var"]) def test_std_var(axis, skipna, method): eval_general( *create_test_dfs(test_data["float_nan_data"]), lambda df: getattr(df, method)(axis=axis, skipna=skipna), ) @pytest.mark.parametrize("axis", [0, 1, None]) def test_rank(axis): expected_exception = None if axis is None: expected_exception = ValueError("No axis named None for object type DataFrame") eval_general( *create_test_dfs(test_data["float_nan_data"]), lambda df: df.rank(axis=axis), expected_exception=expected_exception, ) @pytest.mark.parametrize("axis", ["rows", "columns"]) @pytest.mark.parametrize("ddof", int_arg_values, ids=arg_keys("ddof", int_arg_keys)) @pytest.mark.parametrize("method", ["std", "var"]) def test_std_var_transposed(axis, ddof, method): eval_general( *create_test_dfs(test_data["int_data"]), lambda df: getattr(df.T, method)(axis=axis, ddof=ddof), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_values(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) np.testing.assert_equal(modin_df.values, pandas_df.values) ================================================ FILE: modin/tests/pandas/extensions/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. ================================================ FILE: modin/tests/pandas/extensions/conftest.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import pytest from modin.config import Backend, Engine, Execution, StorageFormat from modin.core.execution.dispatching.factories import factories from modin.core.execution.dispatching.factories.factories import BaseFactory, NativeIO from modin.core.storage_formats.pandas.native_query_compiler import NativeQueryCompiler from modin.pandas.api.extensions.extensions import _NON_EXTENDABLE_ATTRIBUTES class Test1QueryCompiler(NativeQueryCompiler): storage_format = property(lambda self: "Test1_Storage_Format") engine = property(lambda self: "Test1_Engine") class Test1IO(NativeIO): query_compiler_cls = Test1QueryCompiler class Test1Factory(BaseFactory): @classmethod def prepare(cls): cls.io_cls = Test1IO @pytest.fixture def Backend1(): factories.Test1_Storage_FormatOnTest1_EngineFactory = Test1Factory if "Backend1" not in Backend.choices: StorageFormat.add_option("Test1_storage_format") Engine.add_option("Test1_engine") Backend.register_backend( "Backend1", Execution(storage_format="Test1_Storage_Format", engine="Test1_Engine"), ) return "Backend1" @pytest.fixture( # sort the set of non-extendable attributes to make the sequence of test # cases deterministic for pytest-xdist. params=sorted(_NON_EXTENDABLE_ATTRIBUTES), ) def non_extendable_attribute_name(request) -> str: return request.param ================================================ FILE: modin/tests/pandas/extensions/test_api_reexport.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import pandas import modin.pandas as pd def test_extensions_does_not_overwrite_pandas_api(): # Ensure that importing modin.pandas.api.extensions does not overwrite our re-export # of pandas.api submodules. import modin.pandas.api.extensions as ext # Top-level submodules should remain the same assert set(pd.api.__all__) == set(pandas.api.__all__) # Methods we define, like ext.register_dataframe_accessor should be different assert ( ext.register_dataframe_accessor is not pandas.api.extensions.register_dataframe_accessor ) # Methods from other submodules, like pd.api.types.is_bool_dtype, should be the same assert pd.api.types.is_bool_dtype is pandas.api.types.is_bool_dtype ================================================ FILE: modin/tests/pandas/extensions/test_base_extensions.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import re import pytest import modin.pandas as pd from modin.pandas.api.extensions import register_base_accessor from modin.tests.pandas.utils import df_equals @pytest.mark.parametrize("data_class", [pd.DataFrame, pd.Series]) class TestOverrideMethodForOneBackend: def test_add_simple_method(self, Backend1, data_class): expected_string_val = "Some string value" method_name = "new_method" modin_object = data_class([1, 2, 3]).set_backend(Backend1) @register_base_accessor(name=method_name, backend=Backend1) def my_method_implementation(self): return expected_string_val assert hasattr(data_class, method_name) assert getattr(modin_object, method_name)() == expected_string_val with pytest.raises( AttributeError, match=re.escape( f"{data_class.__name__} object has no attribute {method_name}" ), ): getattr(modin_object.set_backend("pandas"), method_name)() def test_add_non_method(self, Backend1, data_class): expected_val = 4 attribute_name = "four" register_base_accessor(name=attribute_name, backend=Backend1)(expected_val) assert data_class().set_backend(Backend1).four == expected_val assert not hasattr(data_class().set_backend("pandas"), attribute_name) def test_method_uses_existing_methods(self, Backend1, data_class): modin_object = data_class([1, 2, 3]).set_backend(Backend1) method_name = "self_accessor" expected_result = modin_object.sum() / modin_object.count() @register_base_accessor(name=method_name, backend=Backend1) def my_average(self): return self.sum() / self.count() if data_class is pd.DataFrame: df_equals(modin_object.self_accessor(), expected_result) else: assert modin_object.self_accessor() == expected_result def test_override_existing_method(self, Backend1, data_class): modin_object = data_class([3, 2, 1]) @register_base_accessor(name="copy", backend=Backend1) def my_copy(self, *args, **kwargs): return self + 1 df_equals(modin_object.set_backend(Backend1).copy(), modin_object + 1) @pytest.mark.parametrize("data_class", [pd.DataFrame, pd.Series]) @pytest.mark.parametrize("backend", ["pandas", "python_test"]) class TestOverrideMethodForAllBackends: def test_add_simple_method(self, backend, data_class): expected_string_val = "Some string value" method_name = "new_method" @register_base_accessor(name=method_name) def my_method_implementation(self): return expected_string_val modin_object = data_class([1, 2, 3]).set_backend(backend) assert getattr(modin_object, method_name)() == expected_string_val assert modin_object.new_method() == expected_string_val def test_add_non_method(self, data_class, backend): expected_val = 4 attribute_name = "four" register_base_accessor(name=attribute_name)(expected_val) assert data_class().set_backend(backend).four == expected_val def test_method_uses_existing_methods(self, data_class, backend): modin_object = data_class([1, 2, 3]).set_backend(backend) method_name = "self_accessor" expected_result = modin_object.sum() / modin_object.count() @register_base_accessor(name=method_name) def my_average(self): return self.sum() / self.count() if data_class is pd.DataFrame: df_equals(modin_object.self_accessor(), expected_result) else: assert modin_object.self_accessor() == expected_result def test_override_existing_method(self, data_class, backend): modin_object = data_class([3, 2, 1]) @register_base_accessor(name="copy") def my_copy(self, *args, **kwargs): return self + 1 df_equals(modin_object.set_backend(backend).copy(), modin_object + 1) class TestDunders: """ Make sure to test that we override special "dunder" methods like __len__ correctly. python calls these methods with DataFrame.__len__(obj) rather than getattr(obj, "__len__")(). source: https://docs.python.org/3/reference/datamodel.html#special-lookup """ @pytest.mark.parametrize("data_class", [pd.DataFrame, pd.Series]) def test_len(self, Backend1, data_class): @register_base_accessor(name="__len__", backend=Backend1) def always_get_1(self): return 1 modin_object = data_class([1, 2, 3]) assert len(modin_object) == 3 backend_object = modin_object.set_backend(Backend1) assert len(backend_object) == 1 assert backend_object.__len__() == 1 @pytest.mark.parametrize("data_class", [pd.DataFrame, pd.Series]) class TestProperty: def test_override_loc_for_one_backend(self, Backend1, data_class): modin_object = data_class([1, 2, 3]) @register_base_accessor(name="loc", backend=Backend1) @property def my_loc(self): return self.index[0] assert isinstance(modin_object.set_backend(Backend1).loc, int) assert modin_object.set_backend(Backend1).loc == 0 @pytest.mark.parametrize("backend", ["pandas", "python_test"]) def test_override_loc_for_all_backends(self, backend, data_class): @register_base_accessor(name="loc", backend=None) @property def my_loc(self): return self.index[0] modin_object = data_class([1, 2, 3]) assert isinstance(modin_object.set_backend(backend).loc, int) assert modin_object.set_backend(backend).loc == 0 def test_add_deletable_property(self, Backend1, data_class): # register a public property `public_property_name` that is backed by # a private attribute `private_property_name`. public_property_name = "property_name" private_property_name = "_property_name" def get_property(self): return getattr(self, private_property_name) def set_property(self, value): setattr(self, private_property_name, value) def del_property(self): delattr(self, private_property_name) register_base_accessor(name=public_property_name, backend=Backend1)( property(fget=get_property, fset=set_property, fdel=del_property) ) modin_object = data_class({"a": [1, 2, 3], "b": [4, 5, 6]}) assert not hasattr(modin_object, public_property_name) backend_object = modin_object.set_backend(Backend1) setattr(backend_object, public_property_name, "value") assert getattr(backend_object, public_property_name) == "value" delattr(backend_object, public_property_name) # check that the deletion works. assert not hasattr(backend_object, private_property_name) @pytest.mark.parametrize("backend", ["pandas", "python_test"]) def test_add_deletable_property_for_all_backends(self, data_class, backend): # register a public property `public_property_name` that is backed by # a private attribute `private_property_name`. public_property_name = "property_name" private_property_name = "_property_name" def get_property(self): return getattr(self, private_property_name) def set_property(self, value): setattr(self, private_property_name, value) def del_property(self): delattr(self, private_property_name) register_base_accessor(name=public_property_name)( property(fget=get_property, fset=set_property, fdel=del_property) ) modin_object = data_class({"a": [1, 2, 3], "b": [4, 5, 6]}).set_backend(backend) setattr(modin_object, public_property_name, "value") assert getattr(modin_object, public_property_name) == "value" delattr(modin_object, public_property_name) # check that the deletion works. assert not hasattr(modin_object, private_property_name) def test_get_property_that_raises_attribute_error_on_get_modin_issue_7562( self, data_class ): def get_property(self): raise AttributeError register_base_accessor(name="extension_property")(property(fget=get_property)) modin_object = data_class() with pytest.raises(AttributeError): getattr(modin_object, "extension_property") def test_non_settable_extension_property(self, Backend1, data_class): modin_object = data_class([0]) property_name = "property_name" register_base_accessor(name=property_name, backend=Backend1)( property(fget=(lambda self: 4)) ) assert not hasattr(modin_object, property_name) backend_object = modin_object.set_backend(Backend1) assert getattr(backend_object, property_name) == 4 with pytest.raises(AttributeError): setattr(backend_object, property_name, "value") def test_delete_non_deletable_extension_property(self, Backend1, data_class): modin_object = data_class([0]) property_name = "property_name" register_base_accessor(name=property_name, backend=Backend1)( property(fget=(lambda self: "value")) ) assert not hasattr(modin_object, property_name) backend_object = modin_object.set_backend(Backend1) assert hasattr(backend_object, property_name) with pytest.raises(AttributeError): delattr(backend_object, property_name) @pytest.mark.parametrize("data_class", [pd.DataFrame, pd.Series]) def test_deleting_extension_that_is_not_property_raises_attribute_error( Backend1, data_class ): expected_string_val = "Some string value" method_name = "new_method" @register_base_accessor(name=method_name, backend=Backend1) def my_method_implementation(self): return expected_string_val modin_object = data_class([0]).set_backend(Backend1) assert hasattr(data_class, method_name) with pytest.raises(AttributeError): delattr(modin_object, method_name) def test_disallowed_extensions(Backend1, non_extendable_attribute_name): with pytest.raises( ValueError, match=re.escape( f"Cannot register an extension with the reserved name {non_extendable_attribute_name}." ), ): register_base_accessor(name=non_extendable_attribute_name, backend=Backend1)( "unused_value" ) ================================================ FILE: modin/tests/pandas/extensions/test_dataframe_extensions.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import re from unittest import mock import pandas import pytest import modin.pandas as pd from modin.config import AutoSwitchBackend, Backend from modin.config import context as config_context from modin.pandas.api.extensions import register_dataframe_accessor default___init__ = pd.DataFrame._extensions[None]["__init__"] def test_dataframe_extension_simple_method(Backend1): expected_string_val = "Some string value" method_name = "new_method" df = pd.DataFrame([1, 2, 3]).set_backend(Backend1) @register_dataframe_accessor(name=method_name, backend=Backend1) def my_method_implementation(self): return expected_string_val assert hasattr(pd.DataFrame, method_name) assert df.new_method() == expected_string_val def test_dataframe_extension_non_method(Backend1): expected_val = 4 attribute_name = "four" register_dataframe_accessor(name=attribute_name, backend=Backend1)(expected_val) df = pd.DataFrame([1, 2, 3]).set_backend(Backend1) assert df.four == expected_val def test_dataframe_extension_accessing_existing_methods(Backend1): df = pd.DataFrame([1, 2, 3]).set_backend(Backend1) method_name = "self_accessor" expected_result = df.sum() / df.count() @register_dataframe_accessor(name=method_name, backend=Backend1) def my_average(self): return self.sum() / self.count() assert df.self_accessor().equals(expected_result) def test_dataframe_extension_overrides_existing_method(Backend1): df = pd.DataFrame([3, 2, 1]) assert df.sort_values(0).iloc[0, 0] == 1 @register_dataframe_accessor(name="sort_values", backend=Backend1) def my_sort_values(self): return self assert df.set_backend(Backend1).sort_values().iloc[0, 0] == 3 @pytest.mark.parametrize( "method_name", [ "pow", "__pow__", "__ipow__", ], ) def test_dataframe_extension_overrides_pow_github_issue_7495(method_name): register_dataframe_accessor(method_name, backend="Pandas")( lambda *args, **kwargs: 4 ) assert getattr(pd.DataFrame([1]).set_backend("Pandas"), method_name)() == 4 def test_override_pow_and__pow__to_different_implementations(): register_dataframe_accessor("pow", backend="Pandas")( lambda *args, **kwargs: "pow_result" ) register_dataframe_accessor("__pow__", backend="Pandas")( lambda *args, **kwargs: "__pow___result" ) df = pd.DataFrame([1]).set_backend("pandas") assert df.pow() == "pow_result" assert df.__pow__() == "__pow___result" def test_dataframe_extension_method_uses_superclass_method(Backend1): df = pd.DataFrame([3, 2, 1]) assert df.sort_values(0).iloc[0, 0] == 1 @register_dataframe_accessor(name="sort_values", backend=Backend1) def my_sort_values(self, by): return super(pd.DataFrame, self).sort_values(by=by, ascending=False) assert df.set_backend(Backend1).sort_values(by=0).iloc[0, 0] == 3 class TestOverride__init__: def test_override_one_backend_and_pass_no_query_compilers(self): default_backend = Backend.get() backend_init = mock.Mock(wraps=default___init__) register_dataframe_accessor(name="__init__", backend=default_backend)( backend_init ) output_df = pd.DataFrame([1], index=["a"], columns=["b"]) assert output_df.get_backend() == default_backend backend_init.assert_has_calls( [ mock.call(output_df, [1], index=["a"], columns=["b"]), # There's a second, internal call to the dataframe constructor that # uses a different dataframe as `self`. mock.call(mock.ANY, query_compiler=output_df._query_compiler), ] ) def test_override_one_backend_and_pass_query_compiler_kwarg(self): backend = "Pandas" backend_init = mock.Mock(wraps=default___init__) register_dataframe_accessor(name="__init__", backend=backend)(backend_init) with config_context(Backend=backend): input_df = pd.DataFrame() backend_init.reset_mock() output_df = pd.DataFrame(query_compiler=input_df._query_compiler) assert output_df.get_backend() == backend backend_init.assert_called_once_with( output_df, query_compiler=input_df._query_compiler ) @pytest.mark.parametrize("input_backend", ["Python_Test", "Pandas"]) def test_override_all_backends_and_pass_query_compiler_kwarg(self, input_backend): backend_init = mock.Mock(wraps=default___init__) register_dataframe_accessor(name="__init__")(backend_init) with config_context(Backend=input_backend): input_df = pd.DataFrame() backend_init.reset_mock() output_df = pd.DataFrame(query_compiler=input_df._query_compiler) assert output_df.get_backend() == input_backend backend_init.assert_called_once_with( output_df, query_compiler=input_df._query_compiler ) class TestDunders: """ Make sure to test that we override special "dunder" methods like __len__ correctly. python calls these methods with DataFrame.__len__(obj) rather than getattr(obj, "__len__")(). source: https://docs.python.org/3/reference/datamodel.html#special-lookup """ def test_len(self, Backend1): @register_dataframe_accessor(name="__len__", backend=Backend1) def always_get_1(self): return 1 df = pd.DataFrame([1, 2, 3]) assert len(df) == 3 backend_df = df.set_backend(Backend1) assert len(backend_df) == 1 assert backend_df.__len__() == 1 def test_repr(self, Backend1): @register_dataframe_accessor(name="__repr__", backend=Backend1) def simple_repr(self) -> str: return "dataframe_string" df = pd.DataFrame([1, 2, 3]) assert repr(df) == repr(df.modin.to_pandas()) backend_df = df.set_backend(Backend1) assert repr(backend_df) == "dataframe_string" assert backend_df.__repr__() == "dataframe_string" class TestProperty: def test_override_columns(self, Backend1): df = pd.DataFrame([["a", "b"]]) def set_columns(self, new_columns): self._query_compiler.columns = [f"{v}_custom" for v in new_columns] register_dataframe_accessor(name="columns", backend=Backend1)( property( fget=(lambda self: self._query_compiler.columns[::-1]), fset=set_columns ) ) assert list(df.columns) == [0, 1] backend_df = df.set_backend(Backend1) assert list(backend_df.columns) == [1, 0] backend_df.columns = [2, 3] assert list(backend_df.columns) == [ "3_custom", "2_custom", ] def test_search_for_missing_attribute_in_overridden_columns(self, Backend1): """ Test a scenario where we override the columns getter, then search for a missing dataframe attribute. Modin should look in the dataframe's overridden columns for the attribute. """ column_name = "column_name" column_getter = mock.Mock(wraps=(lambda self: self._query_compiler.columns)) register_dataframe_accessor(name="columns", backend=Backend1)( property(fget=column_getter) ) df = pd.DataFrame({column_name: ["a"]}).set_backend(Backend1) with pytest.raises( AttributeError, match="'DataFrame' object has no attribute 'non_existent_column'", ): getattr(df, "non_existent_column") column_getter.assert_called_once_with(df) def test_add_deletable_property(self, Backend1): public_property_name = "property_name" private_property_name = "_property_name" # register a public property `public_property_name` that is backed by # a private attribute `private_property_name`. def get_property(self): return getattr(self, private_property_name) def set_property(self, value): setattr(self, private_property_name, value) def del_property(self): delattr(self, private_property_name) register_dataframe_accessor(name=public_property_name, backend=Backend1)( property(get_property, set_property, del_property) ) df = pd.DataFrame([0]) assert not hasattr(df, public_property_name) backend_df = df.set_backend(Backend1) setattr(backend_df, public_property_name, "value") assert hasattr(backend_df, private_property_name) assert getattr(backend_df, private_property_name) == "value" delattr(backend_df, public_property_name) # check that the deletion works. assert not hasattr(backend_df, private_property_name) def test_non_settable_extension_property(self, Backend1): df = pd.DataFrame([0]) property_name = "property_name" register_dataframe_accessor(name=property_name, backend=Backend1)( property(fget=(lambda self: 4)) ) assert not hasattr(df, property_name) backend_df = df.set_backend(Backend1) assert getattr(backend_df, property_name) == 4 with pytest.raises(AttributeError): setattr(backend_df, property_name, "value") def test_delete_non_deletable_extension_property(self, Backend1): property_name = "property_name" register_dataframe_accessor(name=property_name, backend=Backend1)( property(fget=(lambda self: "value")) ) df = pd.DataFrame([0]) assert not hasattr(df, property_name) backend_df = df.set_backend(Backend1) assert hasattr(backend_df, property_name) with pytest.raises(AttributeError): delattr(backend_df, property_name) def test_deleting_extension_that_is_not_property_raises_attribute_error(Backend1): expected_string_val = "Some string value" method_name = "new_method" @register_dataframe_accessor(name=method_name, backend=Backend1) def my_method_implementation(self): return expected_string_val df = pd.DataFrame([1, 2, 3]).set_backend(Backend1) assert hasattr(pd.DataFrame, method_name) assert df.new_method() == expected_string_val with pytest.raises(AttributeError): delattr(df, method_name) def test_disallowed_extensions(Backend1, non_extendable_attribute_name): with pytest.raises( ValueError, match=re.escape( f"Cannot register an extension with the reserved name {non_extendable_attribute_name}." ), ): register_dataframe_accessor( name=non_extendable_attribute_name, backend=Backend1 )("unused_value") def test_correct_backend_with_pin(Backend1): # Ensures that the correct implementation is used when dispatching an operation on a pinned # frame, as an earlier implementation used the wrong extension method while preserving the # correct backend. assert not AutoSwitchBackend.get() @register_dataframe_accessor(name="__repr__", backend=Backend1) def my_repr(self): return "fake_repr" with config_context(Backend="Python_Test"): df = pd.DataFrame([1]) assert df.get_backend() == "Python_Test" assert repr(df) == repr(pandas.DataFrame([1])) df.set_backend(Backend1, inplace=True) df.pin_backend(inplace=True) assert df.get_backend() == Backend1 assert repr(df) == "fake_repr" def test_get_extension_from_dataframe_that_is_on_non_default_backend_when_auto_switch_is_false( Backend1, ): assert not AutoSwitchBackend.get() with config_context(Backend=Backend1): pandas_df = pd.DataFrame([1, 2]).move_to("Pandas") register_dataframe_accessor("sum", backend="Pandas")( lambda df: "small_sum_result" ) assert pandas_df.sum() == "small_sum_result" ================================================ FILE: modin/tests/pandas/extensions/test_groupby_extensions.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. from functools import cached_property import pytest import modin.pandas as pd from modin.config import AutoSwitchBackend, Backend from modin.config import context as config_context from modin.pandas.api.extensions import ( register_dataframe_groupby_accessor, register_series_groupby_accessor, ) from modin.pandas.groupby import DataFrameGroupBy, SeriesGroupBy from modin.tests.pandas.utils import default_to_pandas_ignore_string, df_equals from modin.tests.test_utils import ( current_execution_is_native, warns_that_defaulting_to_pandas_if, ) @pytest.mark.parametrize( "get_groupby,register_accessor", ( (lambda df: df.groupby("col0"), register_dataframe_groupby_accessor), (lambda df: df.groupby("col0")["col1"], register_series_groupby_accessor), ), ) @config_context(Backend="Pandas") @pytest.mark.parametrize("extension_backend", [None, "Pandas"]) @pytest.mark.parametrize("method_name", ["new_method", "sum"]) def test_add_simple_method( get_groupby, register_accessor, extension_backend, method_name ): expected_string_val = "expected_string_val" df = pd.DataFrame({"col0": [1, 2, 3], "col1": [4, 5, 6]}) @register_accessor(method_name, backend=extension_backend) def new_method(self): return expected_string_val groupby = get_groupby(df) assert hasattr(groupby, method_name) assert getattr(groupby, method_name)() == expected_string_val def test_dataframe_accessor_for_method_that_series_groupby_does_not_override(): """ Test sum(), a DataFrameGroupBy method that SeriesGroupBy inherits without overriding. Registering an extension method for DataFrameGroupBy should override sum() behavior for both DataFrameGroupBy and SeriesGroupBy. """ # Check that SeriesGroupBy inherits sum() from DataFrameGroupBy, with the only # difference being that SeriesGroupBy's sum() is wrapped in a method for handling # extensions and casting. assert DataFrameGroupBy.sum is SeriesGroupBy.sum._wrapped_method_for_casting df = pd.DataFrame({"col0": [1, 2, 3], "col1": [4, 5, 6]}) accessor_result = "test_result" register_dataframe_groupby_accessor("sum", backend=Backend.get())( lambda self, *args, **kwargs: accessor_result ) groupby_sum_result = df.groupby("col0").sum() assert groupby_sum_result == accessor_result series_groupby_sum_result = df.groupby("col0")["col1"].sum() assert series_groupby_sum_result == accessor_result @pytest.mark.filterwarnings(default_to_pandas_ignore_string) def test_dataframe_accessor_for_method_that_series_groupby_overrides(): """ Test describe(), a DataFrameGroupBy method that SeriesGroupBy overrides. Registering an extension method for DataFrameGroupBy should not affect SeriesGroupBy's describe() method. """ # Check that SeriesGroupBy overrides describe(). assert ( DataFrameGroupBy.describe is not SeriesGroupBy.describe._wrapped_method_for_casting ) df = pd.DataFrame({"col0": [1, 2, 3], "col1": [4, 5, 6]}) original_series_groupby_describe_result = df.groupby("col0")["col1"].describe() accessor_result = "test_result" register_dataframe_groupby_accessor("describe", backend=Backend.get())( lambda self, *args, **kwargs: accessor_result ) groupby_describe_result = df.groupby("col0").describe() assert groupby_describe_result == accessor_result series_groupby_describe_result = df.groupby("col0")["col1"].describe() df_equals(series_groupby_describe_result, original_series_groupby_describe_result) @pytest.mark.parametrize( "get_groupby,register_accessor", ( (lambda df: df.groupby("col0"), register_dataframe_groupby_accessor), (lambda df: df.groupby("col0")["col1"], register_series_groupby_accessor), ), ) class TestProperty: @pytest.mark.parametrize("df_backend", ["Pandas", "Python_Test"]) def test_add_read_only_property_for_all_backends( self, df_backend, get_groupby, register_accessor ): expected_string_val = "expected_string_val" property_name = "new_property" @register_dataframe_groupby_accessor(property_name) @property def new_property(self): return expected_string_val with config_context(Backend=df_backend): df = pd.DataFrame({"col0": [1, 2, 3], "col1": [4, 5, 6]}) assert get_groupby(df).new_property == expected_string_val with pytest.raises(AttributeError): del df.groupby("col0").new_property with pytest.raises(AttributeError): df.groupby("col0").new_property = "new_value" def test_override_ngroups_getter_for_one_backend( self, get_groupby, register_accessor ): accessor_ngroups = -1 property_name = "ngroups" @register_accessor(property_name, backend="Pandas") @property def ngroups(self): return accessor_ngroups pandas_df = pd.DataFrame({"col0": [1, 2, 3], "col1": [4, 5, 6]}).move_to( "pandas" ) groupby = get_groupby(pandas_df) assert groupby.ngroups == accessor_ngroups # Check that the accessor doesn't work on the Python_Test backend. python_test_df = pandas_df.move_to("Python_Test") groupby = get_groupby(python_test_df) with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): assert groupby.ngroups == 3 def test_add_ngroups_setter_and_deleter_for_one_backend( self, get_groupby, register_accessor ): def _get_ngroups(self): return self._ngroups def _delete_ngroups(self): delattr(self, "_ngroups") def _set_ngroups(self, value): self._ngroups = value register_accessor("ngroups", backend="Pandas")( property(fget=_get_ngroups, fset=_set_ngroups, fdel=_delete_ngroups) ) python_test_df = pd.DataFrame({"col0": [1, 2, 3], "col1": [4, 5, 6]}).move_to( "python_test" ) python_test_groupby = get_groupby(python_test_df) with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): assert python_test_groupby.ngroups == 3 with pytest.raises(AttributeError): python_test_groupby.ngroups = 4 with pytest.raises(AttributeError): del python_test_groupby.ngroups pandas_groupby = get_groupby(python_test_df.move_to("Pandas")) assert not hasattr(pandas_groupby, "ngroups") pandas_groupby.ngroups = -1 assert pandas_groupby.ngroups == -1 # Deleting ngroups should delete the private attribute _ngroups. del pandas_groupby.ngroups # now getting ngroups should raise an AttributeError because the # private attribute _ngroups is missing. assert not hasattr(pandas_groupby, "ngroups") def test_add_deletable_property_for_one_backend( self, get_groupby, register_accessor ): public_property_name = "property_name" private_property_name = "_property_name" # register a public property `public_property_name` that is backed by # a private attribute `private_property_name`. def get_property(self): return getattr(self, private_property_name) def set_property(self, value): setattr(self, private_property_name, value) def del_property(self): # Note that deleting the public property deletes the private # attribute, not the public property itself. delattr(self, private_property_name) register_accessor(name=public_property_name, backend="Pandas")( property(get_property, set_property, del_property) ) python_test_df = pd.DataFrame({"col0": [1, 2, 3], "col1": [4, 5, 6]}).move_to( "python_test" ) python_test_groupby = get_groupby(python_test_df) assert not hasattr(python_test_groupby, public_property_name) pandas_df = python_test_df.move_to("pandas") pandas_groupby = get_groupby(pandas_df) setattr(pandas_groupby, public_property_name, "value") assert getattr(pandas_groupby, public_property_name) == "value" delattr(pandas_groupby, public_property_name) assert not hasattr(pandas_groupby, private_property_name) @pytest.mark.filterwarnings(default_to_pandas_ignore_string) def test_override_cached_property(self, get_groupby, register_accessor): @cached_property def groups(self): return {"group": pd.Index(["test"])} register_accessor("groups", backend="Pandas")(groups) pandas_df = pd.DataFrame({"col0": [1], "col1": [2]}).move_to("pandas") assert get_groupby(pandas_df).groups == {"group": pd.Index(["test"])} def test_deleting_extension_that_is_not_property_raises_attribute_error(): expected_string_val = "Some string value" method_name = "new_method" @register_dataframe_groupby_accessor(name=method_name) def my_method_implementation(self): return expected_string_val groupby = pd.DataFrame({"col0": [1, 2, 3], "col1": [4, 5, 6]}).groupby("col0") assert hasattr(DataFrameGroupBy, method_name) assert getattr(groupby, method_name)() == expected_string_val with pytest.raises(AttributeError): delattr(groupby, method_name) @pytest.mark.skipif(Backend.get() == "Pandas", reason="already on pandas backend") def test_get_extension_from_dataframe_that_is_on_non_default_backend_when_auto_switch_is_false(): assert not AutoSwitchBackend.get() pandas_df = pd.DataFrame([1, 2]).move_to("Pandas") register_dataframe_groupby_accessor("sum", backend="Pandas")( lambda df: "small_sum_result" ) assert pandas_df.groupby(0).sum() == "small_sum_result" ================================================ FILE: modin/tests/pandas/extensions/test_pd_extensions.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import re from types import FunctionType import pandas import pytest import modin.pandas as pd from modin.config import context as config_context from modin.pandas.api.extensions import register_pd_accessor from modin.tests.pandas.utils import df_equals, eval_general @pytest.fixture( params=sorted( key for key, value in pd.__dict__.items() if isinstance(value, FunctionType) and value.__module__ == pd.general.__name__ ) ) def pd_general_function(request): return request.param @pytest.fixture( params=sorted( key for key, value in pd.__dict__.items() if isinstance(value, FunctionType) and value.__module__ == pd.io.__name__ ) ) def pd_io_function(request): return request.param class TestRegisterForAllBackends: def test_add_new_function(self): expected_string_val = "Some string value" method_name = "new_method" @register_pd_accessor(method_name) def my_method_implementation(): return expected_string_val assert pd.new_method() == expected_string_val def test_add_new_non_method(self): expected_val = 4 attribute_name = "four" register_pd_accessor(attribute_name)(expected_val) assert pd.four == expected_val def test_override_io_function(self, pd_io_function): sentinel = object() register_pd_accessor(pd_io_function)(lambda: sentinel) assert getattr(pd, pd_io_function)() == sentinel def test_override_general_function(self, pd_general_function): sentinel = object() register_pd_accessor(pd_general_function)(lambda: sentinel) assert getattr(pd, pd_general_function)() == sentinel class TestRegisterForOneBackend: def test_add_new_function(self): backend = "Pandas" expected_string_val = "Some string value" method_name = "new_method" @register_pd_accessor(method_name, backend=backend) def my_method_implementation(): return expected_string_val with config_context(Backend=backend): assert getattr(pd, method_name)() == expected_string_val with config_context(Backend="Python_Test"): with pytest.raises( AttributeError, match=re.escape( f"module 'modin.pandas' has no attribute {method_name}" ), ): getattr(pd, method_name)() def test_override_function(self): backend = "Pandas" expected_string_val = "Some string value" @register_pd_accessor("to_datetime", backend=backend) def my_method_implementation(*args, **kwargs): return expected_string_val with config_context(Backend=backend): # Since there are no query compiler inputs to to_datetime(), use # the to_datetime() implementation for Backend.get() assert pd.to_datetime(1) == expected_string_val with config_context(Backend="Python_Test"): # There are no query compiler inputs to to_datetime(), and # the current Backend.get() does not have a to_datetime() extension, # so fall back to the default to_datetime() implementation, which # should return the same result as pandas.to_datetime(). eval_general(pd, pandas, lambda lib: lib.to_datetime(1)) def test_add_new_non_method(self): backend = "Pandas" expected_val = 4 attribute_name = "four" register_pd_accessor(attribute_name, backend=backend)(expected_val) with config_context(Backend=backend): assert pd.four == expected_val with config_context(Backend="Python_Test"): assert not hasattr(pd, attribute_name) def test_to_datetime_dispatches_to_implementation_for_input(self): @register_pd_accessor("to_datetime", backend="Pandas") def pandas_to_datetime(*args, **kwargs): return "pandas_to_datetime_result" with config_context(Backend="Pandas"): pandas_backend_series = pd.Series(1) with config_context(Backend="Python_Test"): python_backend_df = pd.Series(1) assert pd.to_datetime(pandas_backend_series) == "pandas_to_datetime_result" df_equals( pd.to_datetime(python_backend_df), pandas.to_datetime(python_backend_df._to_pandas()), ) def test_concat_with_two_different_backends(self): with config_context(Backend="Pandas"): modin_on_pandas_df = pd.DataFrame({"a": [1, 2, 3]}) with config_context(Backend="Python_Test"): modin_on_python_df = pd.DataFrame({"a": [4, 5, 6]}) @register_pd_accessor("concat", backend="Pandas") def pandas_concat(*args, **kwargs): return "pandas_concat_result" @register_pd_accessor("concat", backend="Python_Test") def python_concat(*args, **kwargs): return "python_concat_result" # If the backends are different, we dispatch to the concat() override # for the backend of the first argument. assert ( pd.concat([modin_on_pandas_df, modin_on_python_df]) == "pandas_concat_result" ) # With inplace casting we need to reset the original dataframes modin_on_pandas_df.move_to("Pandas", inplace=True) modin_on_python_df.move_to("Python_Test", inplace=True) assert ( pd.concat([modin_on_python_df, modin_on_pandas_df]) == "python_concat_result" ) def test_index_class_override(self): class FakeIndex: def __init__(self, _values): pass def fake_method(self) -> str: return "python_fake_index" register_pd_accessor("Index", backend="Python_Test")(FakeIndex) with config_context(Backend="Pandas"): # Should return an actual native pandas index object df_equals(pd.Index([1]).to_series(), pd.Series([1], index=[1])) with config_context(Backend="Python_Test"): # Should just return a string assert pd.Index([1]).fake_method() == "python_fake_index" ================================================ FILE: modin/tests/pandas/extensions/test_series_extensions.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import re from unittest import mock import pytest import modin.pandas as pd from modin.config import Backend from modin.config import context as config_context from modin.pandas.api.extensions import register_series_accessor default___init__ = pd.Series._extensions[None]["__init__"] def test_series_extension_simple_method(Backend1): expected_string_val = "Some string value" method_name = "new_method" ser = pd.Series([1, 2, 3]).set_backend(Backend1) @register_series_accessor(name=method_name, backend=Backend1) def my_method_implementation(self): return expected_string_val assert hasattr(pd.Series, method_name) assert ser.new_method() == expected_string_val def test_series_extension_non_method(Backend1): expected_val = 4 attribute_name = "four" register_series_accessor(name=attribute_name, backend=Backend1)(expected_val) ser = pd.Series([1, 2, 3]).set_backend(Backend1) assert ser.four == expected_val def test_series_extension_accessing_existing_methods(Backend1): ser = pd.Series([1, 2, 3]).set_backend(Backend1) method_name = "self_accessor" expected_result = ser.sum() / ser.count() @register_series_accessor(name=method_name, backend=Backend1) def my_average(self): return self.sum() / self.count() assert ser.self_accessor() == expected_result def test_series_extension_overrides_existing_method(Backend1): series = pd.Series([3, 2, 1]) assert series.sort_values().iloc[0] == 1 @register_series_accessor(name="sort_values", backend=Backend1) def my_sort_values(self): return self assert series.set_backend(Backend1).sort_values().iloc[0] == 3 def test_series_extension_method_uses_superclass_method(Backend1): series = pd.Series([3, 2, 1], name="name") assert series.sort_values().iloc[0] == 1 @register_series_accessor(name="sort_values", backend=Backend1) def my_sort_values(self): return super(pd.Series, self).sort_values(by="name", ascending=False) assert series.set_backend(Backend1).sort_values().iloc[0] == 3 class TestOverride__init__: def test_override_one_backend_and_pass_no_query_compilers(self): default_backend = Backend.get() backend_init = mock.Mock(wraps=default___init__) register_series_accessor(name="__init__", backend=default_backend)(backend_init) output_series = pd.Series([1], index=["a"]) assert output_series.get_backend() == default_backend backend_init.assert_has_calls( [ mock.call(output_series, [1], index=["a"]), ] ) def test_override_one_backend_and_pass_query_compiler_kwarg(self): backend_init = mock.Mock(wraps=default___init__) register_series_accessor(name="__init__", backend="Pandas")(backend_init) with config_context(Backend="Pandas"): input_series = pd.Series() backend_init.reset_mock() output_series = pd.Series(query_compiler=input_series._query_compiler) assert output_series.get_backend() == "Pandas" backend_init.assert_called_once_with( output_series, query_compiler=input_series._query_compiler ) @pytest.mark.parametrize("input_backend", ["Python_Test", "Pandas"]) def test_override_all_backends_and_pass_query_compiler_kwarg(self, input_backend): backend_init = mock.Mock(wraps=default___init__) register_series_accessor(name="__init__")(backend_init) with config_context(Backend=input_backend): input_series = pd.Series() backend_init.reset_mock() output_series = pd.Series(query_compiler=input_series._query_compiler) assert output_series.get_backend() == input_backend backend_init.assert_called_once_with( output_series, query_compiler=input_series._query_compiler ) class TestDunders: """ Make sure to test that we override special "dunder" methods like __len__ correctly. python calls these methods with DataFrame.__len__(obj) rather than getattr(obj, "__len__")(). source: https://docs.python.org/3/reference/datamodel.html#special-lookup """ def test_len(self, Backend1): @register_series_accessor(name="__len__", backend=Backend1) def always_get_1(self): return 1 series = pd.Series([1, 2, 3]) assert len(series) == 3 backend_series = series.set_backend(Backend1) assert len(backend_series) == 1 assert backend_series.__len__() == 1 def test_repr(self, Backend1): @register_series_accessor(name="__repr__", backend=Backend1) def simple_repr(self) -> str: return "series_string" series = pd.Series([1, 2, 3]) assert repr(series) == repr(series.modin.to_pandas()) backend_series = series.set_backend(Backend1) assert repr(backend_series) == "series_string" assert backend_series.__repr__() == "series_string" class TestProperty: def test_override_index(self, Backend1): series = pd.Series(["a", "b"]) def set_index(self, new_index): self._query_compiler.index = [f"{v}_custom" for v in new_index] register_series_accessor(name="index", backend=Backend1)( property(fget=lambda self: self._query_compiler.index[::-1], fset=set_index) ) assert list(series.index) == [0, 1] backend_series = series.set_backend(Backend1) assert list(backend_series.index) == [1, 0] backend_series.index = [2, 3] assert list(backend_series.index) == ["3_custom", "2_custom"] def test_add_deletable_property(self, Backend1): # register a public property `public_property_name` that is backed by # a private attribute `private_property_name`. public_property_name = "property_name" private_property_name = "_property_name" def get_property(self): return getattr(self, private_property_name) def set_property(self, value): setattr(self, private_property_name, value) def del_property(self): delattr(self, private_property_name) register_series_accessor(name=public_property_name, backend=Backend1)( property(get_property, set_property, del_property) ) series = pd.Series([0]) assert not hasattr(series, public_property_name) backend_series = series.set_backend(Backend1) setattr(backend_series, public_property_name, "value") assert hasattr(backend_series, private_property_name) assert getattr(backend_series, public_property_name) == "value" delattr(backend_series, public_property_name) # check that the deletion works. assert not hasattr(backend_series, private_property_name) def test_non_settable_extension_property(self, Backend1): property_name = "property_name" register_series_accessor(name=property_name, backend=Backend1)( property(fget=(lambda self: 4)) ) series = pd.Series([0]) assert not hasattr(series, property_name) backend_series = series.set_backend(Backend1) assert getattr(backend_series, property_name) == 4 with pytest.raises(AttributeError): setattr(backend_series, property_name, "value") def test_delete_non_deletable_extension_property(self, Backend1): property_name = "property_name" register_series_accessor(name=property_name, backend=Backend1)( property(fget=(lambda self: "value")) ) series = pd.Series([0]) assert not hasattr(series, property_name) backend_series = series.set_backend(Backend1) with pytest.raises(AttributeError): delattr(backend_series, property_name) def test_deleting_extension_that_is_not_property_raises_attribute_error(Backend1): expected_string_val = "Some string value" method_name = "new_method" series = pd.Series([1, 2, 3]).set_backend(Backend1) @register_series_accessor(name=method_name, backend=Backend1) def my_method_implementation(self): return expected_string_val assert hasattr(pd.Series, method_name) assert series.new_method() == expected_string_val with pytest.raises(AttributeError): delattr(series, method_name) def test_disallowed_extensions(Backend1, non_extendable_attribute_name): with pytest.raises( ValueError, match=re.escape( f"Cannot register an extension with the reserved name {non_extendable_attribute_name}." ), ): register_series_accessor(name=non_extendable_attribute_name, backend=Backend1)( "unused_value" ) def test_wrapped_extension(Backend1): """ Tests using the extensions system to overwrite a method with a wrapped version of the original method obtained via getattr. Because the QueryCompilerCaster ABC automatically wraps all methods with a dispatch to the appropriate backend, we must use the __wrapped__ property of the originally-defined attribute to avoid infinite recursion. """ original_item = pd.Series.item.__wrapped__ @register_series_accessor(name="item", backend=Backend1) def item_implementation(self): return (original_item(self) + 2) * 5 series = pd.Series([3]) assert series.item() == 3 assert series.set_backend(Backend1).item() == 25 ================================================ FILE: modin/tests/pandas/integrations/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. ================================================ FILE: modin/tests/pandas/integrations/test_lazy_import.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import lazy_import pandas = lazy_import.lazy_module("pandas") pyarrow = lazy_import.lazy_module("pyarrow") from modin import pandas as pd # noqa: E402 def test_dataframe_constructor(): pd.DataFrame({"col1": [1, 2, 3], "col2": list("abc")}) ================================================ FILE: modin/tests/pandas/internals/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. ================================================ FILE: modin/tests/pandas/internals/test_benchmark_mode.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import unittest.mock as mock import pytest import modin.pandas as pd from modin.config import Engine engine = Engine.get() # We have to explicitly mock subclass implementations of wait_partitions. if engine == "Ray": wait_method = ( "modin.core.execution.ray.implementations." + "pandas_on_ray.partitioning." + "PandasOnRayDataframePartitionManager.wait_partitions" ) elif engine == "Dask": wait_method = ( "modin.core.execution.dask.implementations." + "pandas_on_dask.partitioning." + "PandasOnDaskDataframePartitionManager.wait_partitions" ) elif engine == "Unidist": wait_method = ( "modin.core.execution.unidist.implementations." + "pandas_on_unidist.partitioning." + "PandasOnUnidistDataframePartitionManager.wait_partitions" ) else: wait_method = ( "modin.core.dataframe.pandas.partitioning." + "partition_manager.PandasDataframePartitionManager.wait_partitions" ) @pytest.mark.parametrize("set_benchmark_mode", [False], indirect=True) def test_turn_off(set_benchmark_mode): df = pd.DataFrame([0]) with mock.patch(wait_method) as wait: df.dropna() wait.assert_not_called() @pytest.mark.parametrize("set_benchmark_mode", [True], indirect=True) def test_turn_on(set_benchmark_mode): df = pd.DataFrame([0]) with mock.patch(wait_method) as wait: df.dropna() wait.assert_called() ================================================ FILE: modin/tests/pandas/native_df_interoperability/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ This module contains tests for interoperability between Modin dataframes using "native" execution and Modin dataframes using other execution modes. """ ================================================ FILE: modin/tests/pandas/native_df_interoperability/conftest.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import itertools import pytest def _get_native_bool_descriptor(v: bool) -> str: return "native" if v else "default" @pytest.fixture( params=list(itertools.product([True, False], repeat=2)), ids=lambda param: "_".join(_get_native_bool_descriptor(v) for v in param), ) def df_mode_pair(request): return request.param ================================================ FILE: modin/tests/pandas/native_df_interoperability/test_binary.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import matplotlib import pytest from modin.config import NPartitions from modin.tests.pandas.native_df_interoperability.utils import ( create_test_df_in_defined_mode, eval_general_interop, ) from modin.tests.pandas.utils import ( default_to_pandas_ignore_string, df_equals, test_data, test_data_keys, test_data_values, ) NPartitions.put(4) # Force matplotlib to not use any Xwindows backend. matplotlib.use("Agg") # Our configuration in pytest.ini requires that we explicitly catch all # instances of defaulting to pandas, but some test modules, like this one, # have too many such instances. pytestmark = pytest.mark.filterwarnings(default_to_pandas_ignore_string) @pytest.mark.parametrize( "other", [ lambda df, axis: 4, lambda df, axis: df.iloc[0] if axis == "columns" else list(df[df.columns[0]]), lambda df, axis: { label: idx + 1 for idx, label in enumerate(df.axes[0 if axis == "rows" else 1]) }, lambda df, axis: { label if idx % 2 else f"random_key{idx}": idx + 1 for idx, label in enumerate(df.axes[0 if axis == "rows" else 1][::-1]) }, ], ids=[ "scalar", "series_or_list", "dictionary_keys_equal_columns", "dictionary_keys_unequal_columns", ], ) @pytest.mark.parametrize("axis", ["rows", "columns"]) @pytest.mark.parametrize( "op", [ *("add", "radd", "sub", "rsub", "mod", "rmod", "pow", "rpow"), *("truediv", "rtruediv", "mul", "rmul", "floordiv", "rfloordiv"), ], ) @pytest.mark.parametrize("backend", [None, "pyarrow"]) def test_math_functions(other, axis, op, backend, df_mode_pair): data = test_data["float_nan_data"] if (op == "floordiv" or op == "rfloordiv") and axis == "rows": # lambda == "series_or_list" pytest.xfail(reason="different behavior") if op == "rmod" and axis == "rows": # lambda == "series_or_list" pytest.xfail(reason="different behavior") if op in ("mod", "rmod") and backend == "pyarrow": pytest.skip(reason="These functions are not implemented in pandas itself") eval_general_interop( data, backend, lambda df1, df2: getattr(df1, op)(other(df2, axis), axis=axis), df_mode_pair, ) @pytest.mark.parametrize("other", [lambda df: 2, lambda df: df]) def test___divmod__(other, df_mode_pair): data = test_data["float_nan_data"] eval_general_interop( data, None, lambda df1, df2: divmod(df1, other(df2)), df_mode_pair ) @pytest.mark.parametrize("other", ["as_left", 4]) @pytest.mark.parametrize("op", ["eq", "ge", "gt", "le", "lt", "ne"]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_comparison(data, op, other, request, df_mode_pair): def operation(df1, df2): return getattr(df1, op)(df2 if other == "as_left" else other) expected_exception = None if "int_data" in request.node.callspec.id and other == "a": pytest.xfail(reason="https://github.com/modin-project/modin/issues/7019") elif "float_nan_data" in request.node.callspec.id and other == "a": expected_exception = TypeError( "Invalid comparison between dtype=float64 and str" ) eval_general_interop( data, None, operation, df_mode_pair, expected_exception=expected_exception, ) @pytest.mark.parametrize( "frame1_data,frame2_data,expected_pandas_equals", [ pytest.param({}, {}, True, id="two_empty_dataframes"), pytest.param([[1]], [[0]], False, id="single_unequal_values"), pytest.param([[None]], [[None]], True, id="single_none_values"), pytest.param( [[1, 2], [3, 4]], [[1, 2], [3, 4]], True, id="equal_two_by_two_dataframes", ), pytest.param( [[1, 2], [3, 4]], [[5, 2], [3, 4]], False, id="unequal_two_by_two_dataframes", ), ], ) def test_equals(frame1_data, frame2_data, expected_pandas_equals, df_mode_pair): modin_df1, pandas_df1 = create_test_df_in_defined_mode( frame1_data, native=df_mode_pair[0] ) modin_df2, pandas_df2 = create_test_df_in_defined_mode( frame2_data, native=df_mode_pair[1] ) pandas_equals = pandas_df1.equals(pandas_df2) assert pandas_equals == expected_pandas_equals, ( "Test expected pandas to say the dataframes were" + f"{'' if expected_pandas_equals else ' not'} equal, but they were" + f"{' not' if expected_pandas_equals else ''} equal." ) assert modin_df1.equals(modin_df2) == pandas_equals assert modin_df1.equals(pandas_df2) == pandas_equals @pytest.mark.parametrize("empty_operand", ["right", "left", "both"]) def test_empty_df(empty_operand, df_mode_pair): modin_df, pandas_df = create_test_df_in_defined_mode( [0, 1, 2, 0, 1, 2], native=df_mode_pair[0] ) modin_df_empty, pandas_df_empty = create_test_df_in_defined_mode( native=df_mode_pair[1] ) if empty_operand == "right": modin_res = modin_df + modin_df_empty pandas_res = pandas_df + pandas_df_empty elif empty_operand == "left": modin_res = modin_df_empty + modin_df pandas_res = pandas_df_empty + pandas_df else: modin_res = modin_df_empty + modin_df_empty pandas_res = pandas_df_empty + pandas_df_empty df_equals(modin_res, pandas_res) ================================================ FILE: modin/tests/pandas/native_df_interoperability/test_compiler_caster.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import contextlib import json import logging from io import StringIO from types import MappingProxyType from typing import Iterator, Optional from unittest import mock import pandas import pytest from pytest import param import modin.pandas as pd from modin.config import context as config_context from modin.config.envvars import ( Backend, Engine, Execution, NativePandasMaxRows, NativePandasTransferThreshold, ) from modin.core.execution.dispatching.factories import factories from modin.core.execution.dispatching.factories.factories import BaseFactory from modin.core.io.io import BaseIO from modin.core.storage_formats.base.query_compiler import QCCoercionCost from modin.core.storage_formats.base.query_compiler_calculator import ( BackendCostCalculator, ) from modin.core.storage_formats.pandas.native_query_compiler import NativeQueryCompiler from modin.core.storage_formats.pandas.query_compiler_caster import ( _GENERAL_EXTENSIONS, register_function_for_post_op_switch, register_function_for_pre_op_switch, ) from modin.logging import DEFAULT_LOGGER_NAME from modin.logging.metrics import add_metric_handler, clear_metric_handler from modin.pandas.api.extensions import register_pd_accessor from modin.tests.pandas.utils import ( create_test_dfs, default_to_pandas_ignore_string, df_equals, eval_general, ) # Some modin methods warn about defaulting to pandas at the API layer. That's # expected and not an error as it would be normally. pytestmark = pytest.mark.filterwarnings(default_to_pandas_ignore_string) BIG_DATA_CLOUD_MIN_NUM_ROWS = 10 SMALL_DATA_NUM_ROWS = 5 class CalculatorTestQc(NativeQueryCompiler): """ A subclass of NativeQueryCompiler with simpler cost functions. We MAY eventually want to stop overriding the superclass's cost functions. """ @classmethod def move_to_me_cost(cls, other_qc, api_cls_name, operation, arguments): if isinstance(other_qc, cls): return QCCoercionCost.COST_ZERO return None def stay_cost(self, api_cls_name, operation, arguments): return QCCoercionCost.COST_ZERO def move_to_cost(self, other_qc_type, api_cls_name, operation, arguments): if isinstance(self, other_qc_type): return QCCoercionCost.COST_ZERO return None class CloudQC(CalculatorTestQc): "Represents a cloud-hosted query compiler" def get_backend(self): return "Cloud" @classmethod def max_cost(cls): return QCCoercionCost.COST_IMPOSSIBLE def move_to_cost(self, other_qc_cls, api_cls_name, op, arguments): assert op is not None assert api_cls_name in [ None, "_iLocIndexer", "_LocationIndexerBase", "Series", "DataFrame", "BasePandasDataset", ] return { CloudQC: QCCoercionCost.COST_ZERO, CloudQCHighSelf: QCCoercionCost.COST_LOW, ClusterQC: QCCoercionCost.COST_MEDIUM, DefaultQC: QCCoercionCost.COST_MEDIUM, LocalMachineQC: QCCoercionCost.COST_HIGH, PicoQC: QCCoercionCost.COST_IMPOSSIBLE, OmniscientEagerQC: None, OmniscientLazyQC: None, }.get(other_qc_cls) def stay_cost(self, api_cls_name, op, arguments): return QCCoercionCost.COST_ZERO class CloudQCHighSelf(CloudQC): def get_backend(self): return "Cloud_High_Self" def stay_cost(self, api_cls_name, op, arguments): return QCCoercionCost.COST_HIGH class ClusterQC(CalculatorTestQc): "Represents a local network cluster query compiler" def get_backend(self): return "Cluster" @classmethod def max_cost(cls): return QCCoercionCost.COST_HIGH def move_to_cost(self, other_qc_cls, api_cls_name, op, arguments): return { CloudQC: QCCoercionCost.COST_MEDIUM, CloudQCHighSelf: QCCoercionCost.COST_MEDIUM, ClusterQC: QCCoercionCost.COST_ZERO, DefaultQC: None, # cluster qc knows nothing about default qc LocalMachineQC: QCCoercionCost.COST_MEDIUM, PicoQC: QCCoercionCost.COST_HIGH, }.get(other_qc_cls) class LocalMachineQC(CalculatorTestQc): "Represents a local machine query compiler" def get_backend(self): return "Local_Machine" @classmethod def max_cost(cls): return QCCoercionCost.COST_MEDIUM def move_to_cost(self, other_qc_cls, api_cls_name, op, arguments): return { CloudQC: QCCoercionCost.COST_MEDIUM, CloudQCHighSelf: QCCoercionCost.COST_MEDIUM, ClusterQC: QCCoercionCost.COST_LOW, LocalMachineQC: QCCoercionCost.COST_ZERO, PicoQC: QCCoercionCost.COST_MEDIUM, }.get(other_qc_cls) class PicoQC(CalculatorTestQc): "Represents a query compiler with very few resources" def get_backend(self): return "Pico" @classmethod def max_cost(cls): return QCCoercionCost.COST_LOW def move_to_cost(self, other_qc_cls, api_cls_name, op, arguments): return { CloudQC: QCCoercionCost.COST_LOW, CloudQCHighSelf: QCCoercionCost.COST_LOW, ClusterQC: QCCoercionCost.COST_LOW, LocalMachineQC: QCCoercionCost.COST_LOW, PicoQC: QCCoercionCost.COST_ZERO, }.get(other_qc_cls) class AdversarialQC(CalculatorTestQc): "Represents a query compiler which returns non-sensical costs" def get_backend(self): return "Adversarial" def move_to_cost(self, other_qc_cls, api_cls_name, op, arguments): return { CloudQC: -1000, CloudQCHighSelf: -1000, ClusterQC: 10000, AdversarialQC: QCCoercionCost.COST_ZERO, }.get(other_qc_cls) class OmniscientEagerQC(CalculatorTestQc): "Represents a query compiler which knows a lot, and wants to steal work" def get_backend(self): return "Eager" # keep other workloads from getting my workload def move_to_cost(self, other_qc_cls, api_cls_name, op, arguments): if OmniscientEagerQC is other_qc_cls: return QCCoercionCost.COST_ZERO return QCCoercionCost.COST_IMPOSSIBLE # try to force other workloads to my engine @classmethod def move_to_me_cost(cls, other_qc, api_cls_name, operation, arguments): return QCCoercionCost.COST_ZERO class OmniscientLazyQC(CalculatorTestQc): "Represents a query compiler which knows a lot, and wants to avoid work" def get_backend(self): return "Lazy" # encorage other engines to take my workload def move_to_cost(self, other_qc_cls, api_cls_name, op, arguments): return QCCoercionCost.COST_ZERO # try to keep other workloads from getting my workload @classmethod def move_to_me_cost(cls, other_qc, api_cls_name, operation, arguments): if isinstance(other_qc, cls): return QCCoercionCost.COST_ZERO return QCCoercionCost.COST_IMPOSSIBLE class DefaultQC(CalculatorTestQc): "Represents a query compiler with no costing information" def get_backend(self): return "Test_Casting_Default" class DefaultQC2(CalculatorTestQc): "Represents a query compiler with no costing information, but different." def get_backend(self): return "Test_Casting_Default_2" class BaseTestAutoMover(NativeQueryCompiler): _MAX_SIZE_THIS_ENGINE_CAN_HANDLE = BIG_DATA_CLOUD_MIN_NUM_ROWS def __init__(self, pandas_frame): super().__init__(pandas_frame) class CloudForBigDataQC(BaseTestAutoMover): """Represents a cloud-hosted query compiler that prefers to stay on the cloud only for big data""" # Operations are more costly on this engine, even though it can handle larger datasets _MAX_SIZE_THIS_ENGINE_CAN_HANDLE = BIG_DATA_CLOUD_MIN_NUM_ROWS * 10 _OPERATION_INITIALIZATION_OVERHEAD = QCCoercionCost.COST_MEDIUM _OPERATION_PER_ROW_OVERHEAD = 10 def __init__(self, pandas_frame): super().__init__(pandas_frame) def stay_cost(self, api_cls_name, operation, arguments): if operation == "read_json": return QCCoercionCost.COST_IMPOSSIBLE return super().stay_cost(api_cls_name, operation, arguments) def get_backend(self) -> str: return "Big_Data_Cloud" @classmethod def max_cost(cls): return QCCoercionCost.COST_IMPOSSIBLE * 10 @classmethod def move_to_me_cost(cls, other_qc, api_cls_name, operation, arguments): if api_cls_name in ("DataFrame", "Series") and operation == "__init__": if (query_compiler := arguments.get("query_compiler")) is not None: # When we create a dataframe or series with a query compiler # input, we should not switch the resulting dataframe or series # to a different backend. return ( QCCoercionCost.COST_ZERO if isinstance(query_compiler, cls) else QCCoercionCost.COST_IMPOSSIBLE ) else: # Moving the in-memory __init__ inputs to the cloud is expensive. return QCCoercionCost.COST_HIGH return super().move_to_me_cost(other_qc, api_cls_name, operation, arguments) class LocalForSmallDataQC(BaseTestAutoMover): """Represents a local query compiler that prefers small data.""" # Operations are cheap on this engine for small data, but there is an upper bound _MAX_SIZE_THIS_ENGINE_CAN_HANDLE = BIG_DATA_CLOUD_MIN_NUM_ROWS _OPERATION_PER_ROW_OVERHEAD = 1 def __init__(self, pandas_frame): super().__init__(pandas_frame) def get_backend(self) -> str: return "Small_Data_Local" @classmethod def max_cost(cls): return QCCoercionCost.COST_IMPOSSIBLE * 10 def register_backend(name, qc): class TestCasterIO(BaseIO): _should_warn_on_default_to_pandas: bool = False query_compiler_cls = qc class TestCasterFactory(BaseFactory): @classmethod def prepare(cls): cls.io_cls = TestCasterIO TestCasterFactory.prepare() factory_name = f"{name}OnNativeFactory" setattr(factories, factory_name, TestCasterFactory) Engine.add_option(name) Backend.register_backend(name, Execution(name, "Native")) ALL_BACKENDS = { "Pico": PicoQC, "Cluster": ClusterQC, "Cloud": CloudQC, "Cloud_High_Self": CloudQCHighSelf, "Local_Machine": LocalMachineQC, "Adversarial": AdversarialQC, "Eager": OmniscientEagerQC, "Lazy": OmniscientLazyQC, "Test_Casting_Default": DefaultQC, "Test_Casting_Default_2": DefaultQC2, "Big_Data_Cloud": CloudForBigDataQC, "Small_Data_Local": LocalForSmallDataQC, } for backend, qc in ALL_BACKENDS.items(): register_backend(backend, qc) DEFAULT_TEST_BACKENDS = ( "Pico", "Cluster", "Cloud", "Cloud_High_Self", "Local_Machine", "Lazy", ) @pytest.fixture(autouse=True) def turn_on_auto_switch_backend(): with config_context(AutoSwitchBackend=True): yield @contextlib.contextmanager def backend_test_context( *, test_backend: Optional[str] = None, choices: Optional[tuple] = None ) -> Iterator[None]: if choices is None: # Consider only a select set custom-defined test backends by default for easier testing. # This is necessary because n-ary operations consider _all_ possible active backends, so # we may observe unexpected behavior if too many backends are activated at once. # If a QC is explicitly created for an inactive backend, the QC calculator should still # be able to accept it. choices = DEFAULT_TEST_BACKENDS if test_backend is None: test_backend = choices[0] old_default_backend = Backend.get() old_backend_choices = Backend.get_active_backends() try: Backend.set_active_backends(choices) Backend.put(test_backend) yield finally: Backend.set_active_backends(old_backend_choices) Backend.put(old_default_backend) @pytest.fixture() def cloud_df(): return pd.DataFrame(query_compiler=CloudQC(pandas.DataFrame([0, 1, 2]))) @pytest.fixture() def cloud_high_self_df(): return pd.DataFrame(query_compiler=CloudQCHighSelf(pandas.DataFrame([0, 1, 2]))) @pytest.fixture() def cluster_df(): return pd.DataFrame(query_compiler=ClusterQC(pandas.DataFrame([0, 1, 2]))) @pytest.fixture() def local_df(): return pd.DataFrame(query_compiler=LocalMachineQC(pandas.DataFrame([0, 1, 2]))) @pytest.fixture() def pico_df(): return pd.DataFrame(query_compiler=PicoQC(pandas.DataFrame([0, 1, 2]))) @pytest.fixture() def adversarial_df(): return pd.DataFrame(query_compiler=AdversarialQC(pandas.DataFrame([0, 1, 2]))) @pytest.fixture() def eager_df(): return pd.DataFrame(query_compiler=OmniscientEagerQC(pandas.DataFrame([0, 1, 2]))) @pytest.fixture() def lazy_df(): return pd.DataFrame(query_compiler=OmniscientLazyQC(pandas.DataFrame([0, 1, 2]))) @pytest.fixture() def default_df(): return pd.DataFrame(query_compiler=DefaultQC(pandas.DataFrame([0, 1, 2]))) @pytest.fixture() def default2_df(): return pd.DataFrame(query_compiler=DefaultQC2(pandas.DataFrame([0, 1, 2]))) def test_two_same_backend(pico_df): df3 = pd.concat([pico_df, pico_df], axis=1) assert pico_df.get_backend() == "Pico" assert df3.get_backend() == "Pico" def test_cast_to_second_backend_with_concat(pico_df, cluster_df, caplog): with caplog.at_level(level=logging.INFO, logger=DEFAULT_LOGGER_NAME): # We have to copy the input dataframes because of inplace merging df3 = pd.concat([pico_df.copy(), cluster_df.copy()], axis=1) assert pico_df.get_backend() == "Pico" assert cluster_df.get_backend() == "Cluster" assert df3.get_backend() == "Cluster" # result should be on cluster log_records = caplog.records assert len(log_records) == 1 assert log_records[0].name == DEFAULT_LOGGER_NAME assert log_records[0].levelno == logging.INFO assert log_records[0].message.startswith( "BackendCostCalculator results for pd.concat: " ) def test_cast_to_second_backend_with_concat_uses_second_backend_api_override( pico_df, cluster_df ): register_pd_accessor(name="concat", backend="Cluster")( lambda *args, **kwargs: "custom_concat_result" ) # copy dataframes for concat to allow for in-place merging assert ( pd.concat([pico_df.copy(), cluster_df.copy()], axis=1) == "custom_concat_result" ) assert pico_df.get_backend() == "Pico" assert cluster_df.get_backend() == "Cluster" def test_moving_pico_to_cluster_in_place_calls_set_backend_only_once_github_issue_7490( pico_df, cluster_df ): with mock.patch.object( pd.DataFrame, "set_backend", wraps=pico_df.set_backend ) as mock_set_backend: pico_df.set_backend(cluster_df.get_backend(), inplace=True) assert pico_df.get_backend() == "Cluster" mock_set_backend.assert_called_once_with("Cluster", inplace=True) def test_cast_to_second_backend_with___init__(pico_df, cluster_df): df3 = pd.DataFrame({"pico": pico_df.iloc[:, 0], "cluster": cluster_df.iloc[:, 0]}) assert ( pico_df.get_backend() == "Pico" ) # pico stays despite in-place casting by iloc assert cluster_df.get_backend() == "Cluster" assert df3.get_backend() == "Cluster" # result should be on cluster def test_cast_to_first_backend(pico_df, cluster_df): df3 = pd.concat([cluster_df, pico_df], axis=1) assert pico_df.get_backend() == "Cluster" # pico_df was cast in place by concat assert cluster_df.get_backend() == "Cluster" assert df3.get_backend() == cluster_df.get_backend() # result should be on cluster def test_cast_to_first_backend_with_concat_uses_first_backend_api_override( pico_df, cluster_df ): register_pd_accessor(name="concat", backend="Cluster")( lambda *args, **kwargs: "custom_concat_result" ) assert pd.concat([cluster_df, pico_df], axis=1) == "custom_concat_result" assert pico_df.get_backend() == "Cluster" # pico was cast in place by concat assert cluster_df.get_backend() == "Cluster" def test_cast_to_first_backend_with___init__(pico_df, cluster_df): df3 = pd.DataFrame( { "cluster": cluster_df.iloc[:, 0], "pico": pico_df.iloc[:, 0], } ) assert pico_df.get_backend() == "Pico" # Pico not cast in place by iloc assert cluster_df.get_backend() == "Cluster" assert df3.get_backend() == "Cluster" # result should be on cluster def test_self_cost_causes_move(cloud_high_self_df, cluster_df): """ Test that ``self_cost`` is being properly considered. Cost to stay on cloud_high_self is HIGH, but moving to cluster is MEDIUM. Cost to stay on cluster is ZERO, and moving to cloud_high_self is MEDIUM. With two dataframes, one on each backend, the total cost of using ``cloud_high_self`` as the final backend is: ``stay_cost(cloud_high_self) + move_cost(cluster->cloud_high_self)`` which is ``HIGH + MEDIUM``. The total cost of using ``cluster`` as the final backend is: ``stay_cost(cluster) + move_cost(cloud_high_self->cluster)`` which is ``ZERO + MEDIUM``. So we should select ``cluster``. """ result = pd.concat([cloud_high_self_df, cluster_df]) assert result.get_backend() == "Cluster" result = pd.concat([cluster_df, cloud_high_self_df]) assert result.get_backend() == "Cluster" @pytest.mark.parametrize( "df1, df2, df3, df4, expected_result_backend", [ # no-op ("cloud_df", "cloud_df", "cloud_df", "cloud_df", "Cloud"), # moving all dfs to cloud is 1250, moving to cluster is 1000 # regardless of how they are ordered ("pico_df", "local_df", "cluster_df", "cloud_df", "Cluster"), ("cloud_df", "local_df", "cluster_df", "pico_df", "Cluster"), ("cloud_df", "cluster_df", "local_df", "pico_df", "Cluster"), ("cloud_df", "cloud_df", "local_df", "pico_df", "Cloud"), # Still move everything to cloud ("pico_df", "pico_df", "pico_df", "cloud_df", "Cloud"), ("pico_df", "pico_df", "local_df", "cloud_df", "Cloud"), ], ) def test_mixed_dfs(df1, df2, df3, df4, expected_result_backend, request): df1 = request.getfixturevalue(df1) df2 = request.getfixturevalue(df2) df3 = request.getfixturevalue(df3) df4 = request.getfixturevalue(df4) if expected_result_backend is None: with pytest.raises(ValueError): pd.concat(axis=1, objs=[df1, df2, df3, df4]) else: result = pd.concat(axis=1, objs=[df1, df2, df3, df4]) assert result.get_backend() == expected_result_backend def test_adversarial_high(adversarial_df, cluster_df): with pytest.raises(ValueError): pd.concat([adversarial_df, cluster_df], axis=1) def test_adversarial_low(adversarial_df, cloud_df): with pytest.raises(ValueError): pd.concat([adversarial_df, cloud_df], axis=1) def test_two_two_qc_types_default_rhs(default_df, cluster_df): # none of the query compilers know about each other here # so we default to the caller df3 = pd.concat([default_df, cluster_df], axis=1) assert default_df.get_backend() == "Test_Casting_Default" assert ( cluster_df.get_backend() == "Test_Casting_Default" ) # in place cast to default by concat assert df3.get_backend() == default_df.get_backend() # should move to default def test_two_two_qc_types_default_lhs(default_df, cluster_df): # none of the query compilers know about each other here # so we default to the caller df3 = pd.concat([cluster_df, default_df], axis=1) assert default_df.get_backend() == "Cluster" # in place cast to Cluster by concat assert cluster_df.get_backend() == "Cluster" assert df3.get_backend() == cluster_df.get_backend() # should move to cluster def test_two_two_qc_types_default_2_rhs(default_df, cloud_df): # cloud knows a bit about costing; so we prefer moving to there df3 = pd.concat([default_df, cloud_df], axis=1) assert default_df.get_backend() == "Cloud" # inplace cast to Cloud by concat assert cloud_df.get_backend() == "Cloud" assert df3.get_backend() == cloud_df.get_backend() # should move to cloud def test_two_two_qc_types_default_2_lhs(default_df, cloud_df): # cloud knows a bit about costing; so we prefer moving to there df3 = pd.concat([cloud_df, default_df], axis=1) assert default_df.get_backend() == "Cloud" # inplace cast to Cloud by concat assert cloud_df.get_backend() == "Cloud" assert df3.get_backend() == cloud_df.get_backend() # should move to cloud def test_default_to_caller(default_df, default2_df): # No qc knows anything; default to caller df3 = pd.concat([default_df, default2_df], axis=1) assert df3.get_backend() == default_df.get_backend() # should stay on caller df3 = pd.concat([default2_df, default_df], axis=1) assert df3.get_backend() == default2_df.get_backend() # should stay on caller df3 = pd.concat([default_df, default_df], axis=1) assert df3.get_backend() == default_df.get_backend() # no change def test_no_qc_to_calculate(): calculator = BackendCostCalculator( operation_arguments=MappingProxyType({}), api_cls_name=None, operation="operation0", query_compilers=[], preop_switch=False, ) with pytest.raises(ValueError): calculator.calculate() def test_qc_default_self_cost(default_df, default2_df): assert ( default_df._query_compiler.move_to_cost( other_qc_type=type(default2_df._query_compiler), api_cls_name=None, operation="operation0", arguments=MappingProxyType({}), ) is None ) assert ( default_df._query_compiler.move_to_cost( other_qc_type=type(default_df._query_compiler), api_cls_name=None, operation="operation0", arguments=MappingProxyType({}), ) is QCCoercionCost.COST_ZERO ) def test_qc_casting_changed_operation(pico_df, cloud_df): pico_df1 = pico_df cloud_df1 = cloud_df native_cdf2 = cloud_df1._to_pandas() native_pdf2 = pico_df1._to_pandas() expected = native_cdf2 + native_pdf2 # test both directions df_cast_to_rhs = pico_df1 + cloud_df1 df_cast_to_lhs = cloud_df1 + pico_df1 assert df_cast_to_rhs._to_pandas().equals(expected) assert df_cast_to_lhs._to_pandas().equals(expected) def test_qc_mixed_loc(pico_df, cloud_df): pico_df1 = pico_df cloud_df1 = cloud_df assert pico_df1[pico_df1[0][0]][cloud_df1[0][1]] == 1 assert pico_df1[cloud_df1[0][0]][pico_df1[0][1]] == 1 assert cloud_df1[pico_df1[0][0]][pico_df1[0][1]] == 1 def test_merge_in_place(default_df, lazy_df, cloud_df): # lazy_df tries to pawn off work on other engines df = default_df.merge(lazy_df) assert df.get_backend() is default_df.get_backend() # Both arguments now have the same qc type assert lazy_df.get_backend() is default_df.get_backend() with config_context(BackendMergeCastInPlace=False): lazy_df = lazy_df.move_to("Lazy") cloud_df = cloud_df.move_to("Cloud") df = cloud_df.merge(lazy_df) assert df.get_backend() == cloud_df.get_backend() assert lazy_df.get_backend() == "Lazy" assert cloud_df.get_backend() == "Cloud" def test_information_asymmetry(default_df, cloud_df, eager_df, lazy_df): # normally, the default query compiler should be chosen # here, but since eager knows about default, but not # the other way around, eager has a special ability to # control the directionality of the cast. df = default_df.merge(eager_df) assert df.get_backend() == eager_df.get_backend() df = cloud_df.merge(eager_df) assert df.get_backend() == eager_df.get_backend() # lazy_df tries to pawn off work on other engines df = default_df.merge(lazy_df) assert df.get_backend() == default_df.get_backend() df = cloud_df.merge(lazy_df) assert df.get_backend() == cloud_df.get_backend() def test_setitem_in_place_with_self_switching_backend(cloud_df, local_df): local_df.iloc[1, 0] = cloud_df.iloc[1, 0] + local_df.iloc[1, 0] # compute happens in cloud, but we have to make sure that we propagate the # in-place update to the local_df df_equals( local_df, pandas.DataFrame( [ 0, 2, 2, ] ), ) assert local_df.get_backend() == "Local_Machine" assert cloud_df.get_backend() == "Cloud" @pytest.mark.parametrize("pin_local", [True, False], ids=["pinned", "unpinned"]) def test_switch_local_to_cloud_with_iloc___setitem__(local_df, cloud_df, pin_local): if pin_local: local_df = local_df.pin_backend() local_df.iloc[:, 0] = cloud_df.iloc[:, 0] + 1 expected_pandas = local_df._to_pandas() expected_pandas.iloc[:, 0] = cloud_df._to_pandas().iloc[:, 0] + 1 df_equals(local_df, expected_pandas) assert local_df.get_backend() == "Local_Machine" if pin_local else "Cloud" # This test should force the creation of a dataframe which # is too large for the backend and verify that it stays there # because there are no other options def test_single_backend_merge_no_good_options(): with backend_test_context( test_backend="Small_Data_Local", choices=["Small_Data_Local"], ): df1 = pd.DataFrame({"a": [1] * 100}) df1["two"] = pd.to_datetime(df1["a"]) assert df1.get_backend() == "Small_Data_Local" def test_stay_or_move_evaluation(cloud_high_self_df, default_df): default_cls = type(default_df._get_query_compiler()) cloud_cls = type(cloud_high_self_df._get_query_compiler()) empty_arguments = MappingProxyType({}) stay_cost = cloud_high_self_df._get_query_compiler().stay_cost( "Series", "myop", arguments=empty_arguments ) move_cost = cloud_high_self_df._get_query_compiler().move_to_cost( default_cls, "Series", "myop", arguments=empty_arguments ) if stay_cost > move_cost: df = cloud_high_self_df.move_to("Test_Casting_Default") else: assert False stay_cost = df._get_query_compiler().stay_cost( "Series", "myop", arguments=empty_arguments ) move_cost = df._get_query_compiler().move_to_cost( cloud_cls, "Series", "myop", arguments=empty_arguments ) assert stay_cost is not None assert move_cost is None def test_max_shape(cloud_df): # default implementation matches df.shape assert cloud_df.shape == cloud_df._query_compiler._max_shape() class TestSwitchBackendPostOpDependingOnDataSize: def test_read_json(self): with backend_test_context( test_backend="Big_Data_Cloud", choices=("Big_Data_Cloud", "Small_Data_Local"), ): big_json = json.dumps({"col0": list(range(BIG_DATA_CLOUD_MIN_NUM_ROWS))}) small_json = json.dumps( {"col0": list(range(BIG_DATA_CLOUD_MIN_NUM_ROWS - 1))} ) assert pd.read_json(StringIO(big_json)).get_backend() == "Big_Data_Cloud" assert pd.read_json(StringIO(small_json)).get_backend() == "Big_Data_Cloud" register_function_for_post_op_switch( class_name=None, backend="Big_Data_Cloud", method="read_json" ) assert pd.read_json(StringIO(big_json)).get_backend() == "Big_Data_Cloud" assert ( pd.read_json(StringIO(small_json)).get_backend() == "Small_Data_Local" ) @backend_test_context( test_backend="Big_Data_Cloud", choices=("Big_Data_Cloud", "Small_Data_Local"), ) def test_read_json_logging_for_post_op_switch(self, caplog): register_function_for_post_op_switch( class_name=None, backend="Big_Data_Cloud", method="read_json" ) with caplog.at_level(level=logging.INFO, logger=DEFAULT_LOGGER_NAME): assert ( pd.read_json( StringIO( json.dumps( {"col0": list(range(BIG_DATA_CLOUD_MIN_NUM_ROWS - 1))} ) ) ).get_backend() == "Small_Data_Local" ) log_records = caplog.records assert len(log_records) == 2 assert log_records[0].name == DEFAULT_LOGGER_NAME assert log_records[0].levelno == logging.INFO assert log_records[0].message.startswith( "After modin.pandas function read_json, considered moving to backend Small_Data_Local with" ) assert log_records[1].name == DEFAULT_LOGGER_NAME assert log_records[1].levelno == logging.INFO assert log_records[1].message.startswith( "Chose to move to backend Small_Data_Local" ) @backend_test_context( test_backend="Big_Data_Cloud", choices=("Big_Data_Cloud", "Small_Data_Local"), ) def test_read_json_logging_for_post_op_not_switch(self, caplog): register_function_for_post_op_switch( class_name=None, backend="Big_Data_Cloud", method="read_json" ) with caplog.at_level(level=logging.INFO, logger=DEFAULT_LOGGER_NAME): assert ( pd.read_json( StringIO( json.dumps({"col0": list(range(BIG_DATA_CLOUD_MIN_NUM_ROWS))}) ) ).get_backend() == "Big_Data_Cloud" ) log_records = caplog.records assert len(log_records) == 2 assert log_records[0].name == DEFAULT_LOGGER_NAME assert log_records[0].levelno == logging.INFO assert log_records[0].message.startswith( "After modin.pandas function read_json, considered moving to backend Small_Data_Local with" ) assert log_records[1].name == DEFAULT_LOGGER_NAME assert log_records[1].levelno == logging.INFO assert log_records[1].message.startswith( "Chose not to switch backends after operation read_json" ) @backend_test_context( test_backend="Big_Data_Cloud", choices=("Big_Data_Cloud", "Small_Data_Local"), ) def test_progress_bar_shows_modin_pandas_for_general_functions(self): """Test that progress bar messages show 'modin.pandas.read_json' instead of 'None.read_json' for general functions.""" with mock.patch("tqdm.auto.trange") as mock_trange: mock_trange.return_value = range(2) # Register a post-op switch for read_json (general function with class_name=None) register_function_for_post_op_switch( class_name=None, backend="Big_Data_Cloud", method="read_json" ) # Create a small dataset that will trigger backend switch and show progress bar json_input = json.dumps( {"col0": list(range(BIG_DATA_CLOUD_MIN_NUM_ROWS - 1))} ) # This should trigger a backend switch and show progress bar result_df = pd.read_json(StringIO(json_input)) assert result_df.get_backend() == "Small_Data_Local" # Verify that trange was called with correct progress bar message mock_trange.assert_called_once() call_args = mock_trange.call_args desc = call_args[1]["desc"] # Get the 'desc' keyword argument assert desc.startswith( "Transfer: Big_Dat... → Small_D... | read_json ≃ (9, 1) " ) def test_agg(self): with backend_test_context( test_backend="Big_Data_Cloud", choices=("Big_Data_Cloud", "Small_Data_Local"), ): df = pd.DataFrame([[1, 2], [3, 4]]) assert df.get_backend() == "Big_Data_Cloud" assert df.sum().get_backend() == "Big_Data_Cloud" register_function_for_post_op_switch( class_name="DataFrame", backend="Big_Data_Cloud", method="sum" ) assert df.get_backend() == "Big_Data_Cloud" assert df.sum().get_backend() == "Small_Data_Local" def test_agg_pinned(self): # The operation in test_agg would naturally cause an automatic switch, but the # absence of AutoSwitchBackend or the presence of a pin on the frame prevent this # switch from happening. with backend_test_context( test_backend="Big_Data_Cloud", choices=("Big_Data_Cloud", "Small_Data_Local"), ): register_function_for_post_op_switch( class_name="DataFrame", backend="Big_Data_Cloud", method="sum" ) # No pin or config, should switch df = pd.DataFrame([[1, 2], [3, 4]]) assert df.get_backend() == "Big_Data_Cloud" assert df.sum().get_backend() == "Small_Data_Local" # config set to false, should not switch with config_context(AutoSwitchBackend=False): df = pd.DataFrame([[1, 2], [3, 4]]) assert df.get_backend() == "Big_Data_Cloud" assert df.sum().get_backend() == "Big_Data_Cloud" # no config, but data is pinned df = pd.DataFrame([[1, 2], [3, 4]]).pin_backend() assert df.get_backend() == "Big_Data_Cloud" assert df.sum().get_backend() == "Big_Data_Cloud" # a frame-level pin remains valid across a transformation df_copy = df + 1 assert df_copy.get_backend() == "Big_Data_Cloud" assert df_copy.sum().get_backend() == "Big_Data_Cloud" # unpinning df allows a switch again df = df.unpin_backend() assert df.get_backend() == "Big_Data_Cloud" assert df.sum().get_backend() == "Small_Data_Local" df_copy = df + 1 assert df_copy.get_backend() == "Big_Data_Cloud" assert df_copy.sum().get_backend() == "Small_Data_Local" # check in-place pin/unpin operations df.pin_backend(inplace=True) assert df.get_backend() == "Big_Data_Cloud" assert df.sum().get_backend() == "Big_Data_Cloud" df.unpin_backend(inplace=True) assert df.get_backend() == "Big_Data_Cloud" assert df.sum().get_backend() == "Small_Data_Local" @pytest.mark.parametrize( "num_groups, expected_backend", [ (BIG_DATA_CLOUD_MIN_NUM_ROWS - 1, "Small_Data_Local"), (BIG_DATA_CLOUD_MIN_NUM_ROWS, "Big_Data_Cloud"), ], ) @pytest.mark.parametrize( "groupby_class,operation", [ param( "DataFrameGroupBy", lambda df: df.groupby("col0").sum(), id="DataFrameGroupBy", ), param( "SeriesGroupBy", lambda df: df.groupby("col0")["col1"].sum(), id="SeriesGroupBy", ), ], ) def test_dataframe_groupby_agg_switches_for_small_result( self, num_groups, expected_backend, operation, groupby_class ): with backend_test_context( test_backend="Big_Data_Cloud", choices=("Big_Data_Cloud", "Small_Data_Local"), ): modin_df, pandas_df = create_test_dfs( { "col0": list(range(num_groups)), "col1": list(range(1, num_groups + 1)), } ) assert modin_df.get_backend() == "Big_Data_Cloud" assert operation(modin_df).get_backend() == "Big_Data_Cloud" register_function_for_post_op_switch( class_name=groupby_class, backend="Big_Data_Cloud", method="sum" ) assert modin_df.get_backend() == "Big_Data_Cloud" modin_result = operation(modin_df) pandas_result = operation(pandas_df) df_equals(modin_result, pandas_result) assert modin_result.get_backend() == expected_backend assert modin_df.get_backend() == "Big_Data_Cloud" @pytest.mark.parametrize( "groupby_class,operation", [ param( "DataFrameGroupBy", lambda groupby: groupby.sum(), id="DataFrameGroupBy", ), param( "SeriesGroupBy", lambda groupby: groupby["col1"].sum(), id="SeriesGroupBy", ), ], ) @pytest.mark.parametrize( "auto_switch_backend", [True, False], ids=lambda param: f"auto_switch_backend_{param}", ) def test_auto_switch_config_can_disable_groupby_agg_auto_switch( self, operation, groupby_class, auto_switch_backend, ): num_groups = BIG_DATA_CLOUD_MIN_NUM_ROWS - 1 with backend_test_context( test_backend="Big_Data_Cloud", choices=("Big_Data_Cloud", "Small_Data_Local"), ), config_context(AutoSwitchBackend=auto_switch_backend): modin_groupby, pandas_groupby = ( df.groupby("col0") for df in create_test_dfs( { "col0": list(range(num_groups)), "col1": list(range(1, num_groups + 1)), } ) ) assert modin_groupby.get_backend() == "Big_Data_Cloud" assert operation(modin_groupby).get_backend() == "Big_Data_Cloud" register_function_for_post_op_switch( class_name=groupby_class, backend="Big_Data_Cloud", method="sum" ) assert modin_groupby.get_backend() == "Big_Data_Cloud" modin_result = operation(modin_groupby) pandas_result = operation(pandas_groupby) df_equals(modin_result, pandas_result) assert modin_result.get_backend() == ( "Small_Data_Local" if auto_switch_backend else "Big_Data_Cloud" ) assert modin_groupby.get_backend() == "Big_Data_Cloud" @pytest.mark.parametrize( "groupby_class,groupby_operation,agg_operation", [ param( "DataFrameGroupBy", lambda df: df.groupby("col0"), lambda groupby: groupby.sum(), id="DataFrameGroupBy", ), param( "SeriesGroupBy", lambda df: df.groupby("col0")["col1"], lambda groupby: groupby.sum(), id="SeriesGroupBy", ), ], ) @backend_test_context( test_backend="Big_Data_Cloud", choices=("Big_Data_Cloud", "Small_Data_Local"), ) def test_pinned_dataframe_prevents_groupby_backend_switch( self, groupby_class, groupby_operation, agg_operation ): """Test that pinning a DataFrame prevents groupby operations from switching backends.""" modin_df, pandas_df = create_test_dfs( { "col0": list(range(BIG_DATA_CLOUD_MIN_NUM_ROWS - 1)), "col1": list(range(1, BIG_DATA_CLOUD_MIN_NUM_ROWS)), } ) assert modin_df.get_backend() == "Big_Data_Cloud" # Pin the DataFrame modin_df.pin_backend(inplace=True) assert modin_df.is_backend_pinned() # Create groupby object - should inherit pin status from dataframe modin_groupby = groupby_operation(modin_df) pandas_groupby = groupby_operation(pandas_df) assert modin_groupby.is_backend_pinned() # Inherited from DataFrame # Register a post-op switch that would normally move to Small_Data_Local register_function_for_post_op_switch( class_name=groupby_class, backend="Big_Data_Cloud", method="sum" ) # The operation should stay on Big_Data_Cloud due to inherited pinning modin_result = agg_operation(modin_groupby) pandas_result = agg_operation(pandas_groupby) df_equals(modin_result, pandas_result) assert modin_result.get_backend() == "Big_Data_Cloud" @pytest.mark.parametrize( "groupby_class,groupby_operation,agg_operation", [ param( "DataFrameGroupBy", lambda df: df.groupby("col0"), lambda groupby: groupby.sum(), id="DataFrameGroupBy", ), param( "SeriesGroupBy", lambda df: df.groupby("col0")["col1"], lambda groupby: groupby.sum(), id="SeriesGroupBy", ), ], ) @pytest.mark.parametrize("inplace", [True, False], ids=["inplace", "not_inplace"]) @backend_test_context( test_backend="Big_Data_Cloud", choices=("Big_Data_Cloud", "Small_Data_Local"), ) def test_pinned_groupby_prevents_backend_switch( self, groupby_class, groupby_operation, agg_operation, inplace ): """Test that pinning a GroupBy object prevents operations from switching backends.""" modin_df, pandas_df = create_test_dfs( { "col0": list(range(BIG_DATA_CLOUD_MIN_NUM_ROWS - 1)), "col1": list(range(1, BIG_DATA_CLOUD_MIN_NUM_ROWS)), } ) assert modin_df.get_backend() == "Big_Data_Cloud" # Create groupby object and pin it modin_groupby = groupby_operation(modin_df) pandas_groupby = groupby_operation(pandas_df) if inplace: modin_groupby.pin_backend(inplace=True) assert modin_groupby.is_backend_pinned() else: pinned_groupby = modin_groupby.pin_backend(inplace=False) assert not modin_groupby.is_backend_pinned() assert pinned_groupby.is_backend_pinned() modin_groupby = pinned_groupby # Register a post-op switch that would normally move to Small_Data_Local register_function_for_post_op_switch( class_name=groupby_class, backend="Big_Data_Cloud", method="sum" ) # The operation should stay on Big_Data_Cloud due to pinning modin_result = agg_operation(modin_groupby) pandas_result = agg_operation(pandas_groupby) df_equals(modin_result, pandas_result) assert modin_result.get_backend() == "Big_Data_Cloud" class TestSwitchBackendPreOp: @pytest.mark.parametrize( "data_size, expected_backend", [ param( BIG_DATA_CLOUD_MIN_NUM_ROWS - 1, "Small_Data_Local", id="small_data_should_move_to_small_engine", ), param( BIG_DATA_CLOUD_MIN_NUM_ROWS, "Big_Data_Cloud", id="big_data_should_stay_in_cloud", ), ], ) def test_describe_switches_depending_on_data_size( self, data_size, expected_backend ): # Mock the default describe() implementation so that we can check that we # are calling it with the correct backend as an input. We can't just inspect # the mock's call_args_list because call_args_list keeps a reference to the # input dataframe, whose backend may change in place. mock_describe = mock.Mock( wraps=pd.DataFrame._extensions[None]["describe"], side_effect=( # 1) Record the input backend lambda self, *args, **kwargs: setattr( mock_describe, "_last_input_backend", self.get_backend() ) # 2) Return mock.DEFAULT so that we fall back to the original # describe() implementation or mock.DEFAULT ), ) with backend_test_context( test_backend="Big_Data_Cloud", choices=("Big_Data_Cloud", "Small_Data_Local"), ): df = pd.DataFrame(list(range(data_size))) with mock.patch.dict( pd.DataFrame._extensions[None], {"describe": mock_describe} ): # Before we register the post-op switch, the describe() method # should not trigger auto-switch. assert df.get_backend() == "Big_Data_Cloud" describe_result = df.describe() df_equals(describe_result, df._to_pandas().describe()) assert describe_result.get_backend() == "Big_Data_Cloud" assert df.get_backend() == "Big_Data_Cloud" mock_describe.assert_called_once() assert mock_describe._last_input_backend == "Big_Data_Cloud" mock_describe.reset_mock() register_function_for_pre_op_switch( class_name="DataFrame", backend="Big_Data_Cloud", method="describe" ) # Now that we've registered the pre-op switch, the describe() call # should trigger auto-switch. assert df.get_backend() == "Big_Data_Cloud" describe_result = df.describe() df_equals(describe_result, df._to_pandas().describe()) assert describe_result.get_backend() == expected_backend assert df.get_backend() == expected_backend mock_describe.assert_called_once() assert mock_describe._last_input_backend == expected_backend def test_read_json_with_extensions(self): json_input = json.dumps({"col0": [1]}) # Mock the read_json implementation for each backend so that we can check # that we are calling the correct implementation. Also, we have to make # the extension methods produce dataframes with the correct backends. pandas_read_json = mock.Mock( wraps=( lambda *args, **kwargs: _GENERAL_EXTENSIONS[None]["read_json"]( *args, **kwargs ).move_to("Small_Data_Local") ) ) pandas_read_json.__name__ = "read_json" cloud_read_json = mock.Mock( wraps=( lambda *args, **kwargs: _GENERAL_EXTENSIONS[None]["read_json"]( *args, **kwargs ).move_to("Big_Data_Cloud") ) ) cloud_read_json.__name__ = "read_json" register_pd_accessor("read_json", backend="Small_Data_Local")(pandas_read_json) register_pd_accessor("read_json", backend="Big_Data_Cloud")(cloud_read_json) with backend_test_context( test_backend="Big_Data_Cloud", choices=("Big_Data_Cloud", "Small_Data_Local"), ): df = pd.read_json(StringIO(json_input)) assert df.get_backend() == "Big_Data_Cloud" pandas_read_json.assert_not_called() cloud_read_json.assert_called_once() register_function_for_pre_op_switch( class_name=None, backend="Big_Data_Cloud", method="read_json" ) pandas_read_json.reset_mock() cloud_read_json.reset_mock() df = pd.read_json(StringIO(json_input)) assert df.get_backend() == "Small_Data_Local" pandas_read_json.assert_called_once() cloud_read_json.assert_not_called() def test_read_json_without_extensions(self): json_input = json.dumps({"col0": [1]}) with backend_test_context( test_backend="Big_Data_Cloud", choices=("Big_Data_Cloud", "Small_Data_Local"), ): df = pd.read_json(StringIO(json_input)) assert df.get_backend() == "Big_Data_Cloud" register_function_for_pre_op_switch( class_name=None, backend="Big_Data_Cloud", method="read_json" ) df = pd.read_json(StringIO(json_input)) assert df.get_backend() == "Small_Data_Local" @pytest.mark.parametrize( "data_size, expected_backend", [ param( BIG_DATA_CLOUD_MIN_NUM_ROWS - 1, "Small_Data_Local", id="small_data_should_move_to_small_engine", ), param( BIG_DATA_CLOUD_MIN_NUM_ROWS, "Big_Data_Cloud", id="big_data_should_stay_in_cloud", ), ], ) def test_iloc_setitem_switches_depending_on_data_size( self, data_size, expected_backend ): with backend_test_context( test_backend="Big_Data_Cloud", choices=("Big_Data_Cloud", "Small_Data_Local"), ): md_df, pd_df = create_test_dfs(list(range(data_size))) assert md_df.get_backend() == "Big_Data_Cloud" eval_general( md_df, pd_df, lambda df: df.iloc.__setitem__((0, 0), -1), __inplace__=True, ) assert md_df.get_backend() == "Big_Data_Cloud" register_function_for_pre_op_switch( class_name="_iLocIndexer", backend="Big_Data_Cloud", method="__setitem__", ) eval_general( md_df, pd_df, lambda df: df.iloc.__setitem__((0, 0), 0), __inplace__=True, ) assert md_df.get_backend() == expected_backend def test_iloc_pinned(self): # The operation in test_iloc would naturally cause an automatic switch, but the # absence of AutoSwitchBackend or the presence of a pin on the frame prevent this # switch from happening. data_size = BIG_DATA_CLOUD_MIN_NUM_ROWS - 1 with backend_test_context( test_backend="Big_Data_Cloud", choices=("Big_Data_Cloud", "Small_Data_Local"), ): register_function_for_pre_op_switch( class_name="_iLocIndexer", backend="Big_Data_Cloud", method="__setitem__", ) # No pin or config, should switch df = pd.DataFrame(list(range(data_size))) assert df.get_backend() == "Big_Data_Cloud" df.iloc[(0, 0)] = -1 assert df.get_backend() == "Small_Data_Local" # config set to false, should not switch with config_context(AutoSwitchBackend=False): df = pd.DataFrame(list(range(data_size))) assert df.get_backend() == "Big_Data_Cloud" df.iloc[(0, 0)] = -2 assert df.get_backend() == "Big_Data_Cloud" # no config, but data is pinned df = pd.DataFrame(list(range(data_size))).pin_backend() assert df.get_backend() == "Big_Data_Cloud" df.iloc[(0, 0)] = -3 assert df.get_backend() == "Big_Data_Cloud" # a frame-level pin remains valid across a transformation df_copy = df + 1 assert df_copy.get_backend() == "Big_Data_Cloud" df_copy.iloc[(0, 0)] = -4 assert df_copy.get_backend() == "Big_Data_Cloud" # unpinning df allows a switch again df.unpin_backend(inplace=True) assert df.get_backend() == "Big_Data_Cloud" df.iloc[(0, 0)] = -5 assert df.get_backend() == "Small_Data_Local" # An in-place set_backend operation clears the pin df.move_to("Big_Data_Cloud", inplace=True) # check in-place pin/unpin operations df.pin_backend(inplace=True) assert df.get_backend() == "Big_Data_Cloud" df.iloc[(0, 0)] = -6 assert df.get_backend() == "Big_Data_Cloud" df.unpin_backend(inplace=True) assert df.get_backend() == "Big_Data_Cloud" df.iloc[(0, 0)] = -7 assert df.get_backend() == "Small_Data_Local" @pytest.mark.parametrize( "args, kwargs, expected_backend", ( param((), {}, "Small_Data_Local", id="no_args_or_kwargs"), param(([1],), {}, "Small_Data_Local", id="small_list_data_in_arg"), param( (list(range(BIG_DATA_CLOUD_MIN_NUM_ROWS)),), {}, "Small_Data_Local", id="big_list_data_in_arg", ), param((), {"data": [1]}, "Small_Data_Local", id="list_data_in_kwarg"), param( (), {"data": pandas.Series([1])}, "Small_Data_Local", id="series_data_in_kwarg", ), param( (), {"query_compiler": CloudForBigDataQC(pandas.DataFrame([0, 1, 2]))}, "Big_Data_Cloud", id="cloud_query_compiler_in_kwarg", ), param( (), {"query_compiler": LocalForSmallDataQC(pandas.DataFrame([0, 1, 2]))}, "Small_Data_Local", id="small_query_compiler_in_kwarg", ), ), ) @pytest.mark.parametrize("data_class", [pd.DataFrame, pd.Series]) def test___init___with_in_memory_data_uses_native_query_compiler( self, args, kwargs, expected_backend, data_class ): register_function_for_pre_op_switch( class_name=data_class.__name__, method="__init__", backend="Big_Data_Cloud", ) with backend_test_context( test_backend="Big_Data_Cloud", choices=("Big_Data_Cloud", "Small_Data_Local"), ): assert data_class(*args, **kwargs).get_backend() == expected_backend @pytest.mark.parametrize("data_class", [pd.DataFrame, pd.Series]) @backend_test_context( test_backend="Big_Data_Cloud", choices=("Big_Data_Cloud", "Small_Data_Local") ) @pytest.mark.parametrize( "auto_switch_backend,expected_backend", [ (True, "Small_Data_Local"), (False, "Big_Data_Cloud"), ], ) def test_auto_switch_backend_disabled_prevents___init__auto_switch( self, auto_switch_backend, expected_backend, data_class ): register_function_for_pre_op_switch( class_name=data_class.__name__, method="__init__", backend="Big_Data_Cloud", ) with config_context(AutoSwitchBackend=auto_switch_backend): assert data_class([1, 2, 3]).get_backend() == expected_backend @pytest.mark.parametrize( "num_input_rows, expected_backend", [ param( BIG_DATA_CLOUD_MIN_NUM_ROWS - 1, "Small_Data_Local", ), (BIG_DATA_CLOUD_MIN_NUM_ROWS, "Big_Data_Cloud"), ], ) @pytest.mark.parametrize( "groupby_class,operation", [ param( "DataFrameGroupBy", lambda df: df.groupby("col0").apply(lambda x: x + 1), id="DataFrameGroupBy", ), param( "SeriesGroupBy", lambda df: df.groupby("col0")["col1"].apply(lambda x: x + 1), id="SeriesGroupBy", ), ], ) def test_groupby_apply_switches_for_small_input( self, num_input_rows, expected_backend, operation, groupby_class ): with backend_test_context( test_backend="Big_Data_Cloud", choices=("Big_Data_Cloud", "Small_Data_Local"), ): modin_df, pandas_df = create_test_dfs( { "col0": list(range(num_input_rows)), "col1": list(range(1, num_input_rows + 1)), } ) assert modin_df.get_backend() == "Big_Data_Cloud" assert operation(modin_df).get_backend() == "Big_Data_Cloud" register_function_for_pre_op_switch( class_name=groupby_class, backend="Big_Data_Cloud", method="apply" ) modin_result = operation(modin_df) pandas_result = operation(pandas_df) df_equals(modin_result, pandas_result) assert modin_result.get_backend() == expected_backend if groupby_class == "DataFrameGroupBy": assert modin_df.get_backend() == expected_backend # The original dataframe does not move with the SeriesGroupBy if groupby_class == "SeriesGroupBy": assert modin_df.get_backend() == "Big_Data_Cloud" def test_T_switches(self): # Ensure that calling df.T triggers a switch (GH#7653) with backend_test_context( test_backend="Big_Data_Cloud", choices=("Big_Data_Cloud", "Small_Data_Local"), ): modin_df, pandas_df = create_test_dfs( {"col0": list(range(BIG_DATA_CLOUD_MIN_NUM_ROWS - 1))} ) assert modin_df.get_backend() == "Big_Data_Cloud" # Registering transpose should be sufficient to cause T to trigger a switch. register_function_for_pre_op_switch( class_name="DataFrame", backend="Big_Data_Cloud", method="transpose" ) modin_result = modin_df.T pandas_result = pandas_df.T df_equals(modin_result, pandas_result) assert modin_result.get_backend() == "Small_Data_Local" def test_concat_switch_point(self, pico_df, cloud_df, cloud_high_self_df): # When concat is a switch point, backends other than those present in arguments should be considered. with backend_test_context( test_backend="Cloud", choices=(*DEFAULT_TEST_BACKENDS, "Eager") ): register_function_for_pre_op_switch( class_name=None, backend="Cloud", method="concat" ) result = pd.concat([cloud_df, pico_df]) # concat causes in-place switching # the Eager backend will always steal everything assert pico_df.get_backend() == "Eager" assert cloud_df.get_backend() == "Eager" assert result.get_backend() == "Eager" pico_df.move_to("Pico", inplace=True) cloud_df.move_to("Cloud", inplace=True) with backend_test_context( test_backend="Cloud_High_Self", choices=("Cloud_High_Self", "Cloud") ): register_function_for_pre_op_switch( class_name=None, backend="Cloud_High_Self", method="concat" ) result = pd.concat([cloud_high_self_df, cloud_high_self_df]) assert cloud_high_self_df.get_backend() == "Cloud" assert result.get_backend() == "Cloud" @pytest.mark.parametrize("consider_all_backends", [True, False]) def test_consider_all_backends_flag( self, pico_df, cloud_df, cloud_high_self_df, consider_all_backends ): # When concat is a switch point, backends other than those present in arguments should be considered # if BackendJoinConsiderAllBackends is set. with backend_test_context( test_backend="Cloud", choices=(*DEFAULT_TEST_BACKENDS, "Eager") ), config_context(BackendJoinConsiderAllBackends=consider_all_backends): register_function_for_pre_op_switch( class_name=None, backend="Cloud", method="concat" ) result = pd.concat([cloud_df, pico_df]) # concat causes in-place switching if consider_all_backends: assert pico_df.get_backend() == "Eager" assert cloud_df.get_backend() == "Eager" assert result.get_backend() == "Eager" else: assert pico_df.get_backend() == "Cloud" assert cloud_df.get_backend() == "Cloud" assert result.get_backend() == "Cloud" def test_move_to_clears_pin(): # Pin status is reset to false after a set_backend call with backend_test_context( test_backend="Big_Data_Cloud", choices=("Big_Data_Cloud", "Small_Data_Local"), ): df = pd.DataFrame(list(range(10))) # in-place df.pin_backend(inplace=True) assert df.is_backend_pinned() df.move_to("Small_Data_Local", inplace=True) assert not df.is_backend_pinned() # not in-place intermediate = df.pin_backend().move_to("Big_Data_Cloud") assert not intermediate.is_backend_pinned() assert intermediate.pin_backend().is_backend_pinned() @pytest.mark.parametrize( "pin_backends, expected_backend", [ param( [("Small_Data_Local", False), ("Big_Data_Cloud", False)], "Small_Data_Local", id="no_pin", ), # no backend pinned param( [("Small_Data_Local", True), ("Big_Data_Cloud", False)], "Small_Data_Local", id="one_pin", ), # one backend is pinned, so move there param( [ ("Big_Data_Cloud", False), ("Small_Data_Local", True), ("Small_Data_Local", True), ], "Small_Data_Local", id="two_pin", ), # two identical pinned backends param( [("Small_Data_Local", True), ("Big_Data_Cloud", True)], None, id="conflict_pin", ), # conflicting pins raises ValueError ], ) def test_concat_with_pin(pin_backends, expected_backend): with backend_test_context( test_backend="Big_Data_Cloud", choices=("Big_Data_Cloud", "Small_Data_Local"), ): dfs = [ pd.DataFrame([1] * 10).move_to(backend)._set_backend_pinned(should_pin) for backend, should_pin in pin_backends ] if expected_backend is None: with pytest.raises( ValueError, match="Cannot combine arguments that are pinned to conflicting backends", ): pd.concat(dfs) else: result = pd.concat(dfs) assert result.is_backend_pinned() == any( df.is_backend_pinned() for df in dfs ) assert result.get_backend() == expected_backend df_equals( result, pandas.concat([pandas.DataFrame([1] * 10)] * len(pin_backends)) ) @pytest.mark.parametrize( "groupby_operation", [ param( lambda df: df.groupby("col0"), id="DataFrameGroupBy", ), param( lambda df: df.groupby("col0")["col1"], id="SeriesGroupBy", ), ], ) def test_pin_groupby_in_place(groupby_operation): """Test that groupby objects can be pinned with inplace=True.""" modin_df = pd.DataFrame( { "col0": list(range(BIG_DATA_CLOUD_MIN_NUM_ROWS - 1)), "col1": list(range(1, BIG_DATA_CLOUD_MIN_NUM_ROWS)), } ) groupby_object = groupby_operation(modin_df) assert not groupby_object.is_backend_pinned() groupby_object.pin_backend(inplace=True) assert groupby_object.is_backend_pinned() groupby_object.unpin_backend(inplace=True) assert not groupby_object.is_backend_pinned() @pytest.mark.parametrize( "groupby_operation", [ param( lambda df: df.groupby("col0"), id="DataFrameGroupBy", ), param( lambda df: df.groupby("col0")["col1"], id="SeriesGroupBy", ), ], ) def test_pin_groupby_not_in_place(groupby_operation): """Test that pin_backend works with inplace=False for groupby objects.""" original_groupby = groupby_operation(pd.DataFrame(columns=["col0", "col1"])) assert not original_groupby.is_backend_pinned() new_groupby = original_groupby.pin_backend(inplace=False) assert not original_groupby.is_backend_pinned() assert new_groupby.is_backend_pinned() @pytest.mark.parametrize( "groupby_operation", [ param( lambda df: df.groupby("col0"), id="DataFrameGroupBy", ), param( lambda df: df.groupby("col0")["col1"], id="SeriesGroupBy", ), ], ) def test_unpin_groupby_not_in_place(groupby_operation): """Test that unpin_backend works with inplace=False for groupby objects.""" original_groupby = groupby_operation(pd.DataFrame(columns=["col0", "col1"])) original_groupby.pin_backend(inplace=True) assert original_groupby.is_backend_pinned() new_groupby = original_groupby.unpin_backend(inplace=False) assert original_groupby.is_backend_pinned() assert not new_groupby.is_backend_pinned() @pytest.mark.parametrize( "data_type,data_factory,groupby_factory", [ param( "DataFrame", lambda: pd.DataFrame( { "col0": list(range(BIG_DATA_CLOUD_MIN_NUM_ROWS - 1)), "col1": list(range(1, BIG_DATA_CLOUD_MIN_NUM_ROWS)), } ), lambda obj: obj.groupby("col0"), id="DataFrame", ), param( "Series", lambda: pd.Series(list(range(1, BIG_DATA_CLOUD_MIN_NUM_ROWS)), name="data"), lambda obj: obj.groupby([0] * (BIG_DATA_CLOUD_MIN_NUM_ROWS - 1)), id="Series", ), ], ) def test_groupby_pinning_reflects_parent_object_pin_status( data_type, data_factory, groupby_factory ): """Test that groupby pinning inherits from parent object (DataFrame/Series) pin status but can be modified independently.""" modin_obj = data_factory() old_groupby_obj = groupby_factory(modin_obj) # Initially not pinned assert not old_groupby_obj.is_backend_pinned() assert not modin_obj.is_backend_pinned() # Pin the parent object - new groupby objects should inherit this modin_obj.pin_backend(inplace=True) # Create a new groupby object after pinning parent object new_groupby_obj = groupby_factory(modin_obj) # New groupby should inherit the pinned status assert new_groupby_obj.is_backend_pinned() assert modin_obj.is_backend_pinned() # But we can still modify groupby pinning independently new_groupby_obj.unpin_backend(inplace=True) # Parent object should remain pinned, groupby should be unpinned assert not new_groupby_obj.is_backend_pinned() assert modin_obj.is_backend_pinned() assert not old_groupby_obj.is_backend_pinned() old_groupby_obj.pin_backend(inplace=True) assert old_groupby_obj.is_backend_pinned() def test_second_init_only_calls_from_pandas_once_github_issue_7559(): with config_context(Backend="Big_Data_Cloud"): # Create a dataframe once first so that we can initialize the dummy # query compiler for the Big_Data_Cloud backend. pd.DataFrame([1]) with mock.patch.object( factories.Big_Data_CloudOnNativeFactory.io_cls.query_compiler_cls, "from_pandas", wraps=factories.Big_Data_CloudOnNativeFactory.io_cls.query_compiler_cls.from_pandas, ) as mock_from_pandas: pd.DataFrame([1]) mock_from_pandas.assert_called_once() def test_native_config(): qc = NativeQueryCompiler(pandas.DataFrame([0, 1, 2])) # Native Query Compiler gets a special configuration assert qc._TRANSFER_THRESHOLD == 0 assert qc._transfer_threshold() == NativePandasTransferThreshold.get() assert qc._MAX_SIZE_THIS_ENGINE_CAN_HANDLE == 1 assert qc._engine_max_size() == NativePandasMaxRows.get() oldmax = qc._engine_max_size() oldthresh = qc._transfer_threshold() with config_context(NativePandasMaxRows=123, NativePandasTransferThreshold=321): qc2 = NativeQueryCompiler(pandas.DataFrame([0, 1, 2])) assert qc2._transfer_threshold() == 321 assert qc2._engine_max_size() == 123 assert qc._engine_max_size() == 123 assert qc._transfer_threshold() == 321 # sub class configuration is unchanged class AQC(NativeQueryCompiler): pass subqc = AQC(pandas.DataFrame([0, 1, 2])) assert subqc._TRANSFER_THRESHOLD == 0 assert subqc._MAX_SIZE_THIS_ENGINE_CAN_HANDLE == 1 assert qc._engine_max_size() == oldmax assert qc._transfer_threshold() == oldthresh def test_cast_metrics(pico_df, cluster_df): try: count = 0 def test_handler(metric: str, value) -> None: nonlocal count if metric.startswith("modin.hybrid.merge"): count += 1 add_metric_handler(test_handler) df3 = pd.concat([pico_df, cluster_df], axis=1) assert df3.get_backend() == "Cluster" # result should be on cluster assert count == 7 finally: clear_metric_handler(test_handler) def test_switch_metrics(pico_df, cluster_df): with backend_test_context( test_backend="Big_Data_Cloud", choices=("Big_Data_Cloud", "Small_Data_Local"), ): try: count = 0 def test_handler(metric: str, value) -> None: nonlocal count if metric.startswith("modin.hybrid.auto"): count += 1 add_metric_handler(test_handler) register_function_for_pre_op_switch( class_name="DataFrame", backend="Big_Data_Cloud", method="describe", ) df = pd.DataFrame([1] * 10) assert df.get_backend() == "Big_Data_Cloud" df.describe() assert count == 8 finally: clear_metric_handler(test_handler) ================================================ FILE: modin/tests/pandas/native_df_interoperability/test_copy_on_write.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. # Tests interactions between a modin frame and a parent or child native pandas frame when one # object's metadata or data is modified. # Only valid on the native pandas backend. import functools import pandas import pytest import modin.pandas as pd from modin.config import Backend from modin.config import context as config_context @pytest.fixture(scope="module", autouse=True) def mutation_cow_test(): if Backend.get() != "Pandas": pytest.skip( reason="tests are only meaningful with pandas backend", allow_module_level=True, ) @pytest.fixture(scope="function") def copy_on_write(request): # Indirect fixture for toggling copy-on-write when tests are run with config_context( Backend="Pandas", NativePandasDeepCopy=False ), pandas.option_context("mode.copy_on_write", request.param): yield request.param def get_mutation_fixtures(data, **kwargs): # Return a fixture that sets the copy_on_write fixture, then passes a modin and native DF together for mutation testing. # One parameter combination creates a modin DF from a native DF. # The other creates a native DF by calling to_pandas on a modin DF. def wrapper(f): # Need to create separate functions so parametrized runs don't affect each other. def native_first(): native_input = pandas.DataFrame(data, **kwargs) return native_input, pd.DataFrame(native_input) def modin_first(): modin_input = pd.DataFrame(data, **kwargs) return modin_input, modin_input.modin.to_pandas() @pytest.mark.parametrize("df_factory", [native_first, modin_first]) @pytest.mark.parametrize( "copy_on_write", [pytest.param(True, id="CoW"), pytest.param(False, id="no_CoW")], indirect=True, ) @functools.wraps(f) def test_runner(*args, **kwargs): return f(*args, **kwargs) return test_runner return wrapper @pytest.mark.parametrize( "axis", [pytest.param(0, id="index"), pytest.param(1, id="columns")] ) @get_mutation_fixtures({"A": [0, 1], "B": [2, 3]}) def test_set_axis_name(axis, copy_on_write, df_factory): df1, df2 = df_factory() df1.axes[axis].name = "x" assert df1.axes[axis].name == "x" # Changes do not propagate when copy-on-write is enabled. if copy_on_write: assert df2.axes[axis].name is None else: assert df2.axes[axis].name == "x" df2.axes[axis].name = "y" assert df1.axes[axis].name == ("x" if copy_on_write else "y") assert df2.axes[axis].name == "y" @pytest.mark.parametrize( "axis", [pytest.param(0, id="index"), pytest.param(1, id="columns")] ) @get_mutation_fixtures({"A": [0, 1], "B": [2, 3]}, index=["A", "B"]) def test_rename_axis(axis, copy_on_write, df_factory): df1, df2 = df_factory() # Renames don't propagate, regardless of CoW. df1.rename({"A": "aprime"}, axis=axis, inplace=True) assert df1.axes[axis].tolist() == ["aprime", "B"] assert df2.axes[axis].tolist() == ["A", "B"] df2.rename({"B": "bprime"}, axis=axis, inplace=True) assert df1.axes[axis].tolist() == ["aprime", "B"] assert df2.axes[axis].tolist() == ["A", "bprime"] @get_mutation_fixtures({"A": [0, 1], "B": [2, 3]}) def test_locset(copy_on_write, df_factory): df1, df2 = df_factory() df1.loc[0, "A"] = -1 assert df1.loc[0, "A"] == -1 assert df2.loc[0, "A"] == (0 if copy_on_write else -1) df2.loc[1, "B"] = 999 assert df1.loc[1, "B"] == (3 if copy_on_write else 999) assert df2.loc[1, "B"] == 999 @get_mutation_fixtures({"A": [0, 1], "B": [2, 3]}) def test_add_column(copy_on_write, df_factory): df1, df2 = df_factory() df1["C"] = [4, 5] assert df1["C"].tolist() == [4, 5] # Even with CoW disabled, the new column is not added to df2. assert df2.columns.tolist() == ["A", "B"] df2["D"] = [6, 7] assert df2["D"].tolist() == [6, 7] assert df1.columns.tolist() == ["A", "B", "C"] @get_mutation_fixtures({"A": [0, 1], "B": [2, 3]}) def test_add_row(copy_on_write, df_factory): df1, df2 = df_factory() df1.loc[9] = [4, 5] assert df1.loc[9].tolist() == [4, 5] # Even with CoW disabled, the new row is not added to df2. assert df2.index.tolist() == [0, 1] df2.loc[10] = [6, 7] assert df2.loc[10].tolist() == [6, 7] assert df1.index.tolist() == [0, 1, 9] @pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.filterwarnings("ignore::pandas.errors.ChainedAssignmentError") @get_mutation_fixtures({"A": [0, 1], "B": [2, 3]}) def test_chained_assignment(copy_on_write, df_factory): df1, df2 = df_factory() is_assign_noop = copy_on_write and isinstance(df1, pandas.DataFrame) df1["A"][0] = -1 assert df1["A"][0] == (0 if is_assign_noop else -1) assert df2["A"][0] == ( 0 if copy_on_write or isinstance(df2, pandas.DataFrame) else -1 ) is_assign_noop = copy_on_write and isinstance(df2, pandas.DataFrame) df2["B"][1] = 999 assert df1["B"][1] == ( 3 if copy_on_write or isinstance(df1, pandas.DataFrame) else 999 ) assert df2["B"][1] == (3 if is_assign_noop else 999) @get_mutation_fixtures({"A": [0, 1], "B": [2, 3]}) def test_column_reassign(copy_on_write, df_factory): df1, df2 = df_factory() df1["A"] = df1["A"] - 1 assert df1["A"].tolist() == [-1, 0] assert df2["A"].tolist() == [0, 1] df2["B"] = df2["B"] + 1 assert df1["B"].tolist() == [2, 3] assert df2["B"].tolist() == [3, 4] @pytest.mark.parametrize("always_deep", [True, False]) def test_explicit_copy(always_deep): # Test that making an explicit copy with deep=True actually makes a deep copy. with config_context(NativePandasDeepCopy=always_deep): df = pd.DataFrame([[0]]) # We don't really care about behavior with shallow copy, since modin semantics don't line up # perfectly with native pandas. df_copy = df.copy(deep=True) df.loc[0, 0] = -1 assert df.loc[0, 0] == -1 assert df_copy.loc[0, 0] == 0 ================================================ FILE: modin/tests/pandas/native_df_interoperability/test_default.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import matplotlib import numpy as np import pandas import pytest from numpy.testing import assert_array_equal import modin.pandas as pd from modin.config import NPartitions from modin.pandas.io import to_pandas from modin.tests.pandas.native_df_interoperability.utils import ( create_test_df_in_defined_mode, create_test_series_in_defined_mode, eval_general_interop, ) from modin.tests.pandas.utils import ( default_to_pandas_ignore_string, df_equals, test_data, test_data_diff_dtype, test_data_keys, test_data_large_categorical_dataframe, test_data_values, ) from modin.tests.test_utils import ( df_or_series_using_native_execution, warns_that_defaulting_to_pandas_if, ) NPartitions.put(4) # Force matplotlib to not use any Xwindows backend. matplotlib.use("Agg") # Our configuration in pytest.ini requires that we explicitly catch all # instances of defaulting to pandas, but some test modules, like this one, # have too many such instances. pytestmark = [ pytest.mark.filterwarnings(default_to_pandas_ignore_string), # IGNORE FUTUREWARNINGS MARKS TO CLEANUP OUTPUT pytest.mark.filterwarnings( "ignore:.*bool is now deprecated and will be removed:FutureWarning" ), pytest.mark.filterwarnings( "ignore:first is deprecated and will be removed:FutureWarning" ), pytest.mark.filterwarnings( "ignore:last is deprecated and will be removed:FutureWarning" ), ] @pytest.mark.parametrize( "op, make_args", [ ("align", lambda df: {"other": df}), ("corrwith", lambda df: {"other": df}), ("ewm", lambda df: {"com": 0.5}), ("from_dict", lambda df: {"data": None}), ("from_records", lambda df: {"data": to_pandas(df)}), ("hist", lambda df: {"column": "int_col"}), ("interpolate", None), ("mask", lambda df: {"cond": df != 0}), ("pct_change", None), ("to_xarray", None), ("flags", None), ("set_flags", lambda df: {"allows_duplicate_labels": False}), ], ) def test_ops_defaulting_to_pandas(op, make_args, df_mode_pair): modin_df1, _ = create_test_df_in_defined_mode( test_data_diff_dtype, post_fn=lambda df: df.drop(["str_col", "bool_col"], axis=1), native=df_mode_pair[0], ) modin_df2, _ = create_test_df_in_defined_mode( test_data_diff_dtype, post_fn=lambda df: df.drop(["str_col", "bool_col"], axis=1), native=df_mode_pair[1], ) with warns_that_defaulting_to_pandas_if( not df_or_series_using_native_execution(modin_df1) ): operation = getattr(modin_df1, op) if make_args is not None: operation(**make_args(modin_df2)) else: try: operation() # `except` for non callable attributes except TypeError: pass @pytest.mark.parametrize( "data", test_data_values + [test_data_large_categorical_dataframe], ids=test_data_keys + ["categorical_ints"], ) def test_to_numpy(data): modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) assert_array_equal(modin_df.values, pandas_df.values) def test_array_ufunc(): modin_df, pandas_df = create_test_df_in_defined_mode([[1, 2], [3, 4]], native=True) df_equals(np.sqrt(modin_df), np.sqrt(pandas_df)) modin_ser, pandas_ser = create_test_series_in_defined_mode( [1, 2, 3, 4, 9], native=True ) df_equals(np.sqrt(modin_ser), np.sqrt(pandas_ser)) def test_asfreq(df_mode_pair): index = pd.date_range("1/1/2000", periods=4, freq="min") series, _ = create_test_series_in_defined_mode( [0.0, None, 2.0, 3.0], index=index, native=df_mode_pair[0] ) df, _ = create_test_df_in_defined_mode({"s": series}, native=df_mode_pair[1]) with warns_that_defaulting_to_pandas_if( not df_or_series_using_native_execution(df) ): # We are only testing that this defaults to pandas, so we will just check for # the warning df.asfreq(freq="30S") def test_assign(df_mode_pair): data = test_data_values[0] def assign_one_column(df1, df2): df1.assign(new_column=pd.Series(df2.iloc[:, 0])) eval_general_interop(data, None, assign_one_column, df_mode_pair) def assign_multiple_columns(df1, df2): df1.assign( new_column=pd.Series(df2.iloc[:, 0]), new_column2=pd.Series(df2.iloc[:, 1]) ) eval_general_interop(data, None, assign_multiple_columns, df_mode_pair) def test_combine_first(df_mode_pair): data1 = {"A": [None, 0], "B": [None, 4]} modin_df1, pandas_df1 = create_test_df_in_defined_mode( data1, native=df_mode_pair[0] ) data2 = {"A": [1, 1], "B": [3, 3]} modin_df2, pandas_df2 = create_test_df_in_defined_mode( data2, native=df_mode_pair[1] ) df_equals( modin_df1.combine_first(modin_df2), pandas_df1.combine_first(pandas_df2), # https://github.com/modin-project/modin/issues/5959 check_dtypes=False, ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_dot(data, df_mode_pair): modin_df, pandas_df = create_test_df_in_defined_mode(data, native=df_mode_pair[0]) col_len = len(modin_df.columns) # Test series input modin_series, pandas_series = create_test_series_in_defined_mode( np.arange(col_len), index=pandas_df.columns, native=df_mode_pair[1], ) modin_result = modin_df.dot(modin_series) pandas_result = pandas_df.dot(pandas_series) df_equals(modin_result, pandas_result) def dot_func(df1, df2): return df1.dot(df2.T) # modin_result = modin_df.dot(modin_df.T) # pandas_result = pandas_df.dot(pandas_df.T) # df_equals(modin_result, pandas_result) # Test dataframe input eval_general_interop(data, None, dot_func, df_mode_pair) # Test when input series index doesn't line up with columns with pytest.raises(ValueError): modin_series_without_index, _ = create_test_series_in_defined_mode( np.arange(col_len), native=df_mode_pair[1] ) modin_df.dot(modin_series_without_index) # Test case when left dataframe has size (n x 1) # and right dataframe has size (1 x n) eval_general_interop(pandas_series, None, dot_func, df_mode_pair) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_matmul(data, df_mode_pair): modin_df, pandas_df = create_test_df_in_defined_mode(data, native=df_mode_pair[0]) col_len = len(modin_df.columns) # Test list input arr = np.arange(col_len) modin_result = modin_df @ arr pandas_result = pandas_df @ arr df_equals(modin_result, pandas_result) # Test bad dimensions with pytest.raises(ValueError): modin_df @ np.arange(col_len + 10) # Test series input modin_series, pandas_series = create_test_series_in_defined_mode( np.arange(col_len), index=pandas_df.columns, native=df_mode_pair[1], ) modin_result = modin_df @ modin_series pandas_result = pandas_df @ pandas_series df_equals(modin_result, pandas_result) # Test dataframe input def matmul_func(df1, df2): return df1 @ df2.T # Test dataframe input eval_general_interop(data, None, matmul_func, df_mode_pair) # Test when input series index doesn't line up with columns with pytest.raises(ValueError): modin_series_without_index, _ = create_test_series_in_defined_mode( np.arange(col_len), native=df_mode_pair[1] ) modin_df @ modin_series_without_index @pytest.mark.parametrize("data", [test_data["int_data"]], ids=["int_data"]) @pytest.mark.parametrize( "index", [ pytest.param(lambda _, df: df.columns[0], id="single_index_col"), pytest.param( lambda _, df: [*df.columns[0:2], *df.columns[-7:-4]], id="multiple_index_cols", ), pytest.param(None, id="default_index"), ], ) @pytest.mark.parametrize( "columns", [ pytest.param(lambda _, df: df.columns[len(df.columns) // 2], id="single_col"), pytest.param( lambda _, df: [ *df.columns[(len(df.columns) // 2) : (len(df.columns) // 2 + 4)], df.columns[-7], ], id="multiple_cols", ), pytest.param(None, id="default_columns"), ], ) @pytest.mark.parametrize( "values", [ pytest.param(lambda _, df: df.columns[-1], id="single_value_col"), pytest.param(lambda _, df: df.columns[-4:-1], id="multiple_value_cols"), ], ) @pytest.mark.parametrize( "aggfunc", [ pytest.param(lambda df, _: np.mean(df), id="callable_tree_reduce_func"), pytest.param("mean", id="tree_reduce_func"), pytest.param("nunique", id="full_axis_func"), ], ) def test_pivot_table_data(data, index, columns, values, aggfunc, request, df_mode_pair): if ( "callable_tree_reduce_func-single_value_col-multiple_cols-multiple_index_cols" in request.node.callspec.id or "callable_tree_reduce_func-multiple_value_cols-multiple_cols-multiple_index_cols" in request.node.callspec.id or "tree_reduce_func-single_value_col-multiple_cols-multiple_index_cols" in request.node.callspec.id or "tree_reduce_func-multiple_value_cols-multiple_cols-multiple_index_cols" in request.node.callspec.id or "full_axis_func-single_value_col-multiple_cols-multiple_index_cols" in request.node.callspec.id or "full_axis_func-multiple_value_cols-multiple_cols-multiple_index_cols" in request.node.callspec.id ): pytest.xfail(reason="https://github.com/modin-project/modin/issues/7011") expected_exception = None if "default_columns-default_index" in request.node.callspec.id: expected_exception = ValueError("No group keys passed!") elif ( "callable_tree_reduce_func" in request.node.callspec.id and "int_data" in request.node.callspec.id ): expected_exception = TypeError("'numpy.float64' object is not callable") eval_general_interop( data, None, operation=lambda df, _, *args, **kwargs: df.pivot_table( *args, **kwargs ).sort_index(axis=int(index is not None)), df_mode_pair=df_mode_pair, index=index, columns=columns, values=values, aggfunc=aggfunc, expected_exception=expected_exception, ) ================================================ FILE: modin/tests/pandas/native_df_interoperability/test_default_to_pandas_without_warnings.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. # While other modin backends raise a warning when defaulting to pandas, it does not make sense to # do so when we're running on the native pandas backend already. These tests ensure such warnings # are not raised with the pandas backend. import numpy as np import pandas import pytest import modin.pandas as pd from modin.config import Backend from modin.tests.pandas.utils import df_equals pytestmark = [ pytest.mark.skipif( Backend.get() != "Pandas", reason="warnings only suppressed on native pandas backend", allow_module_level=True, ), # Error if a default to pandas warning is detected. pytest.mark.filterwarnings("error:is not supported by NativeOnNative:UserWarning"), ] def test_crosstab_no_warning(): # Example from pandas docs # https://pandas.pydata.org/docs/reference/api/pandas.crosstab.html a = np.array( ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"], dtype=object, ) b = np.array( ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"], dtype=object, ) c = np.array( [ "dull", "dull", "shiny", "dull", "dull", "shiny", "shiny", "dull", "shiny", "shiny", "shiny", ], dtype=object, ) df_equals( pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"]), pandas.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"]), ) def test_json_normalize_no_warning(): # Example from pandas docs # https://pandas.pydata.org/docs/reference/api/pandas.json_normalize.html data = [ {"id": 1, "name": {"first": "Coleen", "last": "Volk"}}, {"name": {"given": "Mark", "family": "Regner"}}, {"id": 2, "name": "Faye Raker"}, ] df_equals(pd.json_normalize(data), pandas.json_normalize(data)) ================================================ FILE: modin/tests/pandas/native_df_interoperability/test_general.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import pandas import pytest import modin.pandas as pd from modin.tests.pandas.native_df_interoperability.utils import ( create_test_df_in_defined_mode, create_test_series_in_defined_mode, ) from modin.tests.pandas.utils import default_to_pandas_ignore_string, df_equals # Our configuration in pytest.ini requires that we explicitly catch all # instances of defaulting to pandas, but some test modules, like this one, # have too many such instances. pytestmark = pytest.mark.filterwarnings(default_to_pandas_ignore_string) def test_cut(df_mode_pair): modin_x, pandas_x = create_test_series_in_defined_mode( [1, 3], native=df_mode_pair[0] ) modin_bins, pandas_bins = create_test_series_in_defined_mode( [0, 2], native=df_mode_pair[1] ) def operation(*, lib, x, bins): return lib.cut(x, bins) df_equals( operation(lib=pd, x=modin_x, bins=modin_bins), operation(lib=pandas, x=pandas_x, bins=pandas_bins), ) def test_qcut(df_mode_pair): modin_x, pandas_x = create_test_series_in_defined_mode( [1, 2, 3, 4], native=df_mode_pair[0] ) modin_quantiles, pandas_quantiles = create_test_series_in_defined_mode( [0, 0.5, 1], native=df_mode_pair[1] ) def operation(*, lib, x, quantiles): return lib.qcut(x, quantiles) df_equals( operation(lib=pd, x=modin_x, quantiles=modin_quantiles), operation(lib=pandas, x=pandas_x, quantiles=pandas_quantiles), ) def test_merge_ordered(df_mode_pair): modin_left, pandas_left = create_test_df_in_defined_mode( { "key": ["a", "c", "e", "a", "c", "e"], "lvalue": [1, 2, 3, 1, 2, 3], "group": ["a", "a", "a", "b", "b", "b"], }, native=df_mode_pair[0], ) modin_right, pandas_right = create_test_df_in_defined_mode( {"key": ["b", "c", "d"], "rvalue": [1, 2, 3]}, native=df_mode_pair[1], ) def operation(*, lib, left, right): return lib.merge_ordered(left, right, fill_method="ffill", left_by="group") df_equals( operation(lib=pd, left=modin_left, right=modin_right), operation(lib=pandas, left=pandas_left, right=pandas_right), ) def test_merge_asof(df_mode_pair): modin_left, pandas_left = create_test_df_in_defined_mode( {"a": [1, 5, 10], "left_val": ["a", "b", "c"]}, native=df_mode_pair[0] ) modin_right, pandas_right = create_test_df_in_defined_mode( {"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}, native=df_mode_pair[1], ) def operation(*, lib, left, right): return lib.merge_asof(left, right, on="a") df_equals( operation(lib=pd, left=modin_left, right=modin_right), operation(lib=pandas, left=pandas_left, right=pandas_right), ) ================================================ FILE: modin/tests/pandas/native_df_interoperability/test_indexing.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. from itertools import product import matplotlib import numpy as np import pandas import pytest import modin.pandas as pd from modin.config import NPartitions from modin.tests.pandas.native_df_interoperability.utils import ( create_test_df_in_defined_mode, create_test_series_in_defined_mode, eval_general_interop, ) from modin.tests.pandas.utils import ( RAND_HIGH, RAND_LOW, assert_dtypes_equal, default_to_pandas_ignore_string, df_equals, eval_general, test_data, test_data_keys, test_data_values, ) NPartitions.put(4) # Force matplotlib to not use any Xwindows backend. matplotlib.use("Agg") # Our configuration in pytest.ini requires that we explicitly catch all # instances of defaulting to pandas, but some test modules, like this one, # have too many such instances. # TODO(https://github.com/modin-project/modin/issues/3655): catch all instances # of defaulting to pandas. pytestmark = pytest.mark.filterwarnings(default_to_pandas_ignore_string) def eval_setitem(md_df, pd_df, value, col=None, loc=None, expected_exception=None): if loc is not None: col = pd_df.columns[loc] value_getter = value if callable(value) else (lambda *args, **kwargs: value) eval_general( md_df, pd_df, lambda df: df.__setitem__(col, value_getter(df)), __inplace__=True, expected_exception=expected_exception, ) for pair in list(product([True, False], repeat=2)): eval_general_interop( pd_df, None, lambda df1, df2: df1.__setitem__(col, value_getter(df2)), pair, __inplace__=True, expected_exception=expected_exception, ) def eval_loc(md_df, pd_df, value, key): if isinstance(value, tuple): assert len(value) == 2 # case when value for pandas different md_value, pd_value = value else: md_value, pd_value = value, value eval_general( md_df, pd_df, lambda df: df.loc.__setitem__( key, pd_value if isinstance(df, pandas.DataFrame) else md_value ), __inplace__=True, ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize( "key_func", [ # test for the case from https://github.com/modin-project/modin/issues/4308 lambda df: "non_existing_column", lambda df: df.columns[0], lambda df: df.index, lambda df: [df.index, df.columns[0]], lambda df: ( pandas.Series(list(range(len(df.index)))) if isinstance(df, pandas.DataFrame) else pd.Series(list(range(len(df)))) ), ], ids=[ "non_existing_column", "first_column_name", "original_index", "list_of_index_and_first_column_name", "series_of_integers", ], ) @pytest.mark.parametrize( "drop_kwargs", [{"drop": True}, {"drop": False}, {}], ids=["drop_True", "drop_False", "no_drop_param"], ) def test_set_index(data, key_func, drop_kwargs, request, df_mode_pair): if ( "list_of_index_and_first_column_name" in request.node.name and "drop_False" in request.node.name ): pytest.xfail( reason="KeyError: https://github.com/modin-project/modin/issues/5636" ) expected_exception = None if "non_existing_column" in request.node.callspec.id: expected_exception = KeyError( "None of ['non_existing_column'] are in the columns" ) eval_general_interop( data, None, lambda df1, df2: df1.set_index(key_func(df2), **drop_kwargs), expected_exception=expected_exception, df_mode_pair=df_mode_pair, ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_loc(data, df_mode_pair): modin_df, pandas_df = create_test_df_in_defined_mode(data, native=df_mode_pair[0]) indices = [i % 3 == 0 for i in range(len(modin_df.index))] columns = [i % 5 == 0 for i in range(len(modin_df.columns))] # Key is a Modin or pandas series of booleans series1, _ = create_test_series_in_defined_mode(indices, native=df_mode_pair[0]) series2, _ = create_test_series_in_defined_mode( columns, index=modin_df.columns, native=df_mode_pair[0] ) df_equals( modin_df.loc[series1, series2], pandas_df.loc[ pandas.Series(indices), pandas.Series(columns, index=modin_df.columns) ], ) @pytest.mark.parametrize("left, right", [(2, 1), (6, 1), (lambda df: 70, 1), (90, 70)]) def test_loc_insert_row(left, right, df_mode_pair): # This test case comes from # https://github.com/modin-project/modin/issues/3764 data = [[1, 2, 3], [4, 5, 6]] def _test_loc_rows(df1, df2): df1.loc[left] = df2.loc[right] return df1 expected_exception = None if right == 70: pytest.xfail(reason="https://github.com/modin-project/modin/issues/7024") eval_general_interop( data, None, _test_loc_rows, expected_exception=expected_exception, df_mode_pair=df_mode_pair, ) @pytest.fixture def loc_iter_dfs_interop(df_mode_pair): columns = ["col1", "col2", "col3"] index = ["row1", "row2", "row3"] md_df1, pd_df1 = create_test_df_in_defined_mode( {col: ([idx] * len(index)) for idx, col in enumerate(columns)}, columns=columns, index=index, native=df_mode_pair[0], ) md_df2, pd_df2 = create_test_df_in_defined_mode( {col: ([idx] * len(index)) for idx, col in enumerate(columns)}, columns=columns, index=index, native=df_mode_pair[1], ) return md_df1, pd_df1, md_df2, pd_df2 @pytest.mark.parametrize("reverse_order", [False, True]) @pytest.mark.parametrize("axis", [0, 1]) def test_loc_iter_assignment(loc_iter_dfs_interop, reverse_order, axis): if reverse_order and axis: pytest.xfail( "Due to internal sorting of lookup values assignment order is lost, see GH-#2552" ) md_df1, pd_df1, md_df2, pd_df2 = loc_iter_dfs_interop select = [slice(None), slice(None)] select[axis] = sorted(pd_df1.axes[axis][:-1], reverse=reverse_order) select = tuple(select) pd_df1.loc[select] = pd_df1.loc[select] + pd_df2.loc[select] md_df1.loc[select] = md_df1.loc[select] + md_df2.loc[select] df_equals(md_df1, pd_df1) def test_loc_series(df_mode_pair): md_df1, pd_df1 = create_test_df_in_defined_mode( {"a": [1, 2], "b": [3, 4]}, native=df_mode_pair[0] ) md_df2, pd_df2 = create_test_df_in_defined_mode( {"a": [1, 2], "b": [3, 4]}, native=df_mode_pair[1] ) pd_df1.loc[pd_df2["a"] > 1, "b"] = np.log(pd_df1["b"]) md_df1.loc[md_df2["a"] > 1, "b"] = np.log(md_df1["b"]) df_equals(pd_df1, md_df1) def test_reindex_like(df_mode_pair): o_data = [ [24.3, 75.7, "high"], [31, 87.8, "high"], [22, 71.6, "medium"], [35, 95, "medium"], ] o_columns = ["temp_celsius", "temp_fahrenheit", "windspeed"] o_index = pd.date_range(start="2014-02-12", end="2014-02-15", freq="D") new_data = [[28, "low"], [30, "low"], [35.1, "medium"]] new_columns = ["temp_celsius", "windspeed"] new_index = pd.DatetimeIndex(["2014-02-12", "2014-02-13", "2014-02-15"]) modin_df1, pandas_df1 = create_test_df_in_defined_mode( o_data, columns=o_columns, index=o_index, native=df_mode_pair[0], ) modin_df2, pandas_df2 = create_test_df_in_defined_mode( new_data, columns=new_columns, index=new_index, native=df_mode_pair[1], ) modin_result = modin_df2.reindex_like(modin_df1) pandas_result = pandas_df2.reindex_like(pandas_df1) df_equals(modin_result, pandas_result) def test_reindex_multiindex(df_mode_pair): data1, data2 = np.random.randint(1, 20, (5, 5)), np.random.randint(10, 25, 6) index = np.array(["AUD", "BRL", "CAD", "EUR", "INR"]) pandas_midx = pandas.MultiIndex.from_product( [["Bank_1", "Bank_2"], ["AUD", "CAD", "EUR"]], names=["Bank", "Curency"] ) modin_df1, pandas_df1 = create_test_df_in_defined_mode( data=data1, index=index, columns=index, native=df_mode_pair[0] ) modin_df2, pandas_df2 = create_test_df_in_defined_mode( data=data2, index=pandas_midx, native=df_mode_pair[1] ) modin_df2.columns, pandas_df2.columns = ["Notional"], ["Notional"] md_midx = pd.MultiIndex.from_product([modin_df2.index.levels[0], modin_df1.index]) pd_midx = pandas.MultiIndex.from_product( [pandas_df2.index.levels[0], pandas_df1.index] ) # reindex without axis, index, or columns modin_result = modin_df1.reindex(md_midx, fill_value=0) pandas_result = pandas_df1.reindex(pd_midx, fill_value=0) df_equals(modin_result, pandas_result) # reindex with only axis modin_result = modin_df1.reindex(md_midx, fill_value=0, axis=0) pandas_result = pandas_df1.reindex(pd_midx, fill_value=0, axis=0) df_equals(modin_result, pandas_result) # reindex with axis and level modin_result = modin_df1.reindex(md_midx, fill_value=0, axis=0, level=0) pandas_result = pandas_df1.reindex(pd_midx, fill_value=0, axis=0, level=0) df_equals(modin_result, pandas_result) def test_getitem_empty_mask(df_mode_pair): # modin-project/modin#517 modin_frames = [] pandas_frames = [] data1 = np.random.randint(0, 100, size=(100, 4)) mdf1, pdf1 = create_test_df_in_defined_mode( data1, columns=list("ABCD"), native=df_mode_pair[0] ) modin_frames.append(mdf1) pandas_frames.append(pdf1) data2 = np.random.randint(0, 100, size=(100, 4)) mdf2, pdf2 = create_test_df_in_defined_mode( data2, columns=list("ABCD"), native=df_mode_pair[1] ) modin_frames.append(mdf2) pandas_frames.append(pdf2) data3 = np.random.randint(0, 100, size=(100, 4)) mdf3, pdf3 = create_test_df_in_defined_mode( data3, columns=list("ABCD"), native=df_mode_pair[0] ) modin_frames.append(mdf3) pandas_frames.append(pdf3) modin_data = pd.concat(modin_frames) pandas_data = pandas.concat(pandas_frames) df_equals( modin_data[[False for _ in modin_data.index]], pandas_data[[False for _ in modin_data.index]], ) def test___setitem__mask(df_mode_pair): # DataFrame mask: data = test_data["int_data"] modin_df1, pandas_df1 = create_test_df_in_defined_mode(data, native=df_mode_pair[0]) modin_df2, pandas_df2 = create_test_df_in_defined_mode(data, native=df_mode_pair[0]) mean = int((RAND_HIGH + RAND_LOW) / 2) pandas_df1[pandas_df2 > mean] = -50 modin_df1[modin_df2 > mean] = -50 df_equals(modin_df1, pandas_df1) @pytest.mark.parametrize( "data", [ {}, {"id": [], "max_speed": [], "health": []}, {"id": [1], "max_speed": [2], "health": [3]}, {"id": [4, 40, 400], "max_speed": [111, 222, 333], "health": [33, 22, 11]}, ], ids=["empty_frame", "empty_cols", "1_length_cols", "2_length_cols"], ) @pytest.mark.parametrize( "value", [[11, 22], [11, 22, 33]], ids=["2_length_val", "3_length_val"], ) @pytest.mark.parametrize("convert_to_series", [False, True]) @pytest.mark.parametrize("new_col_id", [123, "new_col"], ids=["integer", "string"]) def test_setitem_on_empty_df(data, value, convert_to_series, new_col_id, df_mode_pair): modin_df, pandas_df = create_test_df_in_defined_mode(data, native=df_mode_pair[0]) def applyier(df): if convert_to_series: converted_value = ( pandas.Series(value) if isinstance(df, pandas.DataFrame) else create_test_series_in_defined_mode(value, native=df_mode_pair[1])[ 1 ] ) else: converted_value = value df[new_col_id] = converted_value return df expected_exception = None if not convert_to_series: values_length = len(value) index_length = len(pandas_df.index) expected_exception = ValueError( f"Length of values ({values_length}) does not match length of index ({index_length})" ) eval_general( modin_df, pandas_df, applyier, expected_exception=expected_exception, check_for_execution_propagation=False, no_check_for_execution_propagation_reason=( "https://github.com/modin-project/modin/issues/7428" ), __inplace__=True, ) # Because of https://github.com/modin-project/modin/issues/7600, # df_equals does not check dtypes equality for empty frames. assert_dtypes_equal(modin_df, pandas_df) def test_setitem_on_empty_df_4407(df_mode_pair): data = {} index = pd.date_range(end="1/1/2018", periods=0, freq="D") column = pd.date_range(end="1/1/2018", periods=1, freq="h")[0] modin_df, pandas_df = create_test_df_in_defined_mode( data, columns=index, native=df_mode_pair[0] ) modin_ser, pandas_ser = create_test_series_in_defined_mode( [1], native=df_mode_pair[1] ) modin_df[column] = modin_ser pandas_df[column] = pandas_ser df_equals(modin_df, pandas_df) assert modin_df.columns.freq == pandas_df.columns.freq def test_setitem_2d_insertion(df_mode_pair): def build_value_picker(modin_value, pandas_value): """Build a function that returns either Modin or pandas DataFrame depending on the passed frame.""" return lambda source_df, *args, **kwargs: ( modin_value if isinstance(source_df, (pd.DataFrame, pd.Series)) else pandas_value ) modin_df, pandas_df = create_test_df_in_defined_mode( test_data["int_data"], native=df_mode_pair[0] ) # Easy case - key and value.columns are equal modin_value, pandas_value = create_test_df_in_defined_mode( { "new_value1": np.arange(len(modin_df)), "new_value2": np.arange(len(modin_df)), }, native=df_mode_pair[1], ) eval_setitem( modin_df, pandas_df, build_value_picker(modin_value, pandas_value), col=["new_value1", "new_value2"], ) # Key and value.columns have equal values but in different order new_columns = ["new_value3", "new_value4"] modin_value.columns, pandas_value.columns = new_columns, new_columns eval_setitem( modin_df, pandas_df, build_value_picker(modin_value, pandas_value), col=["new_value4", "new_value3"], ) # Key and value.columns have different values new_columns = ["new_value5", "new_value6"] modin_value.columns, pandas_value.columns = new_columns, new_columns eval_setitem( modin_df, pandas_df, build_value_picker(modin_value, pandas_value), col=["__new_value5", "__new_value6"], ) # Key and value.columns have different lengths, testing that both raise the same exception eval_setitem( modin_df, pandas_df, build_value_picker(modin_value.iloc[:, [0]], pandas_value.iloc[:, [0]]), col=["new_value7", "new_value8"], expected_exception=ValueError("Columns must be same length as key"), ) @pytest.mark.parametrize("does_value_have_different_columns", [True, False]) def test_setitem_2d_update(does_value_have_different_columns, df_mode_pair): def test(dfs, iloc): """Update columns on the given numeric indices.""" df1, df2 = dfs cols1 = df1.columns[iloc].tolist() cols2 = df2.columns[iloc].tolist() df1[cols1] = df2[cols2] return df1 modin_df, pandas_df = create_test_df_in_defined_mode( test_data["int_data"], native=df_mode_pair[0] ) modin_df2, pandas_df2 = create_test_df_in_defined_mode( test_data["int_data"], native=df_mode_pair[1] ) modin_df2 *= 10 pandas_df2 *= 10 if does_value_have_different_columns: new_columns = [f"{col}_new" for col in modin_df.columns] modin_df2.columns = new_columns pandas_df2.columns = new_columns modin_dfs = (modin_df, modin_df2) pandas_dfs = (pandas_df, pandas_df2) eval_general(modin_dfs, pandas_dfs, test, iloc=[0, 1, 2]) eval_general(modin_dfs, pandas_dfs, test, iloc=[0, -1]) eval_general( modin_dfs, pandas_dfs, test, iloc=slice(1, None) ) # (start=1, stop=None) eval_general( modin_dfs, pandas_dfs, test, iloc=slice(None, -2) ) # (start=None, stop=-2) eval_general( modin_dfs, pandas_dfs, test, iloc=[0, 1, 5, 6, 9, 10, -2, -1], ) eval_general( modin_dfs, pandas_dfs, test, iloc=[5, 4, 0, 10, 1, -1], ) eval_general( modin_dfs, pandas_dfs, test, iloc=slice(None, None, 2) ) # (start=None, stop=None, step=2) def test___setitem__single_item_in_series(df_mode_pair): # Test assigning a single item in a Series for issue # https://github.com/modin-project/modin/issues/3860 modin_series1, pandas_series1 = create_test_series_in_defined_mode( 99, native=df_mode_pair[0] ) modin_series2, pandas_series2 = create_test_series_in_defined_mode( 100, native=df_mode_pair[1] ) modin_series1[:1] = modin_series2 pandas_series1[:1] = pandas_series2 df_equals(modin_series1, pandas_series1) @pytest.mark.parametrize( "value", [ 1, np.int32(1), 1.0, "str val", pandas.Timestamp("1/4/2018"), np.datetime64(0, "ms"), True, ], ) def test_loc_boolean_assignment_scalar_dtypes(value, df_mode_pair): modin_df, pandas_df = create_test_df_in_defined_mode( { "a": [1, 2, 3], "b": [3.0, 5.0, 6.0], "c": ["a", "b", "c"], "d": [1.0, "c", 2.0], "e": pandas.to_datetime(["1/1/2018", "1/2/2018", "1/3/2018"]), "f": [True, False, True], }, native=df_mode_pair[1], ) modin_idx, pandas_idx = create_test_series_in_defined_mode( [False, True, True], native=df_mode_pair[1] ) modin_df.loc[modin_idx] = value pandas_df.loc[pandas_idx] = value df_equals(modin_df, pandas_df) # This is a very subtle bug that comes from: # https://github.com/modin-project/modin/issues/4945 def test_lazy_eval_index(df_mode_pair): data = {"col0": [0, 1]} def func(df1, df2): df_copy = df1[df2["col0"] < 6].copy() # The problem here is that the index is not copied over so it needs # to get recomputed at some point. Our implementation of __setitem__ # requires us to build a mask and insert the value from the right # handside into the new DataFrame. However, it's possible that we # won't have any new partitions, so we will end up computing an empty # index. df_copy["col0"] = df_copy["col0"].apply(lambda x: x + 1) return df_copy eval_general_interop(data, None, func, df_mode_pair=df_mode_pair) def test_index_of_empty_frame(df_mode_pair): # Test on an empty frame created by user # Test on an empty frame produced by Modin's logic data = test_data_values[0] md_df1, pd_df1 = create_test_df_in_defined_mode( data, index=pandas.RangeIndex(len(next(iter(data.values()))), name="index name"), native=df_mode_pair[0], ) md_df2, pd_df2 = create_test_df_in_defined_mode( data, index=pandas.RangeIndex(len(next(iter(data.values()))), name="index name"), native=df_mode_pair[1], ) md_res = md_df1.query(f"{md_df2.columns[0]} > {RAND_HIGH}") pd_res = pd_df1.query(f"{pd_df2.columns[0]} > {RAND_HIGH}") assert md_res.empty and pd_res.empty df_equals(md_res.index, pd_res.index) ================================================ FILE: modin/tests/pandas/native_df_interoperability/test_iter.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import warnings import matplotlib import pytest import modin.pandas as pd from modin.config import NPartitions from modin.pandas.utils import SET_DATAFRAME_ATTRIBUTE_WARNING from modin.tests.pandas.native_df_interoperability.utils import ( create_test_df_in_defined_mode, create_test_series_in_defined_mode, ) from modin.tests.pandas.utils import df_equals, eval_general NPartitions.put(4) # Force matplotlib to not use any Xwindows backend. matplotlib.use("Agg") def test___setattr__mutating_column(df_mode_pair): # Use case from issue #4577 modin_df, pandas_df = create_test_df_in_defined_mode( [[1]], columns=["col0"], native=df_mode_pair[0] ) # Replacing a column with a list should mutate the column in place. pandas_df.col0 = [3] modin_df.col0 = [3] modin_ser, pandas_ser = create_test_series_in_defined_mode( [3], native=df_mode_pair[1] ) df_equals(modin_df, pandas_df) # Check that the col0 attribute reflects the value update. df_equals(modin_df.col0, pandas_df.col0) pandas_df.col0 = pandas_ser modin_df.col0 = modin_ser # Check that the col0 attribute reflects this update df_equals(modin_df, pandas_df) pandas_df.loc[0, "col0"] = 4 modin_df.loc[0, "col0"] = 4 # Check that the col0 attribute reflects update via loc df_equals(modin_df, pandas_df) assert modin_df.col0.equals(modin_df["col0"]) # Check that attempting to add a new col via attributes raises warning # and adds the provided list as a new attribute and not a column. with pytest.warns( UserWarning, match=SET_DATAFRAME_ATTRIBUTE_WARNING, ): modin_df.col1 = [4] with warnings.catch_warnings(): warnings.filterwarnings( action="error", message=SET_DATAFRAME_ATTRIBUTE_WARNING, ) modin_df.col1 = [5] modin_df.new_attr = 6 modin_df.col0 = 7 assert "new_attr" in dir( modin_df ), "Modin attribute was not correctly added to the df." assert ( "new_attr" not in modin_df ), "New attribute was not correctly added to columns." assert modin_df.new_attr == 6, "Modin attribute value was set incorrectly." assert isinstance( modin_df.col0, pd.Series ), "Scalar was not broadcasted properly to an existing column." def test_isin_with_modin_objects(df_mode_pair): modin_df1, pandas_df1 = create_test_df_in_defined_mode( {"a": [1, 2], "b": [3, 4]}, native=df_mode_pair[0] ) modin_series, pandas_series = create_test_series_in_defined_mode( [1, 4, 5, 6], native=df_mode_pair[1] ) eval_general( (modin_df1, modin_series), (pandas_df1, pandas_series), lambda srs: srs[0].isin(srs[1]), ) modin_df2 = modin_series.to_frame("a") pandas_df2 = pandas_series.to_frame("a") eval_general( (modin_df1, modin_df2), (pandas_df1, pandas_df2), lambda srs: srs[0].isin(srs[1]), ) # Check case when indices are not matching modin_df1, pandas_df1 = create_test_df_in_defined_mode( {"a": [1, 2], "b": [3, 4]}, index=[10, 11], native=df_mode_pair[0], ) eval_general( (modin_df1, modin_series), (pandas_df1, pandas_series), lambda srs: srs[0].isin(srs[1]), ) eval_general( (modin_df1, modin_df2), (pandas_df1, pandas_df2), lambda srs: srs[0].isin(srs[1]), ) ================================================ FILE: modin/tests/pandas/native_df_interoperability/test_join_sort.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import matplotlib import numpy as np import pandas import pytest from pytest import param import modin.pandas as pd from modin.config import NPartitions from modin.pandas.io import to_pandas from modin.tests.pandas.native_df_interoperability.utils import ( create_test_df_in_defined_mode, create_test_series_in_defined_mode, eval_general_interop, ) from modin.tests.pandas.utils import ( default_to_pandas_ignore_string, df_equals, eval_general, random_state, test_data_keys, test_data_values, ) NPartitions.put(4) # Force matplotlib to not use any Xwindows backend. matplotlib.use("Agg") # Our configuration in pytest.ini requires that we explicitly catch all # instances of defaulting to pandas, but some test modules, like this one, # have too many such instances. pytestmark = pytest.mark.filterwarnings(default_to_pandas_ignore_string) # Initialize env for storage format detection in @pytest.mark.* pd.DataFrame() def df_equals_and_sort(df1, df2): """Sort dataframe's rows and run ``df_equals()`` for them.""" df1 = df1.sort_values(by=df1.columns.tolist(), ignore_index=True) df2 = df2.sort_values(by=df2.columns.tolist(), ignore_index=True) df_equals(df1, df2) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_combine(data, df_mode_pair): modin_df_1, pandas_df_1 = create_test_df_in_defined_mode( data, native=df_mode_pair[0] ) modin_df_2, pandas_df_2 = create_test_df_in_defined_mode( data, native=df_mode_pair[1] ) modin_df_1.combine( modin_df_2 + 1, lambda s1, s2: s1 if s1.count() < s2.count() else s2 ) pandas_df_1.combine( pandas_df_2 + 1, lambda s1, s2: s1 if s1.count() < s2.count() else s2 ) @pytest.mark.parametrize( "test_data, test_data2", [ ( np.random.randint(0, 100, size=(64, 64)), np.random.randint(0, 100, size=(128, 64)), ), ( np.random.randint(0, 100, size=(128, 64)), np.random.randint(0, 100, size=(64, 64)), ), ( np.random.randint(0, 100, size=(64, 64)), np.random.randint(0, 100, size=(64, 128)), ), ( np.random.randint(0, 100, size=(64, 128)), np.random.randint(0, 100, size=(64, 64)), ), ], ) def test_join(test_data, test_data2, df_mode_pair): modin_df, pandas_df = create_test_df_in_defined_mode( test_data, columns=["col{}".format(i) for i in range(test_data.shape[1])], index=pd.Index([i for i in range(1, test_data.shape[0] + 1)], name="key"), native=df_mode_pair[0], ) modin_df2, pandas_df2 = create_test_df_in_defined_mode( test_data2, columns=["col{}".format(i) for i in range(test_data2.shape[1])], index=pd.Index([i for i in range(1, test_data2.shape[0] + 1)], name="key"), native=df_mode_pair[1], ) hows = ["inner", "left", "right", "outer"] ons = ["col33", "col34"] sorts = [False, True] assert len(ons) == len(sorts), "the loop below is designed for this condition" for i in range(len(hows)): for j in range(len(ons)): modin_result = modin_df.join( modin_df2, how=hows[i], on=ons[j], sort=sorts[j], lsuffix="_caller", rsuffix="_other", ) pandas_result = pandas_df.join( pandas_df2, how=hows[i], on=ons[j], sort=sorts[j], lsuffix="_caller", rsuffix="_other", ) if sorts[j]: # sorting in `join` is implemented through range partitioning technique # therefore the order of the rows after it does not match the pandas, # so additional sorting is needed in order to get the same result as for pandas df_equals_and_sort(modin_result, pandas_result) else: df_equals(modin_result, pandas_result) frame_data = { "col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6], } modin_df = pd.DataFrame(frame_data) pandas_df = pandas.DataFrame(frame_data) frame_data2 = {"col5": [0], "col6": [1]} modin_df2 = pd.DataFrame(frame_data2) pandas_df2 = pandas.DataFrame(frame_data2) join_types = ["left", "right", "outer", "inner"] for how in join_types: modin_join = modin_df.join(modin_df2, how=how) pandas_join = pandas_df.join(pandas_df2, how=how) df_equals(modin_join, pandas_join) frame_data3 = {"col7": [1, 2, 3, 5, 6, 7, 8]} modin_df3 = pd.DataFrame(frame_data3) pandas_df3 = pandas.DataFrame(frame_data3) join_types = ["left", "outer", "inner"] for how in join_types: modin_join = modin_df.join([modin_df2, modin_df3], how=how) pandas_join = pandas_df.join([pandas_df2, pandas_df3], how=how) df_equals(modin_join, pandas_join) def test_join_cross_6786(df_mode_pair): data = [[7, 8, 9], [10, 11, 12]] modin_df_1, pandas_df_1 = create_test_df_in_defined_mode( data, columns=["x", "y", "z"], native=df_mode_pair[0] ) modin_df_2, pandas_df_2 = create_test_df_in_defined_mode( data, columns=["x", "y", "z"], native=df_mode_pair[1] ) modin_join = modin_df_1.join( modin_df_2[["x"]].set_axis(["p", "q"], axis=0), how="cross", lsuffix="p" ) pandas_join = pandas_df_1.join( pandas_df_2[["x"]].set_axis(["p", "q"], axis=0), how="cross", lsuffix="p" ) df_equals(modin_join, pandas_join) @pytest.mark.parametrize( "test_data, test_data2", [ ( np.random.randint(0, 100, size=(64, 64)), np.random.randint(0, 100, size=(128, 64)), ), ( np.random.randint(0, 100, size=(128, 64)), np.random.randint(0, 100, size=(64, 64)), ), ( np.random.randint(0, 100, size=(64, 64)), np.random.randint(0, 100, size=(64, 128)), ), ( np.random.randint(0, 100, size=(64, 128)), np.random.randint(0, 100, size=(64, 64)), ), ], ) @pytest.mark.parametrize( "merge_with_on, merge_with_left_on_right_on", [ param( lambda df1, df2, *, lib, how, sort, on=None: df1.merge( df2, how=how, on=on, sort=sort ), lambda df1, df2, *, lib, how, sort: df1.merge( df2, how=how, left_on="key", right_on="key", sort=sort ), id="merge_with_dataframe_method", ), param( lambda df1, df2, *, lib, how, sort, on=None: lib.merge( df1, df2, how=how, on=on, sort=sort, ), lambda df1, df2, *, lib, how, sort: lib.merge( df1, df2, how=how, left_on="key", right_on="key", sort=sort ), id="merge_with_general_function", ), ], ) def test_merge( test_data, test_data2, df_mode_pair, merge_with_on, merge_with_left_on_right_on, ): modin_df, pandas_df = create_test_df_in_defined_mode( test_data, columns=["col{}".format(i) for i in range(test_data.shape[1])], index=pd.Index([i for i in range(1, test_data.shape[0] + 1)], name="key"), native=df_mode_pair[0], ) modin_df2, pandas_df2 = create_test_df_in_defined_mode( test_data2, columns=["col{}".format(i) for i in range(test_data2.shape[1])], index=pd.Index([i for i in range(1, test_data2.shape[0] + 1)], name="key"), native=df_mode_pair[1], ) hows = ["left", "inner", "right"] ons = ["col33", ["col33", "col34"]] sorts = [False, True] assert len(ons) == len(sorts), "the loop below is designed for this condition" for i in range(len(hows)): for j in range(len(ons)): modin_result = merge_with_on( modin_df, modin_df2, how=hows[i], on=ons[j], sort=sorts[j], lib=pd ) pandas_result = merge_with_on( pandas_df, pandas_df2, how=hows[i], on=ons[j], sort=sorts[j], lib=pandas ) # FIXME: https://github.com/modin-project/modin/issues/2246 df_equals_and_sort(modin_result, pandas_result) modin_result = merge_with_left_on_right_on( modin_df, modin_df2, how=hows[i], sort=sorts[j], lib=pd ) pandas_result = merge_with_left_on_right_on( pandas_df, pandas_df2, how=hows[i], sort=sorts[j], lib=pandas ) # FIXME: https://github.com/modin-project/modin/issues/2246 df_equals_and_sort(modin_result, pandas_result) @pytest.mark.parametrize("how", ["left", "inner", "right"]) def test_merge_empty( how, df_mode_pair, ): data = np.random.randint(0, 100, size=(64, 64)) eval_general_interop( data, None, lambda df1, df2: df1.merge(df2.iloc[:0], how=how), df_mode_pair, ) def test_merge_with_mi_columns(df_mode_pair): modin_df1, pandas_df1 = create_test_df_in_defined_mode( { ("col0", "a"): [1, 2, 3, 4], ("col0", "b"): [2, 3, 4, 5], ("col1", "a"): [3, 4, 5, 6], }, native=df_mode_pair[0], ) modin_df2, pandas_df2 = create_test_df_in_defined_mode( { ("col0", "a"): [1, 2, 3, 4], ("col0", "c"): [2, 3, 4, 5], ("col1", "a"): [3, 4, 5, 6], }, native=df_mode_pair[1], ) eval_general( (modin_df1, modin_df2), (pandas_df1, pandas_df2), lambda dfs: dfs[0].merge(dfs[1], on=[("col0", "a")]), ) def test_where(df_mode_pair): columns = list("abcdefghij") frame_data = random_state.randn(100, 10) modin_df_1, pandas_df_1 = create_test_df_in_defined_mode( frame_data, columns=columns, native=df_mode_pair[0] ) modin_df_2, pandas_df_2 = create_test_df_in_defined_mode( frame_data, columns=columns, native=df_mode_pair[1] ) pandas_cond_df = pandas_df_2 % 5 < 2 modin_cond_df = modin_df_2 % 5 < 2 pandas_result = pandas_df_1.where(pandas_cond_df, -pandas_df_2) modin_result = modin_df_1.where(modin_cond_df, -modin_df_2) assert all((to_pandas(modin_result) == pandas_result).all()) # test case when other is Series other_data = random_state.randn(len(pandas_df_1)) modin_other, pandas_other = create_test_series_in_defined_mode( other_data, native=df_mode_pair[0] ) pandas_result = pandas_df_1.where(pandas_cond_df, pandas_other, axis=0) modin_result = modin_df_1.where(modin_cond_df, modin_other, axis=0) df_equals(modin_result, pandas_result) # Test that we choose the right values to replace when `other` == `True` # everywhere. other_data = np.full(shape=pandas_df_1.shape, fill_value=True) modin_other, pandas_other = create_test_df_in_defined_mode( other_data, columns=columns, native=df_mode_pair[0] ) pandas_result = pandas_df_1.where(pandas_cond_df, pandas_other) modin_result = modin_df_1.where(modin_cond_df, modin_other) df_equals(modin_result, pandas_result) other = pandas_df_1.loc[3] pandas_result = pandas_df_1.where(pandas_cond_df, other, axis=1) modin_result = modin_df_1.where(modin_cond_df, other, axis=1) assert all((to_pandas(modin_result) == pandas_result).all()) other = pandas_df_1["e"] pandas_result = pandas_df_1.where(pandas_cond_df, other, axis=0) modin_result = modin_df_1.where(modin_cond_df, other, axis=0) assert all((to_pandas(modin_result) == pandas_result).all()) pandas_result = pandas_df_1.where(pandas_df_2 < 2, True) modin_result = modin_df_1.where(modin_df_2 < 2, True) assert all((to_pandas(modin_result) == pandas_result).all()) @pytest.mark.parametrize("align_axis", ["index", "columns"]) @pytest.mark.parametrize("keep_shape", [False, True]) @pytest.mark.parametrize("keep_equal", [False, True]) def test_compare(align_axis, keep_shape, keep_equal, df_mode_pair): kwargs = { "align_axis": align_axis, "keep_shape": keep_shape, "keep_equal": keep_equal, } frame_data1 = random_state.randn(100, 10) frame_data2 = random_state.randn(100, 10) modin_df, pandas_df = create_test_df_in_defined_mode( frame_data1, columns=list("abcdefghij"), native=df_mode_pair[0] ) modin_df2, pandas_df2 = create_test_df_in_defined_mode( frame_data2, columns=list("abcdefghij"), native=df_mode_pair[0] ) modin_result = modin_df.compare(modin_df2, **kwargs) pandas_result = pandas_df.compare(pandas_df2, **kwargs) assert to_pandas(modin_result).equals(pandas_result) modin_result = modin_df2.compare(modin_df, **kwargs) pandas_result = pandas_df2.compare(pandas_df, **kwargs) assert to_pandas(modin_result).equals(pandas_result) series_data1 = ["a", "b", "c", "d", "e"] series_data2 = ["a", "a", "c", "b", "e"] modin_series1, pandas_series1 = create_test_series_in_defined_mode( series_data1, native=df_mode_pair[0] ) modin_series2, pandas_series2 = create_test_series_in_defined_mode( series_data2, native=df_mode_pair[1] ) modin_result = modin_series1.compare(modin_series2, **kwargs) pandas_result = pandas_series1.compare(pandas_series2, **kwargs) assert to_pandas(modin_result).equals(pandas_result) modin_result = modin_series2.compare(modin_series1, **kwargs) pandas_result = pandas_series2.compare(pandas_series1, **kwargs) assert to_pandas(modin_result).equals(pandas_result) ================================================ FILE: modin/tests/pandas/native_df_interoperability/test_map_metadata.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import matplotlib import numpy as np import pandas import pytest import modin.pandas as pd from modin.config import NPartitions from modin.tests.pandas.native_df_interoperability.utils import ( create_test_df_in_defined_mode, create_test_series_in_defined_mode, ) from modin.tests.pandas.utils import ( RAND_HIGH, RAND_LOW, axis_keys, axis_values, default_to_pandas_ignore_string, df_equals, eval_general, name_contains, numeric_dfs, random_state, test_data, test_data_keys, test_data_values, ) NPartitions.put(4) # Force matplotlib to not use any Xwindows backend. matplotlib.use("Agg") # Our configuration in pytest.ini requires that we explicitly catch all # instances of defaulting to pandas, but some test modules, like this one, # have too many such instances. pytestmark = pytest.mark.filterwarnings(default_to_pandas_ignore_string) def eval_insert(modin_df, pandas_df, **kwargs): if "col" in kwargs and "column" not in kwargs: kwargs["column"] = kwargs.pop("col") _kwargs = {"loc": 0, "column": "New column"} _kwargs.update(kwargs) eval_general( modin_df, pandas_df, operation=lambda df, **kwargs: df.insert(**kwargs), __inplace__=True, **_kwargs, ) def test_empty_df(df_mode_pair): modin_df, pd_df = create_test_df_in_defined_mode(None, native=df_mode_pair[0]) md_series, pd_series = create_test_series_in_defined_mode( [1, 2, 3, 4, 5], native=df_mode_pair[1] ) modin_df["a"] = md_series pd_df["a"] = pd_series df_equals(modin_df, pd_df) def test_astype(df_mode_pair): td = pandas.DataFrame(test_data["int_data"])[["col1", "index", "col3", "col4"]] modin_df, pandas_df = create_test_df_in_defined_mode( td.values, index=td.index, columns=td.columns, native=df_mode_pair[0], ) def astype_func(df): md_ser, pd_ser = create_test_series_in_defined_mode( [str, str], index=["col1", "col1"], native=df_mode_pair[1] ) if isinstance(df, pd.DataFrame): return df.astype(md_ser) else: return df.astype(pd_ser) # The dtypes series must have a unique index. eval_general( modin_df, pandas_df, astype_func, expected_exception=ValueError( "cannot reindex on an axis with duplicate labels" ), ) ########################################################################### def test_convert_dtypes_5653(df_mode_pair): modin_part1, _ = create_test_df_in_defined_mode( {"col1": ["a", "b", "c", "d"]}, native=df_mode_pair[0] ) modin_part2, _ = create_test_df_in_defined_mode( {"col1": [None, None, None, None]}, native=df_mode_pair[1] ) modin_df = pd.concat([modin_part1, modin_part2]) if modin_df._query_compiler.storage_format == "Pandas": assert modin_df._query_compiler._modin_frame._partitions.shape == (2, 1) modin_df = modin_df.convert_dtypes() assert len(modin_df.dtypes) == 1 assert modin_df.dtypes.iloc[0] == "string" @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize("bound_type", ["list", "series"], ids=["list", "series"]) @pytest.mark.exclude_in_sanity def test_clip(request, data, axis, bound_type, df_mode_pair): modin_df, pandas_df = create_test_df_in_defined_mode(data, native=df_mode_pair[0]) if name_contains(request.node.name, numeric_dfs): ind_len = ( len(modin_df.index) if not pandas.DataFrame()._get_axis_number(axis) else len(modin_df.columns) ) lower = random_state.randint(RAND_LOW, RAND_HIGH, ind_len) upper = random_state.randint(RAND_LOW, RAND_HIGH, ind_len) if bound_type == "series": modin_lower, pandas_lower = create_test_series_in_defined_mode( lower, native=df_mode_pair[1] ) modin_upper, pandas_upper = create_test_series_in_defined_mode( upper, native=df_mode_pair[0] ) else: modin_lower = pandas_lower = lower modin_upper = pandas_upper = upper # test lower and upper list bound on each column modin_result = modin_df.clip(modin_lower, modin_upper, axis=axis) pandas_result = pandas_df.clip(pandas_lower, pandas_upper, axis=axis) df_equals(modin_result, pandas_result) # test only upper list bound on each column modin_result = modin_df.clip(np.nan, modin_upper, axis=axis) pandas_result = pandas_df.clip(np.nan, pandas_upper, axis=axis) df_equals(modin_result, pandas_result) with pytest.raises(ValueError): modin_df.clip(lower=[1, 2, 3], axis=None) @pytest.mark.parametrize( "data, other_data", [ ({"A": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, 5, 6], "C": [7, 8, 9]}), ({"C": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, 5, 6], "A": [7, 8, 9]}), ( {"A": ["a", "b", "c"], "B": ["x", "y", "z"]}, {"B": ["d", "e", "f", "g", "h", "i"]}, ), ({"A": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, np.nan, 6]}), ], ) @pytest.mark.parametrize("errors", ["raise", "ignore"]) def test_update(data, other_data, errors, df_mode_pair): modin_df, pandas_df = create_test_df_in_defined_mode(data, native=df_mode_pair[0]) other_modin_df, other_pandas_df = create_test_df_in_defined_mode( other_data, native=df_mode_pair[1] ) expected_exception = None if errors == "raise": expected_exception = ValueError("Data overlaps.") eval_general( modin_df, pandas_df, lambda df: ( df.update(other_modin_df, errors=errors) if isinstance(df, pd.DataFrame) else df.update(other_pandas_df, errors=errors) ), __inplace__=True, expected_exception=expected_exception, ) @pytest.mark.parametrize( "get_index", [ pytest.param(lambda idx: None, id="None_idx"), pytest.param(lambda idx: ["a", "b", "c"], id="No_intersection_idx"), pytest.param(lambda idx: idx, id="Equal_idx"), pytest.param(lambda idx: idx[::-1], id="Reversed_idx"), ], ) @pytest.mark.parametrize( "get_columns", [ pytest.param(lambda idx: None, id="None_idx"), pytest.param(lambda idx: ["a", "b", "c"], id="No_intersection_idx"), pytest.param(lambda idx: idx, id="Equal_idx"), pytest.param(lambda idx: idx[::-1], id="Reversed_idx"), ], ) @pytest.mark.parametrize("dtype", [None, "str"]) @pytest.mark.exclude_in_sanity def test_constructor_from_modin_series(get_index, get_columns, dtype, df_mode_pair): modin_df, pandas_df = create_test_df_in_defined_mode( test_data_values[0], native=df_mode_pair[0] ) modin_data = {f"new_col{i}": modin_df.iloc[:, i] for i in range(modin_df.shape[1])} pandas_data = { f"new_col{i}": pandas_df.iloc[:, i] for i in range(pandas_df.shape[1]) } index = get_index(modin_df.index) columns = get_columns(list(modin_data.keys())) new_modin = pd.DataFrame(modin_data, index=index, columns=columns, dtype=dtype) new_pandas = pandas.DataFrame( pandas_data, index=index, columns=columns, dtype=dtype ) df_equals(new_modin, new_pandas) ================================================ FILE: modin/tests/pandas/native_df_interoperability/test_pickle.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import numpy as np import pytest import modin.pandas as pd from modin.config import PersistentPickle from modin.tests.pandas.native_df_interoperability.utils import ( create_test_df_in_defined_mode, ) from modin.tests.pandas.utils import df_equals @pytest.fixture def modin_df(): return pd.DataFrame({"col1": np.arange(1000), "col2": np.arange(2000, 3000)}) @pytest.fixture def modin_column(modin_df): return modin_df["col1"] @pytest.fixture(params=[True, False]) def persistent(request): old = PersistentPickle.get() PersistentPickle.put(request.param) yield request.param PersistentPickle.put(old) def test__reduce__(df_mode_pair): # `DataFrame.__reduce__` will be called implicitly when lambda expressions are # pre-processed for the distributed engine. dataframe_data = ["Major League Baseball", "National Basketball Association"] abbr_md, abbr_pd = create_test_df_in_defined_mode( dataframe_data, index=["MLB", "NBA"], native=df_mode_pair[0] ) dataframe_data = { "name": ["Mariners", "Lakers"] * 500, "league_abbreviation": ["MLB", "NBA"] * 500, } teams_md, teams_pd = create_test_df_in_defined_mode( dataframe_data, native=df_mode_pair[1] ) result_md = ( teams_md.set_index("name") .league_abbreviation.apply(lambda abbr: abbr_md[0].loc[abbr]) .rename("league") ) result_pd = ( teams_pd.set_index("name") .league_abbreviation.apply(lambda abbr: abbr_pd[0].loc[abbr]) .rename("league") ) df_equals(result_md, result_pd) ================================================ FILE: modin/tests/pandas/native_df_interoperability/test_window.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import matplotlib import numpy as np import pandas import modin.pandas as pd from modin.config import NPartitions from modin.tests.pandas.native_df_interoperability.utils import ( create_test_df_in_defined_mode, ) from modin.tests.pandas.utils import df_equals NPartitions.put(4) # Force matplotlib to not use any Xwindows backend. matplotlib.use("Agg") def test_fillna_4660(df_mode_pair): modin_df_1, pandas_df_1 = create_test_df_in_defined_mode( {"a": ["a"], "b": ["b"], "c": [pd.NA]}, index=["row1"], native=df_mode_pair[0], ) modin_df_2, pandas_df_2 = create_test_df_in_defined_mode( {"a": ["a"], "b": ["b"], "c": [pd.NA]}, index=["row1"], native=df_mode_pair[1], ) modin_result = modin_df_1["c"].fillna(modin_df_2["b"]) pandas_result = pandas_df_1["c"].fillna(pandas_df_2["b"]) df_equals(modin_result, pandas_result) def test_fillna_dict_series(df_mode_pair): frame_data = { "a": [np.nan, 1, 2, np.nan, np.nan], "b": [1, 2, 3, np.nan, np.nan], "c": [np.nan, 1, 2, 3, 4], } df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) modin_df_1, pandas_df_1 = create_test_df_in_defined_mode( frame_data, native=df_mode_pair[0] ) modin_df_2, pandas_df_2 = create_test_df_in_defined_mode( frame_data, native=df_mode_pair[1] ) df_equals(modin_df.fillna({"a": 0, "b": 5}), df.fillna({"a": 0, "b": 5})) df_equals( modin_df.fillna({"a": 0, "b": 5, "d": 7}), df.fillna({"a": 0, "b": 5, "d": 7}), ) # Series treated same as dict df_equals( modin_df_1.fillna(modin_df_2.max()), pandas_df_1.fillna(pandas_df_2.max()) ) def test_fillna_dataframe(df_mode_pair): frame_data = { "a": [np.nan, 1, 2, np.nan, np.nan], "b": [1, 2, 3, np.nan, np.nan], "c": [np.nan, 1, 2, 3, 4], } modin_df_1, pandas_df_1 = create_test_df_in_defined_mode( frame_data, index=list("VWXYZ"), native=df_mode_pair[0] ) modin_df_2, pandas_df_2 = create_test_df_in_defined_mode( {"a": [np.nan, 10, 20, 30, 40], "b": [50, 60, 70, 80, 90], "foo": ["bar"] * 5}, index=list("VWXuZ"), native=df_mode_pair[1], ) # only those columns and indices which are shared get filled df_equals(modin_df_1.fillna(modin_df_2), pandas_df_1.fillna(pandas_df_2)) ================================================ FILE: modin/tests/pandas/native_df_interoperability/utils.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. from contextlib import contextmanager, nullcontext from modin import set_execution from modin.config import Engine, StorageFormat from modin.config import context as config_context from modin.config.envvars import Backend from modin.tests.pandas.utils import ( NoModinException, create_test_dfs, create_test_series, df_equals, ) from modin.tests.test_utils import current_execution_is_native from modin.utils import try_cast_to_pandas @contextmanager def switch_to_native_execution(): engine = Engine.get() storage_format = StorageFormat.get() try: set_execution("Native", "Native") yield finally: set_execution(engine=engine, storage_format=storage_format) def create_test_df_in_defined_mode( *args, post_fn=None, backend=None, native=None, **kwargs ): assert not current_execution_is_native(), "already in native dataframe mode." if not isinstance(native, bool): raise ValueError("`native` should be True or False.") # Use the default backend unless native hybrid_backend = "Pandas" if native else Backend.get() with switch_to_native_execution() if native else nullcontext(): with config_context(AutoSwitchBackend=False, Backend=hybrid_backend): modin_df, pandas_df = create_test_dfs( *args, post_fn=post_fn, backend=backend, **kwargs ) return modin_df, pandas_df def create_test_series_in_defined_mode( vals, sort=False, backend=None, native=None, **kwargs ): assert not current_execution_is_native(), "already in native dataframe mode." if not isinstance(native, bool): raise ValueError("`native` should be True or False.") # Use the default backend unless native hybrid_backend = "Pandas" if native else Backend.get() with switch_to_native_execution() if native else nullcontext(): with config_context(AutoSwitchBackend=False, Backend=hybrid_backend): modin_ser, pandas_ser = create_test_series( vals, sort=sort, backend=backend, **kwargs ) return modin_ser, pandas_ser def eval_general_interop( data, backend, operation, df_mode_pair, comparator=df_equals, __inplace__=False, expected_exception=None, check_kwargs_callable=True, md_extra_kwargs=None, comparator_kwargs=None, **kwargs, ): df1_native, df2_native = df_mode_pair modin_df1, pandas_df1 = create_test_df_in_defined_mode( data, backend=backend, native=df1_native ) modin_df2, pandas_df2 = create_test_df_in_defined_mode( data, backend=backend, native=df2_native ) md_kwargs, pd_kwargs = {}, {} def execute_callable(fn, inplace=False, md_kwargs={}, pd_kwargs={}): try: pd_result = fn(pandas_df1, pandas_df2, **pd_kwargs) except Exception as pd_e: try: if inplace: _ = fn(modin_df1, modin_df2, **md_kwargs) try_cast_to_pandas(modin_df1) # force materialization else: try_cast_to_pandas( fn(modin_df1, modin_df2, **md_kwargs) ) # force materialization except Exception as md_e: assert isinstance( md_e, type(pd_e) ), "Got Modin Exception type {}, but pandas Exception type {} was expected".format( type(md_e), type(pd_e) ) if expected_exception: if Engine.get() == "Ray": from ray.exceptions import RayTaskError # unwrap ray exceptions from remote worker if isinstance(md_e, RayTaskError): md_e = md_e.args[0] assert ( type(md_e) is type(expected_exception) and md_e.args == expected_exception.args ), f"not acceptable Modin's exception: [{repr(md_e)}] expected {expected_exception}" assert ( pd_e.args == expected_exception.args ), f"not acceptable Pandas' exception: [{repr(pd_e)}]" elif expected_exception is False: # The only way to disable exception message checking. pass else: # It’s not enough that Modin and pandas have the same types of exceptions; # we need to explicitly specify the instance of an exception # (using `expected_exception`) in tests so that we can check exception messages. # This allows us to eliminate situations where exceptions are thrown # that we don't expect, which could hide different bugs. raise pd_e else: raise NoModinException( f"Modin doesn't throw an exception, while pandas does: [{repr(pd_e)}]" ) else: md_result = fn(modin_df1, modin_df2, **md_kwargs) return (md_result, pd_result) if not inplace else (modin_df1, pandas_df1) for key, value in kwargs.items(): if check_kwargs_callable and callable(value): values = execute_callable(value) # that means, that callable raised an exception if values is None: return else: md_value, pd_value = values else: md_value, pd_value = value, value md_kwargs[key] = md_value pd_kwargs[key] = pd_value if md_extra_kwargs: assert isinstance(md_extra_kwargs, dict) md_kwargs.update(md_extra_kwargs) values = execute_callable( operation, md_kwargs=md_kwargs, pd_kwargs=pd_kwargs, inplace=__inplace__ ) if values is not None: comparator(*values, **(comparator_kwargs or {})) ================================================ FILE: modin/tests/pandas/test_api.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import inspect import numpy as np import pandas import pytest import modin.pandas as pd _MODIN_EXTRA_ATTRIBUTES = ( # modin - namespace for accessing additional Modin functions that are not available in Pandas "modin", # get_backend - get storage and engine backend for the current DataFrame "get_backend", # set_backend - set storage and engine backend for the current DataFrame "set_backend", # move_to - set storage and engine backend for the current DataFrame "move_to", # is_backend_pinned, pin_backend, unpin_backend - change automatic switching behavior "is_backend_pinned", "pin_backend", "unpin_backend", ) def test_top_level_api_equality(): modin_dir = [obj for obj in dir(pd) if obj[0] != "_"] pandas_dir = [obj for obj in dir(pandas) if obj[0] != "_"] missing_from_modin = set(pandas_dir) - set(modin_dir) extra_in_modin = set(modin_dir) - set(pandas_dir) ignore_pandas = [ "annotations", "np", "tests", "pandas", "core", "compat", "util", "offsets", "datetime", "api", "tseries", "to_msgpack", # This one is experimental, and doesn't look finished "Panel", # This is deprecated and throws a warning every time. ] ignore_modin = [ "indexing", "iterator", "series", "accessor", "base", "utils", "dataframe", "groupby", "general", "datetime", "warnings", "os", "series_utils", "window", ] assert not len( missing_from_modin - set(ignore_pandas) ), "Differences found in API: {}".format(missing_from_modin - set(ignore_pandas)) assert not len( extra_in_modin - set(ignore_modin) ), "Differences found in API: {}".format(extra_in_modin - set(ignore_modin)) difference = [] allowed_different = ["Interval", "datetime", "StringDtype"] # Check that we have all keywords and defaults in pandas for m in set(pandas_dir) - set(ignore_pandas): if m in allowed_different: continue try: pandas_sig = dict(inspect.signature(getattr(pandas, m)).parameters) except (TypeError, ValueError): continue try: modin_sig = dict(inspect.signature(getattr(pd, m)).parameters) except (TypeError, ValueError): continue if not pandas_sig == modin_sig: try: append_val = ( m, { i: pandas_sig[i] for i in pandas_sig.keys() if i not in modin_sig or pandas_sig[i].default != modin_sig[i].default and not ( pandas_sig[i].default is np.nan and modin_sig[i].default is np.nan ) }, ) except Exception: raise try: # This validates that there are actually values to add to the difference # based on the condition above. if len(list(append_val[-1])[-1]) > 0: difference.append(append_val) except IndexError: pass assert not len(difference), "Missing params found in API: {}".format(difference) # Check that we have no extra keywords or defaults for m in set(pandas_dir) - set(ignore_pandas): if m in allowed_different: continue try: pandas_sig = dict(inspect.signature(getattr(pandas, m)).parameters) except (TypeError, ValueError): continue try: modin_sig = dict(inspect.signature(getattr(pd, m)).parameters) except (TypeError, ValueError): continue if not pandas_sig == modin_sig: try: append_val = ( m, { i: modin_sig[i] for i in modin_sig.keys() if i not in pandas_sig and i != "query_compiler" }, ) except Exception: raise try: # This validates that there are actually values to add to the difference # based on the condition above. if len(list(append_val[-1])[-1]) > 0: difference.append(append_val) except IndexError: pass assert not len(difference), "Extra params found in API: {}".format(difference) def test_dataframe_api_equality(): modin_dir = [obj for obj in dir(pd.DataFrame) if obj[0] != "_"] pandas_dir = [obj for obj in dir(pandas.DataFrame) if obj[0] != "_"] ignore_in_pandas = ["timetuple"] # modin - namespace for accessing additional Modin functions that are not available in Pandas missing_from_modin = set(pandas_dir) - set(modin_dir) assert not len( missing_from_modin - set(ignore_in_pandas) ), "Differences found in API: {}".format( len(missing_from_modin - set(ignore_in_pandas)) ) assert not len( set(modin_dir) - set(_MODIN_EXTRA_ATTRIBUTES) - set(pandas_dir) ), "Differences found in API: {}".format(set(modin_dir) - set(pandas_dir)) assert_parameters_eq( (pandas.DataFrame, pd.DataFrame), modin_dir, allowed_different=_MODIN_EXTRA_ATTRIBUTES, ) def test_series_str_api_equality(): modin_dir = [obj for obj in dir(pd.Series.str) if obj[0] != "_"] pandas_dir = [obj for obj in dir(pandas.Series.str) if obj[0] != "_"] missing_from_modin = set(pandas_dir) - set(modin_dir) assert not len(missing_from_modin), "Differences found in API: {}".format( missing_from_modin ) extra_in_modin = set(modin_dir) - set(pandas_dir) assert not len(extra_in_modin), "Differences found in API: {}".format( extra_in_modin ) assert_parameters_eq((pandas.Series.str, pd.Series.str), modin_dir, []) def test_series_dt_api_equality(): modin_dir = [obj for obj in dir(pd.Series.dt) if obj[0] != "_"] pandas_dir = [obj for obj in dir(pandas.Series.dt) if obj[0] != "_"] # should be deleted, but for some reason the check fails # https://github.com/pandas-dev/pandas/pull/33595 ignore = ["week", "weekofyear"] missing_from_modin = set(pandas_dir) - set(modin_dir) - set(ignore) assert not len(missing_from_modin), "Differences found in API: {}".format( missing_from_modin ) extra_in_modin = set(modin_dir) - set(pandas_dir) assert not len(extra_in_modin), "Differences found in API: {}".format( extra_in_modin ) assert_parameters_eq((pandas.Series.dt, pd.Series.dt), modin_dir, []) def test_series_cat_api_equality(): modin_dir = [obj for obj in dir(pd.Series.cat) if obj[0] != "_"] pandas_dir = [obj for obj in dir(pandas.Series.cat) if obj[0] != "_"] missing_from_modin = set(pandas_dir) - set(modin_dir) assert not len(missing_from_modin), "Differences found in API: {}".format( len(missing_from_modin) ) extra_in_modin = set(modin_dir) - set(pandas_dir) assert not len(extra_in_modin), "Differences found in API: {}".format( extra_in_modin ) # all methods of `pandas.Series.cat` don't have any information about parameters, # just method(*args, **kwargs) assert_parameters_eq((pandas.core.arrays.Categorical, pd.Series.cat), modin_dir, []) @pytest.mark.parametrize("obj", ["DataFrame", "Series"]) def test_sparse_accessor_api_equality(obj): modin_dir = [x for x in dir(getattr(pd, obj).sparse) if x[0] != "_"] pandas_dir = [x for x in dir(getattr(pandas, obj).sparse) if x[0] != "_"] missing_from_modin = set(pandas_dir) - set(modin_dir) assert not len(missing_from_modin), "Differences found in API: {}".format( len(missing_from_modin) ) extra_in_modin = set(modin_dir) - set(pandas_dir) assert not len(extra_in_modin), "Differences found in API: {}".format( extra_in_modin ) @pytest.mark.parametrize("obj", ["SeriesGroupBy", "DataFrameGroupBy"]) def test_groupby_api_equality(obj): modin_dir = [x for x in dir(getattr(pd.groupby, obj)) if x[0] != "_"] pandas_dir = [x for x in dir(getattr(pandas.core.groupby, obj)) if x[0] != "_"] # These attributes are not mentioned in the pandas documentation, # but we might want to implement them someday. ignore = ["keys", "level", "grouper"] missing_from_modin = set(pandas_dir) - set(modin_dir) - set(ignore) assert not len(missing_from_modin), "Differences found in API: {}".format( len(missing_from_modin) ) # FIXME: wrong inheritance ignore = ( {"boxplot", "corrwith", "dtypes"} if obj == "SeriesGroupBy" else {"boxplot"} ) | set(_MODIN_EXTRA_ATTRIBUTES) extra_in_modin = ( set(modin_dir) - set(pandas_dir) - set(ignore) - set(_MODIN_EXTRA_ATTRIBUTES) ) assert not len(extra_in_modin), "Differences found in API: {}".format( extra_in_modin ) assert_parameters_eq( (getattr(pandas.core.groupby, obj), getattr(pd.groupby, obj)), modin_dir, ignore ) def test_series_api_equality(): modin_dir = [obj for obj in dir(pd.Series) if obj[0] != "_"] pandas_dir = [obj for obj in dir(pandas.Series) if obj[0] != "_"] ignore = ["timetuple"] missing_from_modin = set(pandas_dir) - set(modin_dir) - set(ignore) assert not len(missing_from_modin), "Differences found in API: {}".format( missing_from_modin ) extra_in_modin = set(modin_dir) - set(_MODIN_EXTRA_ATTRIBUTES) - set(pandas_dir) assert not len(extra_in_modin), "Differences found in API: {}".format( extra_in_modin ) assert_parameters_eq( (pandas.Series, pd.Series), modin_dir, allowed_different=_MODIN_EXTRA_ATTRIBUTES ) def assert_parameters_eq(objects, attributes, allowed_different): pandas_obj, modin_obj = objects difference = [] # Check that Modin functions/methods don't have extra params for m in attributes: if m in allowed_different: continue try: pandas_sig = dict(inspect.signature(getattr(pandas_obj, m)).parameters) except TypeError: continue try: modin_sig = dict(inspect.signature(getattr(modin_obj, m)).parameters) except TypeError: continue if not pandas_sig == modin_sig: append_val = ( m, { i: pandas_sig[i] for i in pandas_sig.keys() if i not in modin_sig or pandas_sig[i].default != modin_sig[i].default and not ( pandas_sig[i].default is np.nan and modin_sig[i].default is np.nan ) }, ) try: # This validates that there are actually values to add to the difference # based on the condition above. if len(list(append_val[-1])[-1]) > 0: difference.append(append_val) except IndexError: pass assert not len(difference), "Missing params found in API: {}".format(difference) difference = [] # Check that Modin functions/methods have all params as pandas for m in attributes: if m in allowed_different: continue try: pandas_sig = dict(inspect.signature(getattr(pandas_obj, m)).parameters) except TypeError: continue try: modin_sig = dict(inspect.signature(getattr(modin_obj, m)).parameters) except TypeError: continue if not pandas_sig == modin_sig: append_val = ( m, {i: modin_sig[i] for i in modin_sig.keys() if i not in pandas_sig}, ) try: # This validates that there are actually values to add to the difference # based on the condition above. if len(list(append_val[-1])[-1]) > 0: difference.append(append_val) except IndexError: pass assert not len(difference), "Extra params found in API: {}".format(difference) ================================================ FILE: modin/tests/pandas/test_backend.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import platform import re from unittest.mock import patch import pandas import pytest import tqdm.auto import modin.pandas as pd from modin.config import Backend from modin.config import context as config_context from modin.tests.pandas.utils import ( create_test_dfs, default_to_pandas_ignore_string, df_equals, ) WINDOWS_RAY_SKIP_MARK = pytest.mark.skipif( platform.system() == "Windows", reason=( "Some windows tests with engine != ray use 2 cores, but that " + "doesn't work with ray due to " + "https://github.com/modin-project/modin/issues/7387" ), ) # Some modin methods warn about defaulting to pandas at the API layer. That's # expected and not an error as it would be normally. pytestmark = pytest.mark.filterwarnings(default_to_pandas_ignore_string) def test_new_dataframe_uses_default_backend(): # We run this test with `Backend` set to just one value (instead of # trying to look for every possible `Backend` value in the same pytest # process) because switching to the MPI backend within a test process # that's not set up to run MPI (i.e. because the test process has been # started `mpiexec` instead of just `pytest`) would cause errors. We assume # that CI runs this test file once with every possible `Backend`. assert pd.DataFrame([1]).get_backend() == Backend.get() @pytest.mark.parametrize("setter_method", ["set_backend", "move_to"]) @pytest.mark.parametrize( "inplace_kwargs", [ pytest.param({"inplace": True}, id="inplace"), pytest.param({"inplace": False}, id="not_inplace"), pytest.param({}, id="no_inplace_kwargs"), ], ) @pytest.mark.parametrize( "starting_backend, new_backend, expected_result_backend", [ pytest.param(Backend.get(), "pandas", "Pandas", id="current_to_pandas"), pytest.param("pandas", Backend.get(), Backend.get(), id="pandas_to_current"), pytest.param( Backend.get(), "python_test", "Python_Test", id="current_to_python" ), pytest.param( "python_test", Backend.get(), Backend.get(), id="python_to_current" ), pytest.param("python_test", "pandas", "Pandas", id="python_to_pandas1"), pytest.param("PYTHON_test", "PANDAS", "Pandas", id="python_to_pandas2"), pytest.param("pandas", "python_test", "Python_Test", id="pandas_to_python"), pytest.param("pandas", "pandas", "Pandas", id="pandas_to_pandas"), pytest.param( "python_test", "python_test", "Python_Test", id="python_to_python" ), pytest.param( "ray", "dask", "Dask", id="ray_to_dask", marks=WINDOWS_RAY_SKIP_MARK, ), pytest.param( "dask", "ray", "Ray", id="dask_to_ray", marks=WINDOWS_RAY_SKIP_MARK, ), pytest.param( "ray", "python_test", "Python_Test", id="ray_to_python", marks=WINDOWS_RAY_SKIP_MARK, ), pytest.param("dask", "python_test", "Python_Test", id="dask_to_python"), pytest.param( "python_test", "ray", "Ray", id="python_to_ray", marks=WINDOWS_RAY_SKIP_MARK, ), pytest.param("python_test", "dask", "Dask", id="python_to_dask"), pytest.param("ray", "ray", "Ray", id="ray_to_ray", marks=WINDOWS_RAY_SKIP_MARK), pytest.param("dask", "dask", "Dask", id="dask_to_dask"), ], ) @pytest.mark.parametrize( "data_class", [ pytest.param(pd.DataFrame, id="dataframe"), pytest.param(pd.Series, id="series"), ], ) def test_set_valid_backend( setter_method, inplace_kwargs, starting_backend, new_backend, data_class, expected_result_backend, ): progress_iter_count = 2 with patch.object( tqdm.auto, "trange", return_value=range(progress_iter_count) ) as mock_trange, config_context(Backend=starting_backend): original_df = data_class([1]) # convert to pandas for comparison while still on the `starting_backend`. original_df_as_pandas = original_df.modin.to_pandas() method_result = getattr(original_df, setter_method)( new_backend, **inplace_kwargs ) if inplace_kwargs.get("inplace", False): assert method_result is None result_df = original_df else: assert method_result is not None result_df = method_result assert result_df.get_backend() == expected_result_backend df_equals(result_df, original_df_as_pandas) # The global Backend should remain the same even if we change the # backend for a single dataframe. assert Backend.get() == Backend.normalize(starting_backend) if Backend.normalize(starting_backend) == Backend.normalize( expected_result_backend ): mock_trange.assert_not_called() else: # trange constructor is only called once and the iterator is consumed # progress_iter_count times, but we can't easily assert on the number of iterations mock_trange.assert_called_once() def test_same_backend(): with patch.object( tqdm.auto, "trange", return_value=range(2) ) as mock_trange, config_context(Backend="Python_Test"): df = pd.DataFrame([1]) new_df = df.set_backend("Python_Test") mock_trange.assert_not_called() assert new_df.get_backend() == "Python_Test" new_df = df.set_backend("Python_Test", inplace=True) mock_trange.assert_not_called() assert new_df is None assert df.get_backend() == "Python_Test" def test_set_nonexistent_backend(): backend_choice_string = ", ".join(f"'{choice}'" for choice in Backend.choices) with pytest.raises( ValueError, match=re.escape( "Unknown backend 'does_not_exist'. " + f"Available backends are: {backend_choice_string}" ), ): pd.DataFrame([1]).set_backend("does_not_exist") @pytest.mark.parametrize("backend", [None, 1, [], {}]) def test_wrong_backend_type(backend): with pytest.raises( TypeError, match=re.escape( "Backend value should be a string, but instead it is " + f"{repr(backend)} of type {type(backend)}" ), ): pd.DataFrame([1]).set_backend(backend) def test_get_backend_docstrings(): dataframe_method = pd.DataFrame.get_backend series_method = pd.Series.get_backend assert dataframe_method.__doc__ != series_method.__doc__ assert dataframe_method.__doc__ == series_method.__doc__.replace( "Series", "DataFrame" ) @pytest.mark.parametrize("setter_method", ["set_backend", "move_to"]) def test_set_backend_docstrings(setter_method): dataframe_method = getattr(pd.DataFrame, setter_method) series_method = getattr(pd.Series, setter_method) assert dataframe_method.__doc__ != series_method.__doc__ assert dataframe_method.__doc__ == series_method.__doc__.replace( "Series", "DataFrame" ) class TestGroupbySetBackend: @pytest.mark.parametrize("setter_method", ["set_backend", "move_to"]) @pytest.mark.parametrize( "inplace_kwargs", [ pytest.param({"inplace": True}, id="inplace"), pytest.param({"inplace": False}, id="not_inplace"), pytest.param({}, id="no_inplace_kwargs"), ], ) @pytest.mark.parametrize( "starting_backend, new_backend", [ pytest.param(Backend.get(), "Pandas", id="current_to_pandas"), pytest.param("Pandas", Backend.get(), id="pandas_to_current"), pytest.param(Backend.get(), "Python_Test", id="current_to_python"), pytest.param("Python_Test", Backend.get(), id="python_to_current"), pytest.param("Python_Test", "Pandas", id="python_to_pandas"), pytest.param("Pandas", "Python_Test", id="pandas_to_python"), ], ) @pytest.mark.parametrize( "by_level_factory", [ pytest.param(lambda df: ("C", None), id="by_string_column"), pytest.param(lambda df: (["C", "D"], None), id="by_list_of_strings"), pytest.param(lambda df: (df["C"], None), id="by_series"), pytest.param(lambda df: (["C", df["D"]], None), id="by_list_mixed"), pytest.param(lambda df: (pandas.Grouper(key="C"), None), id="by_grouper"), pytest.param(lambda df: (None, 0), id="level_scalar"), pytest.param(lambda df: (None, [0, 1]), id="level_list"), pytest.param( lambda df: (["C", df["D"]], None), id="by_mixed_string_series" ), ], ) def test_dataframe( self, setter_method, inplace_kwargs, starting_backend, new_backend, by_level_factory, ): """Test set_backend functionality for DataFrame groupby objects with various 'by' and 'level' combinations.""" with config_context(Backend=starting_backend): def do_groupby(df): by, level = by_level_factory(df) return df.groupby(by=by, level=level) inplace = inplace_kwargs.get("inplace", False) original_modin_df, original_pandas_df = create_test_dfs( pandas.DataFrame( data={ "A": [1, 2, 3, 4, 5, 6], "B": [10, 20, 30, 40, 50, 60], "C": ["x", "y", "x", "y", "x", "y"], "D": ["p", "p", "q", "q", "r", "r"], }, index=pd.MultiIndex.from_tuples( [ ("foo", 1), ("foo", 2), ("bar", 1), ("bar", 2), ("baz", 1), ("baz", 2), ], names=["first", "second"], ), ) ) # Create DataFrame groupby object original_groupby = do_groupby(original_modin_df) setter_result = getattr(original_groupby, setter_method)( new_backend, **inplace_kwargs ) if inplace: assert setter_result is None result_groupby = original_groupby # Verify that the underlying DataFrame's backend was also changed assert original_groupby._df.get_backend() == new_backend else: assert setter_result is not original_groupby result_groupby = setter_result # Verify original DataFrame's backend was not changed assert original_groupby._df.get_backend() == starting_backend # Verify backend was changed assert result_groupby.get_backend() == new_backend # Verify that groupby still works correctly after backend switch # Create a fresh groupby for comparison to avoid mixed backend states pandas_groupby_sum = do_groupby(original_pandas_df).sum() df_equals( result_groupby.sum(), pandas_groupby_sum, ) if not inplace: df_equals( original_groupby.sum(), pandas_groupby_sum, ) @pytest.mark.parametrize("setter_method", ["set_backend", "move_to"]) @pytest.mark.parametrize( "inplace_kwargs", [ pytest.param({"inplace": True}, id="inplace"), pytest.param({"inplace": False}, id="not_inplace"), pytest.param({}, id="no_inplace_kwargs"), ], ) @pytest.mark.parametrize( "starting_backend, new_backend", [ pytest.param(Backend.get(), "Pandas", id="current_to_pandas"), pytest.param("Pandas", Backend.get(), id="pandas_to_current"), pytest.param(Backend.get(), "Python_Test", id="current_to_python"), pytest.param("Python_Test", Backend.get(), id="python_to_current"), pytest.param("Python_Test", "Pandas", id="python_to_pandas"), pytest.param("Pandas", "Python_Test", id="pandas_to_python"), ], ) @pytest.mark.parametrize( "by_level_factory", [ pytest.param(lambda series: (None, 0), id="by_index_level_0"), pytest.param( lambda series: (None, [0, 1]), id="by_index_levels_list", ), pytest.param( lambda series: (pandas.Grouper(level=0), None), id="by_grouper_level", ), pytest.param(lambda series: (None, 0), id="level_scalar"), pytest.param(lambda series: (None, [0, 1]), id="level_list"), pytest.param(lambda series: (series, None), id="by_self"), pytest.param(lambda series: (series % 2, None), id="by_self_modulo_2"), ], ) def test_series( self, setter_method, inplace_kwargs, starting_backend, new_backend, by_level_factory, ): """Test set_backend functionality for Series groupby objects with various 'by' and 'level' combinations.""" with config_context(Backend=starting_backend): inplace = inplace_kwargs.get("inplace", False) # Create test data with MultiIndex to support level-based grouping idx = pd.MultiIndex.from_tuples( [ ("foo", 1), ("foo", 2), ("bar", 1), ("bar", 2), ("baz", 1), ("baz", 2), ], names=["first", "second"], ) original_pandas_series = pandas.Series([1, 2, 1, 3, 4, 5], index=idx) original_modin_series = pd.Series([1, 2, 1, 3, 4, 5], index=idx) def do_groupby(series): by, level = by_level_factory(series) return series.groupby(by=by, level=level) # Create Series groupby object original_groupby = do_groupby(original_modin_series) setter_result = getattr(original_groupby, setter_method)( new_backend, **inplace_kwargs ) if inplace: assert setter_result is None result_groupby = original_groupby # Verify that the underlying Series's backend was also changed assert original_groupby._df.get_backend() == new_backend else: assert setter_result is not original_groupby result_groupby = setter_result # Verify original Series's backend was not changed assert original_groupby._df.get_backend() == starting_backend assert result_groupby.get_backend() == new_backend pandas_groupby_sum = do_groupby(original_pandas_series).sum() df_equals(result_groupby.sum(), pandas_groupby_sum) if not inplace: df_equals(original_groupby.sum(), pandas_groupby_sum) # Tests for fallback progress printing when tqdm is not available @pytest.mark.parametrize( "switch_operation,expected_output", [ ( None, "Transfer: Python_... → Pandas | ≃ (3, 1) ", ), ( "test_operation", "Transfer: Python_... → Pandas | test_operation ≃ (3, 1) ", ), ], ) @patch("tqdm.auto.trange", side_effect=ImportError("tqdm not available")) @config_context(Backend="python_test") def test_fallback_progress_printing( mock_trange, capsys, switch_operation, expected_output ): """Test that fallback progress printing works when tqdm is not available and ShowBackendSwitchProgress is enabled.""" df = pd.DataFrame([1, 2, 3]) df.set_backend("pandas", switch_operation=switch_operation) captured = capsys.readouterr() assert expected_output in captured.err assert captured.out == "" # Nothing should go to stdout @config_context(Backend="python_test") def test_bigger_df_progress_message(): # Insiginificant digits in the size get truncated df = pd.DataFrame([[1] * 144] * 121) with patch.object(tqdm.auto, "trange", return_value=range(2)) as mock_trange: df.set_backend("pandas") mock_trange.assert_called_once() call_args = mock_trange.call_args desc = call_args[1]["desc"] # Get the 'desc' keyword argument assert desc.startswith( "Transfer: Python_... → Pandas | ≃ (1e+02, 1e+02)" ) @patch("tqdm.auto.trange", side_effect=ImportError("tqdm not available")) @config_context(Backend="python_test") def test_fallback_progress_printing_silent_when_disabled(mock_trange, capsys): """Test that fallback progress printing is silent when ShowBackendSwitchProgress is disabled.""" df = pd.DataFrame([1, 2, 3]) with config_context(ShowBackendSwitchProgress=False): df.set_backend("pandas") captured = capsys.readouterr() assert captured.out == "" assert captured.err == "" @config_context(Backend="python_test") def test_tqdm_progress_bar_disabled_when_backend_switch_progress_false(capsys): """Test that tqdm progress bar doesn't appear when ShowBackendSwitchProgress is disabled.""" df = pd.DataFrame([1, 2, 3]) with config_context(ShowBackendSwitchProgress=False), patch( "tqdm.auto.trange" ) as mock_trange: df.set_backend("pandas") mock_trange.assert_not_called() captured = capsys.readouterr() assert captured.out == "" assert captured.err == "" ================================================ FILE: modin/tests/pandas/test_concat.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import numpy as np import pandas import pytest import modin.pandas as pd from modin.config import NPartitions, StorageFormat from modin.pandas.io import from_pandas from modin.utils import get_current_execution from .utils import ( create_test_dfs, default_to_pandas_ignore_string, df_equals, generate_dfs, generate_multiindex_dfs, generate_none_dfs, ) NPartitions.put(4) pytestmark = pytest.mark.filterwarnings(default_to_pandas_ignore_string) # Initialize env for storage format detection in @pytest.mark.* pd.DataFrame() def test_df_concat(): df, df2 = generate_dfs() df_equals(pd.concat([df, df2]), pandas.concat([df, df2])) def test_concat(): df, df2 = generate_dfs() modin_df, modin_df2 = from_pandas(df), from_pandas(df2) df_equals(pd.concat([modin_df, modin_df2]), pandas.concat([df, df2])) def test_concat_with_series(): df, df2 = generate_dfs() modin_df, modin_df2 = from_pandas(df), from_pandas(df2) pandas_series = pandas.Series([1, 2, 3, 4], name="new_col") df_equals( pd.concat([modin_df, modin_df2, pandas_series], axis=0), pandas.concat([df, df2, pandas_series], axis=0), ) df_equals( pd.concat([modin_df, modin_df2, pandas_series], axis=1), pandas.concat([df, df2, pandas_series], axis=1), ) def test_concat_on_index(): df, df2 = generate_dfs() modin_df, modin_df2 = from_pandas(df), from_pandas(df2) df_equals( pd.concat([modin_df, modin_df2], axis="index"), pandas.concat([df, df2], axis="index"), ) df_equals( pd.concat([modin_df, modin_df2], axis="rows"), pandas.concat([df, df2], axis="rows"), ) df_equals( pd.concat([modin_df, modin_df2], axis=0), pandas.concat([df, df2], axis=0) ) @pytest.mark.parametrize("no_dup_cols", [True, False]) @pytest.mark.parametrize("different_len", [True, False]) def test_concat_on_column(no_dup_cols, different_len): df, df2 = generate_dfs() if no_dup_cols: df = df.drop(set(df.columns) & set(df2.columns), axis="columns") if different_len: df = pandas.concat([df, df], ignore_index=True) modin_df, modin_df2 = from_pandas(df), from_pandas(df2) df_equals( pd.concat([modin_df, modin_df2], axis=1), pandas.concat([df, df2], axis=1) ) df_equals( pd.concat([modin_df, modin_df2], axis="columns"), pandas.concat([df, df2], axis="columns"), ) modin_result = pd.concat( [pd.Series(np.ones(10)), pd.Series(np.ones(10))], axis=1, ignore_index=True ) pandas_result = pandas.concat( [pandas.Series(np.ones(10)), pandas.Series(np.ones(10))], axis=1, ignore_index=True, ) df_equals(modin_result, pandas_result) assert modin_result.dtypes.equals(pandas_result.dtypes) def test_invalid_axis_errors(): df, df2 = generate_dfs() modin_df, modin_df2 = from_pandas(df), from_pandas(df2) with pytest.raises(ValueError): pd.concat([modin_df, modin_df2], axis=2) def test_mixed_concat(): df, df2 = generate_dfs() df3 = df.copy() mixed_dfs = [from_pandas(df), from_pandas(df2), df3] df_equals(pd.concat(mixed_dfs), pandas.concat([df, df2, df3])) def test_mixed_inner_concat(): df, df2 = generate_dfs() df3 = df.copy() mixed_dfs = [from_pandas(df), from_pandas(df2), df3] df_equals( pd.concat(mixed_dfs, join="inner"), pandas.concat([df, df2, df3], join="inner"), # https://github.com/modin-project/modin/issues/5963 check_dtypes=False, ) def test_mixed_none_concat(): df, df2 = generate_none_dfs() df3 = df.copy() mixed_dfs = [from_pandas(df), from_pandas(df2), df3] df_equals(pd.concat(mixed_dfs), pandas.concat([df, df2, df3])) def test_ignore_index_concat(): df, df2 = generate_dfs() df_equals( pd.concat([df, df2], ignore_index=True), pandas.concat([df, df2], ignore_index=True), ) def test_concat_non_subscriptable_keys(): frame_data = np.random.randint(0, 100, size=(2**10, 2**6)) df = pd.DataFrame(frame_data).add_prefix("col") pdf = pandas.DataFrame(frame_data).add_prefix("col") modin_dict = {"c": df.copy(), "b": df.copy()} pandas_dict = {"c": pdf.copy(), "b": pdf.copy()} modin_result = pd.concat(modin_dict.values(), keys=modin_dict.keys()) pandas_result = pandas.concat(pandas_dict.values(), keys=pandas_dict.keys()) df_equals(modin_result, pandas_result) def test_concat_series_only(): modin_series = pd.Series(list(range(1000))) pandas_series = pandas.Series(list(range(1000))) df_equals( pd.concat([modin_series, modin_series]), pandas.concat([pandas_series, pandas_series]), ) def test_concat_5776(): modin_data = {key: pd.Series(index=range(3)) for key in ["a", "b"]} pandas_data = {key: pandas.Series(index=range(3)) for key in ["a", "b"]} df_equals( pd.concat(modin_data, axis="columns"), pandas.concat(pandas_data, axis="columns"), ) def test_concat_6840(): groupby_objs = [] for idx, lib in enumerate((pd, pandas)): df1 = lib.DataFrame( [["a", 1], ["b", 2], ["b", 4]], columns=["letter", "number"] ) df1_g = df1.groupby("letter", as_index=False)["number"].agg("sum") df2 = lib.DataFrame( [["a", 3], ["a", 4], ["b", 1]], columns=["letter", "number"] ) df2_g = df2.groupby("letter", as_index=False)["number"].agg("sum") groupby_objs.append([df1_g, df2_g]) df_equals( pd.concat(groupby_objs[0]), pandas.concat(groupby_objs[1]), ) def test_concat_with_empty_frame(): modin_empty_df = pd.DataFrame() pandas_empty_df = pandas.DataFrame() modin_row = pd.Series({0: "a", 1: "b"}) pandas_row = pandas.Series({0: "a", 1: "b"}) df_equals( pd.concat([modin_empty_df, modin_row]), pandas.concat([pandas_empty_df, pandas_row]), ) md_empty1, pd_empty1 = create_test_dfs(index=[1, 2, 3]) md_empty2, pd_empty2 = create_test_dfs(index=[2, 3, 4]) df_equals( pd.concat([md_empty1, md_empty2], axis=0), pandas.concat([pd_empty1, pd_empty2], axis=0), ) df_equals( pd.concat([md_empty1, md_empty2], axis=1), pandas.concat([pd_empty1, pd_empty2], axis=1), ) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("names", [False, True]) def test_concat_multiindex(axis, names): pd_df1, pd_df2 = generate_multiindex_dfs(axis=axis) md_df1, md_df2 = map(from_pandas, [pd_df1, pd_df2]) keys = ["first", "second"] if names: names = [str(i) for i in np.arange(pd_df1.axes[axis].nlevels + 1)] else: names = None df_equals( pd.concat([md_df1, md_df2], keys=keys, axis=axis, names=names), pandas.concat([pd_df1, pd_df2], keys=keys, axis=axis, names=names), ) @pytest.mark.parametrize("axis", [0, 1]) def test_concat_dictionary(axis): pandas_df, pandas_df2 = generate_dfs() modin_df, modin_df2 = from_pandas(pandas_df), from_pandas(pandas_df2) df_equals( pd.concat({"A": modin_df, "B": modin_df2}, axis=axis), pandas.concat({"A": pandas_df, "B": pandas_df2}, axis=axis), ) @pytest.mark.parametrize("sort", [False, True]) @pytest.mark.parametrize("join", ["inner", "outer"]) @pytest.mark.parametrize("axis", [0, 1]) def test_sort_order(sort, join, axis): pandas_df = pandas.DataFrame({"c": [3], "d": [4]}, columns=["d", "c"]) pandas_df2 = pandas.DataFrame({"a": [1], "b": [2]}, columns=["b", "a"]) modin_df, modin_df2 = from_pandas(pandas_df), from_pandas(pandas_df2) pandas_concat = pandas.concat([pandas_df, pandas_df2], join=join, sort=sort) modin_concat = pd.concat([modin_df, modin_df2], join=join, sort=sort) df_equals( pandas_concat, modin_concat, # https://github.com/modin-project/modin/issues/5963 check_dtypes=join != "inner", ) assert list(pandas_concat.columns) == list(modin_concat.columns) @pytest.mark.parametrize( "data1, index1, data2, index2", [ (None, None, None, None), (None, None, {"A": [1, 2, 3]}, pandas.Index([1, 2, 3], name="Test")), ({"A": [1, 2, 3]}, pandas.Index([1, 2, 3], name="Test"), None, None), ({"A": [1, 2, 3]}, None, None, None), (None, None, {"A": [1, 2, 3]}, None), (None, pandas.Index([1, 2, 3], name="Test"), None, None), (None, None, None, pandas.Index([1, 2, 3], name="Test")), ], ) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("join", ["inner", "outer"]) def test_concat_empty(data1, index1, data2, index2, axis, join): pdf1 = pandas.DataFrame(data1, index=index1) pdf2 = pandas.DataFrame(data2, index=index2) pdf = pandas.concat((pdf1, pdf2), axis=axis, join=join) mdf1 = pd.DataFrame(data1, index=index1) mdf2 = pd.DataFrame(data2, index=index2) mdf = pd.concat((mdf1, mdf2), axis=axis, join=join) df_equals( pdf, mdf, # https://github.com/modin-project/modin/issues/5963 check_dtypes=join != "inner", ) def test_concat_empty_df_series(): pdf = pandas.concat((pandas.DataFrame({"A": [1, 2, 3]}), pandas.Series())) mdf = pd.concat((pd.DataFrame({"A": [1, 2, 3]}), pd.Series())) df_equals( pdf, mdf, # https://github.com/modin-project/modin/issues/5964 check_dtypes=False, ) pdf = pandas.concat((pandas.DataFrame(), pandas.Series([1, 2, 3]))) mdf = pd.concat((pd.DataFrame(), pd.Series([1, 2, 3]))) df_equals( pdf, mdf, # https://github.com/modin-project/modin/issues/5964 check_dtypes=False, ) @pytest.mark.skipif( StorageFormat.get() != "Base", reason="https://github.com/modin-project/modin/issues/5696", ) @pytest.mark.parametrize("col_type", [None, "str"]) @pytest.mark.parametrize("df1_cols", [0, 90, 100]) @pytest.mark.parametrize("df2_cols", [0, 90, 100]) @pytest.mark.parametrize("df1_rows", [0, 100]) @pytest.mark.parametrize("df2_rows", [0, 100]) @pytest.mark.parametrize("idx_type", [None, "str"]) @pytest.mark.parametrize("ignore_index", [True, False]) @pytest.mark.parametrize("sort", [True, False]) @pytest.mark.parametrize("join", ["inner", "outer"]) def test_concat_different_num_cols( col_type, df1_cols, df2_cols, df1_rows, df2_rows, idx_type, ignore_index, sort, join, ): def create_frame(frame_type, ncols, nrows): def to_str(val): return f"str_{val}" off = 0 data = {} for n in range(1, ncols + 1): row = range(off + 1, off + nrows + 1) if col_type == "str": row = map(to_str, row) data[f"Col_{n}"] = list(row) off += nrows idx = None if idx_type == "str": idx = pandas.Index(map(to_str, range(1, nrows + 1)), name=f"Index_{nrows}") df = frame_type(data=data, index=idx) return df def concat(frame_type, lib): df1 = create_frame(frame_type, df1_cols, df1_rows) df2 = create_frame(frame_type, df2_cols, df2_rows) return lib.concat([df1, df2], ignore_index=ignore_index, sort=sort, join=join) mdf = concat(pd.DataFrame, pd) pdf = concat(pandas.DataFrame, pandas) df_equals( pdf, mdf, # Empty slicing causes this bug: # https://github.com/modin-project/modin/issues/5974 check_dtypes=not ( get_current_execution() == "BaseOnPython" and any(o == 0 for o in (df1_cols, df2_cols, df1_rows, df2_rows)) ), ) ================================================ FILE: modin/tests/pandas/test_expanding.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import numpy as np import pandas import pytest import modin.pandas as pd from modin.config import NPartitions from modin.tests.test_utils import ( current_execution_is_native, warns_that_defaulting_to_pandas_if, ) from .utils import ( create_test_dfs, create_test_series, df_equals, eval_general, test_data, test_data_keys, test_data_values, ) NPartitions.put(4) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("min_periods", [None, 5]) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize( "method, kwargs", [ ("count", {}), ("sum", {}), ("mean", {}), ("median", {}), ("skew", {}), ("kurt", {}), ("var", {"ddof": 0}), ("std", {"ddof": 0}), ("min", {}), ("max", {}), ("rank", {}), ("sem", {"ddof": 0}), ("quantile", {"q": 0.1}), ], ) def test_dataframe(data, min_periods, axis, method, kwargs): eval_general( *create_test_dfs(data), lambda df: getattr(df.expanding(min_periods=min_periods, axis=axis), method)( **kwargs ) ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("min_periods", [None, 5]) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("method", ["corr", "cov"]) def test_dataframe_corr_cov(data, min_periods, axis, method): with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): eval_general( *create_test_dfs(data), lambda df: getattr( df.expanding(min_periods=min_periods, axis=axis), method )() ) @pytest.mark.parametrize("method", ["corr", "cov"]) def test_dataframe_corr_cov_with_self(method): mdf, pdf = create_test_dfs(test_data["float_nan_data"]) with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): eval_general( mdf, pdf, lambda df, other: getattr(df.expanding(), method)(other=other), other=pdf, md_extra_kwargs={"other": mdf}, ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("min_periods", [None, 5]) def test_dataframe_agg(data, min_periods): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) pandas_expanded = pandas_df.expanding( min_periods=min_periods, axis=0, ) modin_expanded = modin_df.expanding( min_periods=min_periods, axis=0, ) # aggregates are only supported on axis 0 df_equals(modin_expanded.aggregate(np.sum), pandas_expanded.aggregate(np.sum)) df_equals( pandas_expanded.aggregate([np.sum, np.mean]), modin_expanded.aggregate([np.sum, np.mean]), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("min_periods", [None, 5]) @pytest.mark.parametrize( "method, kwargs", [ ("count", {}), ("sum", {}), ("mean", {}), ("median", {}), ("skew", {}), ("kurt", {}), ("corr", {}), ("cov", {}), ("var", {"ddof": 0}), ("std", {"ddof": 0}), ("min", {}), ("max", {}), ("rank", {}), ("sem", {"ddof": 0}), ("quantile", {"q": 0.1}), ], ) def test_series(data, min_periods, method, kwargs): eval_general( *create_test_series(data), lambda df: getattr(df.expanding(min_periods=min_periods), method)(**kwargs) ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("min_periods", [None, 5]) def test_series_agg(data, min_periods): modin_series, pandas_series = create_test_series(data) pandas_expanded = pandas_series.expanding(min_periods=min_periods) modin_expanded = modin_series.expanding(min_periods=min_periods) df_equals(modin_expanded.aggregate(np.sum), pandas_expanded.aggregate(np.sum)) df_equals( pandas_expanded.aggregate([np.sum, np.mean]), modin_expanded.aggregate([np.sum, np.mean]), ) @pytest.mark.parametrize("method", ["corr", "cov"]) def test_series_corr_cov_with_self(method): mdf, pdf = create_test_series(test_data["float_nan_data"]) eval_general( mdf, pdf, lambda df, other: getattr(df.expanding(), method)(other=other), other=pdf, md_extra_kwargs={"other": mdf}, ) ================================================ FILE: modin/tests/pandas/test_general.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import numpy as np import pandas import pytest from numpy.testing import assert_array_equal import modin.pandas as pd from modin.pandas.io import to_pandas from modin.pandas.testing import assert_frame_equal from modin.tests.test_utils import ( current_execution_is_native, df_or_series_using_native_execution, warns_that_defaulting_to_pandas_if, ) from modin.utils import get_current_execution from .utils import ( bool_arg_keys, bool_arg_values, create_test_dfs, df_equals, eval_general, is_native_shallow_copy, sort_if_range_partitioning, sort_index_for_equal_values, test_data_keys, test_data_values, ) pytestmark = pytest.mark.filterwarnings( "default:`DataFrame.insert` for empty DataFrame is not currently supported.*:UserWarning" ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("append_na", [True, False]) @pytest.mark.parametrize("op", ["isna", "isnull", "notna", "notnull"]) def test_isna_isnull_notna_notnull(data, append_na, op): pandas_df = pandas.DataFrame(data) modin_df = pd.DataFrame(pandas_df) if append_na: pandas_df["NONE_COL"] = None pandas_df["NAN_COL"] = np.nan modin_df["NONE_COL"] = None modin_df["NAN_COL"] = np.nan pandas_result = getattr(pandas, op)(pandas_df) modin_result = getattr(pd, op)(modin_df) df_equals(modin_result, pandas_result) modin_result = getattr(pd, op)(pd.Series([1, np.nan, 2])) pandas_result = getattr(pandas, op)(pandas.Series([1, np.nan, 2])) df_equals(modin_result, pandas_result) assert pd.isna(np.nan) == pandas.isna(np.nan) def test_merge(): frame_data = { "col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6], } modin_df = pd.DataFrame(frame_data) pandas_df = pandas.DataFrame(frame_data) frame_data2 = {"col1": [0, 1, 2], "col2": [1, 5, 6]} modin_df2 = pd.DataFrame(frame_data2) pandas_df2 = pandas.DataFrame(frame_data2) join_types = ["outer", "inner"] for how in join_types: with warns_that_defaulting_to_pandas_if( how == "outer" and not df_or_series_using_native_execution(modin_df) ): modin_result = pd.merge(modin_df, modin_df2, how=how) pandas_result = pandas.merge(pandas_df, pandas_df2, how=how) df_equals(modin_result, pandas_result) # left_on and right_index with warns_that_defaulting_to_pandas_if( not df_or_series_using_native_execution(modin_df) ): modin_result = pd.merge( modin_df, modin_df2, how=how, left_on="col1", right_index=True ) pandas_result = pandas.merge( pandas_df, pandas_df2, how=how, left_on="col1", right_index=True ) df_equals(modin_result, pandas_result) # left_index and right_on with warns_that_defaulting_to_pandas_if( not df_or_series_using_native_execution(modin_df) ): modin_result = pd.merge( modin_df, modin_df2, how=how, left_index=True, right_on="col1" ) pandas_result = pandas.merge( pandas_df, pandas_df2, how=how, left_index=True, right_on="col1" ) df_equals(modin_result, pandas_result) # left_on and right_on col1 with warns_that_defaulting_to_pandas_if( how == "outer" and not df_or_series_using_native_execution(modin_df) ): modin_result = pd.merge( modin_df, modin_df2, how=how, left_on="col1", right_on="col1" ) pandas_result = pandas.merge( pandas_df, pandas_df2, how=how, left_on="col1", right_on="col1" ) df_equals(modin_result, pandas_result) # left_on and right_on col2 with warns_that_defaulting_to_pandas_if( how == "outer" and not df_or_series_using_native_execution(modin_df) ): modin_result = pd.merge( modin_df, modin_df2, how=how, left_on="col2", right_on="col2" ) pandas_result = pandas.merge( pandas_df, pandas_df2, how=how, left_on="col2", right_on="col2" ) df_equals(modin_result, pandas_result) # left_index and right_index modin_result = pd.merge( modin_df, modin_df2, how=how, left_index=True, right_index=True ) pandas_result = pandas.merge( pandas_df, pandas_df2, how=how, left_index=True, right_index=True ) df_equals(modin_result, pandas_result) s = pd.Series(frame_data.get("col1")) with pytest.raises(ValueError): pd.merge(s, modin_df2) with pytest.raises(TypeError): pd.merge("Non-valid type", modin_df2) def test_merge_ordered(): data_a = { "key": list("aceace"), "lvalue": [1, 2, 3, 1, 2, 3], "group": list("aaabbb"), } data_b = {"key": list("bcd"), "rvalue": [1, 2, 3]} modin_df_a = pd.DataFrame(data_a) modin_df_b = pd.DataFrame(data_b) with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): df = pd.merge_ordered( modin_df_a, modin_df_b, fill_method="ffill", left_by="group" ) assert isinstance(df, pd.DataFrame) with pytest.raises(TypeError): pd.merge_ordered(data_a, data_b, fill_method="ffill", left_by="group") @pytest.mark.parametrize("right_index", [None, [0] * 5], ids=["default", "non_unique"]) def test_merge_asof(right_index): left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) right = pd.DataFrame( {"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}, index=right_index ) with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): df = pd.merge_asof(left, right, on="a") assert isinstance(df, pd.DataFrame) with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): df = pd.merge_asof(left, right, on="a", allow_exact_matches=False) assert isinstance(df, pd.DataFrame) with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): df = pd.merge_asof(left, right, on="a", direction="forward") assert isinstance(df, pd.DataFrame) with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): df = pd.merge_asof(left, right, on="a", direction="nearest") assert isinstance(df, pd.DataFrame) left = pd.DataFrame({"left_val": ["a", "b", "c"]}, index=[1, 5, 10]) right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7]}, index=[1, 2, 3, 6, 7]) with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): df = pd.merge_asof(left, right, left_index=True, right_index=True) assert isinstance(df, pd.DataFrame) with pytest.raises(ValueError): pd.merge_asof( {"left_val": ["a", "b", "c"]}, {"right_val": [1, 2, 3, 6, 7]}, left_index=True, right_index=True, ) def test_merge_asof_on_variations(): """on=,left_on=,right_on=,right_index=,left_index= options match Pandas.""" left = {"a": [1, 5, 10], "left_val": ["a", "b", "c"]} left_index = [6, 8, 12] right = {"a": [1, 2, 3, 6, 7], "right_val": ["d", "e", "f", "g", "h"]} right_index = [6, 7, 8, 9, 15] pandas_left, pandas_right = ( pandas.DataFrame(left, index=left_index), pandas.DataFrame(right, index=right_index), ) modin_left, modin_right = ( pd.DataFrame(left, index=left_index), pd.DataFrame(right, index=right_index), ) for on_arguments in [ {"on": "a"}, {"left_on": "a", "right_on": "a"}, {"left_on": "a", "right_index": True}, {"left_index": True, "right_on": "a"}, {"left_index": True, "right_index": True}, ]: pandas_merged = pandas.merge_asof(pandas_left, pandas_right, **on_arguments) with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): modin_merged = pd.merge_asof(modin_left, modin_right, **on_arguments) df_equals(pandas_merged, modin_merged) def test_merge_asof_suffixes(): """Suffix variations are handled the same as Pandas.""" left = {"a": [1, 5, 10]} right = {"a": [2, 3, 6]} pandas_left, pandas_right = (pandas.DataFrame(left), pandas.DataFrame(right)) modin_left, modin_right = pd.DataFrame(left), pd.DataFrame(right) for suffixes in [("a", "b"), (False, "c"), ("d", False)]: pandas_merged = pandas.merge_asof( pandas_left, pandas_right, left_index=True, right_index=True, suffixes=suffixes, ) with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): modin_merged = pd.merge_asof( modin_left, modin_right, left_index=True, right_index=True, suffixes=suffixes, ) df_equals(pandas_merged, modin_merged) with pytest.raises(ValueError): pandas.merge_asof( pandas_left, pandas_right, left_index=True, right_index=True, suffixes=(False, False), ) with pytest.raises(ValueError), warns_that_defaulting_to_pandas_if( not current_execution_is_native() ): pd.merge_asof( modin_left, modin_right, left_index=True, right_index=True, suffixes=(False, False), ) def test_merge_asof_bad_arguments(): left = {"a": [1, 5, 10], "b": [5, 7, 9]} right = {"a": [2, 3, 6], "b": [6, 5, 20]} pandas_left, pandas_right = (pandas.DataFrame(left), pandas.DataFrame(right)) modin_left, modin_right = pd.DataFrame(left), pd.DataFrame(right) # Can't mix by with left_by/right_by with pytest.raises(ValueError): pandas.merge_asof( pandas_left, pandas_right, on="a", by="b", left_by="can't do with by" ) with pytest.raises(ValueError), warns_that_defaulting_to_pandas_if( not current_execution_is_native() ): pd.merge_asof( modin_left, modin_right, on="a", by="b", left_by="can't do with by" ) with pytest.raises(ValueError): pandas.merge_asof( pandas_left, pandas_right, by="b", on="a", right_by="can't do with by" ) with pytest.raises(ValueError), warns_that_defaulting_to_pandas_if( not current_execution_is_native() ): pd.merge_asof( modin_left, modin_right, by="b", on="a", right_by="can't do with by" ) # Can't mix on with left_on/right_on with pytest.raises(ValueError): pandas.merge_asof(pandas_left, pandas_right, on="a", left_on="can't do with by") with pytest.raises(ValueError), warns_that_defaulting_to_pandas_if( not current_execution_is_native() ): pd.merge_asof(modin_left, modin_right, on="a", left_on="can't do with by") with pytest.raises(ValueError): pandas.merge_asof( pandas_left, pandas_right, on="a", right_on="can't do with by" ) with pytest.raises(ValueError), warns_that_defaulting_to_pandas_if( not current_execution_is_native() ): pd.merge_asof(modin_left, modin_right, on="a", right_on="can't do with by") # Can't mix left_index with left_on or on, similarly for right. with pytest.raises(ValueError), warns_that_defaulting_to_pandas_if( not current_execution_is_native() ): pd.merge_asof(modin_left, modin_right, on="a", right_index=True) with pytest.raises(ValueError), warns_that_defaulting_to_pandas_if( not current_execution_is_native() ): pd.merge_asof( modin_left, modin_right, left_on="a", right_on="a", right_index=True ) with pytest.raises(ValueError), warns_that_defaulting_to_pandas_if( not current_execution_is_native() ): pd.merge_asof(modin_left, modin_right, on="a", left_index=True) with pytest.raises(ValueError), warns_that_defaulting_to_pandas_if( not current_execution_is_native() ): pd.merge_asof( modin_left, modin_right, left_on="a", right_on="a", left_index=True ) # Need both left and right with pytest.raises(Exception): # Pandas bug, didn't validate inputs sufficiently pandas.merge_asof(pandas_left, pandas_right, left_on="a") with pytest.raises(ValueError), warns_that_defaulting_to_pandas_if( not current_execution_is_native() ): pd.merge_asof(modin_left, modin_right, left_on="a") with pytest.raises(Exception): # Pandas bug, didn't validate inputs sufficiently pandas.merge_asof(pandas_left, pandas_right, right_on="a") with pytest.raises(ValueError), warns_that_defaulting_to_pandas_if( not current_execution_is_native() ): pd.merge_asof(modin_left, modin_right, right_on="a") with pytest.raises(ValueError): pandas.merge_asof(pandas_left, pandas_right) with pytest.raises(ValueError), warns_that_defaulting_to_pandas_if( not current_execution_is_native() ): pd.merge_asof(modin_left, modin_right) def test_merge_asof_merge_options(): modin_quotes = pd.DataFrame( { "time": [ pd.Timestamp("2016-05-25 13:30:00.023"), pd.Timestamp("2016-05-25 13:30:00.023"), pd.Timestamp("2016-05-25 13:30:00.030"), pd.Timestamp("2016-05-25 13:30:00.041"), pd.Timestamp("2016-05-25 13:30:00.048"), pd.Timestamp("2016-05-25 13:30:00.049"), pd.Timestamp("2016-05-25 13:30:00.072"), pd.Timestamp("2016-05-25 13:30:00.075"), ], "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL", "GOOG", "MSFT"], "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03], } ) modin_trades = pd.DataFrame( { "time": [ pd.Timestamp("2016-05-25 13:30:00.023"), pd.Timestamp("2016-05-25 13:30:00.038"), pd.Timestamp("2016-05-25 13:30:00.048"), pd.Timestamp("2016-05-25 13:30:00.048"), pd.Timestamp("2016-05-25 13:30:00.048"), ], "ticker2": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], "price": [51.95, 51.95, 720.77, 720.92, 98.0], "quantity": [75, 155, 100, 100, 100], } ) pandas_quotes, pandas_trades = to_pandas(modin_quotes), to_pandas(modin_trades) # left_by + right_by with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): modin_result = pd.merge_asof( modin_quotes, modin_trades, on="time", left_by="ticker", right_by="ticker2", ) df_equals( pandas.merge_asof( pandas_quotes, pandas_trades, on="time", left_by="ticker", right_by="ticker2", ), modin_result, ) # Just by: pandas_trades["ticker"] = pandas_trades["ticker2"] modin_trades["ticker"] = modin_trades["ticker2"] with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): modin_result = pd.merge_asof( modin_quotes, modin_trades, on="time", by="ticker", ) df_equals( pandas.merge_asof( pandas_quotes, pandas_trades, on="time", by="ticker", ), modin_result, ) # Tolerance with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): modin_result = pd.merge_asof( modin_quotes, modin_trades, on="time", by="ticker", tolerance=pd.Timedelta("2ms"), ) df_equals( pandas.merge_asof( pandas_quotes, pandas_trades, on="time", by="ticker", tolerance=pd.Timedelta("2ms"), ), modin_result, ) # Direction with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): modin_result = pd.merge_asof( modin_quotes, modin_trades, on="time", by="ticker", direction="forward", ) df_equals( pandas.merge_asof( pandas_quotes, pandas_trades, on="time", by="ticker", direction="forward", ), modin_result, ) # Allow exact matches with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): modin_result = pd.merge_asof( modin_quotes, modin_trades, on="time", by="ticker", tolerance=pd.Timedelta("10ms"), allow_exact_matches=False, ) df_equals( pandas.merge_asof( pandas_quotes, pandas_trades, on="time", by="ticker", tolerance=pd.Timedelta("10ms"), allow_exact_matches=False, ), modin_result, ) def test_pivot(): test_df = pd.DataFrame( { "foo": ["one", "one", "one", "two", "two", "two"], "bar": ["A", "B", "C", "A", "B", "C"], "baz": [1, 2, 3, 4, 5, 6], "zoo": ["x", "y", "z", "q", "w", "t"], } ) df = pd.pivot(test_df, index="foo", columns="bar", values="baz") assert isinstance(df, pd.DataFrame) with pytest.raises(ValueError): pd.pivot(test_df["bar"], index="foo", columns="bar", values="baz") if not (get_current_execution() == "BaseOnPython" or current_execution_is_native()): # FIXME: Failed for some reason on 'BaseOnPython' and 'NativeOnNative' # https://github.com/modin-project/modin/issues/6240 df_equals( pd.pivot(test_df, columns="bar"), pandas.pivot(test_df._to_pandas(), columns="bar"), ) df_equals( pd.pivot(test_df, index="foo", columns="bar"), pandas.pivot(test_df._to_pandas(), index="foo", columns="bar"), ) def test_pivot_values_is_none(): test_df = pd.DataFrame( { "foo": ["one", "one", "one", "two", "two", "two"], "bar": ["A", "B", "C", "A", "B", "C"], "baz": [1, 2, 3, 4, 5, 6], "zoo": ["x", "y", "z", "q", "w", "t"], } ) df = pd.pivot(test_df, index="foo", columns="bar") assert isinstance(df, pd.DataFrame) def test_pivot_table(): test_df = pd.DataFrame( { "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], "C": [ "small", "large", "large", "small", "small", "large", "small", "small", "large", ], "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], "E": [2, 4, 5, 5, 6, 6, 8, 9, 9], } ) df = pd.pivot_table( test_df, values="D", index=["A", "B"], columns=["C"], aggfunc=np.sum ) assert isinstance(df, pd.DataFrame) with pytest.raises(ValueError): pd.pivot_table( test_df["C"], values="D", index=["A", "B"], columns=["C"], aggfunc=np.sum ) def test_unique(): comparator = lambda *args: sort_if_range_partitioning( # noqa: E731 *args, comparator=assert_array_equal ) modin_result = pd.unique([2, 1, 3, 3]) pandas_result = pandas.unique([2, 1, 3, 3]) comparator(modin_result, pandas_result) assert modin_result.shape == pandas_result.shape modin_result = pd.unique(pd.Series([2] + [1] * 5)) pandas_result = pandas.unique(pandas.Series([2] + [1] * 5)) comparator(modin_result, pandas_result) assert modin_result.shape == pandas_result.shape modin_result = pd.unique( pd.Series([pd.Timestamp("20160101"), pd.Timestamp("20160101")]) ) pandas_result = pandas.unique( pandas.Series([pandas.Timestamp("20160101"), pandas.Timestamp("20160101")]) ) comparator(modin_result, pandas_result) assert modin_result.shape == pandas_result.shape modin_result = pd.unique( pd.Series( [ pd.Timestamp("20160101", tz="US/Eastern"), pd.Timestamp("20160101", tz="US/Eastern"), ] ) ) pandas_result = pandas.unique( pandas.Series( [ pandas.Timestamp("20160101", tz="US/Eastern"), pandas.Timestamp("20160101", tz="US/Eastern"), ] ) ) comparator(modin_result, pandas_result) assert modin_result.shape == pandas_result.shape modin_result = pd.unique( pd.Index( [ pd.Timestamp("20160101", tz="US/Eastern"), pd.Timestamp("20160101", tz="US/Eastern"), ] ) ) pandas_result = pandas.unique( pandas.Index( [ pandas.Timestamp("20160101", tz="US/Eastern"), pandas.Timestamp("20160101", tz="US/Eastern"), ] ) ) comparator(modin_result, pandas_result) assert modin_result.shape == pandas_result.shape modin_result = pd.unique(pd.Series(pd.Categorical(list("baabc")))) pandas_result = pandas.unique(pandas.Series(pandas.Categorical(list("baabc")))) comparator(modin_result, pandas_result) assert modin_result.shape == pandas_result.shape @pytest.mark.parametrize("normalize, bins, dropna", [(True, 3, False)]) def test_value_counts(normalize, bins, dropna): # We sort indices for Modin and pandas result because of issue #1650 values = np.array([3, 1, 2, 3, 4, np.nan]) modin_result = sort_index_for_equal_values( pd.value_counts(values, normalize=normalize, ascending=False), False ) pandas_result = sort_index_for_equal_values( pandas.value_counts(values, normalize=normalize, ascending=False), False ) df_equals(modin_result, pandas_result) with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): modin_result = sort_index_for_equal_values( pd.value_counts(values, bins=bins, ascending=False), False ) pandas_result = sort_index_for_equal_values( pandas.value_counts(values, bins=bins, ascending=False), False ) df_equals(modin_result, pandas_result) modin_result = sort_index_for_equal_values( pd.value_counts(values, dropna=dropna, ascending=True), True ) pandas_result = sort_index_for_equal_values( pandas.value_counts(values, dropna=dropna, ascending=True), True ) df_equals(modin_result, pandas_result) def test_to_datetime(): # DataFrame input for to_datetime modin_df = pd.DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}) pandas_df = pandas.DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}) df_equals(pd.to_datetime(modin_df), pandas.to_datetime(pandas_df)) # Series input for to_datetime modin_s = pd.Series(["3/11/2000", "3/12/2000", "3/13/2000"] * 1000) pandas_s = pandas.Series(["3/11/2000", "3/12/2000", "3/13/2000"] * 1000) df_equals(pd.to_datetime(modin_s), pandas.to_datetime(pandas_s)) # Other inputs for to_datetime value = 1490195805 assert pd.to_datetime(value, unit="s") == pandas.to_datetime(value, unit="s") value = 1490195805433502912 assert pd.to_datetime(value, unit="ns") == pandas.to_datetime(value, unit="ns") value = [1, 2, 3] assert pd.to_datetime(value, unit="D", origin=pd.Timestamp("2000-01-01")).equals( pandas.to_datetime(value, unit="D", origin=pandas.Timestamp("2000-01-01")) ) def test_to_datetime_inplace_side_effect(): # See GH#3063 times = list(range(1617993360, 1618193360)) values = list(range(215441, 415441)) modin_df = pd.DataFrame({"time": times, "value": values}) pandas_df = pandas.DataFrame({"time": times, "value": values}) df_equals( pd.to_datetime(modin_df["time"], unit="s"), pandas.to_datetime(pandas_df["time"], unit="s"), ) @pytest.mark.parametrize( "data, errors, downcast", [ (["1.0", "2", -3], "raise", None), (["1.0", "2", -3], "raise", "float"), (["1.0", "2", -3], "raise", "signed"), (["apple", "1.0", "2", -3], "ignore", None), (["apple", "1.0", "2", -3], "coerce", None), ], ) def test_to_numeric(data, errors, downcast): modin_series = pd.Series(data) pandas_series = pandas.Series(data) modin_result = pd.to_numeric(modin_series, errors=errors, downcast=downcast) pandas_result = pandas.to_numeric(pandas_series, errors=errors, downcast=downcast) df_equals(modin_result, pandas_result) @pytest.mark.parametrize("retbins", bool_arg_values, ids=bool_arg_keys) def test_qcut(retbins): # test case from https://github.com/modin-project/modin/issues/5610 pandas_series = pandas.Series(range(10)) modin_series = pd.Series(range(10)) pandas_result = pandas.qcut(pandas_series, 4, retbins=retbins) # NOTE that qcut() defaults to pandas at the API layer. with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): modin_result = pd.qcut(modin_series, 4, retbins=retbins) if retbins: df_equals(modin_result[0], pandas_result[0]) df_equals(modin_result[0].cat.categories, pandas_result[0].cat.categories) assert_array_equal(modin_result[1], pandas_result[1]) else: df_equals(modin_result, pandas_result) df_equals(modin_result.cat.categories, pandas_result.cat.categories) # test case for fallback to pandas, taken from pandas docs pandas_result = pandas.qcut(range(5), 4) modin_result = pd.qcut(range(5), 4) df_equals(modin_result, pandas_result) @pytest.mark.parametrize( "bins, labels", [ pytest.param( [-int(1e18), -1000, 0, 1000, 2000, int(1e18)], [ "-inf_to_-1000", "-1000_to_0", "0_to_1000", "1000_to_2000", "2000_to_inf", ], id="bin_list_spanning_entire_range_with_custom_labels", ), pytest.param( [-int(1e18), -1000, 0, 1000, 2000, int(1e18)], None, id="bin_list_spanning_entire_range_with_default_labels", ), pytest.param( [-1000, 0, 1000, 2000], None, id="bin_list_not_spanning_entire_range" ), pytest.param( 10, [f"custom_label{i}" for i in range(9)], id="int_bin_10_with_custom_labels", ), pytest.param(1, None, id="int_bin_1_with_default_labels"), pytest.param(-1, None, id="int_bin_-1_with_default_labels"), pytest.param(111, None, id="int_bin_111_with_default_labels"), ], ) @pytest.mark.parametrize("retbins", bool_arg_values, ids=bool_arg_keys) def test_cut(retbins, bins, labels): # Would use `eval_general` here, but `eval_general` expects the operation # to be supported by Modin, and so errors out when we give the defaulting # to pandas UserWarning. We could get around this by using # @pytest.mark.filterwarnings("ignore"), but then `eval_general` fails because # sometimes the return type of pd.cut is an np.ndarray, and `eval_general` does # not know how to handle that. try: pd_result = pandas.cut( pandas.Series(range(1000)), retbins=retbins, bins=bins, labels=labels ) except Exception as pd_e: with pytest.raises(Exception) as md_e: with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): md_result = pd.cut( pd.Series(range(1000)), retbins=retbins, bins=bins, labels=labels ) assert isinstance( md_e.value, type(pd_e) ), f"Got Modin Exception type {type(md_e.value)}, but pandas Exception type {type(pd_e)} was expected" else: with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): md_result = pd.cut( pd.Series(range(1000)), retbins=retbins, bins=bins, labels=labels ) if not isinstance(pd_result, tuple): df_equals(md_result, pd_result) else: assert isinstance( md_result, tuple ), "Modin returned single value, but pandas returned tuple of values" for pd_res, md_res in zip(pd_result, md_result): if isinstance(pd_res, pandas.Series): df_equals(pd_res, md_res) else: np.testing.assert_array_equal(pd_res, md_res) def test_cut_fallback(): # Test case for falling back to pandas for cut. pandas_result = pandas.cut(range(5), 4) # note that we default to pandas at the API layer here, so we warn # regardless of whether we are on native execution. with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): modin_result = pd.cut(range(5), 4) df_equals(modin_result, pandas_result) @pytest.mark.parametrize( "data", [test_data_values[0], []], ids=["test_data_values[0]", "[]"] ) def test_to_pandas_indices(data): md_df = pd.DataFrame(data) index = pandas.MultiIndex.from_tuples( [(i, i * 2) for i in np.arange(len(md_df) + 1)], names=["A", "B"] ).drop(0) columns = pandas.MultiIndex.from_tuples( [(i, i * 2) for i in np.arange(len(md_df.columns) + 1)], names=["A", "B"] ).drop(0) md_df.index = index md_df.columns = columns pd_df = md_df._to_pandas() for axis in [0, 1]: assert md_df.axes[axis].equals( pd_df.axes[axis] ), f"Indices at axis {axis} are different!" assert not hasattr(md_df.axes[axis], "equal_levels") or md_df.axes[ axis ].equal_levels( pd_df.axes[axis] ), f"Levels of indices at axis {axis} are different!" def test_to_pandas_read_only_issue(): df = pd.DataFrame( [ [np.nan, 2, np.nan, 0], [3, 4, np.nan, 1], [np.nan, np.nan, np.nan, np.nan], [np.nan, 3, np.nan, 4], ], columns=list("ABCD"), ) pdf = df._to_pandas() # there shouldn't be `ValueError: putmask: output array is read-only` pdf.fillna(0, inplace=True) def test_to_numpy_read_only_issue(): df = pd.DataFrame( [ [np.nan, 2, np.nan, 0], [3, 4, np.nan, 1], [np.nan, np.nan, np.nan, np.nan], [np.nan, 3, np.nan, 4], ], columns=list("ABCD"), ) arr = df.to_numpy() # there shouldn't be `ValueError: putmask: output array is read-only` np.putmask(arr, np.isnan(arr), 0) def test_create_categorical_dataframe_with_duplicate_column_name(): # This tests for https://github.com/modin-project/modin/issues/4312 pd_df = pandas.DataFrame( { "a": pandas.Categorical([1, 2]), "b": [4, 5], "c": pandas.Categorical([7, 8]), } ) pd_df.columns = ["a", "b", "a"] md_df = pd.DataFrame(pd_df) # Use assert_frame_equal instead of the common modin util df_equals because # we should check dtypes of the new categorical with check_dtype=True. # TODO(https://github.com/modin-project/modin/issues/3804): Make # df_equals set check_dtype=True and use df_equals instead. assert_frame_equal( md_df._to_pandas(), pd_df, check_dtype=True, check_index_type=True, check_column_type=True, check_names=True, check_categorical=True, ) @pytest.mark.skipif( get_current_execution() != "BaseOnPython", reason="This test make sense only on BaseOnPython execution.", ) @pytest.mark.parametrize( "func, regex", [ (lambda df: df.mean(), r"DataFrame\.mean"), (lambda df: df + df, r"DataFrame\.add"), (lambda df: df.index, r"DataFrame\.get_axis\(0\)"), ( lambda df: df.drop(columns="col1").squeeze().repeat(2), r"Series\.repeat", ), (lambda df: df.groupby("col1").prod(), r"GroupBy\.prod"), (lambda df: df.rolling(1).count(), r"Rolling\.count"), ], ) def test_default_to_pandas_warning_message(func, regex): data = {"col1": [1, 2, 3], "col2": [4, 5, 6]} df = pd.DataFrame(data) with pytest.warns(UserWarning, match=regex): func(df) def test_empty_dataframe(): df = pd.DataFrame(columns=["a", "b"]) # NOTE that we default to pandas at the API layer. with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): df[(df.a == 1) & (df.b == 2)] def test_empty_series(): s = pd.Series([]) pd.to_numeric(s) @pytest.mark.parametrize( "arg", [[1, 2], ["a"], 1, "a"], ids=["list_of_ints", "list_of_invalid_strings", "scalar", "invalid_scalar"], ) def test_to_timedelta(arg, request): # This test case comes from # https://github.com/modin-project/modin/issues/4966 expected_exception = None if request.node.callspec.id == "list_of_invalid_strings": expected_exception = ValueError("Could not convert 'a' to NumPy timedelta") elif request.node.callspec.id == "invalid_scalar": expected_exception = ValueError("unit abbreviation w/o a number") eval_general( pd, pandas, lambda lib: lib.to_timedelta(arg), expected_exception=expected_exception, ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_series_to_timedelta(data): def make_frame(lib): series = lib.Series( next(iter(data.values())) if isinstance(data, dict) else data ) return lib.to_timedelta(series).to_frame(name="timedelta") eval_general(pd, pandas, make_frame) @pytest.mark.parametrize( "key", [["col0"], "col0", "col1"], ids=["valid_list_of_string", "valid_string", "invalid_string"], ) def test_get(key): modin_df, pandas_df = create_test_dfs({"col0": [0, 1]}) eval_general(modin_df, pandas_df, lambda df: df.get(key)) @pytest.mark.xfail( condition=is_native_shallow_copy(), reason="native pandas backend does not deep copy inputs by default", strict=True, ) def test_df_immutability(): """ Verify that modifications of the source data doesn't propagate to Modin's DataFrame objects. """ src_data = pandas.DataFrame({"a": [1]}) md_df = pd.DataFrame(src_data) src_data.iloc[0, 0] = 100 assert md_df._to_pandas().iloc[0, 0] == 1 def test_np_array_function(): # first argument is a numpy array, second argument is modin frame assert_array_equal( np.where(np.array([1, 0]), pd.Series([9, 9]), [-1, -1]), np.array([9, -1]) ) # multiple arguments are modin objects assert_array_equal( np.where(pd.DataFrame([[1, 0]]), pd.Series([9, 9]), [-1, -1]), np.array([[9, -1]]), ) ================================================ FILE: modin/tests/pandas/test_groupby.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import datetime import itertools from unittest import mock import numpy as np import pandas import pandas._libs.lib as lib import pytest import modin.pandas as pd from modin.config import ( IsRayCluster, NPartitions, RangePartitioning, StorageFormat, context, ) from modin.core.dataframe.algebra.default2pandas.groupby import GroupBy from modin.core.dataframe.pandas.partitioning.axis_partition import ( PandasDataframeAxisPartition, ) from modin.core.storage_formats.pandas.query_compiler_caster import ( _assert_casting_functions_wrap_same_implementation, ) from modin.pandas.io import from_pandas from modin.pandas.utils import is_scalar from modin.tests.test_utils import ( current_execution_is_native, df_or_series_using_native_execution, warns_that_defaulting_to_pandas_if, ) from modin.utils import ( MODIN_UNNAMED_SERIES_LABEL, get_current_execution, hashable, try_cast_to_pandas, ) from .utils import ( assert_set_of_rows_identical, check_df_columns_have_nans, create_test_dfs, create_test_series, default_to_pandas_ignore_string, df_equals, dict_equals, eval_general, generate_multiindex, modin_df_almost_equals_pandas, test_data, test_data_values, test_groupby_data, try_modin_df_almost_equals_compare, value_equals, ) NPartitions.put(4) # Our configuration in pytest.ini requires that we explicitly catch all # instances of defaulting to pandas, but some test modules, like this one, # have too many such instances. # TODO(https://github.com/modin-project/modin/issues/3655): catch all instances # of defaulting to pandas. pytestmark = [ pytest.mark.filterwarnings(default_to_pandas_ignore_string), # TO MAKE SURE ALL FUTUREWARNINGS ARE CONSIDERED pytest.mark.filterwarnings("error::FutureWarning"), # ... except for this expected Ray warning due to https://github.com/ray-project/ray/issues/54868 pytest.mark.filterwarnings( "ignore:.*In future versions of Ray, Ray will no longer override accelerator visible devices env var if num_gpus=0 or num_gpus=None:FutureWarning" ), # IGNORE FUTUREWARNINGS MARKS TO CLEANUP OUTPUT pytest.mark.filterwarnings( "ignore:DataFrame.groupby with axis=1 is deprecated:FutureWarning" ), pytest.mark.filterwarnings( "ignore:DataFrameGroupBy.dtypes is deprecated:FutureWarning" ), pytest.mark.filterwarnings( "ignore:DataFrameGroupBy.diff with axis=1 is deprecated:FutureWarning" ), pytest.mark.filterwarnings( "ignore:DataFrameGroupBy.pct_change with axis=1 is deprecated:FutureWarning" ), pytest.mark.filterwarnings( "ignore:The 'fill_method' keyword being not None and the 'limit' keyword " + "in (DataFrame|DataFrameGroupBy).pct_change are deprecated:FutureWarning" ), pytest.mark.filterwarnings( "ignore:DataFrameGroupBy.shift with axis=1 is deprecated:FutureWarning" ), pytest.mark.filterwarnings( "ignore:(DataFrameGroupBy|SeriesGroupBy).fillna is deprecated:FutureWarning" ), pytest.mark.filterwarnings( "ignore:(DataFrame|Series).fillna with 'method' is deprecated:FutureWarning" ), # FIXME: these cases inconsistent between modin and pandas pytest.mark.filterwarnings( "ignore:A grouping was used that is not in the columns of the DataFrame and so was excluded from the result:FutureWarning" ), pytest.mark.filterwarnings( "ignore:The default of observed=False is deprecated:FutureWarning" ), pytest.mark.filterwarnings( "ignore:When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future:FutureWarning" ), pytest.mark.filterwarnings( "ignore:.*DataFrame.idxmax with all-NA values, or any-NA and skipna=False, is deprecated:FutureWarning" ), pytest.mark.filterwarnings( "ignore:.*DataFrame.idxmin with all-NA values, or any-NA and skipna=False, is deprecated:FutureWarning" ), pytest.mark.filterwarnings( "ignore:.*In a future version of pandas, the provided callable will be used directly.*:FutureWarning" ), pytest.mark.filterwarnings( "ignore:(DataFrameGroupBy|SeriesGroupBy).apply operated on the grouping columns:FutureWarning" ), ] def get_external_groupers(df, columns, drop_from_original_df=False, add_plus_one=False): """ Construct ``by`` argument containing external groupers. Parameters ---------- df : pandas.DataFrame or modin.pandas.DataFrame columns : list[tuple[bool, str]] Columns to group on. If ``True`` do ``df[col]``, otherwise keep the column name. ''' >>> columns = [(True, "a"), (False, "b")] >>> get_external_groupers(df, columns) [ pandas.Series(..., name="a"), "b" ] ''' drop_from_original_df : bool, default: False Whether to drop selected external columns from `df`. add_plus_one : bool, default: False Whether to do ``df[name] + 1`` for external groupers (so they won't be considered as sibling with `df`). Returns ------- new_df : pandas.DataFrame or modin.pandas.DataFrame If `drop_from_original_df` was True, returns a new dataframe with dropped external columns, otherwise returns `df`. by : list Groupers to pass to `df.groupby(by)`. """ new_df = df by = [] for lookup, name in columns: if lookup: ser = df[name].copy() if add_plus_one: ser = ser + 1 by.append(ser) if drop_from_original_df: new_df = new_df.drop(columns=[name]) else: by.append(name) return new_df, by def modin_groupby_equals_pandas(modin_groupby, pandas_groupby): eval_general( modin_groupby, pandas_groupby, lambda grp: grp.indices, comparator=dict_equals ) # FIXME: https://github.com/modin-project/modin/issues/7032 eval_general( modin_groupby, pandas_groupby, lambda grp: grp.groups, comparator=dict_equals, expected_exception=False, ) for g1, g2 in itertools.zip_longest(modin_groupby, pandas_groupby): value_equals(g1[0], g2[0]) df_equals(g1[1], g2[1]) def eval_aggregation(md_df, pd_df, operation=None, by=None, *args, **kwargs): if by is None: by = md_df.columns[0] if operation is None: operation = {} return eval_general( md_df, pd_df, lambda df, *args, **kwargs: df.groupby(by=by).agg(operation, *args, **kwargs), *args, **kwargs, ) def build_types_asserter(comparator): def wrapper(obj1, obj2, *args, **kwargs): error_str = f"obj1 and obj2 has incorrect types: {type(obj1)} and {type(obj2)}" assert not (is_scalar(obj1) ^ is_scalar(obj2)), error_str assert obj1.__module__.split(".")[0] == "modin", error_str assert obj2.__module__.split(".")[0] == "pandas", error_str comparator(obj1, obj2, *args, **kwargs) return wrapper @pytest.mark.parametrize("as_index", [True, False]) def test_mixed_dtypes_groupby(as_index): frame_data = np.random.RandomState(42).randint(97, 198, size=(2**6, 2**4)) pandas_df = pandas.DataFrame(frame_data).add_prefix("col") # Convert every other column to string for col in pandas_df.iloc[ :, [i for i in range(len(pandas_df.columns)) if i % 2 == 0] ]: pandas_df[col] = [str(chr(i)) for i in pandas_df[col]] modin_df = from_pandas(pandas_df) n = 1 by_values = [ ("col1",), (lambda x: x % 2,), (modin_df["col0"].copy(), pandas_df["col0"].copy()), ("col3",), ] for by in by_values: if isinstance(by[0], str) and by[0] == "col3": modin_groupby = modin_df.set_index(by[0]).groupby( by=by[0], as_index=as_index ) pandas_groupby = pandas_df.set_index(by[0]).groupby( by=by[-1], as_index=as_index ) # difference in behaviour between .groupby().ffill() and # .groupby.fillna(method='ffill') on duplicated indices # caused by https://github.com/pandas-dev/pandas/issues/43412 # is hurting the tests, for now sort the frames md_sorted_grpby = ( modin_df.set_index(by[0]) .sort_index() .groupby(by=by[0], as_index=as_index) ) pd_sorted_grpby = ( pandas_df.set_index(by[0]) .sort_index() .groupby(by=by[0], as_index=as_index) ) else: modin_groupby = modin_df.groupby(by=by[0], as_index=as_index) pandas_groupby = pandas_df.groupby(by=by[-1], as_index=as_index) md_sorted_grpby, pd_sorted_grpby = modin_groupby, pandas_groupby modin_groupby_equals_pandas(modin_groupby, pandas_groupby) eval_ngroups(modin_groupby, pandas_groupby) eval_general( md_sorted_grpby, pd_sorted_grpby, lambda df: df.ffill(), comparator=lambda *dfs: df_equals(*sort_if_experimental_groupby(*dfs)), ) # FIXME: https://github.com/modin-project/modin/issues/7032 eval_general( modin_groupby, pandas_groupby, lambda df: df.sem(), modin_df_almost_equals_pandas, expected_exception=False, ) eval_general( modin_groupby, pandas_groupby, lambda df: df.sample(random_state=1) ) eval_general( modin_groupby, pandas_groupby, lambda df: df.ewm(com=0.5).std(), expected_exception=pandas.errors.DataError( "Cannot aggregate non-numeric type: object" ), ) eval_shift( modin_groupby, pandas_groupby, comparator=( # We should sort the result before comparison for transform functions # in case of range-partitioning groupby (https://github.com/modin-project/modin/issues/5924). # This test though produces so much NaN values in the result, so it's impossible to sort, # using manual comparison of set of rows instead assert_set_of_rows_identical if RangePartitioning.get() else None ), ) eval_mean(modin_groupby, pandas_groupby, numeric_only=True) eval_any(modin_groupby, pandas_groupby) eval_min(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.idxmax()) eval_ndim(modin_groupby, pandas_groupby) eval_cumsum(modin_groupby, pandas_groupby, numeric_only=True) eval_general( modin_groupby, pandas_groupby, lambda df: df.pct_change(), modin_df_almost_equals_pandas, # FIXME: https://github.com/modin-project/modin/issues/7032 expected_exception=False, ) eval_cummax(modin_groupby, pandas_groupby, numeric_only=True) # TODO Add more apply functions apply_functions = [lambda df: df.sum(), min] for func in apply_functions: eval_apply(modin_groupby, pandas_groupby, func) eval_dtypes(modin_groupby, pandas_groupby) eval_general( modin_groupby, pandas_groupby, lambda df: df.first(), comparator=lambda *dfs: df_equals(*sort_if_experimental_groupby(*dfs)), ) eval_cummin(modin_groupby, pandas_groupby, numeric_only=True) eval_general( md_sorted_grpby, pd_sorted_grpby, lambda df: df.bfill(), comparator=lambda *dfs: df_equals(*sort_if_experimental_groupby(*dfs)), ) # numeric_only=False doesn't work eval_general( modin_groupby, pandas_groupby, lambda df: df.idxmin(numeric_only=True) ) eval_prod(modin_groupby, pandas_groupby, numeric_only=True) if as_index: eval_std(modin_groupby, pandas_groupby, numeric_only=True) eval_var(modin_groupby, pandas_groupby, numeric_only=True) eval_skew(modin_groupby, pandas_groupby, numeric_only=True) agg_functions = [ lambda df: df.sum(), "min", min, "max", max, sum, {"col2": "sum"}, {"col2": sum}, {"col2": "max", "col4": "sum", "col5": "min"}, {"col2": max, "col4": sum, "col5": "min"}, # Intersection of 'by' and agg cols for TreeReduce impl {"col0": "count", "col1": "count", "col2": "count"}, # Intersection of 'by' and agg cols for FullAxis impl {"col0": "nunique", "col1": "nunique", "col2": "nunique"}, ] for func in agg_functions: eval_agg(modin_groupby, pandas_groupby, func) eval_general(modin_groupby, pandas_groupby, lambda df: df.last()) eval_max(modin_groupby, pandas_groupby) eval_len(modin_groupby, pandas_groupby) eval_sum(modin_groupby, pandas_groupby) if not RangePartitioning.get(): # `.group` fails with experimental groupby # https://github.com/modin-project/modin/issues/6083 eval_ngroup(modin_groupby, pandas_groupby) eval_nunique(modin_groupby, pandas_groupby) eval_value_counts(modin_groupby, pandas_groupby) eval_median(modin_groupby, pandas_groupby, numeric_only=True) eval_general( modin_groupby, pandas_groupby, lambda df: df.head(n), comparator=lambda *dfs: df_equals(*sort_if_experimental_groupby(*dfs)), ) eval_cumprod(modin_groupby, pandas_groupby, numeric_only=True) # numeric_only=False doesn't work eval_general( modin_groupby, pandas_groupby, lambda df: df.cov(numeric_only=True), modin_df_almost_equals_pandas, ) transform_functions = [lambda df: df, lambda df: df + df] for func in transform_functions: eval_transform(modin_groupby, pandas_groupby, func) pipe_functions = [lambda dfgb: dfgb.sum()] for func in pipe_functions: eval_pipe(modin_groupby, pandas_groupby, func) eval_general( modin_groupby, pandas_groupby, lambda df: df.corr(numeric_only=True), modin_df_almost_equals_pandas, ) eval_fillna(modin_groupby, pandas_groupby) eval_count(modin_groupby, pandas_groupby) eval_general( modin_groupby, pandas_groupby, lambda df: df.tail(n), comparator=lambda *dfs: df_equals(*sort_if_experimental_groupby(*dfs)), ) eval_quantile(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.take([0])) eval___getattr__(modin_groupby, pandas_groupby, "col2") eval_groups(modin_groupby, pandas_groupby) class GetColumn: """Indicate to the test that it should do gc(df).""" def __init__(self, name): self.name = name def __call__(self, df): return df[self.name] def test_aggregate_alias(): # It's optimization. If failed, groupby().aggregate should be tested explicitly from modin.pandas.groupby import DataFrameGroupBy, SeriesGroupBy _assert_casting_functions_wrap_same_implementation( DataFrameGroupBy.aggregate, DataFrameGroupBy.agg ) _assert_casting_functions_wrap_same_implementation( SeriesGroupBy.aggregate, SeriesGroupBy.agg ) @pytest.mark.parametrize( "by", [ [1, 2, 1, 2], lambda x: x % 3, "col1", ["col1"], # col2 contains NaN, is it necessary to test functions like size() "col2", ["col2"], # 5 pytest.param( ["col1", "col2"], marks=pytest.mark.xfail(reason="Excluded because of bug #1554"), ), pytest.param( ["col2", "col4"], marks=pytest.mark.xfail(reason="Excluded because of bug #1554"), ), pytest.param( ["col4", "col2"], marks=pytest.mark.xfail(reason="Excluded because of bug #1554"), ), pytest.param( ["col3", "col4", "col2"], marks=pytest.mark.xfail(reason="Excluded because of bug #1554"), ), # but cum* functions produce undefined results with NaNs so we need to test the same combinations without NaN too ["col5"], # 10 ["col1", "col5"], ["col5", "col4"], ["col4", "col5"], ["col5", "col4", "col1"], ["col1", pd.Series([1, 5, 7, 8])], # 15 [pd.Series([1, 5, 7, 8])], [ pd.Series([1, 5, 7, 8]), pd.Series([1, 5, 7, 8]), pd.Series([1, 5, 7, 8]), pd.Series([1, 5, 7, 8]), pd.Series([1, 5, 7, 8]), ], ["col1", GetColumn("col5")], [GetColumn("col1"), GetColumn("col5")], [GetColumn("col1")], # 20 ], ) @pytest.mark.parametrize("as_index", [True, False], ids=lambda v: f"as_index={v}") @pytest.mark.parametrize( "col1_category", [True, False], ids=lambda v: f"col1_category={v}" ) def test_simple_row_groupby(by, as_index, col1_category): pandas_df = pandas.DataFrame( { "col1": [0, 1, 2, 3], "col2": [4, 5, np.nan, 7], "col3": [np.nan, np.nan, 12, 10], "col4": [17, 13, 16, 15], "col5": [-4, -5, -6, -7], } ) if col1_category: pandas_df = pandas_df.astype({"col1": "category"}) # As of pandas 1.4.0 operators like min cause TypeErrors to be raised on unordered # categorical columns. We need to specify the categorical column as ordered to bypass this. pandas_df["col1"] = pandas_df["col1"].cat.as_ordered() modin_df = from_pandas(pandas_df) n = 1 def maybe_get_columns(df, by): if isinstance(by, list): return [o(df) if isinstance(o, GetColumn) else o for o in by] else: return by modin_groupby = modin_df.groupby( by=maybe_get_columns(modin_df, by), as_index=as_index ) pandas_by = maybe_get_columns(pandas_df, try_cast_to_pandas(by)) pandas_groupby = pandas_df.groupby(by=pandas_by, as_index=as_index) modin_groupby_equals_pandas(modin_groupby, pandas_groupby) eval_ngroups(modin_groupby, pandas_groupby) eval_shift(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.ffill()) if as_index: eval_general(modin_groupby, pandas_groupby, lambda df: df.nth(0)) else: # FIXME: df.groupby(as_index=False).nth() does not produce correct index in Modin, # it should maintain values from df.index, not create a new one or re-order it; # it also produces completely wrong result for multi-column `by` :( if not isinstance(pandas_by, list) or len(pandas_by) <= 1: eval_general( modin_groupby, pandas_groupby, lambda df: df.nth(0).sort_values("col1").reset_index(drop=True), ) expected_exception = None if col1_category: expected_exception = TypeError( "category dtype does not support aggregation 'sem'" ) eval_general( modin_groupby, pandas_groupby, lambda df: df.sem(), modin_df_almost_equals_pandas, expected_exception=expected_exception, ) eval_mean(modin_groupby, pandas_groupby, numeric_only=True) eval_any(modin_groupby, pandas_groupby) eval_min(modin_groupby, pandas_groupby) # FIXME: https://github.com/modin-project/modin/issues/7033 eval_general( modin_groupby, pandas_groupby, lambda df: df.idxmax(), expected_exception=False ) eval_ndim(modin_groupby, pandas_groupby) if not check_df_columns_have_nans(modin_df, by): # cum* functions produce undefined results for columns with NaNs so we run them only when "by" columns contain no NaNs expected_exception = None if col1_category: expected_exception = TypeError( "category type does not support cumsum operations" ) eval_general( modin_groupby, pandas_groupby, lambda df: df.cumsum(), expected_exception=expected_exception, ) expected_exception = None if col1_category: expected_exception = TypeError( "category type does not support cummax operations" ) eval_general( modin_groupby, pandas_groupby, lambda df: df.cummax(), expected_exception=expected_exception, ) expected_exception = None if col1_category: expected_exception = TypeError( "category type does not support cummin operations" ) eval_general( modin_groupby, pandas_groupby, lambda df: df.cummin(), expected_exception=expected_exception, ) expected_exception = None if col1_category: expected_exception = TypeError( "category type does not support cumprod operations" ) eval_general( modin_groupby, pandas_groupby, lambda df: df.cumprod(), expected_exception=expected_exception, ) expected_exception = None if col1_category: expected_exception = TypeError( "category type does not support cumcount operations" ) eval_general( modin_groupby, pandas_groupby, lambda df: df.cumcount(), expected_exception=expected_exception, ) eval_general( modin_groupby, pandas_groupby, lambda df: df.pct_change( periods=2, fill_method="bfill", limit=1, freq=None, axis=1 ), modin_df_almost_equals_pandas, ) apply_functions = [ lambda df: df.sum(numeric_only=True), lambda df: pandas.Series([1, 2, 3, 4], name="result"), min, ] for func in apply_functions: eval_apply(modin_groupby, pandas_groupby, func) eval_dtypes(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.first()) eval_general(modin_groupby, pandas_groupby, lambda df: df.bfill()) # FIXME: https://github.com/modin-project/modin/issues/7033 eval_general( modin_groupby, pandas_groupby, lambda df: df.idxmin(), expected_exception=False ) expected_exception = None if col1_category: expected_exception = TypeError("category type does not support prod operations") eval_general( modin_groupby, pandas_groupby, lambda grp: grp.prod(), expected_exception=expected_exception, ) if as_index: eval_std(modin_groupby, pandas_groupby, numeric_only=True) eval_var(modin_groupby, pandas_groupby, numeric_only=True) eval_skew(modin_groupby, pandas_groupby, numeric_only=True) agg_functions = [ lambda df: df.sum(), "min", "max", min, sum, # Intersection of 'by' and agg cols for TreeReduce impl {"col1": "count", "col2": "count"}, # Intersection of 'by' and agg cols for FullAxis impl {"col1": "nunique", "col2": "nunique"}, ] for func in agg_functions: # Pandas raises an exception when 'by' contains categorical key and `as_index=False` # because of this bug: https://github.com/pandas-dev/pandas/issues/36698 # Modin correctly processes the result is_pandas_bug_case = not as_index and col1_category and isinstance(func, dict) expected_exception = None if col1_category: # FIXME: https://github.com/modin-project/modin/issues/7033 expected_exception = False if not is_pandas_bug_case: eval_general( modin_groupby, pandas_groupby, lambda grp: grp.agg(func), expected_exception=expected_exception, ) eval_general(modin_groupby, pandas_groupby, lambda df: df.last()) eval_general(modin_groupby, pandas_groupby, lambda df: df.rank()) eval_max(modin_groupby, pandas_groupby) eval_len(modin_groupby, pandas_groupby) expected_exception = None if col1_category: expected_exception = TypeError("category type does not support sum operations") eval_general( modin_groupby, pandas_groupby, lambda df: df.sum(), expected_exception=expected_exception, ) eval_ngroup(modin_groupby, pandas_groupby) # Pandas raising exception when 'by' contains categorical key and `as_index=False` # because of a bug: https://github.com/pandas-dev/pandas/issues/36698 # Modin correctly processes the result if not (col1_category and not as_index): eval_general( modin_groupby, pandas_groupby, lambda df: df.nunique(), ) expected_exception = None if col1_category: expected_exception = TypeError( "category dtype does not support aggregation 'median'" ) # TypeError: category type does not support median operations eval_general( modin_groupby, pandas_groupby, lambda df: df.median(), modin_df_almost_equals_pandas, expected_exception=expected_exception, ) eval_general(modin_groupby, pandas_groupby, lambda df: df.head(n)) eval_general( modin_groupby, pandas_groupby, lambda df: df.cov(), modin_df_almost_equals_pandas, ) if not check_df_columns_have_nans(modin_df, by): # Pandas groupby.transform does not work correctly with NaN values in grouping columns. See Pandas bug 17093. transform_functions = [lambda df: df + 4, lambda df: -df - 10] for idx, func in enumerate(transform_functions): expected_exception = None if col1_category: if idx == 0: expected_exception = TypeError( "unsupported operand type(s) for +: 'Categorical' and 'int'" ) elif idx == 1: expected_exception = TypeError( "bad operand type for unary -: 'Categorical'" ) eval_general( modin_groupby, pandas_groupby, lambda df: df.transform(func), expected_exception=expected_exception, ) pipe_functions = [lambda dfgb: dfgb.sum()] for func in pipe_functions: expected_exception = None if col1_category: expected_exception = TypeError( "category type does not support sum operations" ) eval_general( modin_groupby, pandas_groupby, lambda df: df.pipe(func), expected_exception=expected_exception, ) eval_general( modin_groupby, pandas_groupby, lambda df: df.corr(), modin_df_almost_equals_pandas, ) eval_fillna(modin_groupby, pandas_groupby) eval_count(modin_groupby, pandas_groupby) if get_current_execution() != "BaseOnPython" and not current_execution_is_native(): eval_general( modin_groupby, pandas_groupby, lambda df: df.size(), ) eval_general(modin_groupby, pandas_groupby, lambda df: df.tail(n)) eval_quantile(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.take([0])) if isinstance(by, list) and not any( isinstance(o, (pd.Series, pandas.Series)) for o in by ): # Not yet supported for non-original-column-from-dataframe Series in by: eval___getattr__(modin_groupby, pandas_groupby, "col3") # FIXME: https://github.com/modin-project/modin/issues/7033 eval___getitem__( modin_groupby, pandas_groupby, "col3", expected_exception=False ) eval_groups(modin_groupby, pandas_groupby) # Intersection of the selection and 'by' columns is not yet supported non_by_cols = ( # Potential selection starts only from the second column, because the first may # be categorical in this test, which is not yet supported [col for col in pandas_df.columns[1:] if col not in modin_groupby._internal_by] if isinstance(by, list) else ["col3", "col4"] ) # FIXME: https://github.com/modin-project/modin/issues/7033 eval___getitem__( modin_groupby, pandas_groupby, non_by_cols, expected_exception=False ) # When GroupBy.__getitem__ meets an intersection of the selection and 'by' columns # it throws a warning with the suggested workaround. The following code tests # that this workaround works as expected. if len(modin_groupby._internal_by) != 0: if not isinstance(by, list): by = [by] by_from_workaround = [ ( modin_df[getattr(col, "name", col)].copy() if (hashable(col) and col in modin_groupby._internal_by) or isinstance(col, GetColumn) else col ) for col in by ] # GroupBy result with 'as_index=False' depends on the 'by' origin, since we forcibly changed # the origin of 'by' for modin by doing a copy, set 'as_index=True' to compare results. modin_groupby = modin_df.groupby( maybe_get_columns(modin_df, by_from_workaround), as_index=True ) pandas_groupby = pandas_df.groupby(pandas_by, as_index=True) eval___getitem__( modin_groupby, pandas_groupby, list(modin_groupby._internal_by) + non_by_cols[:1], ) def test_single_group_row_groupby(): pandas_df = pandas.DataFrame( { "col1": [0, 1, 2, 3], "col2": [4, 5, 36, 7], "col3": [3, 8, 12, 10], "col4": [17, 3, 16, 15], "col5": [-4, 5, -6, -7], } ) modin_df = from_pandas(pandas_df) by = ["1", "1", "1", "1"] n = 6 modin_groupby = modin_df.groupby(by=by) pandas_groupby = pandas_df.groupby(by=by) modin_groupby_equals_pandas(modin_groupby, pandas_groupby) eval_ngroups(modin_groupby, pandas_groupby) eval_shift(modin_groupby, pandas_groupby) eval_skew(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.ffill()) eval_general( modin_groupby, pandas_groupby, lambda df: df.sem(), modin_df_almost_equals_pandas, ) eval_mean(modin_groupby, pandas_groupby) eval_any(modin_groupby, pandas_groupby) eval_min(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.idxmax()) eval_ndim(modin_groupby, pandas_groupby) eval_cumsum(modin_groupby, pandas_groupby) eval_general( modin_groupby, pandas_groupby, lambda df: df.pct_change(), modin_df_almost_equals_pandas, ) eval_cummax(modin_groupby, pandas_groupby) apply_functions = [lambda df: df.sum(), lambda df: -df] for func in apply_functions: eval_apply(modin_groupby, pandas_groupby, func) eval_dtypes(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.first()) eval_cummin(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.bfill()) eval_general(modin_groupby, pandas_groupby, lambda df: df.idxmin()) eval_prod(modin_groupby, pandas_groupby) eval_std(modin_groupby, pandas_groupby) agg_functions = [ lambda df: df.sum(), "min", "max", max, sum, {"col2": "sum"}, {"col2": "max", "col4": "sum", "col5": "min"}, ] for func in agg_functions: eval_agg(modin_groupby, pandas_groupby, func) eval_general(modin_groupby, pandas_groupby, lambda df: df.last()) eval_rank(modin_groupby, pandas_groupby) eval_max(modin_groupby, pandas_groupby) eval_var(modin_groupby, pandas_groupby) eval_len(modin_groupby, pandas_groupby) eval_sum(modin_groupby, pandas_groupby) eval_ngroup(modin_groupby, pandas_groupby) eval_nunique(modin_groupby, pandas_groupby) eval_value_counts(modin_groupby, pandas_groupby) eval_median(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.head(n)) eval_cumprod(modin_groupby, pandas_groupby) eval_general( modin_groupby, pandas_groupby, lambda df: df.cov(), modin_df_almost_equals_pandas, ) transform_functions = [lambda df: df + 4, lambda df: -df - 10] for func in transform_functions: eval_transform(modin_groupby, pandas_groupby, func) pipe_functions = [lambda dfgb: dfgb.sum()] for func in pipe_functions: eval_pipe(modin_groupby, pandas_groupby, func) eval_general( modin_groupby, pandas_groupby, lambda df: df.corr(), modin_df_almost_equals_pandas, ) eval_fillna(modin_groupby, pandas_groupby) eval_count(modin_groupby, pandas_groupby) eval_size(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.tail(n)) eval_quantile(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.take([0])) eval___getattr__(modin_groupby, pandas_groupby, "col2") eval_groups(modin_groupby, pandas_groupby) @pytest.mark.parametrize("is_by_category", [True, False]) def test_large_row_groupby(is_by_category): pandas_df = pandas.DataFrame( np.random.randint(0, 8, size=(100, 4)), columns=list("ABCD") ) modin_df = from_pandas(pandas_df) by = [str(i) for i in pandas_df["A"].tolist()] if is_by_category: by = pandas.Categorical(by) n = 4 modin_groupby = modin_df.groupby(by=by) pandas_groupby = pandas_df.groupby(by=by) modin_groupby_equals_pandas(modin_groupby, pandas_groupby) eval_ngroups(modin_groupby, pandas_groupby) eval_shift(modin_groupby, pandas_groupby) eval_skew(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.ffill()) eval_general( modin_groupby, pandas_groupby, lambda df: df.sem(), modin_df_almost_equals_pandas, ) eval_mean(modin_groupby, pandas_groupby) eval_any(modin_groupby, pandas_groupby) eval_min(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.idxmax()) eval_ndim(modin_groupby, pandas_groupby) eval_cumsum(modin_groupby, pandas_groupby) eval_general( modin_groupby, pandas_groupby, lambda df: df.diff(periods=2), ) eval_general( modin_groupby, pandas_groupby, lambda df: df.diff(periods=-1), ) eval_general( modin_groupby, pandas_groupby, lambda df: df.diff(axis=1), ) eval_general( modin_groupby, pandas_groupby, lambda df: df.pct_change(), modin_df_almost_equals_pandas, ) eval_cummax(modin_groupby, pandas_groupby) apply_functions = [lambda df: df.sum(), lambda df: -df] for func in apply_functions: eval_apply(modin_groupby, pandas_groupby, func) eval_dtypes(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.first()) eval_cummin(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.bfill()) eval_general(modin_groupby, pandas_groupby, lambda df: df.idxmin()) # eval_prod(modin_groupby, pandas_groupby) causes overflows eval_std(modin_groupby, pandas_groupby) agg_functions = [ lambda df: df.sum(), "min", "max", min, sum, {"A": "sum"}, {"A": lambda df: df.sum()}, {"A": "max", "B": "sum", "C": "min"}, ] for func in agg_functions: eval_agg(modin_groupby, pandas_groupby, func) eval_general(modin_groupby, pandas_groupby, lambda df: df.last()) eval_rank(modin_groupby, pandas_groupby) eval_max(modin_groupby, pandas_groupby) eval_var(modin_groupby, pandas_groupby) eval_len(modin_groupby, pandas_groupby) eval_sum(modin_groupby, pandas_groupby) eval_ngroup(modin_groupby, pandas_groupby) eval_nunique(modin_groupby, pandas_groupby) eval_value_counts(modin_groupby, pandas_groupby) eval_median(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.head(n)) # eval_cumprod(modin_groupby, pandas_groupby) causes overflows eval_general( modin_groupby, pandas_groupby, lambda df: df.cov(), modin_df_almost_equals_pandas, ) transform_functions = [lambda df: df + 4, lambda df: -df - 10] for func in transform_functions: eval_transform(modin_groupby, pandas_groupby, func) pipe_functions = [lambda dfgb: dfgb.sum()] for func in pipe_functions: eval_pipe(modin_groupby, pandas_groupby, func) eval_general( modin_groupby, pandas_groupby, lambda df: df.corr(), modin_df_almost_equals_pandas, ) eval_fillna(modin_groupby, pandas_groupby) eval_count(modin_groupby, pandas_groupby) eval_size(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.tail(n)) eval_quantile(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.take([0])) eval_groups(modin_groupby, pandas_groupby) def test_simple_col_groupby(): pandas_df = pandas.DataFrame( { "col1": [0, 3, 2, 3], "col2": [4, 1, 6, 7], "col3": [3, 8, 2, 10], "col4": [1, 13, 6, 15], "col5": [-4, 5, 6, -7], } ) modin_df = from_pandas(pandas_df) by = [1, 2, 3, 2, 1] modin_groupby = modin_df.groupby(axis=1, by=by) pandas_groupby = pandas_df.groupby(axis=1, by=by) modin_groupby_equals_pandas(modin_groupby, pandas_groupby) eval_ngroups(modin_groupby, pandas_groupby) eval_shift(modin_groupby, pandas_groupby) eval_skew(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.ffill()) eval_general( modin_groupby, pandas_groupby, lambda df: df.sem(), modin_df_almost_equals_pandas, ) eval_mean(modin_groupby, pandas_groupby) eval_any(modin_groupby, pandas_groupby) eval_min(modin_groupby, pandas_groupby) eval_ndim(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.idxmax()) eval_general(modin_groupby, pandas_groupby, lambda df: df.idxmin()) eval_quantile(modin_groupby, pandas_groupby) # https://github.com/pandas-dev/pandas/issues/21127 # eval_cumsum(modin_groupby, pandas_groupby) # eval_cummax(modin_groupby, pandas_groupby) # eval_cummin(modin_groupby, pandas_groupby) # eval_cumprod(modin_groupby, pandas_groupby) eval_general( modin_groupby, pandas_groupby, lambda df: df.pct_change(), modin_df_almost_equals_pandas, ) apply_functions = [lambda df: -df, lambda df: df.sum(axis=1)] for func in apply_functions: eval_apply(modin_groupby, pandas_groupby, func) eval_general(modin_groupby, pandas_groupby, lambda df: df.first()) eval_general(modin_groupby, pandas_groupby, lambda df: df.bfill()) eval_prod(modin_groupby, pandas_groupby) eval_std(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.last()) eval_max(modin_groupby, pandas_groupby) eval_var(modin_groupby, pandas_groupby) eval_len(modin_groupby, pandas_groupby) eval_sum(modin_groupby, pandas_groupby) # Pandas fails on this case with ValueError # eval_ngroup(modin_groupby, pandas_groupby) # eval_nunique(modin_groupby, pandas_groupby) # NotImplementedError: DataFrameGroupBy.value_counts only handles axis=0 # eval_value_counts(modin_groupby, pandas_groupby) eval_median(modin_groupby, pandas_groupby) eval_general( modin_groupby, pandas_groupby, lambda df: df.cov(), modin_df_almost_equals_pandas, ) transform_functions = [lambda df: df + 4, lambda df: -df - 10] for func in transform_functions: eval_transform(modin_groupby, pandas_groupby, func) pipe_functions = [lambda dfgb: dfgb.sum()] for func in pipe_functions: eval_pipe(modin_groupby, pandas_groupby, func) eval_general( modin_groupby, pandas_groupby, lambda df: df.corr(), modin_df_almost_equals_pandas, ) eval_fillna(modin_groupby, pandas_groupby) eval_count(modin_groupby, pandas_groupby) eval_size(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.take([0])) # https://github.com/pandas-dev/pandas/issues/54858 # eval_groups(modin_groupby, pandas_groupby) @pytest.mark.parametrize( "by", [np.random.randint(0, 100, size=2**8), lambda x: x % 3, None] ) @pytest.mark.parametrize("as_index_series_or_dataframe", [0, 1, 2]) def test_series_groupby(by, as_index_series_or_dataframe): if as_index_series_or_dataframe <= 1: as_index = as_index_series_or_dataframe == 1 series_data = np.random.randint(97, 198, size=2**8) modin_series = pd.Series(series_data) pandas_series = pandas.Series(series_data) else: as_index = True pandas_series = pandas.DataFrame( { "col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], "col3": [3, 8, 12, 10], "col4": [17, 13, 16, 15], "col5": [-4, -5, -6, -7], } ) modin_series = from_pandas(pandas_series) if isinstance(by, np.ndarray) or by is None: by = np.random.randint(0, 100, size=len(pandas_series.index)) n = 1 try: pandas_groupby = pandas_series.groupby(by, as_index=as_index) if as_index_series_or_dataframe == 2: pandas_groupby = pandas_groupby["col1"] except Exception as err: with pytest.raises(type(err)): modin_series.groupby(by, as_index=as_index) else: modin_groupby = modin_series.groupby(by, as_index=as_index) if as_index_series_or_dataframe == 2: modin_groupby = modin_groupby["col1"] modin_groupby_equals_pandas(modin_groupby, pandas_groupby) eval_ngroups(modin_groupby, pandas_groupby) eval_shift(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.ffill()) eval_general( modin_groupby, pandas_groupby, lambda df: df.sem(), modin_df_almost_equals_pandas, ) eval_general( modin_groupby, pandas_groupby, lambda df: df.sample(random_state=1) ) eval_general(modin_groupby, pandas_groupby, lambda df: df.ewm(com=0.5).std()) eval_general( modin_groupby, pandas_groupby, lambda df: df.is_monotonic_decreasing ) eval_general( modin_groupby, pandas_groupby, lambda df: df.is_monotonic_increasing ) eval_general(modin_groupby, pandas_groupby, lambda df: df.nlargest()) eval_general(modin_groupby, pandas_groupby, lambda df: df.nsmallest()) eval_general(modin_groupby, pandas_groupby, lambda df: df.unique()) eval_general(modin_groupby, pandas_groupby, lambda df: df.dtype) eval_mean(modin_groupby, pandas_groupby) eval_any(modin_groupby, pandas_groupby) eval_min(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.idxmax()) eval_ndim(modin_groupby, pandas_groupby) eval_cumsum(modin_groupby, pandas_groupby) eval_general( modin_groupby, pandas_groupby, lambda df: df.pct_change(), modin_df_almost_equals_pandas, ) eval_general( modin_groupby, pandas_groupby, lambda df: df.diff(periods=2), ) eval_general( modin_groupby, pandas_groupby, lambda df: df.diff(periods=-1), ) eval_cummax(modin_groupby, pandas_groupby) apply_functions = [lambda df: df.sum(), min] for func in apply_functions: eval_apply(modin_groupby, pandas_groupby, func) eval_general(modin_groupby, pandas_groupby, lambda df: df.first()) eval_cummin(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.bfill()) eval_general(modin_groupby, pandas_groupby, lambda df: df.idxmin()) eval_prod(modin_groupby, pandas_groupby) if as_index: eval_std(modin_groupby, pandas_groupby) eval_var(modin_groupby, pandas_groupby) eval_skew(modin_groupby, pandas_groupby) agg_functions = [ lambda df: df.sum(), "min", "max", max, sum, np.mean, ["min", "max"], [np.mean, np.std, np.var, np.max, np.min], ] for func in agg_functions: eval_agg(modin_groupby, pandas_groupby, func) eval_general(modin_groupby, pandas_groupby, lambda df: df.last()) eval_rank(modin_groupby, pandas_groupby) eval_max(modin_groupby, pandas_groupby) eval_len(modin_groupby, pandas_groupby) eval_sum(modin_groupby, pandas_groupby) eval_size(modin_groupby, pandas_groupby) eval_ngroup(modin_groupby, pandas_groupby) eval_nunique(modin_groupby, pandas_groupby) eval_value_counts(modin_groupby, pandas_groupby) eval_median(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.head(n)) eval_cumprod(modin_groupby, pandas_groupby) transform_functions = [lambda df: df + 4, lambda df: -df - 10] for func in transform_functions: eval_transform(modin_groupby, pandas_groupby, func) pipe_functions = [lambda dfgb: dfgb.sum()] for func in pipe_functions: eval_pipe(modin_groupby, pandas_groupby, func) eval_fillna(modin_groupby, pandas_groupby) eval_count(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.tail(n)) eval_quantile(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.take([0])) eval_groups(modin_groupby, pandas_groupby) def test_agg_udf_6600(): data = { "name": ["Mariners", "Lakers"] * 50, "league_abbreviation": ["MLB", "NBA"] * 50, } modin_teams, pandas_teams = create_test_dfs(data) def my_first_item(s): return s.iloc[0] for agg in (my_first_item, [my_first_item], ["nunique", my_first_item]): eval_general( modin_teams, pandas_teams, operation=lambda df: df.groupby("league_abbreviation").name.agg(agg), ) def test_multi_column_groupby(): pandas_df = pandas.DataFrame( { "col1": np.random.randint(0, 100, size=1000), "col2": np.random.randint(0, 100, size=1000), "col3": np.random.randint(0, 100, size=1000), "col4": np.random.randint(0, 100, size=1000), "col5": np.random.randint(0, 100, size=1000), }, index=["row{}".format(i) for i in range(1000)], ) modin_df = from_pandas(pandas_df) by = ["col1", "col2"] df_equals(modin_df.groupby(by).count(), pandas_df.groupby(by).count()) with pytest.warns(UserWarning): for k, _ in modin_df.groupby(by): assert isinstance(k, tuple) by = ["row0", "row1"] with pytest.raises(KeyError): modin_df.groupby(by, axis=1).count() def sort_if_experimental_groupby(*dfs): """ This method should be applied before comparing results of ``groupby.transform`` as the experimental implementation changes the order of rows for that: https://github.com/modin-project/modin/issues/5924 """ result = dfs if RangePartitioning.get(): dfs = try_cast_to_pandas(dfs) result = [] for df in dfs: if df.ndim == 1: # Series case result.append(df.sort_index()) continue # filtering out index names in order to avoid: # ValueError: 'col' is both an index level and a column label, which is ambiguous. cols_no_idx_names = df.columns.difference( df.index.names, sort=False ).tolist() df = df.sort_values(cols_no_idx_names) result.append(df) return result def eval_ngroups(modin_groupby, pandas_groupby): assert modin_groupby.ngroups == pandas_groupby.ngroups def eval_skew(modin_groupby, pandas_groupby, numeric_only=False): modin_df_almost_equals_pandas( modin_groupby.skew(numeric_only=numeric_only), pandas_groupby.skew(numeric_only=numeric_only), ) def eval_mean(modin_groupby, pandas_groupby, numeric_only=False): modin_df_almost_equals_pandas( modin_groupby.mean(numeric_only=numeric_only), pandas_groupby.mean(numeric_only=numeric_only), ) def eval_any(modin_groupby, pandas_groupby): df_equals(modin_groupby.any(), pandas_groupby.any()) def eval_min(modin_groupby, pandas_groupby): df_equals(modin_groupby.min(), pandas_groupby.min()) def eval_ndim(modin_groupby, pandas_groupby): assert modin_groupby.ndim == pandas_groupby.ndim def eval_cumsum(modin_groupby, pandas_groupby, axis=lib.no_default, numeric_only=False): df_equals( *sort_if_experimental_groupby( modin_groupby.cumsum(axis=axis, numeric_only=numeric_only), pandas_groupby.cumsum(axis=axis, numeric_only=numeric_only), ) ) def eval_cummax(modin_groupby, pandas_groupby, axis=lib.no_default, numeric_only=False): df_equals( *sort_if_experimental_groupby( modin_groupby.cummax(axis=axis, numeric_only=numeric_only), pandas_groupby.cummax(axis=axis, numeric_only=numeric_only), ) ) def eval_cummin(modin_groupby, pandas_groupby, axis=lib.no_default, numeric_only=False): df_equals( *sort_if_experimental_groupby( modin_groupby.cummin(axis=axis, numeric_only=numeric_only), pandas_groupby.cummin(axis=axis, numeric_only=numeric_only), ) ) def eval_apply(modin_groupby, pandas_groupby, func): df_equals(modin_groupby.apply(func), pandas_groupby.apply(func)) def eval_dtypes(modin_groupby, pandas_groupby): df_equals(modin_groupby.dtypes, pandas_groupby.dtypes) def eval_prod(modin_groupby, pandas_groupby, numeric_only=False): df_equals( modin_groupby.prod(numeric_only=numeric_only), pandas_groupby.prod(numeric_only=numeric_only), ) def eval_std(modin_groupby, pandas_groupby, numeric_only=False): modin_df_almost_equals_pandas( modin_groupby.std(numeric_only=numeric_only), pandas_groupby.std(numeric_only=numeric_only), ) def eval_agg(modin_groupby, pandas_groupby, func): df_equals(modin_groupby.agg(func), pandas_groupby.agg(func)) def eval_rank(modin_groupby, pandas_groupby): df_equals(modin_groupby.rank(), pandas_groupby.rank()) def eval_max(modin_groupby, pandas_groupby): df_equals(modin_groupby.max(), pandas_groupby.max()) def eval_var(modin_groupby, pandas_groupby, numeric_only=False): modin_df_almost_equals_pandas( modin_groupby.var(numeric_only=numeric_only), pandas_groupby.var(numeric_only=numeric_only), ) def eval_len(modin_groupby, pandas_groupby): assert len(modin_groupby) == len(pandas_groupby) def eval_sum(modin_groupby, pandas_groupby): df_equals(modin_groupby.sum(), pandas_groupby.sum()) def eval_ngroup(modin_groupby, pandas_groupby): df_equals(modin_groupby.ngroup(), pandas_groupby.ngroup()) def eval_nunique(modin_groupby, pandas_groupby): df_equals(modin_groupby.nunique(), pandas_groupby.nunique()) def eval_value_counts(modin_groupby, pandas_groupby): df_equals(modin_groupby.value_counts(), pandas_groupby.value_counts()) def eval_median(modin_groupby, pandas_groupby, numeric_only=False): modin_df_almost_equals_pandas( modin_groupby.median(numeric_only=numeric_only), pandas_groupby.median(numeric_only=numeric_only), ) def eval_cumprod( modin_groupby, pandas_groupby, axis=lib.no_default, numeric_only=False ): df_equals( *sort_if_experimental_groupby( modin_groupby.cumprod(numeric_only=numeric_only), pandas_groupby.cumprod(numeric_only=numeric_only), ) ) df_equals( *sort_if_experimental_groupby( modin_groupby.cumprod(axis=axis, numeric_only=numeric_only), pandas_groupby.cumprod(axis=axis, numeric_only=numeric_only), ) ) def eval_transform(modin_groupby, pandas_groupby, func): df_equals( *sort_if_experimental_groupby( modin_groupby.transform(func), pandas_groupby.transform(func) ) ) def eval_fillna(modin_groupby, pandas_groupby): df_equals( *sort_if_experimental_groupby( modin_groupby.fillna(method="ffill"), pandas_groupby.fillna(method="ffill") ) ) def eval_count(modin_groupby, pandas_groupby): df_equals(modin_groupby.count(), pandas_groupby.count()) def eval_size(modin_groupby, pandas_groupby): df_equals(modin_groupby.size(), pandas_groupby.size()) def eval_pipe(modin_groupby, pandas_groupby, func): df_equals(modin_groupby.pipe(func), pandas_groupby.pipe(func)) def eval_quantile(modin_groupby, pandas_groupby): try: pandas_result = pandas_groupby.quantile(q=0.4, numeric_only=True) except Exception as err: with pytest.raises(type(err)): modin_groupby.quantile(q=0.4, numeric_only=True) else: df_equals(modin_groupby.quantile(q=0.4, numeric_only=True), pandas_result) def eval___getattr__(modin_groupby, pandas_groupby, item): eval_general( modin_groupby, pandas_groupby, lambda grp: grp[item].count(), comparator=build_types_asserter(df_equals), ) eval_general( modin_groupby, pandas_groupby, lambda grp: getattr(grp, item).count(), comparator=build_types_asserter(df_equals), ) def eval___getitem__(md_grp, pd_grp, item, expected_exception=None): eval_general( md_grp, pd_grp, lambda grp: grp[item].mean(), comparator=build_types_asserter(df_equals), expected_exception=expected_exception, ) eval_general( md_grp, pd_grp, lambda grp: grp[item].count(), comparator=build_types_asserter(df_equals), expected_exception=expected_exception, ) def build_list_agg(fns): def test(grp): res = grp[item].agg(fns) if res.ndim == 2: # `as_index=False` case new_axis = fns if "index" in res.columns: new_axis = ["index"] + new_axis # Modin's frame has an extra level in the result. Alligning columns to compare. # https://github.com/modin-project/modin/issues/3490 res = res.set_axis(new_axis, axis=1) return res return test eval_general( md_grp, pd_grp, build_list_agg(["mean"]), comparator=build_types_asserter(df_equals), expected_exception=expected_exception, ) eval_general( md_grp, pd_grp, build_list_agg(["mean", "count"]), comparator=build_types_asserter(df_equals), expected_exception=expected_exception, ) # Explicit default-to-pandas test eval_general( md_grp, pd_grp, # Defaulting to pandas only for Modin groupby objects lambda grp: ( grp[item].sum() if not isinstance(grp, pd.groupby.DataFrameGroupBy) else grp[item]._default_to_pandas(lambda df: df.sum()) ), comparator=build_types_asserter(df_equals), expected_exception=expected_exception, ) def eval_groups(modin_groupby, pandas_groupby): for k, v in modin_groupby.groups.items(): assert v.equals(pandas_groupby.groups[k]) if RangePartitioning.get(): # `.get_group()` doesn't work correctly with experimental groupby: # https://github.com/modin-project/modin/issues/6093 return for name in pandas_groupby.groups: df_equals(modin_groupby.get_group(name), pandas_groupby.get_group(name)) def eval_shift(modin_groupby, pandas_groupby, comparator=None): if comparator is None: def comparator(df1, df2): df_equals(*sort_if_experimental_groupby(df1, df2)) eval_general( modin_groupby, pandas_groupby, lambda groupby: groupby.shift(), comparator=comparator, ) eval_general( modin_groupby, pandas_groupby, lambda groupby: groupby.shift(periods=0), comparator=comparator, ) eval_general( modin_groupby, pandas_groupby, lambda groupby: groupby.shift(periods=-3), comparator=comparator, ) # Disabled for `BaseOnPython` because of the issue with `getitem_array`. # groupby.shift internally masks the source frame with a Series boolean mask, # doing so ends up in the `getitem_array` method, that is broken for `BaseOnPython`: # https://github.com/modin-project/modin/issues/3701 if get_current_execution() != "BaseOnPython" and not current_execution_is_native(): if isinstance(pandas_groupby, pandas.core.groupby.DataFrameGroupBy): pandas_res = pandas_groupby.shift(axis=1, fill_value=777) modin_res = modin_groupby.shift(axis=1, fill_value=777) # Pandas produces unexpected index order (pandas GH 44269). # Here we align index of Modin result with pandas to make test passed. import pandas.core.algorithms as algorithms indexer, _ = modin_res.index.get_indexer_non_unique(modin_res.index._values) indexer = algorithms.unique1d(indexer) modin_res = modin_res.take(indexer) comparator(modin_res, pandas_res) else: eval_general( modin_groupby, pandas_groupby, lambda groupby: groupby.shift(fill_value=777), comparator=comparator, ) def test_groupby_on_index_values_with_loop(): length = 2**6 data = { "a": np.random.randint(0, 100, size=length), "b": np.random.randint(0, 100, size=length), "c": np.random.randint(0, 100, size=length), } idx = ["g1" if i % 3 != 0 else "g2" for i in range(length)] modin_df = pd.DataFrame(data, index=idx, columns=list("aba")) pandas_df = pandas.DataFrame(data, index=idx, columns=list("aba")) modin_groupby_obj = modin_df.groupby(modin_df.index) pandas_groupby_obj = pandas_df.groupby(pandas_df.index) modin_dict = {k: v for k, v in modin_groupby_obj} pandas_dict = {k: v for k, v in pandas_groupby_obj} for k in modin_dict: df_equals(modin_dict[k], pandas_dict[k]) modin_groupby_obj = modin_df.groupby(modin_df.columns, axis=1) pandas_groupby_obj = pandas_df.groupby(pandas_df.columns, axis=1) modin_dict = {k: v for k, v in modin_groupby_obj} pandas_dict = {k: v for k, v in pandas_groupby_obj} for k in modin_dict: df_equals(modin_dict[k], pandas_dict[k]) def test_groupby_getitem_preserves_key_order_issue_6154(): a = np.tile(["a", "b", "c", "d", "e"], (1, 10)) np.random.shuffle(a[0]) df = pd.DataFrame( np.hstack((a.T, np.arange(100).reshape((50, 2)))), columns=["col 1", "col 2", "col 3"], ) eval_general( df, df._to_pandas(), lambda df: df.groupby("col 1")[["col 3", "col 2"]].count() ) @pytest.mark.parametrize( "groupby_kwargs", [ pytest.param({"level": 1, "axis": 1}, id="level_idx_axis=1"), pytest.param({"level": 1}, id="level_idx"), pytest.param({"level": [1, "four"]}, id="level_idx+name"), pytest.param({"by": "four"}, id="level_name"), pytest.param({"by": ["one", "two"]}, id="level_name_multi_by"), pytest.param({"by": ["item0", "one", "two"]}, id="col_name+level_name"), ], ) def test_groupby_multiindex(groupby_kwargs): frame_data = np.random.randint(0, 100, size=(2**6, 2**6)) modin_df = pd.DataFrame(frame_data) pandas_df = pandas.DataFrame(frame_data) new_index = pandas.Index([f"item{i}" for i in range(len(pandas_df))]) new_columns = pandas.MultiIndex.from_tuples( [(i // 4, i // 2, i) for i in modin_df.columns], names=["four", "two", "one"] ) modin_df.columns = new_columns modin_df.index = new_index pandas_df.columns = new_columns pandas_df.index = new_index if groupby_kwargs.get("axis", 0) == 0: modin_df = modin_df.T pandas_df = pandas_df.T md_grp, pd_grp = ( modin_df.groupby(**groupby_kwargs), pandas_df.groupby(**groupby_kwargs), ) modin_groupby_equals_pandas(md_grp, pd_grp) df_equals(md_grp.sum(), pd_grp.sum()) df_equals(md_grp.size(), pd_grp.size()) # Grouping on level works incorrect in case of aggregation: # https://github.com/modin-project/modin/issues/2912 # df_equals(md_grp.quantile(), pd_grp.quantile()) df_equals(md_grp.first(), pd_grp.first()) @pytest.mark.parametrize("dropna", [True, False]) @pytest.mark.parametrize( "groupby_kwargs", [ pytest.param({"level": 1, "axis": 1}, id="level_idx_axis=1"), pytest.param({"level": 1}, id="level_idx"), pytest.param({"level": [1, "four"]}, id="level_idx+name"), pytest.param({"by": "four"}, id="level_name"), pytest.param({"by": ["one", "two"]}, id="level_name_multi_by"), pytest.param( {"by": ["item0", "one", "two"]}, id="col_name+level_name", ), pytest.param( {"by": ["item0"]}, id="col_name", ), pytest.param( {"by": ["item0", "item1"]}, id="col_name_multi_by", ), ], ) def test_groupby_with_kwarg_dropna(groupby_kwargs, dropna): modin_df = pd.DataFrame(test_data["float_nan_data"]) pandas_df = pandas.DataFrame(test_data["float_nan_data"]) new_index = pandas.Index([f"item{i}" for i in range(len(pandas_df))]) new_columns = pandas.MultiIndex.from_tuples( [(i // 4, i // 2, i) for i in range(len(modin_df.columns))], names=["four", "two", "one"], ) modin_df.columns = new_columns modin_df.index = new_index pandas_df.columns = new_columns pandas_df.index = new_index if groupby_kwargs.get("axis", 0) == 0: modin_df = modin_df.T pandas_df = pandas_df.T md_grp, pd_grp = ( modin_df.groupby(**groupby_kwargs, dropna=dropna), pandas_df.groupby(**groupby_kwargs, dropna=dropna), ) modin_groupby_equals_pandas(md_grp, pd_grp) by_kwarg = groupby_kwargs.get("by", []) # Disabled because of broken `dropna=False` for TreeReduce implemented aggs: # https://github.com/modin-project/modin/issues/3817 if not ( not dropna and len(by_kwarg) > 1 and any(col in modin_df.columns for col in by_kwarg) ): df_equals(md_grp.sum(), pd_grp.sum()) df_equals(md_grp.size(), pd_grp.size()) # Grouping on level works incorrect in case of aggregation: # https://github.com/modin-project/modin/issues/2912 # "BaseOnPython" tests are disabled because of the bug: # https://github.com/modin-project/modin/issues/3827 if ( get_current_execution() != "BaseOnPython" and not current_execution_is_native() and any(col in modin_df.columns for col in by_kwarg) ): df_equals(md_grp.quantile(), pd_grp.quantile()) # Default-to-pandas tests are disabled for multi-column 'by' because of the bug: # https://github.com/modin-project/modin/issues/3827 if not (not dropna and len(by_kwarg) > 1): df_equals(md_grp.first(), pd_grp.first()) df_equals(md_grp._default_to_pandas(lambda df: df.sum()), pd_grp.sum()) @pytest.mark.parametrize("groupby_axis", [lib.no_default, 1]) @pytest.mark.parametrize("shift_axis", [lib.no_default, 1]) @pytest.mark.parametrize("groupby_sort", [True, False]) def test_shift_freq(groupby_axis, shift_axis, groupby_sort): pandas_df = pandas.DataFrame( { "col1": [1, 0, 2, 3], "col2": [4, 5, np.nan, 7], "col3": [np.nan, np.nan, 12, 10], "col4": [17, 13, 16, 15], } ) modin_df = from_pandas(pandas_df) new_index = pandas.date_range("1/12/2020", periods=4, freq="s") if groupby_axis == 0 and shift_axis == 0: pandas_df.index = modin_df.index = new_index by = [["col2", "col3"], ["col2"], ["col4"], [0, 1, 0, 2]] else: pandas_df.index = modin_df.index = new_index pandas_df.columns = modin_df.columns = new_index by = [[0, 1, 0, 2]] for _by in by: pandas_groupby = pandas_df.groupby(by=_by, axis=groupby_axis, sort=groupby_sort) modin_groupby = modin_df.groupby(by=_by, axis=groupby_axis, sort=groupby_sort) eval_general( modin_groupby, pandas_groupby, lambda groupby: groupby.shift(axis=shift_axis, freq="s"), ) @pytest.mark.parametrize( "by_and_agg_dict", [ { "by": [ list(test_data["int_data"].keys())[0], list(test_data["int_data"].keys())[1], ], "agg_dict": { "max": (list(test_data["int_data"].keys())[2], np.max), "min": (list(test_data["int_data"].keys())[2], np.min), }, }, { "by": ["col1"], "agg_dict": { "max": (list(test_data["int_data"].keys())[0], np.max), "min": (list(test_data["int_data"].keys())[-1], np.min), }, }, { "by": [ list(test_data["int_data"].keys())[0], list(test_data["int_data"].keys())[-1], ], "agg_dict": { "max": (list(test_data["int_data"].keys())[1], max), "min": (list(test_data["int_data"].keys())[-2], min), }, }, pytest.param( { "by": [ list(test_data["int_data"].keys())[0], list(test_data["int_data"].keys())[-1], ], "agg_dict": { "max": (list(test_data["int_data"].keys())[1], max), "min": (list(test_data["int_data"].keys())[-1], min), }, }, marks=pytest.mark.skip("See Modin issue #3602"), ), ], ) @pytest.mark.parametrize("as_index", [True, False]) def test_agg_func_None_rename(by_and_agg_dict, as_index): modin_df, pandas_df = create_test_dfs(test_data["int_data"]) modin_result = modin_df.groupby(by_and_agg_dict["by"], as_index=as_index).agg( **by_and_agg_dict["agg_dict"] ) pandas_result = pandas_df.groupby(by_and_agg_dict["by"], as_index=as_index).agg( **by_and_agg_dict["agg_dict"] ) df_equals(modin_result, pandas_result) @pytest.mark.parametrize( "as_index", [ True, pytest.param( False, marks=pytest.mark.skipif( get_current_execution() == "BaseOnPython" or RangePartitioning.get() or current_execution_is_native(), reason="See Pandas issue #39103", ), ), ], ) @pytest.mark.parametrize("by_length", [1, 3]) @pytest.mark.parametrize( "agg_fns", [["sum", "min", "max"], ["mean", "quantile"]], ids=["reduce", "aggregation"], ) @pytest.mark.parametrize( "intersection_with_by_cols", [pytest.param(True, marks=pytest.mark.skip("See Modin issue #3602")), False], ) def test_dict_agg_rename_mi_columns( as_index, by_length, agg_fns, intersection_with_by_cols ): md_df, pd_df = create_test_dfs(test_data["int_data"]) mi_columns = generate_multiindex(len(md_df.columns), nlevels=4) md_df.columns, pd_df.columns = mi_columns, mi_columns by = list(md_df.columns[:by_length]) agg_cols = ( list(md_df.columns[by_length - 1 : by_length + 2]) if intersection_with_by_cols else list(md_df.columns[by_length : by_length + 3]) ) agg_dict = { f"custom-{i}" + str(agg_fns[i % len(agg_fns)]): (col, agg_fns[i % len(agg_fns)]) for i, col in enumerate(agg_cols) } md_res = md_df.groupby(by, as_index=as_index).agg(**agg_dict) pd_res = pd_df.groupby(by, as_index=as_index).agg(**agg_dict) df_equals(md_res, pd_res) def test_agg_4604(): data = {"col1": [1, 2], "col2": [3, 4]} modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) # add another partition modin_df["col3"] = modin_df["col1"] pandas_df["col3"] = pandas_df["col1"] # problem only with custom aggregation function def col3(x): return np.max(x) by = ["col1"] agg_func = {"col2": ["sum", "min"], "col3": col3} modin_groupby, pandas_groupby = modin_df.groupby(by), pandas_df.groupby(by) eval_agg(modin_groupby, pandas_groupby, agg_func) @pytest.mark.parametrize( "operation", [ "quantile", "mean", "sum", "median", "cumprod", ], ) def test_agg_exceptions(operation): N = 256 fill_data = [ ( "nan_column", [ np.datetime64("2010"), None, np.datetime64("2007"), np.datetime64("2010"), np.datetime64("2006"), np.datetime64("2012"), None, np.datetime64("2011"), ] * (N // 8), ), ( "date_column", [ np.datetime64("2010"), np.datetime64("2011"), np.datetime64("2011-06-15T00:00"), np.datetime64("2009-01-01"), ] * (N // 4), ), ] data1 = { "column_to_by": ["foo", "bar", "baz", "bar"] * (N // 4), # Earlier, the type of this column was `object`. In such a situation, # when performing aggregation on different column partitions, different # exceptions were thrown. The exception that engines return to the main # process was non-deterministic, either `TypeError` or `NotImplementedError`. "nan_column": [np.nan] * N, } data2 = { f"{key}{i}": value for key, value in fill_data for i in range(N // len(fill_data)) } data = {**data1, **data2} def comparator(df1, df2): from modin.core.dataframe.algebra.default2pandas.groupby import GroupBy if GroupBy.is_transformation_kernel(operation): df1, df2 = sort_if_experimental_groupby(df1, df2) df_equals(df1, df2) expected_exception = None if operation == "sum": expected_exception = TypeError( "datetime64 type does not support sum operations" ) elif operation == "cumprod": expected_exception = TypeError( "datetime64 type does not support cumprod operations" ) eval_aggregation( *create_test_dfs(data), operation=operation, comparator=comparator, expected_exception=expected_exception, ) @pytest.mark.skip( "Pandas raises a ValueError on empty dictionary aggregation since 1.2.0" + "It's unclear is that was made on purpose or it is a bug. That question" + "was asked in https://github.com/pandas-dev/pandas/issues/39609." + "So until the answer this test is disabled." ) @pytest.mark.parametrize( "kwargs", [ { "Max": ("cnt", np.max), "Sum": ("cnt", np.sum), "Num": ("c", pd.Series.nunique), "Num1": ("c", pandas.Series.nunique), }, { "func": { "Max": ("cnt", np.max), "Sum": ("cnt", np.sum), "Num": ("c", pd.Series.nunique), "Num1": ("c", pandas.Series.nunique), } }, ], ) def test_to_pandas_convertion(kwargs): data = {"a": [1, 2], "b": [3, 4], "c": [5, 6]} by = ["a", "b"] eval_aggregation(*create_test_dfs(data), by=by, **kwargs) @pytest.mark.parametrize( # When True, do df[name], otherwise just use name "columns", [ [(False, "a"), (False, "b"), (False, "c")], [(False, "a"), (False, "b")], [(True, "b"), (True, "a"), (True, "c")], [(True, "a"), (True, "b")], [(True, "c"), (False, "a"), (False, "b")], [(False, "a"), (True, "c")], ], ) @pytest.mark.parametrize("drop_from_original_df", [True, False]) @pytest.mark.parametrize("as_index", [True, False]) def test_mixed_columns(columns, drop_from_original_df, as_index): data = { "a": [1, 1, 2, 2] * 64, "b": [11, 11, 22, 22] * 64, "c": [111, 111, 222, 222] * 64, "data": [1, 2, 3, 4] * 64, } md_df, pd_df = create_test_dfs(data) md_df, md_by = get_external_groupers(md_df, columns, drop_from_original_df) pd_df, pd_by = get_external_groupers(pd_df, columns, drop_from_original_df) md_grp = md_df.groupby(md_by, as_index=as_index) pd_grp = pd_df.groupby(pd_by, as_index=as_index) df_equals(md_grp.size(), pd_grp.size()) df_equals(md_grp.sum(), pd_grp.sum()) df_equals( md_grp.apply(lambda df: df.sum(), include_groups=False), pd_grp.apply(lambda df: df.sum(), include_groups=False), ) @pytest.mark.parametrize("as_index", [True, False]) def test_groupby_external_grouper_duplicated_names(as_index): data = { "a": [1, 1, 2, 2] * 64, "b": [11, 11, 22, 22] * 64, "c": [111, 111, 222, 222] * 64, "data": [1, 2, 3, 4] * 64, } md_df, pd_df = create_test_dfs(data) md_unnamed_series1, pd_unnamed_series1 = create_test_series([1, 1, 2, 2] * 64) md_unnamed_series2, pd_unnamed_series2 = create_test_series([10, 10, 20, 20] * 64) md_grp = md_df.groupby([md_unnamed_series1, md_unnamed_series2], as_index=as_index) pd_grp = pd_df.groupby([pd_unnamed_series1, pd_unnamed_series2], as_index=as_index) df_equals(md_grp.sum(), pd_grp.sum()) md_same_named_series1, pd_same_named_series1 = create_test_series( [1, 1, 2, 2] * 64, name="series_name" ) md_same_named_series2, pd_same_named_series2 = create_test_series( [10, 10, 20, 20] * 64, name="series_name" ) md_grp = md_df.groupby( [md_same_named_series1, md_same_named_series2], as_index=as_index ) pd_grp = pd_df.groupby( [pd_same_named_series1, pd_same_named_series2], as_index=as_index ) df_equals(md_grp.sum(), pd_grp.sum()) @pytest.mark.parametrize( # When True, use (df[name] + 1), otherwise just use name "columns", [ [(True, "a"), (True, "b"), (True, "c")], [(True, "a"), (True, "b")], [(False, "a"), (False, "b"), (True, "c")], [(False, "a"), (True, "c")], [(False, "a"), (True, "c"), (False, [1, 1, 2])], [(False, "a"), (False, "b"), (False, "c")], [(False, "a"), (False, "b"), (False, "c"), (False, [1, 1, 2])], ], ) def test_internal_by_detection(columns): data = {"a": [1, 1, 2], "b": [11, 11, 22], "c": [111, 111, 222]} md_df = pd.DataFrame(data) _, by = get_external_groupers(md_df, columns, add_plus_one=True) md_grp = md_df.groupby(by) ref = frozenset( col for is_lookup, col in columns if not is_lookup and hashable(col) ) exp = frozenset(md_grp._internal_by) assert ref == exp @pytest.mark.parametrize( # When True, use (df[name] + 1), otherwise just use name "columns", [ [(True, "a"), (True, "b"), (True, "c")], [(True, "a"), (True, "b")], [(False, "a"), (False, "b"), (True, "c")], [(False, "a"), (True, "c")], [(False, "a"), (True, "c"), (False, [1, 1, 2])], ], ) @pytest.mark.parametrize("as_index", [True, False]) def test_mixed_columns_not_from_df(columns, as_index): """ Unlike the previous test, in this case the Series is not just a column from the original DataFrame, so you can't use a fasttrack. """ data = {"a": [1, 1, 2], "b": [11, 11, 22], "c": [111, 111, 222]} groupby_kw = {"as_index": as_index} md_df, pd_df = create_test_dfs(data) (_, by_md), (_, by_pd) = map( lambda df: get_external_groupers(df, columns, add_plus_one=True), [md_df, pd_df] ) pd_grp = pd_df.groupby(by_pd, **groupby_kw) md_grp = md_df.groupby(by_md, **groupby_kw) modin_groupby_equals_pandas(md_grp, pd_grp) eval_general(md_grp, pd_grp, lambda grp: grp.size()) eval_general( md_grp, pd_grp, lambda grp: grp.apply(lambda df: df.sum(), include_groups=False) ) eval_general(md_grp, pd_grp, lambda grp: grp.first()) @pytest.mark.parametrize( # When True, do df[obj], otherwise just use the obj "columns", [ [(False, "a")], [(False, "a"), (False, "b"), (False, "c")], [(False, "a"), (False, "b")], [(False, "b"), (False, "a")], [(True, "a"), (True, "b"), (True, "c")], [(True, "a"), (True, "b")], [(False, "a"), (False, "b"), (True, "c")], [(False, "a"), (True, "c")], [(False, "a"), (False, pd.Series([5, 6, 7, 8]))], ], ) def test_unknown_groupby(columns): data = {"b": [11, 11, 22, 200], "c": [111, 111, 222, 7000]} modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) with pytest.raises(KeyError): pandas_df.groupby(by=get_external_groupers(pandas_df, columns)[1]) with pytest.raises(KeyError): modin_df.groupby(by=get_external_groupers(modin_df, columns)[1]) @pytest.mark.parametrize( "func_to_apply", [ lambda df: df.sum(), lambda df: df.size(), lambda df: df.quantile(), lambda df: df.dtypes, lambda df: df.apply(lambda df: df.sum()), pytest.param( lambda df: df.apply(lambda df: pandas.Series([1, 2, 3, 4])), marks=pytest.mark.skip("See modin issue #2511"), ), lambda grp: grp.agg( { list(test_data_values[0].keys())[1]: (max, min, sum), list(test_data_values[0].keys())[-2]: (sum, min, max), } ), lambda grp: grp.agg( { list(test_data_values[0].keys())[1]: [ ("new_sum", "sum"), ("new_min", "min"), ], list(test_data_values[0].keys())[-2]: np.sum, } ), pytest.param( lambda grp: grp.agg( { list(test_data_values[0].keys())[1]: [ ("new_sum", "sum"), ("new_mean", "mean"), ], list(test_data_values[0].keys())[-2]: "skew", } ), id="renaming_aggs_at_different_partitions", ), pytest.param( lambda grp: grp.agg( { list(test_data_values[0].keys())[1]: [ ("new_sum", "sum"), ("new_mean", "mean"), ], list(test_data_values[0].keys())[2]: "skew", } ), id="renaming_aggs_at_same_partition", ), pytest.param( lambda grp: grp.agg( { list(test_data_values[0].keys())[1]: "mean", list(test_data_values[0].keys())[-2]: "skew", } ), id="custom_aggs_at_different_partitions", ), pytest.param( lambda grp: grp.agg( { list(test_data_values[0].keys())[1]: "mean", list(test_data_values[0].keys())[2]: "skew", } ), id="custom_aggs_at_same_partition", ), pytest.param( lambda grp: grp.agg( { list(test_data_values[0].keys())[1]: "mean", list(test_data_values[0].keys())[-2]: "sum", } ), id="native_and_custom_aggs_at_different_partitions", ), pytest.param( lambda grp: grp.agg( { list(test_data_values[0].keys())[1]: "mean", list(test_data_values[0].keys())[2]: "sum", } ), id="native_and_custom_aggs_at_same_partition", ), pytest.param( lambda grp: grp.agg( { list(test_data_values[0].keys())[1]: (max, "mean", sum), list(test_data_values[0].keys())[-1]: (sum, "skew", max), } ), id="Agg_and_by_intersection_TreeReduce_implementation", ), pytest.param( lambda grp: grp.agg( { list(test_data_values[0].keys())[1]: (max, "mean", "nunique"), list(test_data_values[0].keys())[-1]: (sum, min, max), } ), id="Agg_and_by_intersection_FullAxis_implementation", ), pytest.param( lambda grp: grp.agg({list(test_data_values[0].keys())[0]: "count"}), id="Agg_and_by_intersection_issue_3376", ), ], ) @pytest.mark.parametrize("as_index", [True, False]) @pytest.mark.parametrize("by_length", [1, 2]) @pytest.mark.parametrize( "categorical_by", [pytest.param(True, marks=pytest.mark.skip("See modin issue #2513")), False], ) def test_multi_column_groupby_different_partitions( func_to_apply, as_index, by_length, categorical_by, request ): if ( not categorical_by and by_length == 1 and "custom_aggs_at_same_partition" in request.node.name or "renaming_aggs_at_same_partition" in request.node.name ): pytest.xfail( "After upgrade to pandas 2.1 skew results are different: AssertionError: 1.0 >= 0.0001." + " See https://github.com/modin-project/modin/issues/6530 for details." ) data = test_data_values[0] md_df, pd_df = create_test_dfs(data) by = [pd_df.columns[-i if i % 2 else i] for i in range(by_length)] if categorical_by: md_df = md_df.astype({by[0]: "category"}) pd_df = pd_df.astype({by[0]: "category"}) md_grp, pd_grp = ( md_df.groupby(by, as_index=as_index), pd_df.groupby(by, as_index=as_index), ) eval_general( md_grp, pd_grp, func_to_apply, # 'skew' and 'mean' results are not 100% equal to pandas as they use # different formulas and so precision errors come into play. Thus # using a custom comparator that allows slight numeric deviations. comparator=try_modin_df_almost_equals_compare, ) # FIXME: https://github.com/modin-project/modin/issues/7034 eval___getitem__(md_grp, pd_grp, md_df.columns[1], expected_exception=False) # FIXME: https://github.com/modin-project/modin/issues/7034 eval___getitem__( md_grp, pd_grp, [md_df.columns[1], md_df.columns[2]], expected_exception=False ) def test_empty_partitions_after_groupby(): def func_to_apply(grp): return grp.agg( { list(test_data_values[0].keys())[1]: "sum", list(test_data_values[0].keys())[-1]: "sum", } ) data = test_data_values[0] md_df, pd_df = create_test_dfs(data) by = pd_df.columns[0] with context(DynamicPartitioning=True): md_grp, pd_grp = ( md_df.groupby(by), pd_df.groupby(by), ) eval_general( md_grp, pd_grp, func_to_apply, ) @pytest.mark.parametrize( "by", [ 0, 1.5, "str", pandas.Timestamp("2020-02-02"), [0, "str"], [pandas.Timestamp("2020-02-02"), 1.5], ], ) @pytest.mark.parametrize("as_index", [True, False]) def test_not_str_by(by, as_index): columns = pandas.Index([0, 1.5, "str", pandas.Timestamp("2020-02-02")]) data = {col: np.arange(5) for col in columns} md_df, pd_df = create_test_dfs(data) md_grp, pd_grp = ( md_df.groupby(by, as_index=as_index), pd_df.groupby(by, as_index=as_index), ) modin_groupby_equals_pandas(md_grp, pd_grp) eval_general(md_grp, pd_grp, lambda grp: grp.sum()) eval_general(md_grp, pd_grp, lambda grp: grp.size()) eval_general(md_grp, pd_grp, lambda grp: grp.agg(lambda df: df.mean())) eval_general(md_grp, pd_grp, lambda grp: grp.dtypes) eval_general(md_grp, pd_grp, lambda grp: grp.first()) @pytest.mark.parametrize("internal_by_length", [0, 1, 2]) @pytest.mark.parametrize("external_by_length", [0, 1, 2]) @pytest.mark.parametrize("has_categorical_by", [True, False]) @pytest.mark.parametrize( "agg_func", [ pytest.param( lambda grp: grp.apply(lambda df: df.dtypes), id="modin_dtypes_impl" ), pytest.param( lambda grp: grp.apply(lambda df: df.sum(numeric_only=True)), id="apply_sum" ), pytest.param(lambda grp: grp.count(), id="count"), pytest.param(lambda grp: grp.nunique(), id="nunique"), # Integer key means the index of the column to replace it with. # 0 and -1 are considered to be the indices of the columns to group on. pytest.param({1: "sum", 2: "nunique"}, id="dict_agg_no_intersection_with_by"), pytest.param( {0: "mean", 1: "sum", 2: "nunique"}, id="dict_agg_has_intersection_with_by", ), pytest.param( {1: "sum", 2: "nunique", -1: "nunique"}, id="dict_agg_has_intersection_with_categorical_by", ), ], ) # There are two versions of the `handle_as_index` method: the one accepting pandas.DataFrame from # the execution kernel and backend agnostic. This parameter indicates which one implementation to use. @pytest.mark.parametrize("use_backend_agnostic_method", [True, False]) def test_handle_as_index( internal_by_length, external_by_length, has_categorical_by, agg_func, use_backend_agnostic_method, request, ): """ Test ``modin.core.dataframe.algebra.default2pandas.groupby.GroupBy.handle_as_index``. The role of the ``handle_as_index`` method is to build a groupby result considering ``as_index=False`` from the result that was computed with ``as_index=True``. So the testing flow is the following: 1. Compute GroupBy result with the ``as_index=True`` parameter via Modin. 2. Build ``as_index=False`` result from the ``as_index=True`` using ``handle_as_index`` method. 3. Compute GroupBy result with the ``as_index=False`` parameter via pandas as the reference result. 4. Compare the result from the second step with the reference. """ by_length = internal_by_length + external_by_length if by_length == 0: pytest.skip("No keys to group on were passed, skipping the test.") if ( has_categorical_by and by_length > 1 and ( isinstance(agg_func, dict) or ("nunique" in request.node.callspec.id.split("-")) ) ): pytest.skip( "The linked bug makes pandas raise an exception when 'by' is categorical: " + "https://github.com/pandas-dev/pandas/issues/36698" ) df = pandas.DataFrame(test_groupby_data) external_by_cols = GroupBy.validate_by(df.add_prefix("external_")) if has_categorical_by: df = df.astype({df.columns[-1]: "category"}) if isinstance(agg_func, dict): agg_func = {df.columns[key]: value for key, value in agg_func.items()} selection = list(agg_func.keys()) agg_dict = agg_func agg_func = lambda grp: grp.agg(agg_dict) # noqa: E731 (lambda assignment) else: selection = None # Selecting 'by' columns from both sides of the frame so they located in different partitions internal_by = df.columns[ range(-internal_by_length // 2, internal_by_length // 2) ].tolist() external_by = external_by_cols[:external_by_length] pd_by = internal_by + external_by md_by = internal_by + [pd.Series(ser) for ser in external_by] grp_result = pd.DataFrame(df).groupby(md_by, as_index=True) grp_reference = df.groupby(pd_by, as_index=False) agg_result = agg_func(grp_result) agg_reference = agg_func(grp_reference) if use_backend_agnostic_method: reset_index, drop, lvls_to_drop, cols_to_drop = GroupBy.handle_as_index( result_cols=agg_result.columns, result_index_names=agg_result.index.names, internal_by_cols=internal_by, by_cols_dtypes=df[internal_by].dtypes.values, by_length=len(md_by), selection=selection, drop=len(internal_by) != 0, ) if len(lvls_to_drop) > 0: agg_result.index = agg_result.index.droplevel(lvls_to_drop) if len(cols_to_drop) > 0: agg_result = agg_result.drop(columns=cols_to_drop) if reset_index: agg_result = agg_result.reset_index(drop=drop) else: GroupBy.handle_as_index_for_dataframe( result=agg_result, internal_by_cols=internal_by, by_cols_dtypes=df[internal_by].dtypes.values, by_length=len(md_by), selection=selection, drop=len(internal_by) != 0, inplace=True, ) df_equals(agg_result, agg_reference) def test_validate_by(): """Test ``modin.core.dataframe.algebra.default2pandas.groupby.GroupBy.validate_by``.""" def compare(obj1, obj2): assert type(obj1) is type( obj2 ), f"Both objects must be instances of the same type: {type(obj1)} != {type(obj2)}." if isinstance(obj1, list): for val1, val2 in itertools.zip_longest(obj1, obj2): df_equals(val1, val2) else: df_equals(obj1, obj2) # This emulates situation when the Series's query compiler being passed as a 'by': # 1. The Series at the QC level is represented as a single-column frame with the `MODIN_UNNAMED_SERIES_LABEL` columns. # 2. The valid representation of such QC is an unnamed Series. reduced_frame = pandas.DataFrame({MODIN_UNNAMED_SERIES_LABEL: [1, 2, 3]}) series_result = GroupBy.validate_by(reduced_frame) series_reference = [pandas.Series([1, 2, 3], name=None)] compare(series_reference, series_result) # This emulates situation when several 'by' columns of the group frame are passed as a single QueryCompiler: # 1. If grouping on several columns the 'by' at the QC level is the following: ``df[by]._query_compiler``. # 2. The valid representation of such QC is a list of Series. splited_df = [pandas.Series([1, 2, 3], name=f"col{i}") for i in range(3)] splited_df_result = GroupBy.validate_by( pandas.concat(splited_df, axis=1, copy=True) ) compare(splited_df, splited_df_result) # This emulates situation of mixed by (two column names and an external Series): by = ["col1", "col2", pandas.DataFrame({MODIN_UNNAMED_SERIES_LABEL: [1, 2, 3]})] result_by = GroupBy.validate_by(by) reference_by = ["col1", "col2", pandas.Series([1, 2, 3], name=None)] compare(reference_by, result_by) @pytest.mark.skipif( get_current_execution() == "BaseOnPython" or current_execution_is_native(), reason="The test only make sense for partitioned executions", ) def test_groupby_with_virtual_partitions(): # from https://github.com/modin-project/modin/issues/4464 modin_df, pandas_df = create_test_dfs(test_data["int_data"]) # Concatenate DataFrames here to make virtual partitions. big_modin_df = pd.concat([modin_df for _ in range(5)]) big_pandas_df = pandas.concat([pandas_df for _ in range(5)]) # Check that the constructed Modin DataFrame has virtual partitions when assert issubclass( type(big_modin_df._query_compiler._modin_frame._partitions[0][0]), PandasDataframeAxisPartition, ) eval_general( big_modin_df, big_pandas_df, lambda df: df.groupby(df.columns[0]).count() ) @pytest.mark.parametrize("sort", [True, False]) @pytest.mark.parametrize("is_categorical_by", [True, False]) def test_groupby_sort(sort, is_categorical_by): # from issue #3571 by = np.array(["a"] * 50000 + ["b"] * 10000 + ["c"] * 1000) random_state = np.random.RandomState(seed=42) random_state.shuffle(by) data = {"key_col": by, "data_col": np.arange(len(by))} md_df, pd_df = create_test_dfs(data) if is_categorical_by: md_df = md_df.astype({"key_col": "category"}) pd_df = pd_df.astype({"key_col": "category"}) md_grp = md_df.groupby("key_col", sort=sort) pd_grp = pd_df.groupby("key_col", sort=sort) modin_groupby_equals_pandas(md_grp, pd_grp) eval_general(md_grp, pd_grp, lambda grp: grp.sum(numeric_only=True)) eval_general(md_grp, pd_grp, lambda grp: grp.size()) eval_general(md_grp, pd_grp, lambda grp: grp.agg(lambda df: df.mean())) eval_general(md_grp, pd_grp, lambda grp: grp.dtypes) eval_general(md_grp, pd_grp, lambda grp: grp.first()) def test_groupby_with_frozenlist(): pandas_df = pandas.DataFrame(data={"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]}) pandas_df = pandas_df.set_index(["a", "b"]) modin_df = from_pandas(pandas_df) eval_general(modin_df, pandas_df, lambda df: df.groupby(df.index.names).count()) @pytest.mark.parametrize( "by_func", [ lambda df: "timestamp0", lambda df: ["timestamp0", "timestamp1"], lambda df: ["timestamp0", df["timestamp1"]], ], ) def test_mean_with_datetime(by_func): data = { "timestamp0": [pd.to_datetime(1490195805, unit="s")], "timestamp1": [pd.to_datetime(1490195805, unit="s")], "numeric": [0], } modin_df, pandas_df = create_test_dfs(data) eval_general(modin_df, pandas_df, lambda df: df.groupby(by=by_func(df)).mean()) def test_groupby_ohlc(): pandas_df = pandas.DataFrame( np.random.randint(0, 100, (50, 2)), columns=["stock A", "stock B"] ) pandas_df["Date"] = pandas.concat( [pandas.date_range("1/1/2000", periods=10, freq="min").to_series()] * 5 ).reset_index(drop=True) modin_df = pd.DataFrame(pandas_df) eval_general(modin_df, pandas_df, lambda df: df.groupby("Date")["stock A"].ohlc()) pandas_multiindex_result = pandas_df.groupby("Date")[["stock A"]].ohlc() with warns_that_defaulting_to_pandas_if( not df_or_series_using_native_execution(modin_df) ): modin_multiindex_result = modin_df.groupby("Date")[["stock A"]].ohlc() df_equals(modin_multiindex_result, pandas_multiindex_result) pandas_multiindex_result = pandas_df.groupby("Date")[["stock A", "stock B"]].ohlc() with warns_that_defaulting_to_pandas_if( not df_or_series_using_native_execution(modin_df) ): modin_multiindex_result = modin_df.groupby("Date")[ ["stock A", "stock B"] ].ohlc() df_equals(modin_multiindex_result, pandas_multiindex_result) @pytest.mark.parametrize( "modin_df_recipe", ["non_lazy_frame", "frame_with_deferred_index", "lazy_frame"], ) def test_groupby_on_empty_data(modin_df_recipe): class ModinDfConstructor: def __init__(self, recipe, df_kwargs): self._recipe = recipe self._mock_obj = None self._df_kwargs = df_kwargs def non_lazy_frame(self): return pd.DataFrame(**self._df_kwargs) def frame_with_deferred_index(self): df = pd.DataFrame(**self._df_kwargs) try: # The frame would stop being lazy once index computation is triggered df._query_compiler.set_frame_index_cache(None) except AttributeError: pytest.skip( reason="Selected execution doesn't support deferred indices." ) return df def lazy_frame(self): donor_obj = pd.DataFrame()._query_compiler self._mock_obj = mock.patch( f"{donor_obj.__module__}.{donor_obj.__class__.__name__}.lazy_shape", new_callable=mock.PropertyMock, ) patch_obj = self._mock_obj.__enter__() patch_obj.return_value = True df = pd.DataFrame(**self._df_kwargs) # The frame is lazy until `self.__exit__()` is called assert df._query_compiler.lazy_shape return df def __enter__(self): return getattr(self, self._recipe)() def __exit__(self, *args, **kwargs): if self._mock_obj is not None: self._mock_obj.__exit__(*args, **kwargs) def run_test(eval_function, *args, **kwargs): df_kwargs = {"columns": ["a", "b", "c"]} with ModinDfConstructor(modin_df_recipe, df_kwargs) as modin_df: pandas_df = pandas.DataFrame(**df_kwargs) modin_grp = modin_df.groupby(modin_df.columns[0]) pandas_grp = pandas_df.groupby(pandas_df.columns[0]) eval_function(modin_grp, pandas_grp, *args, **kwargs) run_test(eval___getattr__, item="b") run_test(eval___getitem__, item="b") run_test(eval_agg, func=lambda df: df.mean()) run_test(eval_any) run_test(eval_apply, func=lambda df: df.mean()) run_test(eval_count) run_test(eval_cummax, numeric_only=True) run_test(eval_cummin, numeric_only=True) run_test(eval_cumprod, numeric_only=True) run_test(eval_cumsum, numeric_only=True) run_test(eval_dtypes) run_test(eval_fillna) run_test(eval_groups) run_test(eval_len) run_test(eval_max) run_test(eval_mean) run_test(eval_median) run_test(eval_min) run_test(eval_ndim) run_test(eval_ngroup) run_test(eval_ngroups) run_test(eval_nunique) run_test(eval_prod) run_test(eval_quantile) run_test(eval_rank) run_test(eval_size) run_test(eval_skew) run_test(eval_sum) run_test(eval_var) if modin_df_recipe != "lazy_frame": # TODO: these functions have their specific implementations in the # front-end that are unable to operate on empty frames and thus # fail on an empty lazy frame. # https://github.com/modin-project/modin/issues/5505 # https://github.com/modin-project/modin/issues/5506 run_test(eval_pipe, func=lambda df: df.mean()) run_test(eval_shift) # TODO: these functions fail in case of empty data in the pandas itself, # we have to modify the `eval_*` functions to be able to check for # exceptions equality: # https://github.com/modin-project/modin/issues/5441 # run_test(eval_transform, func=lambda df: df.mean()) # run_test(eval_std) def test_skew_corner_cases(): """ This test was inspired by https://github.com/modin-project/modin/issues/5545. The test verifies that modin acts exactly as pandas when the input data is bad for the 'skew' and so some components of the 'skew' formula appears to be invalid: ``(count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2**1.5)`` """ # When 'm2 == m3 == 0' thus causing 0 / 0 division in the second multiplier. # Note: mX = 'sum((col - mean(col)) ^ x)' modin_df, pandas_df = create_test_dfs({"col0": [1, 1, 1], "col1": [10, 10, 10]}) eval_general(modin_df, pandas_df, lambda df: df.groupby("col0").skew()) # When 'count < 3' thus causing dividing by zero in the first multiplier # Note: count = group_size modin_df, pandas_df = create_test_dfs({"col0": [1, 1], "col1": [1, 2]}) eval_general(modin_df, pandas_df, lambda df: df.groupby("col0").skew()) # When 'count < 3' and 'm3 / m2 != 0'. The case comes from: # https://github.com/modin-project/modin/issues/5545 modin_df, pandas_df = create_test_dfs({"col0": [1, 1], "col1": [171, 137]}) eval_general(modin_df, pandas_df, lambda df: df.groupby("col0").skew()) @pytest.mark.parametrize( "by", [ pandas.Grouper(key="time_stamp", freq="3D"), [pandas.Grouper(key="time_stamp", freq="1ME"), "count"], ], ) def test_groupby_with_grouper(by): # See https://github.com/modin-project/modin/issues/5091 for more details # Generate larger data so that it can handle partitioning cases data = { "id": [i for i in range(200)], "time_stamp": [ pd.Timestamp("2000-01-02") + datetime.timedelta(days=x) for x in range(200) ], } for i in range(200): data[f"count_{i}"] = [i, i + 1] * 100 modin_df, pandas_df = create_test_dfs(data) eval_general( modin_df, pandas_df, lambda df: df.groupby(by).mean(), # FIXME: https://github.com/modin-project/modin/issues/7033 expected_exception=False, ) def test_groupby_preserves_by_order(): modin_df, pandas_df = create_test_dfs({"col0": [1, 1, 1], "col1": [10, 10, 10]}) modin_res = modin_df.groupby([pd.Series([100, 100, 100]), "col0"]).mean() pandas_res = pandas_df.groupby([pandas.Series([100, 100, 100]), "col0"]).mean() df_equals(modin_res, pandas_res) @pytest.mark.parametrize( "method", # test all aggregations from pandas.core.groupby.base.reduction_kernels except # nth and corrwith, both of which require extra arguments. [ "all", "any", "count", "first", "idxmax", "idxmin", "last", "max", "mean", "median", "min", "nunique", "prod", "quantile", "sem", "size", "skew", "std", "sum", "var", ], ) @pytest.mark.skipif( StorageFormat.get() != "Pandas", reason="only relevant to pandas execution", ) def test_groupby_agg_with_empty_column_partition_6175(method): df = pd.concat( [ pd.DataFrame({"col33": [0, 1], "index": [2, 3]}), pd.DataFrame({"col34": [4, 5]}), ], axis=1, ) assert df._query_compiler._modin_frame._partitions.shape == (1, 2) eval_general( df, df._to_pandas(), lambda df: getattr(df.groupby(["col33", "index"]), method)(), ) def test_groupby_pct_change_diff_6194(): df = pd.DataFrame( { "by": ["a", "b", "c", "a", "c"], "value": [1, 2, 4, 5, 1], } ) # These methods should not crash eval_general( df, df._to_pandas(), lambda df: df.groupby(by="by").pct_change(), ) eval_general( df, df._to_pandas(), lambda df: df.groupby(by="by").diff(), ) def test_groupby_datetime_diff_6628(): dates = pd.date_range(start="2023-01-01", periods=10, freq="W") df = pd.DataFrame( { "date": dates, "group": "A", } ) eval_general( df, df._to_pandas(), lambda df: df.groupby("group").diff(), ) def eval_rolling(md_window, pd_window): eval_general(md_window, pd_window, lambda window: window.count()) eval_general(md_window, pd_window, lambda window: window.sum()) eval_general(md_window, pd_window, lambda window: window.mean()) eval_general(md_window, pd_window, lambda window: window.median()) eval_general(md_window, pd_window, lambda window: window.var()) eval_general(md_window, pd_window, lambda window: window.std()) eval_general(md_window, pd_window, lambda window: window.min()) eval_general(md_window, pd_window, lambda window: window.max()) expected_exception = None if pd_window.on == "col4": expected_exception = ValueError( "Length mismatch: Expected axis has 450 elements, new values have 600 elements" ) eval_general( md_window, pd_window, lambda window: window.corr(), expected_exception=expected_exception, ) eval_general( md_window, pd_window, lambda window: window.cov(), expected_exception=expected_exception, ) eval_general(md_window, pd_window, lambda window: window.skew()) eval_general(md_window, pd_window, lambda window: window.kurt()) eval_general( md_window, pd_window, lambda window: window.apply(lambda df: (df + 10).sum()) ) eval_general(md_window, pd_window, lambda window: window.agg("sum")) eval_general(md_window, pd_window, lambda window: window.quantile(0.2)) eval_general(md_window, pd_window, lambda window: window.rank()) expected_exception = None if pd_window.on == "col4": expected_exception = TypeError( "Addition/subtraction of integers and integer-arrays with DatetimeArray is no longer supported." + " Instead of adding/subtracting `n`, use `n * obj.freq`" ) if not md_window._as_index: # There's a mismatch in group columns when 'as_index=False' # see: https://github.com/modin-project/modin/issues/6291 by_cols = list(md_window._groupby_obj._internal_by) eval_general( md_window, pd_window, lambda window: window.sem().drop(columns=by_cols, errors="ignore"), expected_exception=expected_exception, ) else: eval_general( md_window, pd_window, lambda window: window.sem(), expected_exception=expected_exception, ) @pytest.mark.parametrize("center", [True, False]) @pytest.mark.parametrize("closed", ["right", "left", "both", "neither"]) @pytest.mark.parametrize("as_index", [True, False]) def test_rolling_int_window(center, closed, as_index): col_part1 = pd.DataFrame( { "by": np.tile(np.arange(15), 10), "col1": np.arange(150), "col2": np.arange(10, 160), } ) col_part2 = pd.DataFrame({"col3": np.arange(20, 170)}) md_df = pd.concat([col_part1, col_part2], axis=1) pd_df = md_df._to_pandas() if StorageFormat.get() == "Pandas": assert md_df._query_compiler._modin_frame._partitions.shape[1] == 2 md_window = md_df.groupby("by", as_index=as_index).rolling( 3, center=center, closed=closed ) pd_window = pd_df.groupby("by", as_index=as_index).rolling( 3, center=center, closed=closed ) eval_rolling(md_window, pd_window) @pytest.mark.parametrize("center", [True, False]) @pytest.mark.parametrize("closed", ["right", "left", "both", "neither"]) @pytest.mark.parametrize("as_index", [True, False]) @pytest.mark.parametrize("on", [None, "col4"]) def test_rolling_timedelta_window(center, closed, as_index, on): col_part1 = pd.DataFrame( { "by": np.tile(np.arange(15), 10), "col1": np.arange(150), "col2": np.arange(10, 160), } ) col_part2 = pd.DataFrame({"col3": np.arange(20, 170)}) if on is not None: col_part2[on] = pandas.DatetimeIndex( [ datetime.date(2020, 1, 1) + datetime.timedelta(hours=12) * i for i in range(150) ] ) md_df = pd.concat([col_part1, col_part2], axis=1) md_df.index = pandas.DatetimeIndex( [datetime.date(2020, 1, 1) + datetime.timedelta(days=1) * i for i in range(150)] ) pd_df = md_df._to_pandas() if StorageFormat.get() == "Pandas": assert ( md_df._query_compiler._modin_frame._partitions.shape[1] == 2 if on is None else 3 ) md_window = md_df.groupby("by", as_index=as_index).rolling( datetime.timedelta(days=3), center=center, closed=closed, on=on ) pd_window = pd_df.groupby("by", as_index=as_index).rolling( datetime.timedelta(days=3), center=center, closed=closed, on=on ) eval_rolling(md_window, pd_window) @pytest.mark.parametrize( "func", [ pytest.param("sum", id="map_reduce_func"), pytest.param("median", id="full_axis_func"), ], ) def test_groupby_deferred_index(func): # the test is copied from the issue: # https://github.com/modin-project/modin/issues/6368 def perform(lib): df1 = lib.DataFrame({"a": [1, 1, 2, 2]}) df2 = lib.DataFrame({"b": [3, 4, 5, 6], "c": [7, 5, 4, 3]}) df = lib.concat([df1, df2], axis=1) df.index = [10, 11, 12, 13] grp = df.groupby("a") grp.indices return getattr(grp, func)() eval_general(pd, pandas, perform) # there are two different implementations of partitions aligning for cluster and non-cluster mode, # here we want to test both of them, so simply modifying the config for this test @pytest.mark.parametrize( "modify_config", [ {RangePartitioning: True, IsRayCluster: True}, {RangePartitioning: True, IsRayCluster: False}, ], indirect=True, ) def test_shape_changing_udf(modify_config): modin_df, pandas_df = create_test_dfs( { "by_col1": ([1] * 50) + ([10] * 50), "col2": np.arange(100), "col3": np.arange(100), } ) def func1(group): # changes the original shape and indexing of the 'group' return pandas.Series( [1, 2, 3, 4], index=["new_col1", "new_col2", "new_col4", "new_col3"] ) eval_general( modin_df.groupby("by_col1"), pandas_df.groupby("by_col1"), lambda df: df.apply(func1), ) def func2(group): # each group have different shape at the end # (we do .to_frame().T as otherwise this scenario doesn't work in pandas) if group.iloc[0, 0] == 1: return ( pandas.Series( [1, 2, 3, 4], index=["new_col1", "new_col2", "new_col4", "new_col3"] ) .to_frame() .T ) return ( pandas.Series([20, 33, 44], index=["new_col2", "new_col3", "new_col4"]) .to_frame() .T ) eval_general( modin_df.groupby("by_col1"), pandas_df.groupby("by_col1"), lambda df: df.apply(func2), ) def func3(group): # one of the groups produce an empty dataframe, in the result we should # have joined columns of both of these dataframes if group.iloc[0, 0] == 1: return pandas.DataFrame([[1, 2, 3]], index=["col1", "col2", "col3"]) return pandas.DataFrame(columns=["col2", "col3", "col4", "col5"]) eval_general( modin_df.groupby("by_col1"), pandas_df.groupby("by_col1"), lambda df: df.apply(func3), ) @pytest.mark.parametrize("modify_config", [{RangePartitioning: True}], indirect=True) def test_reshuffling_groupby_on_strings(modify_config): # reproducer from https://github.com/modin-project/modin/issues/6509 modin_df, pandas_df = create_test_dfs( {"col1": ["a"] * 50 + ["b"] * 50, "col2": range(100)} ) modin_df = modin_df.astype({"col1": "string"}) pandas_df = pandas_df.astype({"col1": "string"}) md_grp = modin_df.groupby("col1") pd_grp = pandas_df.groupby("col1") eval_general(md_grp, pd_grp, lambda grp: grp.mean()) eval_general(md_grp, pd_grp, lambda grp: grp.nth(2)) eval_general(md_grp, pd_grp, lambda grp: grp.head(10)) eval_general(md_grp, pd_grp, lambda grp: grp.tail(10)) @pytest.mark.parametrize("modify_config", [{RangePartitioning: True}], indirect=True) def test_groupby_apply_series_result(modify_config): # reproducer from the issue: # https://github.com/modin-project/modin/issues/6632 df = pd.DataFrame( np.random.randint(5, 10, size=5), index=[f"s{i+1}" for i in range(5)] ) df["group"] = [1, 1, 2, 2, 3] eval_general( df, df._to_pandas(), lambda df: df.groupby("group").apply( lambda x: x.name + 2, include_groups=False ), ) def test_groupby_named_aggregation(): modin_ser, pandas_ser = create_test_series([10, 10, 10, 1, 1, 1, 2, 3], name="data") eval_general( modin_ser, pandas_ser, lambda ser: ser.groupby(level=0).agg(result=("max")) ) def test_groupby_several_column_partitions(): # see details in #6948 columns = [ "l_returnflag", "l_linestatus", "l_discount", "l_extendedprice", "l_quantity", ] modin_df, pandas_df = create_test_dfs( np.random.randint(0, 100, size=(1000, len(columns))), columns=columns ) pandas_df["a"] = (pandas_df.l_extendedprice) * (1 - (pandas_df.l_discount)) # to create another column partition modin_df["a"] = (modin_df.l_extendedprice) * (1 - (modin_df.l_discount)) eval_general( modin_df, pandas_df, lambda df: df.groupby(["l_returnflag", "l_linestatus"]) .agg( sum_qty=("l_quantity", "sum"), sum_base_price=("l_extendedprice", "sum"), sum_disc_price=("a", "sum"), # sum_charge=("b", "sum"), avg_qty=("l_quantity", "mean"), avg_price=("l_extendedprice", "mean"), avg_disc=("l_discount", "mean"), count_order=("l_returnflag", "count"), ) .reset_index(), ) def test_groupby_named_agg(): # from pandas docs data = { "A": [1, 1, 2, 2], "B": [1, 2, 3, 4], "C": [0.362838, 0.227877, 1.267767, -0.562860], } modin_df, pandas_df = create_test_dfs(data) eval_general( modin_df, pandas_df, lambda df: df.groupby("A").agg( b_min=pd.NamedAgg(column="B", aggfunc="min"), c_sum=pd.NamedAgg(column="C", aggfunc="sum"), ), ) ### TEST GROUPBY WARNINGS ### def test_groupby_axis_1_warning(): data = { "col1": [0, 3, 2, 3], "col2": [4, 1, 6, 7], } modin_df, pandas_df = create_test_dfs(data) with pytest.warns( FutureWarning, match="DataFrame.groupby with axis=1 is deprecated" ): modin_df.groupby(by="col1", axis=1) with pytest.warns( FutureWarning, match="DataFrame.groupby with axis=1 is deprecated" ): pandas_df.groupby(by="col1", axis=1) def test_groupby_dtypes_warning(): data = { "col1": [0, 3, 2, 3], "col2": [4, 1, 6, 7], } modin_df, pandas_df = create_test_dfs(data) modin_groupby = modin_df.groupby(by="col1") pandas_groupby = pandas_df.groupby(by="col1") with pytest.warns(FutureWarning, match="DataFrameGroupBy.dtypes is deprecated"): modin_groupby.dtypes with pytest.warns(FutureWarning, match="DataFrameGroupBy.dtypes is deprecated"): pandas_groupby.dtypes def test_groupby_diff_axis_1_warning(): data = { "col1": [0, 3, 2, 3], "col2": [4, 1, 6, 7], } modin_df, pandas_df = create_test_dfs(data) modin_groupby = modin_df.groupby(by="col1") pandas_groupby = pandas_df.groupby(by="col1") with pytest.warns( FutureWarning, match="DataFrameGroupBy.diff with axis=1 is deprecated" ): modin_groupby.diff(axis=1) with pytest.warns( FutureWarning, match="DataFrameGroupBy.diff with axis=1 is deprecated" ): pandas_groupby.diff(axis=1) def test_groupby_pct_change_axis_1_warning(): data = { "col1": [0, 3, 2, 3], "col2": [4, 1, 6, 7], } modin_df, pandas_df = create_test_dfs(data) modin_groupby = modin_df.groupby(by="col1") pandas_groupby = pandas_df.groupby(by="col1") with pytest.warns( FutureWarning, match="DataFrameGroupBy.pct_change with axis=1 is deprecated" ): modin_groupby.pct_change(axis=1) with pytest.warns( FutureWarning, match="DataFrameGroupBy.pct_change with axis=1 is deprecated" ): pandas_groupby.pct_change(axis=1) def test_groupby_pct_change_parameters_warning(): data = { "col1": [0, 3, 2, 3], "col2": [4, 1, 6, 7], } modin_df, pandas_df = create_test_dfs(data) modin_groupby = modin_df.groupby(by="col1") pandas_groupby = pandas_df.groupby(by="col1") match_string = ( "The 'fill_method' keyword being not None and the 'limit' keyword " + "in (DataFrame|DataFrameGroupBy).pct_change are deprecated" ) with pytest.warns( FutureWarning, match=match_string, ): modin_groupby.pct_change(fill_method="bfill", limit=1) with pytest.warns( FutureWarning, match=match_string, ): pandas_groupby.pct_change(fill_method="bfill", limit=1) def test_groupby_shift_axis_1_warning(): data = { "col1": [0, 3, 2, 3], "col2": [4, 1, 6, 7], } modin_df, pandas_df = create_test_dfs(data) modin_groupby = modin_df.groupby(by="col1") pandas_groupby = pandas_df.groupby(by="col1") with pytest.warns( FutureWarning, match="DataFrameGroupBy.shift with axis=1 is deprecated", ): pandas_groupby.shift(axis=1, fill_value=777) with pytest.warns( FutureWarning, match="DataFrameGroupBy.shift with axis=1 is deprecated", ): modin_groupby.shift(axis=1, fill_value=777) def test_groupby_fillna_axis_1_warning(): data = { "col1": [0, 3, 2, 3], "col2": [4, None, 6, None], } modin_df, pandas_df = create_test_dfs(data) modin_groupby = modin_df.groupby(by="col1") pandas_groupby = pandas_df.groupby(by="col1") with pytest.warns( FutureWarning, match="DataFrameGroupBy.fillna is deprecated", ): modin_groupby.fillna(method="ffill") with pytest.warns( FutureWarning, match="DataFrameGroupBy.fillna is deprecated", ): pandas_groupby.fillna(method="ffill") def test_groupby_agg_provided_callable_warning(): data = { "col1": [0, 3, 2, 3], "col2": [4, 1, 6, 7], } modin_df, pandas_df = create_test_dfs(data) modin_groupby = modin_df.groupby(by="col1") pandas_groupby = pandas_df.groupby(by="col1") for func in (sum, max): with pytest.warns( FutureWarning, match="In a future version of pandas, the provided callable will be used directly", ): modin_groupby.agg(func) with pytest.warns( FutureWarning, match="In a future version of pandas, the provided callable will be used directly", ): pandas_groupby.agg(func) @pytest.mark.parametrize("modify_config", [{RangePartitioning: True}], indirect=True) @pytest.mark.parametrize("observed", [False]) @pytest.mark.parametrize("as_index", [True]) @pytest.mark.parametrize( "func", [ pytest.param(lambda grp: grp.sum(), id="sum"), pytest.param(lambda grp: grp.size(), id="size"), pytest.param(lambda grp: grp.apply(lambda df: df.sum()), id="apply_sum"), pytest.param( lambda grp: grp.apply( lambda df: ( df.sum() if len(df) > 0 else pandas.Series([10] * len(df.columns), index=df.columns) ) ), id="apply_transform", ), ], ) @pytest.mark.parametrize( "by_cols, cat_cols", [ ("a", ["a"]), ("b", ["b"]), ("e", ["e"]), (["a", "e"], ["a"]), (["a", "e"], ["e"]), (["a", "e"], ["a", "e"]), (["b", "e"], ["b"]), (["b", "e"], ["e"]), (["b", "e"], ["b", "e"]), (["a", "b", "e"], ["a"]), (["a", "b", "e"], ["b"]), (["a", "b", "e"], ["e"]), (["a", "b", "e"], ["a", "e"]), (["a", "b", "e"], ["a", "b", "e"]), ], ) @pytest.mark.parametrize( "exclude_values", [ pytest.param(lambda row: ~row["a"].isin(["a", "e"]), id="exclude_from_a"), pytest.param(lambda row: ~row["b"].isin([4]), id="exclude_from_b"), pytest.param(lambda row: ~row["e"].isin(["x"]), id="exclude_from_e"), pytest.param( lambda row: ~row["a"].isin(["a", "e"]) & ~row["b"].isin([4]), id="exclude_from_a_b", ), pytest.param( lambda row: ~row["b"].isin([4]) & ~row["e"].isin(["x"]), id="exclude_from_b_e", ), pytest.param( lambda row: ~row["a"].isin(["a", "e"]) & ~row["b"].isin([4]) & ~row["e"].isin(["x"]), id="exclude_from_a_b_e", ), ], ) def test_range_groupby_categories( observed, func, by_cols, cat_cols, exclude_values, as_index, modify_config ): data = { "a": ["a", "b", "c", "d", "e", "b", "g", "a"] * 32, "b": [1, 2, 3, 4] * 64, "c": range(256), "d": range(256), "e": ["x", "y"] * 128, } md_df, pd_df = create_test_dfs(data) md_df = md_df.astype({col: "category" for col in cat_cols})[exclude_values] pd_df = pd_df.astype({col: "category" for col in cat_cols})[exclude_values] md_res = func(md_df.groupby(by_cols, observed=observed, as_index=as_index)) pd_res = func(pd_df.groupby(by_cols, observed=observed, as_index=as_index)) # HACK, FIXME: there's a bug in range-partitioning impl that apparently can # break the order of rows in the result for multi-column groupbys. Placing the sorting-hack for now # https://github.com/modin-project/modin/issues/6875 df_equals(md_res.sort_index(axis=0), pd_res.sort_index(axis=0)) @pytest.mark.parametrize("cat_cols", [["a"], ["b"], ["a", "b"]]) @pytest.mark.parametrize( "columns", [[(False, "a"), (True, "b")], [(True, "a")], [(True, "a"), (True, "b")]] ) def test_range_groupby_categories_external_grouper(columns, cat_cols): data = { "a": [1, 1, 2, 2] * 64, "b": [11, 11, 22, 22] * 64, "c": [111, 111, 222, 222] * 64, "data": [1, 2, 3, 4] * 64, } md_df, pd_df = create_test_dfs(data) md_df = md_df.astype({col: "category" for col in cat_cols}) pd_df = pd_df.astype({col: "category" for col in cat_cols}) md_df, md_by = get_external_groupers(md_df, columns, drop_from_original_df=True) pd_df, pd_by = get_external_groupers(pd_df, columns, drop_from_original_df=True) eval_general(md_df.groupby(md_by), pd_df.groupby(pd_by), lambda grp: grp.count()) @pytest.mark.parametrize("by", [["a"], ["a", "b"]]) @pytest.mark.parametrize("as_index", [True, False]) @pytest.mark.parametrize("include_groups", [True, False]) def test_include_groups(by, as_index, include_groups): data = { "a": [1, 1, 2, 2] * 64, "b": [11, 11, 22, 22] * 64, "c": [111, 111, 222, 222] * 64, "data": [1, 2, 3, 4] * 64, } def func(df): if include_groups: assert len(df.columns.intersection(by)) == len(by) else: assert len(df.columns.intersection(by)) == 0 return df.sum() md_df, pd_df = create_test_dfs(data) eval_general( md_df, pd_df, lambda df: df.groupby(by, as_index=as_index).apply( func, include_groups=include_groups ), ) @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("how", ["first", "last"]) def test_first_last_skipna(how, skipna): md_df, pd_df = create_test_dfs( { "a": [2, 1, 1, 2, 3, 3] * 20, "b": [np.nan, 3.0, np.nan, 4.0, np.nan, np.nan] * 20, "c": [np.nan, 3.0, np.nan, 4.0, np.nan, np.nan] * 20, } ) pd_res = getattr(pd_df.groupby("a"), how)(skipna=skipna) md_res = getattr(md_df.groupby("a"), how)(skipna=skipna) df_equals(md_res, pd_res) ================================================ FILE: modin/tests/pandas/test_io.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import csv import inspect import os import platform import sys import unittest.mock as mock from collections import defaultdict from io import BytesIO, StringIO from pathlib import Path from typing import Dict import fastparquet import numpy as np import pandas import pandas._libs.lib as lib import pyarrow as pa import pyarrow.dataset import pytest import sqlalchemy as sa from packaging import version from pandas._testing import ensure_clean from pandas.errors import ParserWarning from scipy import sparse from modin.config import ( AsyncReadMode, Engine, IsExperimental, MinRowPartitionSize, ReadSqlEngine, StorageFormat, TestDatasetSize, TestReadFromPostgres, TestReadFromSqlServer, ) from modin.db_conn import ModinDatabaseConnection, UnsupportedDatabaseException from modin.pandas.io import from_arrow, from_dask, from_map, from_ray, to_pandas from modin.tests.test_utils import ( current_execution_is_native, warns_that_defaulting_to_pandas_if, ) from .utils import ( check_file_leaks, create_test_dfs, create_test_series, default_to_pandas_ignore_string, df_equals, dummy_decorator, eval_general, eval_io, eval_io_from_str, generate_dataframe, get_unique_filename, json_long_bytes, json_long_string, json_short_bytes, json_short_string, parse_dates_values_by_id, ) from .utils import test_data as utils_test_data from .utils import ( time_parsing_csv_path, ) if StorageFormat.get() == "Pandas": import modin.pandas as pd else: import modin.experimental.pandas as pd try: import ray EXCEPTIONS = (ray.exceptions.WorkerCrashedError,) except ImportError: EXCEPTIONS = () from modin.config import NPartitions NPartitions.put(4) DATASET_SIZE_DICT = { "Small": 64, "Normal": 2000, "Big": 20000, } # Number of rows in the test file NROWS = DATASET_SIZE_DICT.get(TestDatasetSize.get(), DATASET_SIZE_DICT["Small"]) TEST_DATA = { "col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], "col3": [8, 9, 10, 11], "col4": [12, 13, 14, 15], "col5": [0, 0, 0, 0], } def assert_files_eq(path1, path2): with open(path1, "rb") as file1, open(path2, "rb") as file2: file1_content = file1.read() file2_content = file2.read() if file1_content == file2_content: return True else: return False def setup_clipboard(row_size=NROWS): df = pandas.DataFrame({"col1": np.arange(row_size), "col2": np.arange(row_size)}) df.to_clipboard() def parquet_eval_to_file(tmp_dir, modin_obj, pandas_obj, fn, extension, **fn_kwargs): """ Helper function to test `to_parquet` method. Parameters ---------- tmp_dir : Union[str, Path] Temporary directory. modin_obj : pd.DataFrame A Modin DataFrame or a Series to test `to_parquet` method. pandas_obj: pandas.DataFrame A pandas DataFrame or a Series to test `to_parquet` method. fn : str Name of the method, that should be tested. extension : str Extension of the test file. """ unique_filename_modin = get_unique_filename(extension=extension, data_dir=tmp_dir) unique_filename_pandas = get_unique_filename(extension=extension, data_dir=tmp_dir) engine = fn_kwargs.get("engine", "auto") getattr(modin_obj, fn)(unique_filename_modin, **fn_kwargs) getattr(pandas_obj, fn)(unique_filename_pandas, **fn_kwargs) pandas_df = pandas.read_parquet(unique_filename_pandas, engine=engine) modin_df = pd.read_parquet(unique_filename_modin, engine=engine) df_equals(pandas_df, modin_df) def eval_to_file(tmp_dir, modin_obj, pandas_obj, fn, extension, **fn_kwargs): """ Test `fn` method of `modin_obj` and `pandas_obj`. Parameters ---------- tmp_dir : Union[str, Path] Temporary directory. modin_obj: Modin DataFrame or Series Object to test. pandas_obj: Pandas DataFrame or Series Object to test. fn: str Name of the method, that should be tested. extension: str Extension of the test file. """ unique_filename_modin = get_unique_filename(extension=extension, data_dir=tmp_dir) unique_filename_pandas = get_unique_filename(extension=extension, data_dir=tmp_dir) # parameter `max_retries=0` is set for `to_csv` function on Ray engine, # in order to increase the stability of tests, we repeat the call of # the entire function manually last_exception = None for _ in range(3): try: getattr(modin_obj, fn)(unique_filename_modin, **fn_kwargs) except EXCEPTIONS as err: last_exception = err continue break # If we do have an exception that's valid let's raise it if last_exception: raise last_exception getattr(pandas_obj, fn)(unique_filename_pandas, **fn_kwargs) assert assert_files_eq(unique_filename_modin, unique_filename_pandas) def eval_to_csv_file(tmp_dir, modin_obj, pandas_obj, extension, **kwargs): if extension is None: kwargs["mode"] = "t" kwargs["compression"] = "infer" modin_csv = modin_obj.to_csv(**kwargs) pandas_csv = pandas_obj.to_csv(**kwargs) if modin_csv == pandas_csv: return force_read = True modin_file = get_unique_filename(extension="csv", data_dir=tmp_dir) pandas_file = get_unique_filename(extension="csv", data_dir=tmp_dir) with open(modin_file, "w") as file: file.write(modin_csv) with open(pandas_file, "w") as file: file.write(pandas_csv) else: force_read = extension != "csv" or kwargs.get("compression", None) modin_file = get_unique_filename(extension=extension, data_dir=tmp_dir) pandas_file = get_unique_filename(extension=extension, data_dir=tmp_dir) modin_obj.to_csv(modin_file, **kwargs) pandas_obj.to_csv(pandas_file, **kwargs) if force_read or not assert_files_eq(modin_file, pandas_file): # If the files are not identical, make sure they can # be read by pandas and contains identical data. read_kwargs = {} if kwargs.get("index", None) is not False: read_kwargs["index_col"] = 0 if (value := kwargs.get("sep", None)) is not None: read_kwargs["sep"] = value if (value := kwargs.get("compression", None)) is not None: read_kwargs["compression"] = value modin_obj = pandas.read_csv(modin_file, **read_kwargs) pandas_obj = pandas.read_csv(pandas_file, **read_kwargs) df_equals(pandas_obj, modin_obj) @pytest.fixture def make_parquet_dir(tmp_path): def _make_parquet_dir( dfs_by_filename: Dict[str, pandas.DataFrame], row_group_size: int ): for filename, df in dfs_by_filename.items(): df.to_parquet( os.path.join(tmp_path, filename), row_group_size=row_group_size ) return tmp_path yield _make_parquet_dir @pytest.mark.usefixtures("TestReadCSVFixture") @pytest.mark.skipif( IsExperimental.get() and StorageFormat.get() == "Pyarrow", reason="Segmentation fault; see PR #2347 ffor details", ) @pytest.mark.filterwarnings(default_to_pandas_ignore_string) class TestCsv: # delimiter tests @pytest.mark.parametrize("sep", ["_", ",", "."]) @pytest.mark.parametrize("decimal", [".", "_"]) @pytest.mark.parametrize("thousands", [None, ",", "_", " "]) def test_read_csv_seps(self, make_csv_file, sep, decimal, thousands): unique_filename = make_csv_file( delimiter=sep, thousands_separator=thousands, decimal_separator=decimal, ) eval_io( fn_name="read_csv", # read_csv kwargs filepath_or_buffer=unique_filename, sep=sep, decimal=decimal, thousands=thousands, ) @pytest.mark.parametrize("sep", [None, "_"]) @pytest.mark.parametrize("delimiter", [".", "_"]) def test_read_csv_seps_except(self, make_csv_file, sep, delimiter): unique_filename = make_csv_file(delimiter=delimiter) eval_io( fn_name="read_csv", # read_csv kwargs filepath_or_buffer=unique_filename, delimiter=delimiter, sep=sep, expected_exception=ValueError( "Specified a sep and a delimiter; you can only specify one." ), ) @pytest.mark.parametrize( "dtype_backend", [lib.no_default, "numpy_nullable", "pyarrow"] ) def test_read_csv_dtype_backend(self, make_csv_file, dtype_backend): unique_filename = make_csv_file() def comparator(df1, df2): df_equals(df1, df2) df_equals(df1.dtypes, df2.dtypes) eval_io( fn_name="read_csv", # read_csv kwargs filepath_or_buffer=unique_filename, dtype_backend=dtype_backend, comparator=comparator, ) # Column and Index Locations and Names tests @pytest.mark.parametrize("header", ["infer", None, 0]) @pytest.mark.parametrize("index_col", [None, "col1"]) @pytest.mark.parametrize( "names", [lib.no_default, ["col1"], ["c1", "c2", "c3", "c4", "c5", "c6"]] ) @pytest.mark.parametrize( "usecols", [None, ["col1"], ["col1", "col2", "col6"], [0, 1, 5]] ) @pytest.mark.parametrize("skip_blank_lines", [True, False]) def test_read_csv_col_handling( self, header, index_col, names, usecols, skip_blank_lines, ): if names is lib.no_default: pytest.skip("some parameters combiantions fails: issue #2312") if header in ["infer", None] and names is not lib.no_default: pytest.skip( "Heterogeneous data in a column is not cast to a common type: issue #3346" ) eval_io( fn_name="read_csv", # read_csv kwargs filepath_or_buffer=pytest.csvs_names["test_read_csv_blank_lines"], header=header, index_col=index_col, names=names, usecols=usecols, skip_blank_lines=skip_blank_lines, # FIXME: https://github.com/modin-project/modin/issues/7035 expected_exception=False, ) @pytest.mark.parametrize("usecols", [lambda col_name: col_name in ["a", "b", "e"]]) def test_from_csv_with_callable_usecols(self, usecols): fname = "modin/tests/pandas/data/test_usecols.csv" pandas_df = pandas.read_csv(fname, usecols=usecols) modin_df = pd.read_csv(fname, usecols=usecols) df_equals(modin_df, pandas_df) # General Parsing Configuration @pytest.mark.parametrize("dtype", [None, True]) @pytest.mark.parametrize("engine", [None, "python", "c"]) @pytest.mark.parametrize( "converters", [ None, { "col1": lambda x: np.int64(x) * 10, "col2": pandas.to_datetime, "col4": lambda x: x.replace(":", ";"), }, ], ) @pytest.mark.parametrize("skipfooter", [0, 10]) def test_read_csv_parsing_1( self, dtype, engine, converters, skipfooter, ): if dtype: dtype = { col: "object" for col in pandas.read_csv( pytest.csvs_names["test_read_csv_regular"], nrows=1 ).columns } expected_exception = None if engine == "c" and skipfooter != 0: expected_exception = ValueError( "the 'c' engine does not support skipfooter" ) eval_io( fn_name="read_csv", expected_exception=expected_exception, check_kwargs_callable=not callable(converters), # read_csv kwargs filepath_or_buffer=pytest.csvs_names["test_read_csv_regular"], dtype=dtype, engine=engine, converters=converters, skipfooter=skipfooter, ) @pytest.mark.parametrize("header", ["infer", None, 0]) @pytest.mark.parametrize( "skiprows", [ 2, lambda x: x % 2, lambda x: x > 25, lambda x: x > 128, np.arange(10, 50), np.arange(10, 50, 2), ], ) @pytest.mark.parametrize("nrows", [35, None]) @pytest.mark.parametrize( "names", [ [f"c{col_number}" for col_number in range(4)], [f"c{col_number}" for col_number in range(6)], None, ], ) @pytest.mark.parametrize("encoding", ["latin1", "windows-1251", None]) def test_read_csv_parsing_2( self, make_csv_file, request, header, skiprows, nrows, names, encoding, ): if encoding: unique_filename = make_csv_file(encoding=encoding) else: unique_filename = pytest.csvs_names["test_read_csv_regular"] kwargs = { "filepath_or_buffer": unique_filename, "header": header, "skiprows": skiprows, "nrows": nrows, "names": names, "encoding": encoding, } if Engine.get() != "Python": df = pandas.read_csv(**dict(kwargs, nrows=1)) # in that case first partition will contain str if df[df.columns[0]][df.index[0]] in ["c1", "col1", "c3", "col3"]: pytest.xfail("read_csv incorrect output with float data - issue #2634") eval_io( fn_name="read_csv", expected_exception=None, check_kwargs_callable=not callable(skiprows), # read_csv kwargs **kwargs, ) @pytest.mark.parametrize("true_values", [["Yes"], ["Yes", "true"], None]) @pytest.mark.parametrize("false_values", [["No"], ["No", "false"], None]) @pytest.mark.parametrize("skipfooter", [0, 10]) @pytest.mark.parametrize("nrows", [35, None]) def test_read_csv_parsing_3( self, true_values, false_values, skipfooter, nrows, ): # TODO: Check #2446 as it was closed xfail_case = (false_values or true_values) and Engine.get() != "Python" if xfail_case: pytest.xfail("modin and pandas dataframes differs - issue #2446") expected_exception = None if skipfooter != 0 and nrows is not None: expected_exception = ValueError("'skipfooter' not supported with 'nrows'") eval_io( fn_name="read_csv", expected_exception=expected_exception, # read_csv kwargs filepath_or_buffer=pytest.csvs_names["test_read_csv_yes_no"], true_values=true_values, false_values=false_values, skipfooter=skipfooter, nrows=nrows, ) def test_read_csv_skipinitialspace(self): with ensure_clean(".csv") as unique_filename: str_initial_spaces = ( "col1,col2,col3,col4\n" + "five, six, seven, eight\n" + " five, six, seven, eight\n" + "five, six, seven, eight\n" ) eval_io_from_str(str_initial_spaces, unique_filename, skipinitialspace=True) # NA and Missing Data Handling tests @pytest.mark.parametrize("na_values", ["custom_nan", "73"]) @pytest.mark.parametrize("keep_default_na", [True, False]) @pytest.mark.parametrize("na_filter", [True, False]) @pytest.mark.parametrize("verbose", [True, False]) @pytest.mark.parametrize("skip_blank_lines", [True, False]) def test_read_csv_nans_handling( self, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, ): eval_io( fn_name="read_csv", # read_csv kwargs filepath_or_buffer=pytest.csvs_names["test_read_csv_nans"], na_values=na_values, keep_default_na=keep_default_na, na_filter=na_filter, verbose=verbose, skip_blank_lines=skip_blank_lines, ) # Datetime Handling tests @pytest.mark.parametrize( "parse_dates", [True, False, ["col2"], ["col2", "col4"], [1, 3]] ) @pytest.mark.parametrize("infer_datetime_format", [True, False]) @pytest.mark.parametrize("keep_date_col", [True, False]) @pytest.mark.parametrize( "date_parser", [lib.no_default, lambda x: pandas.to_datetime(x, format="%Y-%m-%d")], ids=["default", "format-Ymd"], ) @pytest.mark.parametrize("dayfirst", [True, False]) @pytest.mark.parametrize("cache_dates", [True, False]) def test_read_csv_datetime( self, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, request, ): expected_exception = None if "format-Ymd" in request.node.callspec.id and ( "parse_dates3" in request.node.callspec.id or "parse_dates4" in request.node.callspec.id ): msg = ( 'time data "00:00:00" doesn\'t match format "%Y-%m-%d", at position 0. You might want to try:\n' + " - passing `format` if your strings have a consistent format;\n" + " - passing `format='ISO8601'` if your strings are all ISO8601 " + "but not necessarily in exactly the same format;\n" + " - passing `format='mixed'`, and the format will be inferred " + "for each element individually. You might want to use `dayfirst` " + "alongside this." ) expected_exception = ValueError(msg) eval_io( fn_name="read_csv", check_kwargs_callable=not callable(date_parser), expected_exception=expected_exception, # read_csv kwargs filepath_or_buffer=pytest.csvs_names["test_read_csv_regular"], parse_dates=parse_dates, infer_datetime_format=infer_datetime_format, keep_date_col=keep_date_col, date_parser=date_parser, dayfirst=dayfirst, cache_dates=cache_dates, ) @pytest.mark.parametrize("date", ["2023-01-01 00:00:01.000000000", "2023"]) @pytest.mark.parametrize("dtype", [None, "str", {"id": "int64"}]) @pytest.mark.parametrize("parse_dates", [None, [], ["date"], [1]]) def test_read_csv_dtype_parse_dates(self, date, dtype, parse_dates): with ensure_clean(".csv") as filename: with open(filename, "w") as file: file.write(f"id,date\n1,{date}") eval_io( fn_name="read_csv", # read_csv kwargs filepath_or_buffer=filename, dtype=dtype, parse_dates=parse_dates, ) # Iteration tests @pytest.mark.parametrize("iterator", [True, False]) def test_read_csv_iteration(self, iterator): filename = pytest.csvs_names["test_read_csv_regular"] # Tests __next__ and correctness of reader as an iterator # Use larger chunksize to read through file quicker rdf_reader = pd.read_csv(filename, chunksize=500, iterator=iterator) pd_reader = pandas.read_csv(filename, chunksize=500, iterator=iterator) for modin_df, pd_df in zip(rdf_reader, pd_reader): df_equals(modin_df, pd_df) # Tests that get_chunk works correctly rdf_reader = pd.read_csv(filename, chunksize=1, iterator=iterator) pd_reader = pandas.read_csv(filename, chunksize=1, iterator=iterator) modin_df = rdf_reader.get_chunk(1) pd_df = pd_reader.get_chunk(1) df_equals(modin_df, pd_df) # Tests that read works correctly rdf_reader = pd.read_csv(filename, chunksize=1, iterator=iterator) pd_reader = pandas.read_csv(filename, chunksize=1, iterator=iterator) modin_df = rdf_reader.read() pd_df = pd_reader.read() df_equals(modin_df, pd_df) # Tests #6553 if iterator: rdf_reader = pd.read_csv(filename, iterator=iterator) pd_reader = pandas.read_csv(filename, iterator=iterator) modin_df = rdf_reader.read() pd_df = pd_reader.read() df_equals(modin_df, pd_df) @pytest.mark.parametrize("pathlike", [False, True]) def test_read_csv_encoding_976(self, pathlike): file_name = "modin/tests/pandas/data/issue_976.csv" if pathlike: file_name = Path(file_name) names = [str(i) for i in range(11)] kwargs = { "sep": ";", "names": names, "encoding": "windows-1251", } df1 = pd.read_csv(file_name, **kwargs) df2 = pandas.read_csv(file_name, **kwargs) # these columns contain data of various types in partitions # see #1931 for details; df1 = df1.drop(["4", "5"], axis=1) df2 = df2.drop(["4", "5"], axis=1) df_equals(df1, df2) # Quoting, Compression parameters tests @pytest.mark.parametrize("compression", ["infer", "gzip", "bz2", "xz", "zip"]) @pytest.mark.parametrize("encoding", [None, "latin8", "utf16"]) @pytest.mark.parametrize("engine", [None, "python", "c", "pyarrow"]) def test_read_csv_compression(self, make_csv_file, compression, encoding, engine): unique_filename = make_csv_file(encoding=encoding, compression=compression) expected_exception = None if encoding == "utf16" and compression in ("bz2", "xz"): expected_exception = UnicodeError("UTF-16 stream does not start with BOM") eval_io( fn_name="read_csv", # read_csv kwargs filepath_or_buffer=unique_filename, compression=compression, encoding=encoding, engine=engine, expected_exception=expected_exception, ) @pytest.mark.parametrize( "encoding", [ None, "ISO-8859-1", "latin1", "iso-8859-1", "cp1252", "utf8", pytest.param( "unicode_escape", marks=pytest.mark.skipif( condition=sys.version_info < (3, 9), reason="https://bugs.python.org/issue45461", ), ), "raw_unicode_escape", "utf_16_le", "utf_16_be", "utf32", "utf_32_le", "utf_32_be", "utf-8-sig", ], ) def test_read_csv_encoding(self, make_csv_file, encoding): unique_filename = make_csv_file(encoding=encoding) eval_io( fn_name="read_csv", # read_csv kwargs filepath_or_buffer=unique_filename, encoding=encoding, ) @pytest.mark.parametrize("thousands", [None, ",", "_", " "]) @pytest.mark.parametrize("decimal", [".", "_"]) @pytest.mark.parametrize("lineterminator", [None, "x", "\n"]) @pytest.mark.parametrize("escapechar", [None, "d", "x"]) @pytest.mark.parametrize("dialect", ["test_csv_dialect", "use_dialect_name", None]) def test_read_csv_file_format( self, make_csv_file, thousands, decimal, lineterminator, escapechar, dialect, ): if dialect: test_csv_dialect_params = { "delimiter": "_", "doublequote": False, "escapechar": "\\", "quotechar": "d", "quoting": csv.QUOTE_ALL, } csv.register_dialect(dialect, **test_csv_dialect_params) if dialect != "use_dialect_name": # otherwise try with dialect name instead of `_csv.Dialect` object dialect = csv.get_dialect(dialect) unique_filename = make_csv_file(**test_csv_dialect_params) else: unique_filename = make_csv_file( thousands_separator=thousands, decimal_separator=decimal, escapechar=escapechar, lineterminator=lineterminator, ) expected_exception = None if dialect is None: # FIXME: https://github.com/modin-project/modin/issues/7035 expected_exception = False eval_io( fn_name="read_csv", # read_csv kwargs filepath_or_buffer=unique_filename, thousands=thousands, decimal=decimal, lineterminator=lineterminator, escapechar=escapechar, dialect=dialect, expected_exception=expected_exception, ) @pytest.mark.parametrize( "quoting", [csv.QUOTE_ALL, csv.QUOTE_MINIMAL, csv.QUOTE_NONNUMERIC, csv.QUOTE_NONE], ) @pytest.mark.parametrize("quotechar", ['"', "_", "d"]) @pytest.mark.parametrize("doublequote", [True, False]) @pytest.mark.parametrize("comment", [None, "#", "x"]) def test_read_csv_quoting( self, make_csv_file, quoting, quotechar, doublequote, comment, ): # in these cases escapechar should be set, otherwise error occures # _csv.Error: need to escape, but no escapechar set" use_escapechar = ( not doublequote and quotechar != '"' and quoting != csv.QUOTE_NONE ) escapechar = "\\" if use_escapechar else None unique_filename = make_csv_file( quoting=quoting, quotechar=quotechar, doublequote=doublequote, escapechar=escapechar, comment_col_char=comment, ) eval_io( fn_name="read_csv", # read_csv kwargs filepath_or_buffer=unique_filename, quoting=quoting, quotechar=quotechar, doublequote=doublequote, escapechar=escapechar, comment=comment, ) # Error Handling parameters tests @pytest.mark.skip(reason="https://github.com/modin-project/modin/issues/6239") @pytest.mark.parametrize("on_bad_lines", ["error", "warn", "skip", None]) def test_read_csv_error_handling(self, on_bad_lines): # in that case exceptions are raised both by Modin and pandas # and tests pass raise_exception_case = on_bad_lines is not None # TODO: Check #2500 as it was closed if not raise_exception_case and Engine.get() not in ["Python"]: pytest.xfail("read_csv doesn't raise `bad lines` exceptions - issue #2500") eval_io( fn_name="read_csv", # read_csv kwargs filepath_or_buffer=pytest.csvs_names["test_read_csv_bad_lines"], on_bad_lines=on_bad_lines, ) @pytest.mark.parametrize("float_precision", [None, "high", "legacy", "round_trip"]) def test_python_engine_float_precision_except(self, float_precision): expected_exception = None if float_precision is not None: expected_exception = ValueError( "The 'float_precision' option is not supported with the 'python' engine" ) eval_io( fn_name="read_csv", # read_csv kwargs filepath_or_buffer=pytest.csvs_names["test_read_csv_regular"], engine="python", float_precision=float_precision, expected_exception=expected_exception, ) @pytest.mark.parametrize("low_memory", [False, True]) def test_python_engine_low_memory_except(self, low_memory): expected_exception = None if not low_memory: expected_exception = ValueError( "The 'low_memory' option is not supported with the 'python' engine" ) eval_io( fn_name="read_csv", # read_csv kwargs filepath_or_buffer=pytest.csvs_names["test_read_csv_regular"], engine="python", low_memory=low_memory, expected_exception=expected_exception, ) @pytest.mark.parametrize("delim_whitespace", [True, False]) def test_delim_whitespace(self, delim_whitespace, tmp_path): str_delim_whitespaces = "col1 col2 col3 col4\n5 6 7 8\n9 10 11 12\n" unique_filename = get_unique_filename(data_dir=tmp_path) eval_io_from_str( str_delim_whitespaces, unique_filename, delim_whitespace=delim_whitespace, ) # Internal parameters tests @pytest.mark.parametrize("engine", ["c"]) @pytest.mark.parametrize("delimiter", [",", " "]) @pytest.mark.parametrize("low_memory", [True, False]) @pytest.mark.parametrize("memory_map", [True, False]) @pytest.mark.parametrize("float_precision", [None, "high", "round_trip"]) def test_read_csv_internal( self, make_csv_file, engine, delimiter, low_memory, memory_map, float_precision, ): unique_filename = make_csv_file(delimiter=delimiter) eval_io( filepath_or_buffer=unique_filename, fn_name="read_csv", engine=engine, delimiter=delimiter, low_memory=low_memory, memory_map=memory_map, float_precision=float_precision, ) # Issue related, specific or corner cases @pytest.mark.parametrize("nrows", [2, None]) def test_read_csv_bad_quotes(self, nrows): csv_bad_quotes = ( '1, 2, 3, 4\none, two, three, four\nfive, "six", seven, "eight\n' ) with ensure_clean(".csv") as unique_filename: eval_io_from_str(csv_bad_quotes, unique_filename, nrows=nrows) def test_read_csv_categories(self): eval_io( fn_name="read_csv", # read_csv kwargs filepath_or_buffer="modin/tests/pandas/data/test_categories.csv", names=["one", "two"], dtype={"one": "int64", "two": "category"}, ) @pytest.mark.parametrize("encoding", [None, "utf-8"]) @pytest.mark.parametrize("encoding_errors", ["strict", "ignore"]) @pytest.mark.parametrize( "parse_dates", [pytest.param(value, id=id) for id, value in parse_dates_values_by_id.items()], ) @pytest.mark.parametrize("index_col", [None, 0, 5]) @pytest.mark.parametrize("header", ["infer", 0]) @pytest.mark.parametrize( "names", [ None, [ "timestamp", "year", "month", "date", "symbol", "high", "low", "open", "close", "spread", "volume", ], ], ) @pytest.mark.exclude_in_sanity def test_read_csv_parse_dates( self, names, header, index_col, parse_dates, encoding, encoding_errors, request, ): if names is not None and header == "infer": pytest.xfail( "read_csv with Ray engine works incorrectly with date data and names parameter provided - issue #2509" ) expected_exception = None if "nonexistent_int_column" in request.node.callspec.id: expected_exception = IndexError("list index out of range") elif "nonexistent_string_column" in request.node.callspec.id: expected_exception = ValueError( "Missing column provided to 'parse_dates': 'z'" ) eval_io( fn_name="read_csv", expected_exception=expected_exception, # read_csv kwargs filepath_or_buffer=time_parsing_csv_path, names=names, header=header, index_col=index_col, parse_dates=parse_dates, encoding=encoding, encoding_errors=encoding_errors, ) @pytest.mark.parametrize( "storage_options", [{"anon": False}, {"anon": True}, {"key": "123", "secret": "123"}, None], ) @pytest.mark.xfail( reason="S3 file gone missing, see https://github.com/modin-project/modin/issues/4875" ) def test_read_csv_s3(self, storage_options): eval_io( fn_name="read_csv", # read_csv kwargs filepath_or_buffer="s3://noaa-ghcn-pds/csv/1788.csv", storage_options=storage_options, ) @pytest.mark.xfail( reason="S3 file gone missing, see https://github.com/modin-project/modin/issues/7571" ) def test_read_csv_s3_issue4658(self): eval_io( fn_name="read_csv", # read_csv kwargs filepath_or_buffer="s3://dask-data/nyc-taxi/2015/yellow_tripdata_2015-01.csv", nrows=10, storage_options={"anon": True}, ) @pytest.mark.parametrize("names", [list("XYZ"), None]) @pytest.mark.parametrize("skiprows", [1, 2, 3, 4, None]) def test_read_csv_skiprows_names(self, names, skiprows): eval_io( fn_name="read_csv", # read_csv kwargs filepath_or_buffer="modin/tests/pandas/data/issue_2239.csv", names=names, skiprows=skiprows, ) def _has_pandas_fallback_reason(self): # The Python engine does not use custom IO dispatchers, so specialized error messages # won't appear return Engine.get() != "Python" def test_read_csv_default_to_pandas(self): if self._has_pandas_fallback_reason(): warning_suffix = "buffers" else: warning_suffix = "" with warns_that_defaulting_to_pandas_if( not current_execution_is_native(), suffix=warning_suffix ): # This tests that we default to pandas on a buffer with open(pytest.csvs_names["test_read_csv_regular"], "r") as _f: pd.read_csv(StringIO(_f.read())) def test_read_csv_url(self): eval_io( fn_name="read_csv", # read_csv kwargs filepath_or_buffer="https://raw.githubusercontent.com/modin-project/modin/main/modin/tests/pandas/data/blah.csv", ) @pytest.mark.parametrize("nrows", [21, 5, None]) @pytest.mark.parametrize("skiprows", [4, 1, 500, None]) def test_read_csv_newlines_in_quotes(self, nrows, skiprows): expected_exception = None if skiprows == 500: expected_exception = pandas.errors.EmptyDataError( "No columns to parse from file" ) eval_io( fn_name="read_csv", expected_exception=expected_exception, # read_csv kwargs filepath_or_buffer="modin/tests/pandas/data/newlines.csv", nrows=nrows, skiprows=skiprows, cast_to_str=True, ) @pytest.mark.parametrize("skiprows", [None, 0, [], [1, 2], np.arange(0, 2)]) def test_read_csv_skiprows_with_usecols(self, skiprows): usecols = {"float_data": "float64"} expected_exception = None if isinstance(skiprows, np.ndarray): expected_exception = ValueError( "Usecols do not match columns, columns expected but not found: ['float_data']" ) eval_io( fn_name="read_csv", expected_exception=expected_exception, # read_csv kwargs filepath_or_buffer="modin/tests/pandas/data/issue_4543.csv", skiprows=skiprows, usecols=usecols.keys(), dtype=usecols, ) def test_read_csv_sep_none(self): eval_io( fn_name="read_csv", modin_warning=ParserWarning, # read_csv kwargs filepath_or_buffer=pytest.csvs_names["test_read_csv_regular"], sep=None, ) def test_read_csv_incorrect_data(self): eval_io( fn_name="read_csv", # read_csv kwargs filepath_or_buffer="modin/tests/pandas/data/test_categories.json", ) @pytest.mark.parametrize( "kwargs", [ {"names": [5, 1, 3, 4, 2, 6]}, {"names": [0]}, {"names": None, "usecols": [1, 0, 2]}, {"names": [3, 1, 2, 5], "usecols": [4, 1, 3, 2]}, ], ) def test_read_csv_names_neq_num_cols(self, kwargs): eval_io( fn_name="read_csv", # read_csv kwargs filepath_or_buffer="modin/tests/pandas/data/issue_2074.csv", **kwargs, ) def test_read_csv_wrong_path(self): expected_exception = FileNotFoundError(2, "No such file or directory") eval_io( fn_name="read_csv", expected_exception=expected_exception, # read_csv kwargs filepath_or_buffer="/some/wrong/path.csv", ) @pytest.mark.parametrize("extension", [None, "csv", "csv.gz"]) @pytest.mark.parametrize("sep", [" "]) @pytest.mark.parametrize("header", [False, True, "sfx-"]) @pytest.mark.parametrize("mode", ["w", "wb+"]) @pytest.mark.parametrize("idx_name", [None, "Index"]) @pytest.mark.parametrize("index", [True, False, "New index"]) @pytest.mark.parametrize("index_label", [None, False, "New index"]) @pytest.mark.parametrize("columns", [None, ["col1", "col3", "col5"]]) @pytest.mark.exclude_in_sanity @pytest.mark.skipif( condition=Engine.get() == "Unidist" and os.name == "nt", reason="https://github.com/modin-project/modin/issues/6846", ) def test_to_csv( self, tmp_path, extension, sep, header, mode, idx_name, index, index_label, columns, ): pandas_df = generate_dataframe(idx_name=idx_name) modin_df = pd.DataFrame(pandas_df) if isinstance(header, str): if columns is None: header = [f"{header}{c}" for c in modin_df.columns] else: header = [f"{header}{c}" for c in columns] eval_to_csv_file( tmp_path, modin_obj=modin_df, pandas_obj=pandas_df, extension=extension, sep=sep, header=header, mode=mode, index=index, index_label=index_label, columns=columns, ) @pytest.mark.skipif( condition=Engine.get() == "Unidist" and os.name == "nt", reason="https://github.com/modin-project/modin/issues/6846", ) def test_dataframe_to_csv(self, tmp_path): pandas_df = pandas.read_csv(pytest.csvs_names["test_read_csv_regular"]) modin_df = pd.DataFrame(pandas_df) eval_to_csv_file( tmp_path, modin_obj=modin_df, pandas_obj=pandas_df, extension="csv", ) @pytest.mark.skipif( condition=Engine.get() == "Unidist" and os.name == "nt", reason="https://github.com/modin-project/modin/issues/6846", ) def test_series_to_csv(self, tmp_path): pandas_s = pandas.read_csv( pytest.csvs_names["test_read_csv_regular"], usecols=["col1"] ).squeeze() modin_s = pd.Series(pandas_s) eval_to_csv_file( tmp_path, modin_obj=modin_s, pandas_obj=pandas_s, extension="csv", ) def test_read_csv_within_decorator(self): @dummy_decorator() def wrapped_read_csv(file, method): if method == "pandas": return pandas.read_csv(file) if method == "modin": return pd.read_csv(file) pandas_df = wrapped_read_csv( pytest.csvs_names["test_read_csv_regular"], method="pandas" ) modin_df = wrapped_read_csv( pytest.csvs_names["test_read_csv_regular"], method="modin" ) df_equals(modin_df, pandas_df) @pytest.mark.parametrize( "read_mode", [ "r", "rb", ], ) @pytest.mark.parametrize("buffer_start_pos", [0, 10]) @pytest.mark.parametrize("set_async_read_mode", [False, True], indirect=True) def test_read_csv_file_handle( self, read_mode, make_csv_file, buffer_start_pos, set_async_read_mode ): unique_filename = make_csv_file() with open(unique_filename, mode=read_mode) as buffer: buffer.seek(buffer_start_pos) pandas_df = pandas.read_csv(buffer) buffer.seek(buffer_start_pos) modin_df = pd.read_csv(buffer) df_equals(modin_df, pandas_df) @pytest.mark.skipif( current_execution_is_native(), reason="no partitions", ) def test_unnamed_index(self): def get_internal_df(df): partition = read_df._query_compiler._modin_frame._partitions[0][0] return partition.to_pandas() path = "modin/tests/pandas/data/issue_3119.csv" read_df = pd.read_csv(path, index_col=0) assert get_internal_df(read_df).index.name is None read_df = pd.read_csv(path, index_col=[0, 1]) for name1, name2 in zip(get_internal_df(read_df).index.names, [None, "a"]): assert name1 == name2 def test_read_csv_empty_frame(self): eval_io( fn_name="read_csv", # read_csv kwargs filepath_or_buffer=pytest.csvs_names["test_read_csv_regular"], usecols=["col1"], index_col="col1", ) @pytest.mark.parametrize( "skiprows", [ [x for x in range(10)], [x + 5 for x in range(15)], [x for x in range(10) if x % 2 == 0], [x + 5 for x in range(15) if x % 2 == 0], lambda x: x % 2, lambda x: x > 20, lambda x: x < 20, lambda x: True, lambda x: x in [10, 20], lambda x: x << 10, ], ) @pytest.mark.parametrize("header", ["infer", None, 0, 1, 150]) def test_read_csv_skiprows_corner_cases(self, skiprows, header): eval_io( fn_name="read_csv", check_kwargs_callable=not callable(skiprows), # read_csv kwargs filepath_or_buffer=pytest.csvs_names["test_read_csv_regular"], skiprows=skiprows, header=header, dtype="str", # to avoid issues with heterogeneous data # FIXME: https://github.com/modin-project/modin/issues/7035 expected_exception=False, ) def test_to_csv_with_index(self, tmp_path): cols = 100 arows = 20000 keyrange = 100 values = np.vstack( [ np.random.choice(keyrange, size=(arows)), np.random.normal(size=(cols, arows)), ] ).transpose() modin_df = pd.DataFrame( values, columns=["key"] + ["avalue" + str(i) for i in range(1, 1 + cols)], ).set_index("key") pandas_df = pandas.DataFrame( values, columns=["key"] + ["avalue" + str(i) for i in range(1, 1 + cols)], ).set_index("key") eval_to_csv_file(tmp_path, modin_df, pandas_df, "csv") @pytest.mark.parametrize("set_async_read_mode", [False, True], indirect=True) def test_read_csv_issue_5150(self, set_async_read_mode): with ensure_clean(".csv") as unique_filename: pandas_df = pandas.DataFrame(np.random.randint(0, 100, size=(2**6, 2**6))) pandas_df.to_csv(unique_filename, index=False) expected_pandas_df = pandas.read_csv(unique_filename, index_col=False) modin_df = pd.read_csv(unique_filename, index_col=False) actual_pandas_df = modin_df._to_pandas() if AsyncReadMode.get(): # If read operations are asynchronous, then the dataframes # check should be inside `ensure_clean` context # because the file may be deleted before actual reading starts df_equals(expected_pandas_df, actual_pandas_df) if not AsyncReadMode.get(): df_equals(expected_pandas_df, actual_pandas_df) @pytest.mark.parametrize("usecols", [None, [0, 1, 2, 3, 4]]) def test_read_csv_1930(self, usecols): eval_io( fn_name="read_csv", # read_csv kwargs filepath_or_buffer="modin/tests/pandas/data/issue_1930.csv", names=["c1", "c2", "c3", "c4", "c5"], usecols=usecols, ) def _check_relative_io(fn_name, unique_filename, path_arg, storage_default=()): # Windows can be funny at where it searches for ~; besides, Python >= 3.8 no longer honors %HOME% dirname, basename = os.path.split(unique_filename) pinned_home = {envvar: dirname for envvar in ("HOME", "USERPROFILE", "HOMEPATH")} should_default = Engine.get() == "Python" or StorageFormat.get() in storage_default with mock.patch.dict(os.environ, pinned_home): with warns_that_defaulting_to_pandas_if(should_default): eval_io( fn_name=fn_name, **{path_arg: f"~/{basename}"}, ) # check that when read without $HOME patched we have equivalent results eval_general( f"~/{basename}", unique_filename, lambda fname: getattr(pandas, fn_name)(**{path_arg: fname}), ) # Leave this test apart from the test classes, which skip the default to pandas # warning check. We want to make sure we are NOT defaulting to pandas for a # path relative to user home. # TODO(https://github.com/modin-project/modin/issues/3655): Get rid of this # commment once we turn all default to pandas messages into errors. def test_read_csv_relative_to_user_home(make_csv_file): unique_filename = make_csv_file() _check_relative_io("read_csv", unique_filename, "filepath_or_buffer") @pytest.mark.filterwarnings(default_to_pandas_ignore_string) class TestTable: def test_read_table(self, make_csv_file): unique_filename = make_csv_file(delimiter="\t") eval_io( fn_name="read_table", # read_table kwargs filepath_or_buffer=unique_filename, ) @pytest.mark.parametrize("set_async_read_mode", [False, True], indirect=True) def test_read_table_within_decorator(self, make_csv_file, set_async_read_mode): @dummy_decorator() def wrapped_read_table(file, method): if method == "pandas": return pandas.read_table(file) if method == "modin": return pd.read_table(file) unique_filename = make_csv_file(delimiter="\t") pandas_df = wrapped_read_table(unique_filename, method="pandas") modin_df = wrapped_read_table(unique_filename, method="modin") df_equals(modin_df, pandas_df) def test_read_table_empty_frame(self, make_csv_file): unique_filename = make_csv_file(delimiter="\t") eval_io( fn_name="read_table", # read_table kwargs filepath_or_buffer=unique_filename, usecols=["col1"], index_col="col1", ) @pytest.mark.parametrize("engine", ["pyarrow", "fastparquet"]) @pytest.mark.filterwarnings(default_to_pandas_ignore_string) class TestParquet: @pytest.mark.parametrize("columns", [None, ["col1"]]) @pytest.mark.parametrize("row_group_size", [None, 100, 1000, 10_000]) @pytest.mark.parametrize("path_type", [Path, str]) def test_read_parquet( self, engine, make_parquet_file, columns, row_group_size, path_type ): self._test_read_parquet( engine=engine, make_parquet_file=make_parquet_file, columns=columns, filters=None, row_group_size=row_group_size, path_type=path_type, ) def _test_read_parquet( self, engine, make_parquet_file, columns, filters, row_group_size, path_type=str, range_index_start=0, range_index_step=1, range_index_name=None, expected_exception=None, ): if engine == "pyarrow" and filters == [] and os.name == "nt": # pyarrow, and therefore pandas using pyarrow, errors in this case. # Modin correctly replicates this behavior; however error cases # cause race conditions with ensure_clean on Windows. # TODO: Remove this once https://github.com/modin-project/modin/issues/6460 is fixed. pytest.xfail( "Skipping empty filters error case to avoid race condition - see #6460" ) with ensure_clean(".parquet") as unique_filename: unique_filename = path_type(unique_filename) make_parquet_file( filename=unique_filename, row_group_size=row_group_size, range_index_start=range_index_start, range_index_step=range_index_step, range_index_name=range_index_name, ) eval_io( fn_name="read_parquet", # read_parquet kwargs engine=engine, path=unique_filename, columns=columns, filters=filters, expected_exception=expected_exception, ) @pytest.mark.parametrize( "dtype_backend", [lib.no_default, "numpy_nullable", "pyarrow"] ) def test_read_parquet_dtype_backend(self, engine, make_parquet_file, dtype_backend): with ensure_clean(".parquet") as unique_filename: make_parquet_file(filename=unique_filename, row_group_size=100) def comparator(df1, df2): df_equals(df1, df2) df_equals(df1.dtypes, df2.dtypes) expected_exception = None if engine == "fastparquet": expected_exception = ValueError( "The 'dtype_backend' argument is not supported for the fastparquet engine" ) eval_io( fn_name="read_parquet", # read_parquet kwargs engine=engine, path=unique_filename, dtype_backend=dtype_backend, comparator=comparator, expected_exception=expected_exception, ) # Tests issue #6778 def test_read_parquet_no_extension(self, engine, make_parquet_file): with ensure_clean(".parquet") as unique_filename: # Remove the .parquet extension no_ext_fname = unique_filename[: unique_filename.index(".parquet")] make_parquet_file(filename=no_ext_fname) eval_io( fn_name="read_parquet", # read_parquet kwargs engine=engine, path=no_ext_fname, ) @pytest.mark.parametrize( "filters", [None, [], [("col1", "==", 5)], [("col1", "<=", 215), ("col2", ">=", 35)]], ) def test_read_parquet_filters(self, engine, make_parquet_file, filters): expected_exception = None if filters == [] and engine == "pyarrow": expected_exception = ValueError("Malformed filters") self._test_read_parquet( engine=engine, make_parquet_file=make_parquet_file, columns=None, filters=filters, row_group_size=100, path_type=str, expected_exception=expected_exception, ) @pytest.mark.parametrize("columns", [None, ["col1"]]) @pytest.mark.parametrize( "filters", [None, [("col1", "<=", 1_000_000)], [("col1", "<=", 75), ("col2", ">=", 35)]], ) @pytest.mark.parametrize( "range_index_start", [0, 5_000], ) @pytest.mark.parametrize( "range_index_step", [1, 10], ) @pytest.mark.parametrize( "range_index_name", [None, "my_index"], ) def test_read_parquet_range_index( self, engine, make_parquet_file, columns, filters, range_index_start, range_index_step, range_index_name, ): self._test_read_parquet( engine=engine, make_parquet_file=make_parquet_file, columns=columns, filters=filters, row_group_size=100, path_type=str, range_index_start=range_index_start, range_index_step=range_index_step, range_index_name=range_index_name, ) def test_read_parquet_list_of_files_5698(self, engine, make_parquet_file): if engine == "fastparquet" and os.name == "nt": pytest.xfail(reason="https://github.com/pandas-dev/pandas/issues/51720") with ensure_clean(".parquet") as f1, ensure_clean( ".parquet" ) as f2, ensure_clean(".parquet") as f3: for f in [f1, f2, f3]: make_parquet_file(filename=f) eval_io(fn_name="read_parquet", path=[f1, f2, f3], engine=engine) def test_read_parquet_indexing_by_column(self, tmp_path, engine, make_parquet_file): # Test indexing into a column of Modin with various parquet file row lengths. # Specifically, tests for https://github.com/modin-project/modin/issues/3527 # which fails when min_partition_size < nrows < min_partition_size * (num_partitions - 1) nrows = ( MinRowPartitionSize.get() + 1 ) # Use the minimal guaranteed failing value for nrows. unique_filename = get_unique_filename(extension="parquet", data_dir=tmp_path) make_parquet_file(filename=unique_filename, nrows=nrows) parquet_df = pd.read_parquet(unique_filename, engine=engine) for col in parquet_df.columns: parquet_df[col] @pytest.mark.parametrize("columns", [None, ["col1"]]) @pytest.mark.parametrize( "filters", [None, [("col1", "<=", 3_215), ("col2", ">=", 35)]], ) @pytest.mark.parametrize("row_group_size", [None, 100, 1000, 10_000]) @pytest.mark.parametrize( "rows_per_file", [[1000] * 40, [0, 0, 40_000], [10_000, 10_000] + [100] * 200] ) @pytest.mark.exclude_in_sanity def test_read_parquet_directory( self, engine, make_parquet_dir, columns, filters, row_group_size, rows_per_file ): self._test_read_parquet_directory( engine=engine, make_parquet_dir=make_parquet_dir, columns=columns, filters=filters, range_index_start=0, range_index_step=1, range_index_name=None, row_group_size=row_group_size, rows_per_file=rows_per_file, ) def _test_read_parquet_directory( self, engine, make_parquet_dir, columns, filters, range_index_start, range_index_step, range_index_name, row_group_size, rows_per_file, ): num_cols = DATASET_SIZE_DICT.get( TestDatasetSize.get(), DATASET_SIZE_DICT["Small"] ) dfs_by_filename = {} start_row = 0 for i, length in enumerate(rows_per_file): end_row = start_row + length df = pandas.DataFrame( {f"col{x + 1}": np.arange(start_row, end_row) for x in range(num_cols)}, ) index = pandas.RangeIndex( start=range_index_start, stop=range_index_start + (length * range_index_step), step=range_index_step, name=range_index_name, ) if ( range_index_start == 0 and range_index_step == 1 and range_index_name is None ): assert df.index.equals(index) else: df.index = index dfs_by_filename[f"{i}.parquet"] = df start_row = end_row path = make_parquet_dir(dfs_by_filename, row_group_size) # There are specific files that PyArrow will try to ignore by default # in a parquet directory. One example are files that start with '_'. Our # previous implementation tried to read all files in a parquet directory, # but we now make use of PyArrow to ensure the directory is valid. with open(os.path.join(path, "_committed_file"), "w+") as f: f.write("testingtesting") eval_io( fn_name="read_parquet", # read_parquet kwargs engine=engine, path=path, columns=columns, filters=filters, ) @pytest.mark.parametrize( "filters", [None, [("col1", "<=", 1_000_000)], [("col1", "<=", 75), ("col2", ">=", 35)]], ) @pytest.mark.parametrize( "range_index_start", [0, 5_000], ) @pytest.mark.parametrize( "range_index_step", [1, 10], ) @pytest.mark.parametrize( "range_index_name", [None, "my_index"], ) @pytest.mark.parametrize("row_group_size", [None, 20]) def test_read_parquet_directory_range_index( self, engine, make_parquet_dir, filters, range_index_start, range_index_step, range_index_name, row_group_size, ): self._test_read_parquet_directory( engine=engine, make_parquet_dir=make_parquet_dir, columns=None, filters=filters, range_index_start=range_index_start, range_index_step=range_index_step, range_index_name=range_index_name, row_group_size=row_group_size, # We don't vary rows_per_file, but we choose a # tricky option: uneven with some empty files, # none divisible by the row_group_size. # We use a smaller total size than in other tests # to make this test run faster. rows_per_file=([250] + [0] * 10 + [25] * 10), ) @pytest.mark.parametrize( "filters", [None, [("col1", "<=", 1_000_000)], [("col1", "<=", 75), ("col2", ">=", 35)]], ) @pytest.mark.parametrize( "range_index_start", [0, 5_000], ) @pytest.mark.parametrize( "range_index_step", [1, 10], ) @pytest.mark.parametrize( "range_index_name", [None, "my_index"], ) def test_read_parquet_directory_range_index_consistent_metadata( self, engine, filters, range_index_start, range_index_step, range_index_name, tmp_path, ): num_cols = DATASET_SIZE_DICT.get( TestDatasetSize.get(), DATASET_SIZE_DICT["Small"] ) df = pandas.DataFrame( {f"col{x + 1}": np.arange(0, 500) for x in range(num_cols)}, ) index = pandas.RangeIndex( start=range_index_start, stop=range_index_start + (len(df) * range_index_step), step=range_index_step, name=range_index_name, ) if ( range_index_start == 0 and range_index_step == 1 and range_index_name is None ): assert df.index.equals(index) else: df.index = index path = get_unique_filename(extension=None, data_dir=tmp_path) table = pa.Table.from_pandas(df) pyarrow.dataset.write_dataset( table, path, format="parquet", max_rows_per_group=35, max_rows_per_file=100, ) # There are specific files that PyArrow will try to ignore by default # in a parquet directory. One example are files that start with '_'. Our # previous implementation tried to read all files in a parquet directory, # but we now make use of PyArrow to ensure the directory is valid. with open(os.path.join(path, "_committed_file"), "w+") as f: f.write("testingtesting") eval_io( fn_name="read_parquet", # read_parquet kwargs engine=engine, path=path, filters=filters, ) @pytest.mark.parametrize("columns", [None, ["col1"]]) @pytest.mark.parametrize( "filters", [None, [], [("col1", "==", 5)], [("col1", "<=", 215), ("col2", ">=", 35)]], ) @pytest.mark.parametrize( "range_index_start", [0, 5_000], ) @pytest.mark.parametrize( "range_index_step", [1, 10], ) def test_read_parquet_partitioned_directory( self, tmp_path, make_parquet_file, columns, filters, range_index_start, range_index_step, engine, ): unique_filename = get_unique_filename(extension=None, data_dir=tmp_path) make_parquet_file( filename=unique_filename, partitioned_columns=["col1"], range_index_start=range_index_start, range_index_step=range_index_step, range_index_name="my_index", ) expected_exception = None if filters == [] and engine == "pyarrow": expected_exception = ValueError("Malformed filters") eval_io( fn_name="read_parquet", # read_parquet kwargs engine=engine, path=unique_filename, columns=columns, filters=filters, expected_exception=expected_exception, ) @pytest.mark.parametrize( "filters", [ None, [], [("B", "==", "a")], [ ("B", "==", "a"), ("A", ">=", 50_000), ("idx", "<=", 30_000), ("idx_categorical", "==", "y"), ], ], ) def test_read_parquet_pandas_index(self, engine, filters): if ( version.parse(pa.__version__) >= version.parse("12.0.0") and version.parse(pd.__version__) < version.parse("2.0.0") and engine == "pyarrow" ): pytest.xfail("incompatible versions; see #6072") # Ensure modin can read parquet files written by pandas with a non-RangeIndex object pandas_df = pandas.DataFrame( { "idx": np.random.randint(0, 100_000, size=2000), "idx_categorical": pandas.Categorical(["y", "z"] * 1000), # Can't do interval index right now because of this bug fix that is planned # to be apart of the pandas 1.5.0 release: https://github.com/pandas-dev/pandas/pull/46034 # "idx_interval": pandas.interval_range(start=0, end=2000), "idx_periodrange": pandas.period_range( start="2017-01-01", periods=2000 ), "A": np.random.randint(0, 100_000, size=2000), "B": ["a", "b"] * 1000, "C": ["c"] * 2000, } ) # Older versions of pyarrow do not support Arrow to Parquet # schema conversion for duration[ns] # https://issues.apache.org/jira/browse/ARROW-6780 if version.parse(pa.__version__) >= version.parse("8.0.0"): pandas_df["idx_timedelta"] = pandas.timedelta_range( start="1 day", periods=2000 ) # There is a non-deterministic bug in the fastparquet engine when we # try to set the index to the datetime column. Please see: # https://github.com/dask/fastparquet/issues/796 if engine == "pyarrow": pandas_df["idx_datetime"] = pandas.date_range( start="1/1/2018", periods=2000 ) for col in pandas_df.columns: if col.startswith("idx"): # Before this commit, first released in version 2023.1.0, fastparquet relied # on pandas private APIs to handle Categorical indices. # These private APIs broke in pandas 2. # https://github.com/dask/fastparquet/commit/cf60ae0e9a9ca57afc7a8da98d8c0423db1c0c53 if ( col == "idx_categorical" and engine == "fastparquet" and version.parse(fastparquet.__version__) < version.parse("2023.1.0") ): continue with ensure_clean(".parquet") as unique_filename: pandas_df.set_index(col).to_parquet(unique_filename) # read the same parquet using modin.pandas eval_io( "read_parquet", # read_parquet kwargs path=unique_filename, engine=engine, filters=filters, ) with ensure_clean(".parquet") as unique_filename: pandas_df.set_index(["idx", "A"]).to_parquet(unique_filename) eval_io( "read_parquet", # read_parquet kwargs path=unique_filename, engine=engine, filters=filters, ) @pytest.mark.parametrize( "filters", [ None, [], [("B", "==", "a")], [("B", "==", "a"), ("A", ">=", 5), ("idx", "<=", 30_000)], ], ) def test_read_parquet_pandas_index_partitioned(self, tmp_path, engine, filters): # Ensure modin can read parquet files written by pandas with a non-RangeIndex object pandas_df = pandas.DataFrame( { "idx": np.random.randint(0, 100_000, size=2000), "A": np.random.randint(0, 10, size=2000), "B": ["a", "b"] * 1000, "C": ["c"] * 2000, } ) unique_filename = get_unique_filename(extension="parquet", data_dir=tmp_path) pandas_df.set_index("idx").to_parquet(unique_filename, partition_cols=["A"]) expected_exception = None if filters == [] and engine == "pyarrow": expected_exception = ValueError("Malformed filters") # read the same parquet using modin.pandas eval_io( "read_parquet", # read_parquet kwargs path=unique_filename, engine=engine, filters=filters, expected_exception=expected_exception, ) def test_read_parquet_hdfs(self, engine): eval_io( fn_name="read_parquet", # read_parquet kwargs path="modin/tests/pandas/data/hdfs.parquet", engine=engine, ) @pytest.mark.parametrize( "path_type", ["object", "directory", "url"], ) def test_read_parquet_s3(self, s3_resource, path_type, engine, s3_storage_options): s3_path = "s3://modin-test/modin-bugs/test_data.parquet" if path_type == "object": import s3fs fs = s3fs.S3FileSystem( endpoint_url=s3_storage_options["client_kwargs"]["endpoint_url"] ) with fs.open(s3_path, "rb") as file_obj: eval_io("read_parquet", path=file_obj, engine=engine) elif path_type == "directory": s3_path = "s3://modin-test/modin-bugs/test_data_dir.parquet" eval_io( "read_parquet", path=s3_path, storage_options=s3_storage_options, engine=engine, ) else: eval_io( "read_parquet", path=s3_path, storage_options=s3_storage_options, engine=engine, ) @pytest.mark.parametrize( "filters", [None, [], [("idx", "<=", 30_000)], [("idx", "<=", 30_000), ("A", ">=", 5)]], ) def test_read_parquet_without_metadata(self, tmp_path, engine, filters): """Test that Modin can read parquet files not written by pandas.""" from pyarrow import csv, parquet parquet_fname = get_unique_filename(extension="parquet", data_dir=tmp_path) csv_fname = get_unique_filename(extension="parquet", data_dir=tmp_path) pandas_df = pandas.DataFrame( { "idx": np.random.randint(0, 100_000, size=2000), "A": np.random.randint(0, 10, size=2000), "B": ["a", "b"] * 1000, "C": ["c"] * 2000, } ) pandas_df.to_csv(csv_fname, index=False) # read into pyarrow table and write it to a parquet file t = csv.read_csv(csv_fname) parquet.write_table(t, parquet_fname) expected_exception = None if filters == [] and engine == "pyarrow": expected_exception = ValueError("Malformed filters") eval_io( "read_parquet", # read_parquet kwargs path=parquet_fname, engine=engine, filters=filters, expected_exception=expected_exception, ) def test_read_empty_parquet_file(self, tmp_path, engine): test_df = pandas.DataFrame() path = tmp_path / "data" path.mkdir() test_df.to_parquet(path / "part-00000.parquet", engine=engine) eval_io(fn_name="read_parquet", path=path, engine=engine) @pytest.mark.parametrize( "compression_kwargs", [ pytest.param({}, id="no_compression_kwargs"), pytest.param({"compression": None}, id="compression=None"), pytest.param({"compression": "gzip"}, id="compression=gzip"), pytest.param({"compression": "snappy"}, id="compression=snappy"), pytest.param({"compression": "brotli"}, id="compression=brotli"), ], ) @pytest.mark.parametrize("extension", ["parquet", ".gz", ".bz2", ".zip", ".xz"]) def test_to_parquet(self, tmp_path, engine, compression_kwargs, extension): modin_df, pandas_df = create_test_dfs(TEST_DATA) parquet_eval_to_file( tmp_path, modin_obj=modin_df, pandas_obj=pandas_df, fn="to_parquet", extension=extension, engine=engine, **compression_kwargs, ) def test_to_parquet_keep_index(self, tmp_path, engine): data = {"c0": [0, 1] * 1000, "c1": [2, 3] * 1000} modin_df, pandas_df = create_test_dfs(data) modin_df.index.name = "foo" pandas_df.index.name = "foo" parquet_eval_to_file( tmp_path, modin_obj=modin_df, pandas_obj=pandas_df, fn="to_parquet", extension="parquet", index=True, engine=engine, ) def test_to_parquet_s3(self, s3_resource, engine, s3_storage_options): # use utils_test_data because it spans multiple partitions modin_path = "s3://modin-test/modin-dir/modin_df.parquet" mdf, pdf = create_test_dfs(utils_test_data["int_data"]) pdf.to_parquet( "s3://modin-test/pandas-dir/pandas_df.parquet", engine=engine, storage_options=s3_storage_options, ) mdf.to_parquet(modin_path, engine=engine, storage_options=s3_storage_options) df_equals( pandas.read_parquet( "s3://modin-test/pandas-dir/pandas_df.parquet", storage_options=s3_storage_options, ), pd.read_parquet(modin_path, storage_options=s3_storage_options), ) # check we're not creating local file: # https://github.com/modin-project/modin/issues/5888 assert not os.path.isdir(modin_path) def test_read_parquet_2462(self, tmp_path, engine): test_df = pandas.DataFrame({"col1": [["ad_1", "ad_2"], ["ad_3"]]}) path = tmp_path / "data" path.mkdir() test_df.to_parquet(path / "part-00000.parquet", engine=engine) read_df = pd.read_parquet(path, engine=engine) df_equals(test_df, read_df) def test_read_parquet_5767(self, tmp_path, engine): test_df = pandas.DataFrame({"a": [1, 2, 3, 4], "b": [1, 1, 2, 2]}) path = tmp_path / "data" path.mkdir() file_name = "modin_issue#0000.parquet" test_df.to_parquet(path / file_name, engine=engine, partition_cols=["b"]) read_df = pd.read_parquet(path / file_name) # both Modin and pandas read column "b" as a category df_equals(test_df, read_df.astype("int64")) @pytest.mark.parametrize("index", [False, True]) def test_read_parquet_6855(self, tmp_path, engine, index): if engine == "fastparquet": pytest.skip("integer columns aren't supported") test_df = pandas.DataFrame(np.random.rand(10**2, 10)) path = tmp_path / "data" path.mkdir() file_name = "issue6855.parquet" test_df.to_parquet(path / file_name, index=index, engine=engine) read_df = pd.read_parquet(path / file_name, engine=engine) if not index: # In that case pyarrow cannot preserve index dtype read_df.columns = pandas.Index(read_df.columns).astype("int64").to_list() df_equals(test_df, read_df) def test_read_parquet_s3_with_column_partitioning( self, s3_resource, engine, s3_storage_options ): # https://github.com/modin-project/modin/issues/4636 s3_path = "s3://modin-test/modin-bugs/issue5159.parquet" eval_io( fn_name="read_parquet", path=s3_path, engine=engine, storage_options=s3_storage_options, ) # Leave this test apart from the test classes, which skip the default to pandas # warning check. We want to make sure we are NOT defaulting to pandas for a # path relative to user home. # TODO(https://github.com/modin-project/modin/issues/3655): Get rid of this # commment once we turn all default to pandas messages into errors. def test_read_parquet_relative_to_user_home(make_parquet_file): with ensure_clean(".parquet") as unique_filename: make_parquet_file(filename=unique_filename) _check_relative_io("read_parquet", unique_filename, "path") @pytest.mark.filterwarnings(default_to_pandas_ignore_string) class TestJson: @pytest.mark.parametrize("pathlike", [False, True]) @pytest.mark.parametrize("lines", [False, True]) def test_read_json(self, make_json_file, lines, pathlike): unique_filename = make_json_file(lines=lines) eval_io( fn_name="read_json", # read_json kwargs path_or_buf=Path(unique_filename) if pathlike else unique_filename, lines=lines, ) @pytest.mark.parametrize( "dtype_backend", [lib.no_default, "numpy_nullable", "pyarrow"] ) def test_read_json_dtype_backend(self, make_json_file, dtype_backend): def comparator(df1, df2): df_equals(df1, df2) df_equals(df1.dtypes, df2.dtypes) eval_io( fn_name="read_json", # read_json kwargs path_or_buf=make_json_file(lines=True), lines=True, dtype_backend=dtype_backend, comparator=comparator, ) @pytest.mark.parametrize( "storage_options_extra", [{"anon": False}, {"anon": True}, {"key": "123", "secret": "123"}], ) def test_read_json_s3(self, s3_resource, s3_storage_options, storage_options_extra): s3_path = "s3://modin-test/modin-bugs/test_data.json" expected_exception = None if "anon" in storage_options_extra: expected_exception = PermissionError("Forbidden") eval_io( fn_name="read_json", path_or_buf=s3_path, lines=True, orient="records", storage_options=s3_storage_options | storage_options_extra, expected_exception=expected_exception, ) def test_read_json_categories(self): eval_io( fn_name="read_json", # read_json kwargs path_or_buf="modin/tests/pandas/data/test_categories.json", dtype={"one": "int64", "two": "category"}, ) def test_read_json_different_columns(self): with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): eval_io( fn_name="read_json", # read_json kwargs path_or_buf="modin/tests/pandas/data/test_different_columns_in_rows.json", lines=True, ) @pytest.mark.parametrize( "data", [json_short_string, json_short_bytes, json_long_string, json_long_bytes], ) def test_read_json_string_bytes(self, data): with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): modin_df = pd.read_json(data) # For I/O objects we need to rewind to reuse the same object. if hasattr(data, "seek"): data.seek(0) df_equals(modin_df, pandas.read_json(data)) def test_to_json(self, tmp_path): modin_df, pandas_df = create_test_dfs(TEST_DATA) eval_to_file( tmp_path, modin_obj=modin_df, pandas_obj=pandas_df, fn="to_json", extension="json", ) @pytest.mark.parametrize( "read_mode", [ "r", "rb", ], ) def test_read_json_file_handle(self, make_json_file, read_mode): with open(make_json_file(), mode=read_mode) as buf: df_pandas = pandas.read_json(buf) buf.seek(0) df_modin = pd.read_json(buf) df_equals(df_pandas, df_modin) @pytest.mark.skipif( current_execution_is_native(), reason="no partitions", ) def test_read_json_metadata(self, make_json_file): # `lines=True` is for triggering Modin implementation, # `orient="records"` should be set if `lines=True` df = pd.read_json( make_json_file(ncols=80, lines=True), lines=True, orient="records" ) parts_width_cached = df._query_compiler._modin_frame._column_widths_cache num_splits = len(df._query_compiler._modin_frame._partitions[0]) parts_width_actual = [ len(df._query_compiler._modin_frame._partitions[0][i].get().columns) for i in range(num_splits) ] assert parts_width_cached == parts_width_actual @pytest.mark.filterwarnings(default_to_pandas_ignore_string) class TestExcel: @check_file_leaks @pytest.mark.parametrize("pathlike", [False, True]) def test_read_excel(self, pathlike, make_excel_file): unique_filename = make_excel_file() eval_io( fn_name="read_excel", # read_excel kwargs io=Path(unique_filename) if pathlike else unique_filename, ) @check_file_leaks @pytest.mark.parametrize("skiprows", [2, [1, 3], lambda x: x in [0, 2]]) def test_read_excel_skiprows(self, skiprows, make_excel_file): eval_io( fn_name="read_excel", # read_excel kwargs io=make_excel_file(), skiprows=skiprows, check_kwargs_callable=False, ) @check_file_leaks @pytest.mark.parametrize( "dtype_backend", [lib.no_default, "numpy_nullable", "pyarrow"] ) def test_read_excel_dtype_backend(self, make_excel_file, dtype_backend): def comparator(df1, df2): df_equals(df1, df2) df_equals(df1.dtypes, df2.dtypes) eval_io( fn_name="read_excel", # read_csv kwargs io=make_excel_file(), dtype_backend=dtype_backend, comparator=comparator, ) @check_file_leaks def test_read_excel_engine(self, make_excel_file): eval_io( fn_name="read_excel", modin_warning=(UserWarning if StorageFormat.get() == "Pandas" else None), # read_excel kwargs io=make_excel_file(), engine="openpyxl", ) @check_file_leaks def test_read_excel_index_col(self, make_excel_file): eval_io( fn_name="read_excel", modin_warning=(UserWarning if StorageFormat.get() == "Pandas" else None), # read_excel kwargs io=make_excel_file(), index_col=0, ) @check_file_leaks def test_read_excel_all_sheets(self, make_excel_file): unique_filename = make_excel_file() pandas_df = pandas.read_excel(unique_filename, sheet_name=None) modin_df = pd.read_excel(unique_filename, sheet_name=None) assert isinstance(pandas_df, dict) assert isinstance(modin_df, type(pandas_df)) assert pandas_df.keys() == modin_df.keys() for key in pandas_df.keys(): df_equals(modin_df.get(key), pandas_df.get(key)) # TODO: Check pandas gh-#39250 as it was fixed @pytest.mark.xfail( (StorageFormat.get() == "Pandas" and Engine.get() != "Python"), reason="pandas throws the exception. See pandas issue #39250 for more info", ) @check_file_leaks def test_read_excel_sheetname_title(self): eval_io( fn_name="read_excel", # read_excel kwargs io="modin/tests/pandas/data/excel_sheetname_title.xlsx", # FIXME: https://github.com/modin-project/modin/issues/7036 expected_exception=False, ) @check_file_leaks def test_excel_empty_line(self): path = "modin/tests/pandas/data/test_emptyline.xlsx" modin_df = pd.read_excel(path) assert str(modin_df) @check_file_leaks def test_read_excel_empty_rows(self): # Test parsing empty rows in middle of excel dataframe as NaN values eval_io( fn_name="read_excel", io="modin/tests/pandas/data/test_empty_rows.xlsx", ) @check_file_leaks def test_read_excel_border_rows(self): # Test parsing border rows as NaN values in excel dataframe eval_io( fn_name="read_excel", io="modin/tests/pandas/data/test_border_rows.xlsx", ) @check_file_leaks def test_read_excel_every_other_nan(self): # Test for reading excel dataframe with every other row as a NaN value eval_io( fn_name="read_excel", io="modin/tests/pandas/data/every_other_row_nan.xlsx", ) @check_file_leaks def test_read_excel_header_none(self): eval_io( fn_name="read_excel", io="modin/tests/pandas/data/every_other_row_nan.xlsx", header=None, ) @pytest.mark.parametrize( "sheet_name", [ "Sheet1", "AnotherSpecialName", "SpecialName", "SecondSpecialName", 0, 1, 2, 3, ], ) @check_file_leaks def test_read_excel_sheet_name(self, sheet_name): eval_io( fn_name="read_excel", # read_excel kwargs io="modin/tests/pandas/data/modin_error_book.xlsx", sheet_name=sheet_name, # https://github.com/modin-project/modin/issues/5965 comparator_kwargs={"check_dtypes": False}, ) def test_ExcelFile(self, make_excel_file): unique_filename = make_excel_file() modin_excel_file = pd.ExcelFile(unique_filename) pandas_excel_file = pandas.ExcelFile(unique_filename) try: df_equals(modin_excel_file.parse(), pandas_excel_file.parse()) assert modin_excel_file.io == unique_filename finally: modin_excel_file.close() pandas_excel_file.close() def test_ExcelFile_bytes(self, make_excel_file): unique_filename = make_excel_file() with open(unique_filename, mode="rb") as f: content = f.read() modin_excel_file = pd.ExcelFile(content) pandas_excel_file = pandas.ExcelFile(content) df_equals(modin_excel_file.parse(), pandas_excel_file.parse()) def test_read_excel_ExcelFile(self, make_excel_file): unique_filename = make_excel_file() with open(unique_filename, mode="rb") as f: content = f.read() modin_excel_file = pd.ExcelFile(content) pandas_excel_file = pandas.ExcelFile(content) df_equals(pd.read_excel(modin_excel_file), pandas.read_excel(pandas_excel_file)) @pytest.mark.parametrize("use_bytes_io", [False, True]) def test_read_excel_bytes(self, use_bytes_io, make_excel_file): unique_filename = make_excel_file() with open(unique_filename, mode="rb") as f: io_bytes = f.read() if use_bytes_io: io_bytes = BytesIO(io_bytes) eval_io( fn_name="read_excel", # read_excel kwargs io=io_bytes, ) def test_read_excel_file_handle(self, make_excel_file): unique_filename = make_excel_file() with open(unique_filename, mode="rb") as f: eval_io( fn_name="read_excel", # read_excel kwargs io=f, ) @pytest.mark.xfail(strict=False, reason="Flaky test, defaults to pandas") def test_to_excel(self, tmp_path): modin_df, pandas_df = create_test_dfs(TEST_DATA) unique_filename_modin = get_unique_filename(extension="xlsx", data_dir=tmp_path) unique_filename_pandas = get_unique_filename( extension="xlsx", data_dir=tmp_path ) modin_writer = pandas.ExcelWriter(unique_filename_modin) pandas_writer = pandas.ExcelWriter(unique_filename_pandas) modin_df.to_excel(modin_writer) pandas_df.to_excel(pandas_writer) modin_writer.save() pandas_writer.save() assert assert_files_eq(unique_filename_modin, unique_filename_pandas) @check_file_leaks def test_read_excel_empty_frame(self, make_excel_file): eval_io( fn_name="read_excel", modin_warning=(UserWarning if StorageFormat.get() == "Pandas" else None), # read_excel kwargs io=make_excel_file(), usecols=[0], index_col=0, ) @pytest.mark.filterwarnings(default_to_pandas_ignore_string) class TestHdf: @pytest.mark.parametrize("format", [None, "table"]) def test_read_hdf(self, make_hdf_file, format): eval_io( fn_name="read_hdf", # read_hdf kwargs path_or_buf=make_hdf_file(format=format), key="df", ) def test_HDFStore(self, tmp_path): unique_filename_modin = get_unique_filename(extension="hdf", data_dir=tmp_path) unique_filename_pandas = get_unique_filename(extension="hdf", data_dir=tmp_path) modin_store = pd.HDFStore(unique_filename_modin) pandas_store = pandas.HDFStore(unique_filename_pandas) modin_df, pandas_df = create_test_dfs(TEST_DATA) modin_store["foo"] = modin_df pandas_store["foo"] = pandas_df modin_df = modin_store.get("foo") pandas_df = pandas_store.get("foo") df_equals(modin_df, pandas_df) modin_store.close() pandas_store.close() modin_df = pandas.read_hdf(unique_filename_modin, key="foo", mode="r") pandas_df = pandas.read_hdf(unique_filename_pandas, key="foo", mode="r") df_equals(modin_df, pandas_df) assert isinstance(modin_store, pd.HDFStore) with ensure_clean(".hdf5") as hdf_file: with pd.HDFStore(hdf_file, mode="w") as store: store.append("data/df1", pd.DataFrame(np.random.randn(5, 5))) store.append("data/df2", pd.DataFrame(np.random.randn(4, 4))) modin_df = pd.read_hdf(hdf_file, key="data/df1", mode="r") pandas_df = pandas.read_hdf(hdf_file, key="data/df1", mode="r") df_equals(modin_df, pandas_df) def test_HDFStore_in_read_hdf(self): with ensure_clean(".hdf") as filename: dfin = pd.DataFrame(np.random.rand(8, 8)) dfin.to_hdf(filename, "/key") with pd.HDFStore(filename) as h: modin_df = pd.read_hdf(h, "/key") with pandas.HDFStore(filename) as h: pandas_df = pandas.read_hdf(h, "/key") df_equals(modin_df, pandas_df) @pytest.mark.filterwarnings(default_to_pandas_ignore_string) class TestSql: @pytest.mark.parametrize("read_sql_engine", ["Pandas", "Connectorx"]) def test_read_sql(self, tmp_path, make_sql_connection, read_sql_engine): filename = get_unique_filename(".db") table = "test_read_sql" conn = make_sql_connection(tmp_path / filename, table) query = f"select * from {table}" eval_io( fn_name="read_sql", # read_sql kwargs sql=query, con=conn, ) eval_io( fn_name="read_sql", # read_sql kwargs sql=query, con=conn, index_col="index", ) with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): pd.read_sql_query(query, conn) with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): pd.read_sql_table(table, conn) # Test SQLAlchemy engine sqlalchemy_engine = sa.create_engine(conn) eval_io( fn_name="read_sql", # read_sql kwargs sql=query, con=sqlalchemy_engine, ) # Test SQLAlchemy Connection sqlalchemy_connection = sqlalchemy_engine.connect() eval_io( fn_name="read_sql", # read_sql kwargs sql=query, con=sqlalchemy_connection, ) old_sql_engine = ReadSqlEngine.get() ReadSqlEngine.put(read_sql_engine) if ReadSqlEngine.get() == "Connectorx": modin_df = pd.read_sql(sql=query, con=conn) else: modin_df = pd.read_sql( sql=query, con=ModinDatabaseConnection("sqlalchemy", conn) ) ReadSqlEngine.put(old_sql_engine) pandas_df = pandas.read_sql(sql=query, con=sqlalchemy_connection) df_equals(modin_df, pandas_df) @pytest.mark.parametrize( "dtype_backend", [lib.no_default, "numpy_nullable", "pyarrow"] ) def test_read_sql_dtype_backend(self, tmp_path, make_sql_connection, dtype_backend): filename = get_unique_filename(extension="db") table = "test_read_sql_dtype_backend" conn = make_sql_connection(tmp_path / filename, table) query = f"select * from {table}" def comparator(df1, df2): df_equals(df1, df2) df_equals(df1.dtypes, df2.dtypes) eval_io( fn_name="read_sql", # read_sql kwargs sql=query, con=conn, dtype_backend=dtype_backend, comparator=comparator, ) @pytest.mark.skipif( not TestReadFromSqlServer.get(), reason="Skip the test when the test SQL server is not set up.", ) def test_read_sql_from_sql_server(self): table_name = "test_1000x256" query = f"SELECT * FROM {table_name}" sqlalchemy_connection_string = ( "mssql+pymssql://sa:Strong.Pwd-123@0.0.0.0:1433/master" ) pandas_df_to_read = pandas.DataFrame( np.arange( 1000 * 256, ).reshape(1000, 256) ).add_prefix("col") pandas_df_to_read.to_sql( table_name, sqlalchemy_connection_string, if_exists="replace" ) modin_df = pd.read_sql( query, ModinDatabaseConnection("sqlalchemy", sqlalchemy_connection_string), ) pandas_df = pandas.read_sql(query, sqlalchemy_connection_string) df_equals(modin_df, pandas_df) @pytest.mark.skipif( not TestReadFromPostgres.get(), reason="Skip the test when the postgres server is not set up.", ) def test_read_sql_from_postgres(self): table_name = "test_1000x256" query = f"SELECT * FROM {table_name}" connection = "postgresql://sa:Strong.Pwd-123@localhost:2345/postgres" pandas_df_to_read = pandas.DataFrame( np.arange( 1000 * 256, ).reshape(1000, 256) ).add_prefix("col") pandas_df_to_read.to_sql(table_name, connection, if_exists="replace") modin_df = pd.read_sql( query, ModinDatabaseConnection("psycopg2", connection), ) pandas_df = pandas.read_sql(query, connection) df_equals(modin_df, pandas_df) def test_invalid_modin_database_connections(self): with pytest.raises(UnsupportedDatabaseException): ModinDatabaseConnection("unsupported_database") def test_read_sql_with_chunksize(self, make_sql_connection): filename = get_unique_filename(extension="db") table = "test_read_sql_with_chunksize" conn = make_sql_connection(filename, table) query = f"select * from {table}" pandas_gen = pandas.read_sql(query, conn, chunksize=10) modin_gen = pd.read_sql(query, conn, chunksize=10) for modin_df, pandas_df in zip(modin_gen, pandas_gen): df_equals(modin_df, pandas_df) @pytest.mark.parametrize("index", [False, True]) @pytest.mark.parametrize("conn_type", ["str", "sqlalchemy", "sqlalchemy+connect"]) def test_to_sql(self, tmp_path, make_sql_connection, index, conn_type): table_name = f"test_to_sql_{str(index)}" modin_df, pandas_df = create_test_dfs(TEST_DATA) # We do not pass the table name so the fixture won't generate a table conn = make_sql_connection(tmp_path / f"{table_name}_modin.db") if conn_type.startswith("sqlalchemy"): conn = sa.create_engine(conn) if conn_type == "sqlalchemy+connect": conn = conn.connect() modin_df.to_sql(table_name, conn, index=index) df_modin_sql = pandas.read_sql( table_name, con=conn, index_col="index" if index else None ) # We do not pass the table name so the fixture won't generate a table conn = make_sql_connection(tmp_path / f"{table_name}_pandas.db") if conn_type.startswith("sqlalchemy"): conn = sa.create_engine(conn) if conn_type == "sqlalchemy+connect": conn = conn.connect() pandas_df.to_sql(table_name, conn, index=index) df_pandas_sql = pandas.read_sql( table_name, con=conn, index_col="index" if index else None ) assert df_modin_sql.sort_index().equals(df_pandas_sql.sort_index()) @pytest.mark.filterwarnings(default_to_pandas_ignore_string) class TestHtml: @pytest.mark.skipif( platform.system() == "Windows", reason="https://github.com/modin-project/modin/issues/7497", ) def test_read_html(self, make_html_file): eval_io(fn_name="read_html", io=make_html_file()) def test_to_html(self, tmp_path): modin_df, pandas_df = create_test_dfs(TEST_DATA) eval_to_file( tmp_path, modin_obj=modin_df, pandas_obj=pandas_df, fn="to_html", extension="html", ) @pytest.mark.filterwarnings(default_to_pandas_ignore_string) class TestFwf: @pytest.mark.parametrize("pathlike", [False, True]) def test_fwf_file(self, make_fwf_file, pathlike): fwf_data = ( "id8141 360.242940 149.910199 11950.7\n" + "id1594 444.953632 166.985655 11788.4\n" + "id1849 364.136849 183.628767 11806.2\n" + "id1230 413.836124 184.375703 11916.8\n" + "id1948 502.953953 173.237159 12468.3\n" ) unique_filename = make_fwf_file(fwf_data=fwf_data) colspecs = [(0, 6), (8, 20), (21, 33), (34, 43)] df = pd.read_fwf( Path(unique_filename) if pathlike else unique_filename, colspecs=colspecs, header=None, index_col=0, ) assert isinstance(df, pd.DataFrame) @pytest.mark.parametrize( "kwargs", [ { "colspecs": [ (0, 11), (11, 15), (19, 24), (27, 32), (35, 40), (43, 48), (51, 56), (59, 64), (67, 72), (75, 80), (83, 88), (91, 96), (99, 104), (107, 112), ], "names": ["stationID", "year", 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], "na_values": ["-9999"], "index_col": ["stationID", "year"], }, { "widths": [20, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8], "names": ["id", 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], "index_col": [0], }, ], ) def test_fwf_file_colspecs_widths(self, make_fwf_file, kwargs): unique_filename = make_fwf_file() modin_df = pd.read_fwf(unique_filename, **kwargs) pandas_df = pd.read_fwf(unique_filename, **kwargs) df_equals(modin_df, pandas_df) @pytest.mark.parametrize( "usecols", [ ["a"], ["a", "b", "d"], [0, 1, 3], ], ) def test_fwf_file_usecols(self, make_fwf_file, usecols): fwf_data = ( "a b c d\n" + "id8141 360.242940 149.910199 11950.7\n" + "id1594 444.953632 166.985655 11788.4\n" + "id1849 364.136849 183.628767 11806.2\n" + "id1230 413.836124 184.375703 11916.8\n" + "id1948 502.953953 173.237159 12468.3\n" ) eval_io( fn_name="read_fwf", # read_fwf kwargs filepath_or_buffer=make_fwf_file(fwf_data=fwf_data), usecols=usecols, ) @pytest.mark.parametrize( "dtype_backend", [lib.no_default, "numpy_nullable", "pyarrow"] ) def test_read_fwf_dtype_backend(self, make_fwf_file, dtype_backend): unique_filename = make_fwf_file() def comparator(df1, df2): df_equals(df1, df2) df_equals(df1.dtypes, df2.dtypes) eval_io( fn_name="read_fwf", # read_csv kwargs filepath_or_buffer=unique_filename, dtype_backend=dtype_backend, comparator=comparator, ) def test_fwf_file_chunksize(self, make_fwf_file): unique_filename = make_fwf_file() # Tests __next__ and correctness of reader as an iterator rdf_reader = pd.read_fwf(unique_filename, chunksize=5) pd_reader = pandas.read_fwf(unique_filename, chunksize=5) for modin_df, pd_df in zip(rdf_reader, pd_reader): df_equals(modin_df, pd_df) # Tests that get_chunk works correctly rdf_reader = pd.read_fwf(unique_filename, chunksize=1) pd_reader = pandas.read_fwf(unique_filename, chunksize=1) modin_df = rdf_reader.get_chunk(1) pd_df = pd_reader.get_chunk(1) df_equals(modin_df, pd_df) # Tests that read works correctly rdf_reader = pd.read_fwf(unique_filename, chunksize=1) pd_reader = pandas.read_fwf(unique_filename, chunksize=1) modin_df = rdf_reader.read() pd_df = pd_reader.read() df_equals(modin_df, pd_df) @pytest.mark.parametrize("nrows", [13, None]) def test_fwf_file_skiprows(self, make_fwf_file, nrows): unique_filename = make_fwf_file() eval_io( fn_name="read_fwf", # read_fwf kwargs filepath_or_buffer=unique_filename, skiprows=2, nrows=nrows, ) eval_io( fn_name="read_fwf", # read_fwf kwargs filepath_or_buffer=unique_filename, usecols=[0, 4, 7], skiprows=[2, 5], nrows=nrows, ) def test_fwf_file_index_col(self, make_fwf_file): fwf_data = ( "a b c d\n" + "id8141 360.242940 149.910199 11950.7\n" + "id1594 444.953632 166.985655 11788.4\n" + "id1849 364.136849 183.628767 11806.2\n" + "id1230 413.836124 184.375703 11916.8\n" + "id1948 502.953953 173.237159 12468.3\n" ) eval_io( fn_name="read_fwf", # read_fwf kwargs filepath_or_buffer=make_fwf_file(fwf_data=fwf_data), index_col="c", ) def test_fwf_file_skipfooter(self, make_fwf_file): eval_io( fn_name="read_fwf", # read_fwf kwargs filepath_or_buffer=make_fwf_file(), skipfooter=2, ) def test_fwf_file_parse_dates(self, make_fwf_file): dates = pandas.date_range("2000", freq="h", periods=10) fwf_data = "col1 col2 col3 col4" for i in range(10, 20): fwf_data = fwf_data + "\n{col1} {col2} {col3} {col4}".format( col1=str(i), col2=str(dates[i - 10].date()), col3=str(i), col4=str(dates[i - 10].time()), ) unique_filename = make_fwf_file(fwf_data=fwf_data) eval_io( fn_name="read_fwf", # read_fwf kwargs filepath_or_buffer=unique_filename, parse_dates=[["col2", "col4"]], ) eval_io( fn_name="read_fwf", # read_fwf kwargs filepath_or_buffer=unique_filename, parse_dates={"time": ["col2", "col4"]}, ) @pytest.mark.parametrize( "read_mode", [ "r", "rb", ], ) def test_read_fwf_file_handle(self, make_fwf_file, read_mode): with open(make_fwf_file(), mode=read_mode) as buffer: df_pandas = pandas.read_fwf(buffer) buffer.seek(0) df_modin = pd.read_fwf(buffer) df_equals(df_modin, df_pandas) def test_read_fwf_empty_frame(self, make_fwf_file): kwargs = { "usecols": [0], "index_col": 0, } unique_filename = make_fwf_file() modin_df = pd.read_fwf(unique_filename, **kwargs) pandas_df = pandas.read_fwf(unique_filename, **kwargs) df_equals(modin_df, pandas_df) @pytest.mark.parametrize( "storage_options_extra", [{"anon": False}, {"anon": True}, {"key": "123", "secret": "123"}], ) def test_read_fwf_s3(self, s3_resource, s3_storage_options, storage_options_extra): expected_exception = None if "anon" in storage_options_extra: expected_exception = PermissionError("Forbidden") eval_io( fn_name="read_fwf", filepath_or_buffer="s3://modin-test/modin-bugs/test_data.fwf", storage_options=s3_storage_options | storage_options_extra, expected_exception=expected_exception, ) @pytest.mark.filterwarnings(default_to_pandas_ignore_string) class TestGbq: @pytest.mark.skip(reason="Can not pass without GBQ access") def test_read_gbq(self): # Test API, but do not supply credentials until credits can be secured. with pytest.raises( ValueError, match="Could not determine project ID and one was not supplied." ): pd.read_gbq("SELECT 1") @pytest.mark.skip(reason="Can not pass without GBQ access") def test_to_gbq(self): modin_df, _ = create_test_dfs(TEST_DATA) # Test API, but do not supply credentials until credits can be secured. with pytest.raises( ValueError, match="Could not determine project ID and one was not supplied." ): modin_df.to_gbq("modin.table") def test_read_gbq_mock(self): test_args = ("fake_query",) test_kwargs = inspect.signature(pd.read_gbq).parameters.copy() test_kwargs.update(project_id="test_id", dialect="standart") test_kwargs.pop("query", None) with mock.patch( "pandas.read_gbq", return_value=pandas.DataFrame([]) ) as read_gbq: pd.read_gbq(*test_args, **test_kwargs) read_gbq.assert_called_once_with(*test_args, **test_kwargs) @pytest.mark.filterwarnings(default_to_pandas_ignore_string) class TestStata: def test_read_stata(self, make_stata_file): eval_io( fn_name="read_stata", # read_stata kwargs filepath_or_buffer=make_stata_file(), ) def test_to_stata(self, tmp_path): modin_df, pandas_df = create_test_dfs(TEST_DATA) eval_to_file( tmp_path, modin_obj=modin_df, pandas_obj=pandas_df, fn="to_stata", extension="stata", ) @pytest.mark.filterwarnings(default_to_pandas_ignore_string) class TestSas: def test_read_sas(self): eval_io( fn_name="read_sas", # read_sas kwargs filepath_or_buffer="modin/tests/pandas/data/airline.sas7bdat", ) @pytest.mark.filterwarnings(default_to_pandas_ignore_string) class TestFeather: def test_read_feather(self, make_feather_file): eval_io( fn_name="read_feather", # read_feather kwargs path=make_feather_file(), ) @pytest.mark.parametrize( "dtype_backend", [lib.no_default, "numpy_nullable", "pyarrow"] ) def test_read_feather_dtype_backend(self, make_feather_file, dtype_backend): def comparator(df1, df2): df_equals(df1, df2) df_equals(df1.dtypes, df2.dtypes) eval_io( fn_name="read_feather", # read_feather kwargs path=make_feather_file(), dtype_backend=dtype_backend, comparator=comparator, ) @pytest.mark.parametrize( "storage_options_extra", [{"anon": False}, {"anon": True}, {"key": "123", "secret": "123"}], ) def test_read_feather_s3( self, s3_resource, s3_storage_options, storage_options_extra ): expected_exception = None if "anon" in storage_options_extra: expected_exception = PermissionError("Forbidden") eval_io( fn_name="read_feather", path="s3://modin-test/modin-bugs/test_data.feather", storage_options=s3_storage_options | storage_options_extra, expected_exception=expected_exception, ) def test_read_feather_path_object(self, make_feather_file): eval_io( fn_name="read_feather", path=Path(make_feather_file()), ) def test_to_feather(self, tmp_path): modin_df, pandas_df = create_test_dfs(TEST_DATA) eval_to_file( tmp_path, modin_obj=modin_df, pandas_obj=pandas_df, fn="to_feather", extension="feather", ) def test_read_feather_with_index_metadata(self, tmp_path): # see: https://github.com/modin-project/modin/issues/6212 df = pandas.DataFrame({"a": [1, 2, 3]}, index=[0, 1, 2]) assert not isinstance(df.index, pandas.RangeIndex) path = get_unique_filename(extension=".feather", data_dir=tmp_path) df.to_feather(path) eval_io( fn_name="read_feather", path=path, ) @pytest.mark.filterwarnings(default_to_pandas_ignore_string) class TestClipboard: @pytest.mark.skip(reason="No clipboard in CI") def test_read_clipboard(self): setup_clipboard() eval_io(fn_name="read_clipboard") @pytest.mark.skip(reason="No clipboard in CI") def test_to_clipboard(self): modin_df, pandas_df = create_test_dfs(TEST_DATA) modin_df.to_clipboard() modin_as_clip = pandas.read_clipboard() pandas_df.to_clipboard() pandas_as_clip = pandas.read_clipboard() assert modin_as_clip.equals(pandas_as_clip) @pytest.mark.filterwarnings(default_to_pandas_ignore_string) class TestPickle: def test_read_pickle(self, make_pickle_file): eval_io( fn_name="read_pickle", # read_pickle kwargs filepath_or_buffer=make_pickle_file(), ) def test_to_pickle(self, tmp_path): modin_df, _ = create_test_dfs(TEST_DATA) unique_filename_modin = get_unique_filename(extension="pkl", data_dir=tmp_path) modin_df.to_pickle(unique_filename_modin) recreated_modin_df = pd.read_pickle(unique_filename_modin) df_equals(modin_df, recreated_modin_df) @pytest.mark.filterwarnings(default_to_pandas_ignore_string) class TestXml: @pytest.mark.skipif( platform.system() == "Windows", reason="https://github.com/modin-project/modin/issues/7497", ) def test_read_xml(self): # example from pandas data = """ square 360 4.0 circle 360 triangle 180 3.0 """ eval_io("read_xml", path_or_buffer=data) @pytest.mark.filterwarnings(default_to_pandas_ignore_string) class TestOrc: # It's not easy to add infrastructure for `orc` format. # In case of defaulting to pandas, it's enough # to check that the parameters are passed to pandas. def test_read_orc(self): test_args = ("fake_path",) test_kwargs = dict( columns=["A"], dtype_backend=lib.no_default, filesystem=None, fake_kwarg="some_pyarrow_parameter", ) with mock.patch( "pandas.read_orc", return_value=pandas.DataFrame([]) ) as read_orc: pd.read_orc(*test_args, **test_kwargs) read_orc.assert_called_once_with(*test_args, **test_kwargs) @pytest.mark.filterwarnings(default_to_pandas_ignore_string) class TestSpss: # It's not easy to add infrastructure for `spss` format. # In case of defaulting to pandas, it's enough # to check that the parameters are passed to pandas. def test_read_spss(self): test_args = ("fake_path",) test_kwargs = dict( usecols=["A"], convert_categoricals=False, dtype_backend=lib.no_default ) with mock.patch( "pandas.read_spss", return_value=pandas.DataFrame([]) ) as read_spss: pd.read_spss(*test_args, **test_kwargs) read_spss.assert_called_once_with(*test_args, **test_kwargs) @pytest.mark.filterwarnings(default_to_pandas_ignore_string) def test_json_normalize(): # example from pandas data = [ {"id": 1, "name": {"first": "Coleen", "last": "Volk"}}, {"name": {"given": "Mark", "family": "Regner"}}, {"id": 2, "name": "Faye Raker"}, ] eval_io("json_normalize", data=data) @pytest.mark.filterwarnings(default_to_pandas_ignore_string) def test_from_arrow(): _, pandas_df = create_test_dfs(TEST_DATA) modin_df = from_arrow(pa.Table.from_pandas(pandas_df)) df_equals(modin_df, pandas_df) @pytest.mark.skipif( condition=Engine.get() != "Ray", reason="Distributed 'from_pandas' is only available for Ray engine", ) @pytest.mark.parametrize("modify_config", [{AsyncReadMode: True}], indirect=True) def test_distributed_from_pandas(modify_config): pandas_df = pandas.DataFrame({f"col{i}": np.arange(200_000) for i in range(64)}) modin_df = pd.DataFrame(pandas_df) df_equals(modin_df, pandas_df) @pytest.mark.filterwarnings(default_to_pandas_ignore_string) def test_from_spmatrix(): data = sparse.eye(3) with pytest.warns(UserWarning, match="defaulting to pandas.*"): modin_df = pd.DataFrame.sparse.from_spmatrix(data) pandas_df = pandas.DataFrame.sparse.from_spmatrix(data) df_equals(modin_df, pandas_df) @pytest.mark.filterwarnings(default_to_pandas_ignore_string) def test_to_dense(): data = {"col1": pandas.arrays.SparseArray([0, 1, 0])} modin_df, pandas_df = create_test_dfs(data) df_equals(modin_df.sparse.to_dense(), pandas_df.sparse.to_dense()) @pytest.mark.filterwarnings(default_to_pandas_ignore_string) def test_to_dict_dataframe(): modin_df, _ = create_test_dfs(TEST_DATA) assert modin_df.to_dict() == to_pandas(modin_df).to_dict() @pytest.mark.parametrize( "kwargs", [ pytest.param({}, id="no_kwargs"), pytest.param({"into": dict}, id="into_dict"), pytest.param({"into": defaultdict(list)}, id="into_defaultdict"), ], ) @pytest.mark.filterwarnings(default_to_pandas_ignore_string) def test_to_dict_series(kwargs): eval_general( *[df.iloc[:, 0] for df in create_test_dfs(utils_test_data["int_data"])], lambda df: df.to_dict(**kwargs), ) @pytest.mark.filterwarnings(default_to_pandas_ignore_string) def test_to_latex(): modin_df, _ = create_test_dfs(TEST_DATA) assert modin_df.to_latex() == to_pandas(modin_df).to_latex() @pytest.mark.filterwarnings(default_to_pandas_ignore_string) @pytest.mark.skipif( platform.system() == "Windows", reason="https://github.com/modin-project/modin/issues/7497", ) def test_to_xml(): # `lxml` is a required dependency for `to_xml`, but optional for Modin. # For some engines we do not install it. pytest.importorskip("lxml") modin_df, _ = create_test_dfs(TEST_DATA) assert modin_df.to_xml() == to_pandas(modin_df).to_xml() @pytest.mark.filterwarnings(default_to_pandas_ignore_string) def test_to_period(): index = pandas.DatetimeIndex( pandas.date_range("2000", freq="h", periods=len(TEST_DATA["col1"])) ) modin_df, pandas_df = create_test_dfs(TEST_DATA, index=index) df_equals(modin_df.to_period(), pandas_df.to_period()) @pytest.mark.xfail( Engine.get() == "Ray" and version.parse(ray.__version__) <= version.parse("2.9.3"), reason="Ray-2.9.3 has a problem using pandas 2.2.0. It will be resolved in the next release of Ray.", ) @pytest.mark.skipif( condition=Engine.get() != "Ray", reason="Modin Dataframe can only be converted to a Ray Dataset if Modin uses a Ray engine.", ) @pytest.mark.filterwarnings(default_to_pandas_ignore_string) def test_df_to_ray(): index = pandas.DatetimeIndex( pandas.date_range("2000", freq="h", periods=len(TEST_DATA["col1"])) ) modin_df, pandas_df = create_test_dfs(TEST_DATA, index=index) ray_dataset = modin_df.modin.to_ray() df_equals(ray_dataset.to_pandas(), pandas_df) @pytest.mark.xfail( Engine.get() == "Ray" and version.parse(ray.__version__) <= version.parse("2.9.3"), reason="Ray-2.9.3 has a problem using pandas 2.2.0. It will be resolved in the next release of Ray.", ) @pytest.mark.skipif( condition=Engine.get() != "Ray", reason="Modin Dataframe can only be converted to a Ray Dataset if Modin uses a Ray engine.", ) @pytest.mark.filterwarnings(default_to_pandas_ignore_string) def test_series_to_ray(): index = pandas.DatetimeIndex( pandas.date_range("2000", freq="h", periods=len(TEST_DATA["col1"])) ) # A Pandas DataFrame with column names of non-str types is not supported by Ray Dataset. index = [str(x) for x in index] pandas_df = pandas.DataFrame(TEST_DATA, index=index) pandas_s = pandas_df.iloc[0] modin_s = pd.Series(pandas_s) ray_dataset = modin_s.modin.to_ray() df_equals(ray_dataset.to_pandas().squeeze(), pandas_s) @pytest.mark.xfail( Engine.get() == "Ray" and version.parse(ray.__version__) <= version.parse("2.9.3"), reason="Ray-2.9.3 has a problem using pandas 2.2.0. It will be resolved in the next release of Ray.", ) @pytest.mark.skipif( condition=Engine.get() != "Ray", reason="Ray Dataset can only be converted to a Modin Dataframe if Modin uses a Ray engine.", ) @pytest.mark.filterwarnings(default_to_pandas_ignore_string) def test_from_ray(): index = pandas.DatetimeIndex( pandas.date_range("2000", freq="h", periods=len(TEST_DATA["col1"])) ) modin_df, pandas_df = create_test_dfs(TEST_DATA, index=index) ray_df = ray.data.from_pandas(pandas_df) result_df = from_ray(ray_df) df_equals(result_df, modin_df) @pytest.mark.skipif( condition=Engine.get() != "Dask", reason="Modin DataFrame can only be converted to a Dask DataFrame if Modin uses a Dask engine.", ) @pytest.mark.filterwarnings(default_to_pandas_ignore_string) def test_df_to_dask(): index = pandas.DatetimeIndex( pandas.date_range("2000", freq="h", periods=len(TEST_DATA["col1"])) ) modin_df, pandas_df = create_test_dfs(TEST_DATA, index=index) dask_df = modin_df.modin.to_dask() df_equals(dask_df.compute(), pandas_df) @pytest.mark.skipif( condition=Engine.get() != "Dask", reason="Modin DataFrame can only be converted to a Dask DataFrame if Modin uses a Dask engine.", ) @pytest.mark.filterwarnings(default_to_pandas_ignore_string) def test_series_to_dask(): modin_s, pandas_s = create_test_series(TEST_DATA["col1"]) dask_series = modin_s.modin.to_dask() df_equals(dask_series.compute(), pandas_s) @pytest.mark.skipif( condition=Engine.get() != "Dask", reason="Dask DataFrame can only be converted to a Modin DataFrame if Modin uses a Dask engine.", ) @pytest.mark.filterwarnings(default_to_pandas_ignore_string) def test_from_dask(): import dask.dataframe as dd index = pandas.DatetimeIndex( pandas.date_range("2000", freq="h", periods=len(TEST_DATA["col1"])) ) modin_df, pandas_df = create_test_dfs(TEST_DATA, index=index) dask_df = dd.from_pandas(pandas_df, npartitions=NPartitions.get()) result_df = from_dask(dask_df) df_equals(result_df, modin_df) @pytest.mark.skipif( condition=Engine.get() not in ("Ray", "Dask", "Unidist"), reason="Modin DataFrame can only be created from map if Modin uses Ray, Dask or MPI engine.", ) @pytest.mark.filterwarnings(default_to_pandas_ignore_string) def test_from_map(): factor = 3 data = [1] * factor + [2] * factor + [3] * factor expected_df = pd.DataFrame(data, index=[0, 1, 2] * factor) def map_func(x, factor): return [x] * factor result_df = from_map(map_func, [1, 2, 3], 3) df_equals(result_df, expected_df) ================================================ FILE: modin/tests/pandas/test_repartition.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import re import numpy as np import pytest import modin.pandas as pd from modin.config import context from modin.core.storage_formats.pandas.native_query_compiler import ( _NO_REPARTITION_ON_NATIVE_EXECUTION_EXCEPTION_MESSAGE, ) from modin.tests.test_utils import current_execution_is_native from modin.utils import get_current_execution @pytest.fixture(autouse=True) def set_npartitions(): with context(NPartitions=4): yield @pytest.mark.skipif( current_execution_is_native(), reason="Native execution does not have partitions." ) @pytest.mark.skipif( get_current_execution() == "BaseOnPython", reason="BaseOnPython chooses partition numbers differently", ) @pytest.mark.parametrize("axis", [0, 1, None]) @pytest.mark.parametrize("dtype", ["DataFrame", "Series"]) def test_repartition(axis, dtype): if axis in (1, None) and dtype == "Series": # no sense for Series return df = pd.DataFrame({"col1": [1, 2], "col2": [5, 6]}) df2 = pd.DataFrame({"col3": [9, 4]}) df = pd.concat([df, df2], axis=1) df = pd.concat([df, df], axis=0) obj = df if dtype == "DataFrame" else df["col1"] source_shapes = { "DataFrame": (2, 2), "Series": (2, 1), } # check that the test makes sense assert obj._query_compiler._modin_frame._partitions.shape == source_shapes[dtype] kwargs = {"axis": axis} if dtype == "DataFrame" else {} obj = obj._repartition(**kwargs) if dtype == "DataFrame": results = { None: (1, 1), 0: (1, 2), 1: (2, 1), } else: results = { None: (1, 1), 0: (1, 1), 1: (2, 1), } assert obj._query_compiler._modin_frame._partitions.shape == results[axis] @pytest.mark.skipif( current_execution_is_native(), reason="Native execution does not have partitions." ) def test_repartition_7170(): with context(MinColumnPartitionSize=102, NPartitions=5): df = pd.DataFrame(np.random.rand(10000, 100)) _ = df._repartition(axis=1).to_numpy() @pytest.mark.skipif( not current_execution_is_native(), reason="This is a native execution test." ) def test_repartition_not_valid_on_native_execution(): df = pd.DataFrame() with pytest.raises( Exception, match=re.escape(_NO_REPARTITION_ON_NATIVE_EXECUTION_EXCEPTION_MESSAGE), ): df._repartition() ================================================ FILE: modin/tests/pandas/test_reshape.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import contextlib import numpy as np import pandas import pytest import modin.pandas as pd from modin.config import StorageFormat from modin.tests.test_utils import ( current_execution_is_native, df_or_series_using_native_execution, warns_that_defaulting_to_pandas_if, ) from .utils import df_equals, test_data_values def test_get_dummies(): s = pd.Series(list("abca")) with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): pd.get_dummies(s) s1 = ["a", "b", np.nan] with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): pd.get_dummies(s1) with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): pd.get_dummies(s1, dummy_na=True) data = {"A": ["a", "b", "a"], "B": ["b", "a", "c"], "C": [1, 2, 3]} modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) modin_result = pd.get_dummies(modin_df, prefix=["col1", "col2"]) pandas_result = pandas.get_dummies(pandas_df, prefix=["col1", "col2"]) df_equals(modin_result, pandas_result) assert modin_result._to_pandas().columns.equals(pandas_result.columns) assert modin_result.shape == pandas_result.shape modin_result = pd.get_dummies(pd.DataFrame(pd.Series(list("abcdeabac")))) pandas_result = pandas.get_dummies( pandas.DataFrame(pandas.Series(list("abcdeabac"))) ) df_equals(modin_result, pandas_result) assert modin_result._to_pandas().columns.equals(pandas_result.columns) assert modin_result.shape == pandas_result.shape with pytest.raises(NotImplementedError): pd.get_dummies(modin_df, prefix=["col1", "col2"], sparse=True) with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): pd.get_dummies(pd.Series(list("abcaa"))) with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): pd.get_dummies(pd.Series(list("abcaa")), drop_first=True) with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): pd.get_dummies(pd.Series(list("abc")), dtype=float) with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): pd.get_dummies(1) # test from #5184 pandas_df = pandas.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["7", "8", "9"]}) modin_df = pd.DataFrame(pandas_df) pandas_result = pandas.get_dummies(pandas_df, columns=["a", "b"]) modin_result = pd.get_dummies(modin_df, columns=["a", "b"]) df_equals(modin_result, pandas_result) def test_melt(): data = test_data_values[0] with ( pytest.warns( UserWarning, match=r"`melt` implementation has mismatches with pandas" ) if StorageFormat.get() == "Pandas" else contextlib.nullcontext() ): pd.melt(pd.DataFrame(data)) def test_crosstab(): a = np.array( ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"], dtype=object, ) b = np.array( ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"], dtype=object, ) c = np.array( [ "dull", "dull", "shiny", "dull", "dull", "shiny", "shiny", "dull", "shiny", "shiny", "shiny", ], dtype=object, ) with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): df = pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"]) assert isinstance(df, pd.DataFrame) foo = pd.Categorical(["a", "b"], categories=["a", "b", "c"]) bar = pd.Categorical(["d", "e"], categories=["d", "e", "f"]) with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): df = pd.crosstab(foo, bar) assert isinstance(df, pd.DataFrame) with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): df = pd.crosstab(foo, bar, dropna=False) assert isinstance(df, pd.DataFrame) def test_lreshape(): data = pd.DataFrame( { "hr1": [514, 573], "hr2": [545, 526], "team": ["Red Sox", "Yankees"], "year1": [2007, 2008], "year2": [2008, 2008], } ) with warns_that_defaulting_to_pandas_if(not current_execution_is_native()): df = pd.lreshape(data, {"year": ["year1", "year2"], "hr": ["hr1", "hr2"]}) assert isinstance(df, pd.DataFrame) with pytest.raises(ValueError): pd.lreshape(data.to_numpy(), {"year": ["year1", "year2"], "hr": ["hr1", "hr2"]}) def test_wide_to_long(): data = pd.DataFrame( { "hr1": [514, 573], "hr2": [545, 526], "team": ["Red Sox", "Yankees"], "year1": [2007, 2008], "year2": [2008, 2008], } ) with warns_that_defaulting_to_pandas_if( not df_or_series_using_native_execution(data) ): df = pd.wide_to_long(data, ["hr", "year"], "team", "index") assert isinstance(df, pd.DataFrame) with pytest.raises(ValueError): pd.wide_to_long(data.to_numpy(), ["hr", "year"], "team", "index") ================================================ FILE: modin/tests/pandas/test_rolling.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import numpy as np import pandas import pandas._libs.lib as lib import pytest import modin.pandas as pd from modin.config import NPartitions from .utils import ( create_test_dfs, create_test_series, default_to_pandas_ignore_string, df_equals, eval_general, test_data_keys, test_data_values, ) NPartitions.put(4) # Our configuration in pytest.ini requires that we explicitly catch all # instances of defaulting to pandas, but some test modules, like this one, # have too many such instances. # TODO(https://github.com/modin-project/modin/issues/3655): catch all instances # of defaulting to pandas. pytestmark = [ pytest.mark.filterwarnings(default_to_pandas_ignore_string), # TO MAKE SURE ALL FUTUREWARNINGS ARE CONSIDERED pytest.mark.filterwarnings("error::FutureWarning"), # ... except for this expected Ray warning due to https://github.com/ray-project/ray/issues/54868 pytest.mark.filterwarnings( "ignore:.*In future versions of Ray, Ray will no longer override accelerator visible devices env var if num_gpus=0 or num_gpus=None:FutureWarning" ), # IGNORE FUTUREWARNINGS MARKS TO CLEANUP OUTPUT pytest.mark.filterwarnings( "ignore:Support for axis=1 in DataFrame.rolling is deprecated:FutureWarning" ), # FIXME: these cases inconsistent between modin and pandas pytest.mark.filterwarnings( "ignore:.*In a future version of pandas, the provided callable will be used directly.*:FutureWarning" ), ] @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("window", [5, 100]) @pytest.mark.parametrize("min_periods", [None, 5]) @pytest.mark.parametrize("axis", [lib.no_default, 1]) @pytest.mark.parametrize( "method, kwargs", [ ("count", {}), ("sum", {}), ("mean", {}), ("var", {"ddof": 0}), ("std", {"ddof": 0}), ("min", {}), ("max", {}), ("skew", {}), ("kurt", {}), ("apply", {"func": np.sum}), ("rank", {}), ("sem", {"ddof": 0}), ("quantile", {"q": 0.1}), ("median", {}), ], ) def test_dataframe_rolling(data, window, min_periods, axis, method, kwargs): # Testing of Rolling class modin_df, pandas_df = create_test_dfs(data) if window > len(pandas_df): window = len(pandas_df) eval_general( modin_df, pandas_df, lambda df: getattr( df.rolling( window=window, min_periods=min_periods, win_type=None, center=True, axis=axis, ), method, )(**kwargs), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("window", [5, 100]) @pytest.mark.parametrize("min_periods", [None, 5]) @pytest.mark.parametrize("axis", [lib.no_default, 1]) def test_dataframe_agg(data, window, min_periods, axis): modin_df, pandas_df = create_test_dfs(data) if window > len(pandas_df): window = len(pandas_df) modin_rolled = modin_df.rolling( window=window, min_periods=min_periods, win_type=None, center=True, axis=axis ) pandas_rolled = pandas_df.rolling( window=window, min_periods=min_periods, win_type=None, center=True, axis=axis ) df_equals(pandas_rolled.aggregate(np.sum), modin_rolled.aggregate(np.sum)) # TODO(https://github.com/modin-project/modin/issues/4260): Once pandas # allows us to rolling aggregate a list of functions over axis 1, test # that, too. if axis != 1: df_equals( pandas_rolled.aggregate([np.sum, np.mean]), modin_rolled.aggregate([np.sum, np.mean]), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("window", [5, 100]) @pytest.mark.parametrize("min_periods", [None, 5]) @pytest.mark.parametrize("axis", [lib.no_default, 1]) @pytest.mark.parametrize( "method, kwargs", [ ("sum", {}), ("mean", {}), ("var", {"ddof": 0}), ("std", {"ddof": 0}), ], ) def test_dataframe_window(data, window, min_periods, axis, method, kwargs): # Testing of Window class modin_df, pandas_df = create_test_dfs(data) if window > len(pandas_df): window = len(pandas_df) eval_general( modin_df, pandas_df, lambda df: getattr( df.rolling( window=window, min_periods=min_periods, win_type="triang", center=True, axis=axis, ), method, )(**kwargs), ) @pytest.mark.parametrize("axis", [lib.no_default, "columns"]) @pytest.mark.parametrize("on", [None, "DateCol"]) @pytest.mark.parametrize("closed", ["both", "right"]) @pytest.mark.parametrize("window", [3, "3s"]) def test_dataframe_dt_index(axis, on, closed, window): index = pandas.date_range("31/12/2000", periods=12, freq="min") data = {"A": range(12), "B": range(12)} pandas_df = pandas.DataFrame(data, index=index) modin_df = pd.DataFrame(data, index=index) if on is not None and axis == lib.no_default and isinstance(window, str): pandas_df[on] = pandas.date_range("22/06/1941", periods=12, freq="min") modin_df[on] = pd.date_range("22/06/1941", periods=12, freq="min") else: on = None if axis == "columns": pandas_df = pandas_df.T modin_df = modin_df.T pandas_rolled = pandas_df.rolling(window=window, on=on, axis=axis, closed=closed) modin_rolled = modin_df.rolling(window=window, on=on, axis=axis, closed=closed) if isinstance(window, int): # This functions are very slowly for data from test_rolling df_equals( modin_rolled.corr(modin_df, True), pandas_rolled.corr(pandas_df, True) ) df_equals( modin_rolled.corr(modin_df, False), pandas_rolled.corr(pandas_df, False) ) df_equals(modin_rolled.cov(modin_df, True), pandas_rolled.cov(pandas_df, True)) df_equals( modin_rolled.cov(modin_df, False), pandas_rolled.cov(pandas_df, False) ) if axis == lib.no_default: df_equals( modin_rolled.cov(modin_df[modin_df.columns[0]], True), pandas_rolled.cov(pandas_df[pandas_df.columns[0]], True), ) df_equals( modin_rolled.corr(modin_df[modin_df.columns[0]], True), pandas_rolled.corr(pandas_df[pandas_df.columns[0]], True), ) else: df_equals(modin_rolled.count(), pandas_rolled.count()) df_equals(modin_rolled.skew(), pandas_rolled.skew()) df_equals( modin_rolled.apply(np.sum, raw=True), pandas_rolled.apply(np.sum, raw=True), ) df_equals(modin_rolled.aggregate(np.sum), pandas_rolled.aggregate(np.sum)) df_equals(modin_rolled.quantile(0.1), pandas_rolled.quantile(0.1)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("window", [5, 100]) @pytest.mark.parametrize("min_periods", [None, 5]) @pytest.mark.parametrize( "method, kwargs", [ ("count", {}), ("sum", {}), ("mean", {}), ("var", {"ddof": 0}), ("std", {"ddof": 0}), ("min", {}), ("max", {}), ("skew", {}), ("kurt", {}), ("apply", {"func": np.sum}), ("rank", {}), ("sem", {"ddof": 0}), ("aggregate", {"func": np.sum}), ("agg", {"func": [np.sum, np.mean]}), ("quantile", {"q": 0.1}), ("median", {}), ], ) def test_series_rolling(data, window, min_periods, method, kwargs): # Test of Rolling class modin_series, pandas_series = create_test_series(data) if window > len(pandas_series): window = len(pandas_series) eval_general( modin_series, pandas_series, lambda series: getattr( series.rolling( window=window, min_periods=min_periods, win_type=None, center=True, ), method, )(**kwargs), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("window", [5, 100]) @pytest.mark.parametrize("min_periods", [None, 5]) def test_series_corr_cov(data, window, min_periods): modin_series, pandas_series = create_test_series(data) if window > len(pandas_series): window = len(pandas_series) modin_rolled = modin_series.rolling( window=window, min_periods=min_periods, win_type=None, center=True ) pandas_rolled = pandas_series.rolling( window=window, min_periods=min_periods, win_type=None, center=True ) df_equals(modin_rolled.corr(modin_series), pandas_rolled.corr(pandas_series)) df_equals( modin_rolled.cov(modin_series, True), pandas_rolled.cov(pandas_series, True) ) df_equals( modin_rolled.cov(modin_series, False), pandas_rolled.cov(pandas_series, False) ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("window", [5, 100]) @pytest.mark.parametrize("min_periods", [None, 5]) @pytest.mark.parametrize( "method, kwargs", [ ("sum", {}), ("mean", {}), ("var", {"ddof": 0}), ("std", {"ddof": 0}), ], ) def test_series_window(data, window, min_periods, method, kwargs): # Test of Window class modin_series, pandas_series = create_test_series(data) if window > len(pandas_series): window = len(pandas_series) eval_general( modin_series, pandas_series, lambda series: getattr( series.rolling( window=window, min_periods=min_periods, win_type="triang", center=True, ), method, )(**kwargs), ) @pytest.mark.parametrize("closed", ["both", "right"]) def test_series_dt_index(closed): index = pandas.date_range("1/1/2000", periods=12, freq="min") pandas_series = pandas.Series(range(12), index=index) modin_series = pd.Series(range(12), index=index) pandas_rolled = pandas_series.rolling("3s", closed=closed) modin_rolled = modin_series.rolling("3s", closed=closed) df_equals(modin_rolled.count(), pandas_rolled.count()) df_equals(modin_rolled.skew(), pandas_rolled.skew()) df_equals( modin_rolled.apply(np.sum, raw=True), pandas_rolled.apply(np.sum, raw=True) ) df_equals(modin_rolled.aggregate(np.sum), pandas_rolled.aggregate(np.sum)) df_equals(modin_rolled.quantile(0.1), pandas_rolled.quantile(0.1)) def test_api_indexer(): modin_df, pandas_df = create_test_dfs(test_data_values[0]) indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=3) pandas_rolled = pandas_df.rolling(window=indexer) modin_rolled = modin_df.rolling(window=indexer) df_equals(modin_rolled.sum(), pandas_rolled.sum()) def test_issue_3512(): data = np.random.rand(129) modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) modin_ans = modin_df[0:33].rolling(window=21).mean() pandas_ans = pandas_df[0:33].rolling(window=21).mean() df_equals(modin_ans, pandas_ans) ### TEST ROLLING WARNINGS ### def test_rolling_axis_1_depr(): index = pandas.date_range("31/12/2000", periods=12, freq="min") data = {"A": range(12), "B": range(12)} modin_df = pd.DataFrame(data, index=index) with pytest.warns( FutureWarning, match="Support for axis=1 in DataFrame.rolling is deprecated", ): modin_df.rolling(window=3, axis=1) ================================================ FILE: modin/tests/pandas/test_series.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. from __future__ import annotations import datetime import itertools import json import sys import unittest.mock as mock import warnings import matplotlib import numpy as np import pandas import pandas._libs.lib as lib import pytest from numpy.testing import assert_array_equal from packaging.version import Version from pandas.core.indexing import IndexingError from pandas.errors import PerformanceWarning, SpecificationError import modin.pandas as pd from modin.config import Engine, NPartitions, StorageFormat from modin.core.storage_formats.pandas.query_compiler_caster import ( _assert_casting_functions_wrap_same_implementation, ) from modin.pandas.io import to_pandas from modin.tests.test_utils import ( current_execution_is_native, df_or_series_using_native_execution, warns_that_defaulting_to_pandas_if, ) from modin.utils import get_current_execution, try_cast_to_pandas from .utils import ( RAND_HIGH, RAND_LOW, UNIVERSAL_UNARY_NUMPY_FUNCTIONS_FOR_FLOATS, CustomIntegerForAddition, NonCommutativeMultiplyInteger, agg_func_except_keys, agg_func_except_values, agg_func_keys, agg_func_values, arg_keys, bool_arg_keys, bool_arg_values, categories_equals, create_test_dfs, create_test_series, default_to_pandas_ignore_string, df_equals, df_equals_with_non_stable_indices, encoding_types, eval_general, generate_multiindex, int_arg_keys, int_arg_values, name_contains, no_numeric_dfs, numeric_dfs, quantiles_keys, quantiles_values, random_state, sort_if_range_partitioning, string_na_rep_keys, string_na_rep_values, string_sep_keys, string_sep_values, test_data, test_data_categorical_keys, test_data_categorical_values, test_data_diff_dtype, test_data_keys, test_data_large_categorical_series_keys, test_data_large_categorical_series_values, test_data_small_keys, test_data_small_values, test_data_values, test_data_with_duplicates_keys, test_data_with_duplicates_values, test_string_data_keys, test_string_data_values, test_string_list_data_keys, test_string_list_data_values, ) # Our configuration in pytest.ini requires that we explicitly catch all # instances of defaulting to pandas, but some test modules, like this one, # have too many such instances. # TODO(https://github.com/modin-project/modin/issues/3655): catch all instances # of defaulting to pandas. pytestmark = [ pytest.mark.filterwarnings(default_to_pandas_ignore_string), # IGNORE FUTUREWARNINGS MARKS TO CLEANUP OUTPUT pytest.mark.filterwarnings( "ignore:.*bool is now deprecated and will be removed:FutureWarning" ), pytest.mark.filterwarnings( "ignore:first is deprecated and will be removed:FutureWarning" ), pytest.mark.filterwarnings( "ignore:last is deprecated and will be removed:FutureWarning" ), ] NPartitions.put(4) # Force matplotlib to not use any Xwindows backend. matplotlib.use("Agg") # Initialize the environment pd.DataFrame() def get_rop(op): if op.startswith("__") and op.endswith("__"): return "__r" + op[2:] else: return None def inter_df_math_helper( modin_series, pandas_series, op, comparator_kwargs=None, expected_exception=None ): inter_df_math_helper_one_side( modin_series, pandas_series, op, comparator_kwargs, expected_exception ) rop = get_rop(op) if rop: inter_df_math_helper_one_side( modin_series, pandas_series, rop, comparator_kwargs, expected_exception ) def inter_df_math_helper_one_side( modin_series, pandas_series, op, comparator_kwargs=None, expected_exception=None, ): if comparator_kwargs is None: comparator_kwargs = {} try: pandas_attr = getattr(pandas_series, op) except Exception as err: with pytest.raises(type(err)): _ = getattr(modin_series, op) return modin_attr = getattr(modin_series, op) try: pandas_result = pandas_attr(4) except Exception as err: with pytest.raises(type(err)): try_cast_to_pandas(modin_attr(4)) # force materialization else: modin_result = modin_attr(4) df_equals(modin_result, pandas_result, **comparator_kwargs) try: pandas_result = pandas_attr(4.0) except Exception as err: with pytest.raises(type(err)): try_cast_to_pandas(modin_attr(4.0)) # force materialization else: modin_result = modin_attr(4.0) df_equals(modin_result, pandas_result, **comparator_kwargs) # These operations don't support non-scalar `other` or have a strange behavior in # the testing environment if op in [ "__divmod__", "divmod", "rdivmod", "floordiv", "__floordiv__", "rfloordiv", "__rfloordiv__", "mod", "__mod__", "rmod", "__rmod__", ]: return eval_general( modin_series, pandas_series, lambda df: (pandas_attr if isinstance(df, pandas.Series) else modin_attr)(df), comparator_kwargs=comparator_kwargs, expected_exception=expected_exception, ) list_test = random_state.randint(RAND_LOW, RAND_HIGH, size=(modin_series.shape[0])) try: pandas_result = pandas_attr(list_test) except Exception as err: with pytest.raises(type(err)): try_cast_to_pandas(modin_attr(list_test)) # force materialization else: modin_result = modin_attr(list_test) df_equals(modin_result, pandas_result, **comparator_kwargs) series_test_modin = pd.Series(list_test, index=modin_series.index) series_test_pandas = pandas.Series(list_test, index=pandas_series.index) eval_general( series_test_modin, series_test_pandas, lambda df: (pandas_attr if isinstance(df, pandas.Series) else modin_attr)(df), comparator_kwargs=comparator_kwargs, expected_exception=expected_exception, ) # Level test new_idx = pandas.MultiIndex.from_tuples( [(i // 4, i // 2, i) for i in modin_series.index] ) modin_df_multi_level = modin_series.copy() modin_df_multi_level.index = new_idx # When 'level' parameter is passed, modin's implementation must raise a default-to-pandas warning, # here we first detect whether 'op' takes 'level' parameter at all and only then perform the warning check # reasoning: https://github.com/modin-project/modin/issues/6893 try: getattr(modin_df_multi_level, op)(modin_df_multi_level, level=1) except TypeError: # Operation doesn't support 'level' parameter pass else: # Operation supports 'level' parameter, so it makes sense to check for a warning with warns_that_defaulting_to_pandas_if( not df_or_series_using_native_execution(modin_df_multi_level) ): getattr(modin_df_multi_level, op)(modin_df_multi_level, level=1) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_to_frame(data): modin_series, pandas_series = create_test_series(data) df_equals(modin_series.to_frame(name="miao"), pandas_series.to_frame(name="miao")) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_to_list(data): modin_series, pandas_series = create_test_series(data) pd_res = pandas_series.to_list() md_res = modin_series.to_list() assert type(pd_res) is type(md_res) assert np.array_equal(pd_res, md_res, equal_nan=True) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_to_json(data): modin_series, pandas_series = create_test_series(data) pd_res = pandas_series.to_json() md_res = modin_series.to_json() assert type(pd_res) is type(md_res) assert pd_res == md_res def test_accessing_index_element_as_property(): s = pd.Series([10, 20, 30], index=["a", "b", "c"]) assert s.b == 20 with pytest.raises(Exception): _ = s.d @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_callable_key_in_getitem(data): modin_series, pandas_series = create_test_series(data) df_equals( modin_series[lambda s: s.index % 2 == 0], pandas_series[lambda s: s.index % 2 == 0], ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_T(data): modin_series, pandas_series = create_test_series(data) df_equals(modin_series.T, pandas_series.T) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___abs__(data): modin_series, pandas_series = create_test_series(data) df_equals(modin_series.__abs__(), pandas_series.__abs__()) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___add__(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "__add__") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___and__(data, request): modin_series, pandas_series = create_test_series(data) expected_exception = None if "float_nan_data" in request.node.callspec.id: # FIXME: https://github.com/modin-project/modin/issues/7037 expected_exception = False inter_df_math_helper( modin_series, pandas_series, "__and__", # https://github.com/modin-project/modin/issues/5966 comparator_kwargs={"check_dtypes": False}, expected_exception=expected_exception, ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("copy_kwargs", ({"copy": True}, {"copy": None}, {})) @pytest.mark.parametrize( "get_array, get_array_name", ( (lambda df, copy_kwargs: df.__array__(**copy_kwargs), "__array__"), (lambda df, copy_kwargs: np.array(df, **copy_kwargs), "np.array"), ), ) def test___array__(data, copy_kwargs, get_array, get_array_name): if ( get_array_name == "np.array" and Version(np.__version__) < Version("2") and "copy" in copy_kwargs and copy_kwargs["copy"] is None ): pytest.skip(reason="np.array does not support copy=None before numpy 2.0") assert_array_equal(*(get_array(df, copy_kwargs) for df in create_test_series(data))) @pytest.mark.xfail( raises=AssertionError, reason="https://github.com/modin-project/modin/issues/4650" ) def test___array__copy_false_creates_view(): def do_in_place_update_via_copy(series): array = np.array(series, copy=False) array[0] += 1 eval_general( *create_test_series([11]), do_in_place_update_via_copy, __inplace__=True ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___bool__(data): modin_series, pandas_series = create_test_series(data) try: pandas_result = pandas_series.__bool__() except Exception as err: with pytest.raises(type(err)): modin_series.__bool__() else: modin_result = modin_series.__bool__() df_equals(modin_result, pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___contains__(request, data): modin_series, pandas_series = create_test_series(data) result = False key = "Not Exist" assert result == modin_series.__contains__(key) assert result == (key in modin_series) if "empty_data" not in request.node.name: result = True key = pandas_series.keys()[0] assert result == modin_series.__contains__(key) assert result == (key in modin_series) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___copy__(data): modin_series, pandas_series = create_test_series(data) df_equals(modin_series.copy(), modin_series) df_equals(modin_series.copy(), pandas_series.copy()) df_equals(modin_series.copy(), pandas_series) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___deepcopy__(data): modin_series, pandas_series = create_test_series(data) df_equals(modin_series.__deepcopy__(), modin_series) df_equals(modin_series.__deepcopy__(), pandas_series.__deepcopy__()) df_equals(modin_series.__deepcopy__(), pandas_series) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___delitem__(data): modin_series, pandas_series = create_test_series(data) del modin_series[modin_series.index[0]] del pandas_series[pandas_series.index[0]] df_equals(modin_series, pandas_series) del modin_series[modin_series.index[-1]] del pandas_series[pandas_series.index[-1]] df_equals(modin_series, pandas_series) del modin_series[modin_series.index[0]] del pandas_series[pandas_series.index[0]] df_equals(modin_series, pandas_series) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_divmod(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "divmod") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_rdivmod(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "rdivmod") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___eq__(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "__eq__") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___floordiv__(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "__floordiv__") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___ge__(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "__ge__") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___getitem__(data): modin_series, pandas_series = create_test_series(data) df_equals(modin_series[0], pandas_series[0]) df_equals( modin_series[modin_series.index[-1]], pandas_series[pandas_series.index[-1]] ) modin_series = pd.Series(list(range(1000))) pandas_series = pandas.Series(list(range(1000))) df_equals(modin_series[:30], pandas_series[:30]) df_equals(modin_series[modin_series > 500], pandas_series[pandas_series > 500]) df_equals(modin_series[::2], pandas_series[::2]) # Test getting an invalid string key # FIXME: https://github.com/modin-project/modin/issues/7038 eval_general( modin_series, pandas_series, lambda s: s["a"], expected_exception=False ) eval_general( modin_series, pandas_series, lambda s: s[["a"]], expected_exception=False ) # Test empty series df_equals(pd.Series([])[:30], pandas.Series([])[:30]) def test___getitem__1383(): # see #1383 for more details data = ["", "a", "b", "c", "a"] modin_series = pd.Series(data) pandas_series = pandas.Series(data) df_equals(modin_series[3:7], pandas_series[3:7]) @pytest.mark.parametrize("start", [-7, -5, -3, 0, None, 3, 5, 7]) @pytest.mark.parametrize("stop", [-7, -5, -3, 0, None, 3, 5, 7]) def test___getitem_edge_cases(start, stop): data = ["", "a", "b", "c", "a"] modin_series = pd.Series(data) pandas_series = pandas.Series(data) df_equals(modin_series[start:stop], pandas_series[start:stop]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___gt__(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "__gt__") @pytest.mark.parametrize("count_elements", [0, 1, 10]) def test___int__(count_elements): expected_exception = None if count_elements != 1: expected_exception = TypeError("cannot convert the series to ") eval_general( *create_test_series([1.5] * count_elements), int, expected_exception=expected_exception, ) @pytest.mark.parametrize("count_elements", [0, 1, 10]) def test___float__(count_elements): expected_exception = None if count_elements != 1: expected_exception = TypeError("cannot convert the series to ") eval_general( *create_test_series([1] * count_elements), float, expected_exception=expected_exception, ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___invert__(data, request): modin_series, pandas_series = create_test_series(data) expected_exception = None if "float_nan_data" in request.node.callspec.id: # FIXME: https://github.com/modin-project/modin/issues/7081 expected_exception = False eval_general( modin_series, pandas_series, lambda ser: ser.__invert__(), expected_exception=expected_exception, ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___iter__(data): modin_series, pandas_series = create_test_series(data) for m, p in zip(modin_series.__iter__(), pandas_series.__iter__()): np.testing.assert_equal(m, p) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___le__(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "__le__") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___len__(data): modin_series, pandas_series = create_test_series(data) assert len(modin_series) == len(pandas_series) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___long__(data): modin_series, pandas_series = create_test_series(data) try: pandas_result = pandas_series[0].__long__() except Exception as err: with pytest.raises(type(err)): modin_series[0].__long__() else: assert modin_series[0].__long__() == pandas_result @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___lt__(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "__lt__") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___mod__(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "__mod__") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___mul__(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "__mul__") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___ne__(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "__ne__") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___neg__(data): modin_series, pandas_series = create_test_series(data) eval_general(modin_series, pandas_series, lambda ser: ser.__neg__()) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___or__(data, request): modin_series, pandas_series = create_test_series(data) expected_exception = None if "float_nan_data" in request.node.callspec.id: # FIXME: https://github.com/modin-project/modin/issues/7081 expected_exception = False inter_df_math_helper( modin_series, pandas_series, "__or__", # https://github.com/modin-project/modin/issues/5966 comparator_kwargs={"check_dtypes": False}, expected_exception=expected_exception, ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___pow__(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "__pow__") @pytest.mark.parametrize("name", ["Dates", None]) @pytest.mark.parametrize( "dt_index", [True, False], ids=["dt_index_true", "dt_index_false"] ) @pytest.mark.parametrize( "data", [*test_data_values, "empty"], ids=[*test_data_keys, "empty"], ) def test___repr__(name, dt_index, data): if data == "empty": modin_series, pandas_series = pd.Series(), pandas.Series() else: modin_series, pandas_series = create_test_series(data) pandas_series.name = modin_series.name = name if dt_index: index = pandas.date_range( "1/1/2000", periods=len(pandas_series.index), freq="min" ) pandas_series.index = modin_series.index = index assert repr(modin_series) == repr(pandas_series) def test___repr__4186(): modin_series, pandas_series = create_test_series( ["a", "b", "c", "a"], dtype="category" ) assert repr(modin_series) == repr(pandas_series) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.exclude_in_sanity def test___round__(data): modin_series, pandas_series = create_test_series(data) df_equals(round(modin_series), round(pandas_series)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.exclude_in_sanity def test___setitem__(data): modin_series, pandas_series = create_test_series(data) for key in modin_series.keys(): modin_series[key] = 0 pandas_series[key] = 0 df_equals(modin_series, pandas_series) @pytest.mark.parametrize( "key", [ pytest.param(lambda idx: slice(1, 3), id="location_based_slice"), pytest.param(lambda idx: slice(idx[1], idx[-1]), id="index_based_slice"), pytest.param(lambda idx: [idx[0], idx[2], idx[-1]], id="list_of_labels"), pytest.param( lambda idx: [True if i % 2 else False for i in range(len(idx))], id="boolean_mask", ), ], ) @pytest.mark.parametrize( "index", [ pytest.param( lambda idx_len: [chr(x) for x in range(ord("a"), ord("a") + idx_len)], id="str_index", ), pytest.param(lambda idx_len: list(range(1, idx_len + 1)), id="int_index"), ], ) def test___setitem___non_hashable(key, index): data = np.arange(5) index = index(len(data)) key = key(index) md_sr, pd_sr = create_test_series(data, index=index) md_sr[key] = 10 pd_sr[key] = 10 df_equals(md_sr, pd_sr) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___sizeof__(data): modin_series, pandas_series = create_test_series(data) with warns_that_defaulting_to_pandas_if( not df_or_series_using_native_execution(modin_series) ): modin_series.__sizeof__() @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___str__(data): modin_series, pandas_series = create_test_series(data) assert str(modin_series) == str(pandas_series) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___sub__(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "__sub__") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___truediv__(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "__truediv__") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___xor__(data, request): modin_series, pandas_series = create_test_series(data) expected_exception = None if "float_nan_data" in request.node.callspec.id: # FIXME: https://github.com/modin-project/modin/issues/7081 expected_exception = False inter_df_math_helper( modin_series, pandas_series, "__xor__", # https://github.com/modin-project/modin/issues/5966 comparator_kwargs={"check_dtypes": False}, expected_exception=expected_exception, ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_abs(data): modin_series, pandas_series = create_test_series(data) df_equals(modin_series.abs(), pandas_series.abs()) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_add(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "add") def test_add_does_not_change_original_series_name(): # See https://github.com/modin-project/modin/issues/5232 s1 = pd.Series(1, name=1) s2 = pd.Series(2, name=2) original_s1 = s1.copy(deep=True) original_s2 = s2.copy(deep=True) _ = s1 + s2 df_equals(s1, original_s1) df_equals(s2, original_s2) @pytest.mark.parametrize("axis", [None, 0, 1]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_add_prefix(data, axis): expected_exception = None if axis: expected_exception = ValueError("No axis named 1 for object type Series") eval_general( *create_test_series(data), lambda df: df.add_prefix("PREFIX_ADD_", axis=axis), expected_exception=expected_exception, ) @pytest.mark.parametrize("axis", [None, 0, 1]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_add_suffix(data, axis): expected_exception = None if axis: expected_exception = ValueError("No axis named 1 for object type Series") eval_general( *create_test_series(data), lambda df: df.add_suffix("SUFFIX_ADD_", axis=axis), expected_exception=expected_exception, ) def test_add_custom_class(): # see https://github.com/modin-project/modin/issues/5236 # Test that we can add any object that is addable to pandas object data # via "+". eval_general( *create_test_series(test_data["int_data"]), lambda df: df + CustomIntegerForAddition(4), ) def test_aggregate_alias(): # It's optimization. If failed, Series.agg should be tested explicitly _assert_casting_functions_wrap_same_implementation( pd.Series.aggregate, pd.Series.agg ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) def test_aggregate(data, func, request): expected_exception = None if "should raise AssertionError" in request.node.callspec.id: # FIXME: https://github.com/modin-project/modin/issues/7031 expected_exception = False eval_general( *create_test_series(data), lambda df: df.aggregate(func), expected_exception=expected_exception, ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("func", agg_func_except_values, ids=agg_func_except_keys) def test_aggregate_except(data, func): # SpecificationError is arisen because we treat a Series as a DataFrame. # See details in pandas issues 36036. with pytest.raises(SpecificationError): eval_general( *create_test_series(data), lambda df: df.aggregate(func), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_aggregate_error_checking(data): modin_series, pandas_series = create_test_series(data) assert pandas_series.aggregate("ndim") == 1 assert modin_series.aggregate("ndim") == 1 eval_general( modin_series, pandas_series, lambda series: series.aggregate("cumprod"), ) eval_general( modin_series, pandas_series, lambda series: series.aggregate("NOT_EXISTS"), expected_exception=AttributeError( "'NOT_EXISTS' is not a valid function for 'Series' object" ), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_align(data): modin_series, _ = create_test_series(data) # noqa: F841 with warns_that_defaulting_to_pandas_if( not df_or_series_using_native_execution(modin_series) ): modin_series.align(modin_series) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("skipna", [False, True]) def test_all(data, skipna): eval_general(*create_test_series(data), lambda df: df.all(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("skipna", [False, True]) def test_any(data, skipna): eval_general(*create_test_series(data), lambda df: df.any(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_append(data): modin_series, pandas_series = create_test_series(data) data_to_append = {"append_a": 2, "append_b": 1000} ignore_idx_values = [True, False] for ignore in ignore_idx_values: try: pandas_result = pandas_series.append(data_to_append, ignore_index=ignore) except Exception as err: with pytest.raises(type(err)): modin_series.append(data_to_append, ignore_index=ignore) else: modin_result = modin_series.append(data_to_append, ignore_index=ignore) df_equals(modin_result, pandas_result) try: pandas_result = pandas_series.append(pandas_series.iloc[-1]) except Exception as err: with pytest.raises(type(err)): modin_series.append(modin_series.iloc[-1]) else: modin_result = modin_series.append(modin_series.iloc[-1]) df_equals(modin_result, pandas_result) try: pandas_result = pandas_series.append([pandas_series.iloc[-1]]) except Exception as err: with pytest.raises(type(err)): modin_series.append([modin_series.iloc[-1]]) else: modin_result = modin_series.append([modin_series.iloc[-1]]) df_equals(modin_result, pandas_result) verify_integrity_values = [True, False] for verify_integrity in verify_integrity_values: try: pandas_result = pandas_series.append( [pandas_series, pandas_series], verify_integrity=verify_integrity ) except Exception as err: with pytest.raises(type(err)): modin_series.append( [modin_series, modin_series], verify_integrity=verify_integrity ) else: modin_result = modin_series.append( [modin_series, modin_series], verify_integrity=verify_integrity ) df_equals(modin_result, pandas_result) try: pandas_result = pandas_series.append( pandas_series, verify_integrity=verify_integrity ) except Exception as err: with pytest.raises(type(err)): modin_series.append(modin_series, verify_integrity=verify_integrity) else: modin_result = modin_series.append( modin_series, verify_integrity=verify_integrity ) df_equals(modin_result, pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) def test_apply(data, func, request): expected_exception = None if "should raise AssertionError" in request.node.callspec.id: # FIXME: https://github.com/modin-project/modin/issues/7031 expected_exception = False elif "df sum" in request.node.callspec.id: _type = "int" if "int_data" in request.node.callspec.id else "float" expected_exception = AttributeError(f"'{_type}' object has no attribute 'sum'") eval_general( *create_test_series(data), lambda df: df.apply(func), expected_exception=expected_exception, ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("func", agg_func_except_values, ids=agg_func_except_keys) def test_apply_except(data, func): eval_general( *create_test_series(data), lambda df: df.apply(func), expected_exception=pandas.errors.SpecificationError( "Function names must be unique if there is no new column names assigned" ), ) def test_apply_external_lib(): json_string = """ { "researcher": { "name": "Ford Prefect", "species": "Betelgeusian", "relatives": [ { "name": "Zaphod Beeblebrox", "species": "Betelgeusian" } ] } } """ modin_result = pd.DataFrame.from_dict({"a": [json_string]}).a.apply(json.loads) pandas_result = pandas.DataFrame.from_dict({"a": [json_string]}).a.apply(json.loads) df_equals(modin_result, pandas_result) @pytest.mark.parametrize("axis", [None, 0, 1]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("func", ["count", "all", "kurt", "array", "searchsorted"]) def test_apply_text_func(data, func, axis): func_kwargs = {} if func not in ("count", "searchsorted"): func_kwargs["axis"] = axis elif not axis: # FIXME: https://github.com/modin-project/modin/issues/7000 return rows_number = len(next(iter(data.values()))) # length of the first data column level_0 = np.random.choice([0, 1, 2], rows_number) level_1 = np.random.choice([3, 4, 5], rows_number) index = pd.MultiIndex.from_arrays([level_0, level_1]) modin_series, pandas_series = create_test_series(data) modin_series.index = index pandas_series.index = index if func == "searchsorted": # required parameter func_kwargs["value"] = pandas_series[1] eval_general(modin_series, pandas_series, lambda df: df.apply(func, **func_kwargs)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("skipna", [True, False]) def test_argmax(data, skipna): modin_series, pandas_series = create_test_series(data) df_equals(modin_series.argmax(skipna=skipna), pandas_series.argmax(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("skipna", [True, False]) def test_argmin(data, skipna): modin_series, pandas_series = create_test_series(data) df_equals(modin_series.argmin(skipna=skipna), pandas_series.argmin(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_argsort(data): modin_series, pandas_series = create_test_series(data) with warns_that_defaulting_to_pandas_if( not df_or_series_using_native_execution(modin_series) ): modin_result = modin_series.argsort() df_equals(modin_result, pandas_series.argsort()) def test_asfreq(): index = pd.date_range("1/1/2000", periods=4, freq="min") series = pd.Series([0.0, None, 2.0, 3.0], index=index) with warns_that_defaulting_to_pandas_if( not df_or_series_using_native_execution(series) ): # We are only testing that this defaults to pandas, so we will just check for # the warning series.asfreq(freq="30S") @pytest.mark.parametrize( "where", [ 20, 30, [10, 40], [20, 30], [20], 25, [25, 45], [25, 30], pandas.Index([20, 30]), pandas.Index([10]), ], ) def test_asof(where): # With NaN: values = [1, 2, np.nan, 4] index = [10, 20, 30, 40] modin_series, pandas_series = ( pd.Series(values, index=index), pandas.Series(values, index=index), ) df_equals(modin_series.asof(where), pandas_series.asof(where)) # No NaN: values = [1, 2, 7, 4] modin_series, pandas_series = ( pd.Series(values, index=index), pandas.Series(values, index=index), ) df_equals(modin_series.asof(where), pandas_series.asof(where)) @pytest.mark.parametrize( "where", [20, 30, [10.5, 40.5], [10], pandas.Index([20, 30]), pandas.Index([10.5])], ) def test_asof_large(where): values = test_data["float_nan_data"]["col1"] index = list(range(len(values))) modin_series, pandas_series = ( pd.Series(values, index=index), pandas.Series(values, index=index), ) df_equals(modin_series.asof(where), pandas_series.asof(where)) @pytest.mark.parametrize( "data", [ test_data["int_data"], test_data["float_nan_data"], ], ids=test_data_keys, ) def test_astype(data, request): modin_series, pandas_series = create_test_series(data) series_name = "test_series" modin_series.name = pandas_series.name = series_name eval_general(modin_series, pandas_series, lambda df: df.astype(str)) expected_exception = None if "float_nan_data" in request.node.callspec.id: expected_exception = pd.errors.IntCastingNaNError( "Cannot convert non-finite values (NA or inf) to integer" ) eval_general( modin_series, pandas_series, lambda ser: ser.astype(np.int64), expected_exception=expected_exception, ) eval_general(modin_series, pandas_series, lambda ser: ser.astype(np.float64)) eval_general( modin_series, pandas_series, lambda ser: ser.astype({series_name: str}) ) # FIXME: https://github.com/modin-project/modin/issues/7039 eval_general( modin_series, pandas_series, lambda ser: ser.astype({"wrong_name": str}), expected_exception=False, ) # TODO(https://github.com/modin-project/modin/issues/4317): Test passing a # dict to astype() for a series with no name. @pytest.mark.parametrize("dtype", ["int32", "float32"]) def test_astype_32_types(dtype): # https://github.com/modin-project/modin/issues/6881 assert pd.Series([1, 2, 6]).astype(dtype).dtype == dtype @pytest.mark.parametrize( "data", [["A", "A", "B", "B", "A"], [1, 1, 2, 1, 2, 2, 3, 1, 2, 1, 2]] ) def test_astype_categorical(data): modin_df, pandas_df = create_test_series(data) modin_result = modin_df.astype("category") pandas_result = pandas_df.astype("category") df_equals(modin_result, pandas_result) assert modin_result.dtype == pandas_result.dtype dtype = pd.CategoricalDtype(categories=sorted(set(data))) modin_result = modin_df.astype(dtype) pandas_result = pandas_df.astype(dtype) df_equals(modin_result, pandas_result) assert modin_result.dtype == pandas_result.dtype @pytest.mark.parametrize("data", [["a", "a", "b", "c", "c", "d", "b", "d"]]) @pytest.mark.parametrize( "set_min_row_partition_size", [2, 4], ids=["four_row_partitions", "two_row_partitions"], indirect=True, ) def test_astype_categorical_issue5722(data, set_min_row_partition_size): modin_series, pandas_series = create_test_series(data) modin_result = modin_series.astype("category") pandas_result = pandas_series.astype("category") df_equals(modin_result, pandas_result) assert modin_result.dtype == pandas_result.dtype pandas_result1, pandas_result2 = pandas_result.iloc[:4], pandas_result.iloc[4:] modin_result1, modin_result2 = modin_result.iloc[:4], modin_result.iloc[4:] # check categories assert pandas_result1.cat.categories.equals(pandas_result2.cat.categories) assert modin_result1.cat.categories.equals(modin_result2.cat.categories) assert pandas_result1.cat.categories.equals(modin_result1.cat.categories) assert pandas_result2.cat.categories.equals(modin_result2.cat.categories) # check codes assert_array_equal(pandas_result1.cat.codes.values, modin_result1.cat.codes.values) assert_array_equal(pandas_result2.cat.codes.values, modin_result2.cat.codes.values) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_at(data): modin_series, pandas_series = create_test_series(data) df_equals( modin_series.at[modin_series.index[0]], pandas_series.at[pandas_series.index[0]] ) df_equals( modin_series.at[modin_series.index[-1]], pandas_series[pandas_series.index[-1]] ) def test_at_time(): i = pd.date_range("2008-01-01", periods=1000, freq="12H") modin_series = pd.Series(list(range(1000)), index=i) pandas_series = pandas.Series(list(range(1000)), index=i) df_equals(modin_series.at_time("12:00"), pandas_series.at_time("12:00")) df_equals(modin_series.at_time("3:00"), pandas_series.at_time("3:00")) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("lag", [1, 2, 3]) def test_autocorr(data, lag): modin_series, pandas_series = create_test_series(data) modin_result = modin_series.autocorr(lag=lag) pandas_result = pandas_series.autocorr(lag=lag) df_equals(modin_result, pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_axes(data): modin_series, pandas_series = create_test_series(data) assert modin_series.axes[0].equals(pandas_series.axes[0]) assert len(modin_series.axes) == len(pandas_series.axes) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_attrs(data): modin_series, pandas_series = create_test_series(data) eval_general(modin_series, pandas_series, lambda df: df.attrs) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_array(data): modin_series, pandas_series = create_test_series(data) eval_general(modin_series, pandas_series, lambda df: df.array) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_between(data): modin_series, pandas_series = create_test_series(data) df_equals( modin_series.between(1, 4), pandas_series.between(1, 4), ) def test_between_time(): i = pd.date_range("2008-01-01", periods=1000, freq="12H") modin_series = pd.Series(list(range(1000)), index=i) pandas_series = pandas.Series(list(range(1000)), index=i) df_equals( modin_series.between_time("12:00", "17:00"), pandas_series.between_time("12:00", "17:00"), ) df_equals( modin_series.between_time("3:00", "8:00"), pandas_series.between_time("3:00", "8:00"), ) df_equals( modin_series.between_time("3:00", "8:00", inclusive="right"), pandas_series.between_time("3:00", "8:00", inclusive="right"), ) def test_add_series_to_timedeltaindex(): # Make a pandas.core.indexes.timedeltas.TimedeltaIndex deltas = pd.to_timedelta([1], unit="h") test_series = create_test_series(np.datetime64("2000-12-12")) eval_general(*test_series, lambda s: s + deltas) eval_general(*test_series, lambda s: s - deltas) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_bfill(data): modin_series, pandas_series = create_test_series(data) df_equals(modin_series.bfill(), pandas_series.bfill()) # inplace modin_series_cp = modin_series.copy() pandas_series_cp = pandas_series.copy() modin_series_cp.bfill(inplace=True) pandas_series_cp.bfill(inplace=True) df_equals(modin_series_cp, pandas_series_cp) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_bool(data): modin_series, _ = create_test_series(data) with pytest.warns( FutureWarning, match="bool is now deprecated and will be removed" ): with pytest.raises(ValueError): modin_series.bool() with pytest.raises(ValueError): modin_series.__bool__() @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("bound_type", ["list", "series"], ids=["list", "series"]) def test_clip_scalar(request, data, bound_type): modin_series, pandas_series = create_test_series( data, ) if name_contains(request.node.name, numeric_dfs): # set bounds lower, upper = np.sort(random_state.randint(RAND_LOW, RAND_HIGH, 2)) # test only upper scalar bound modin_result = modin_series.clip(None, upper) pandas_result = pandas_series.clip(None, upper) df_equals(modin_result, pandas_result) # test lower and upper scalar bound modin_result = modin_series.clip(lower, upper) pandas_result = pandas_series.clip(lower, upper) df_equals(modin_result, pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("bound_type", ["list", "series"], ids=["list", "series"]) def test_clip_sequence(request, data, bound_type): modin_series, pandas_series = create_test_series( data, ) if name_contains(request.node.name, numeric_dfs): lower = random_state.randint(RAND_LOW, RAND_HIGH, len(pandas_series)) upper = random_state.randint(RAND_LOW, RAND_HIGH, len(pandas_series)) if bound_type == "series": modin_lower = pd.Series(lower) pandas_lower = pandas.Series(lower) modin_upper = pd.Series(upper) pandas_upper = pandas.Series(upper) else: modin_lower = pandas_lower = lower modin_upper = pandas_upper = upper # test lower and upper list bound modin_result = modin_series.clip(modin_lower, modin_upper, axis=0) pandas_result = pandas_series.clip(pandas_lower, pandas_upper) df_equals(modin_result, pandas_result) # test only upper list bound modin_result = modin_series.clip(np.nan, modin_upper, axis=0) pandas_result = pandas_series.clip(np.nan, pandas_upper) df_equals(modin_result, pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_combine(data): modin_series, _ = create_test_series(data) # noqa: F841 modin_series2 = modin_series % (max(modin_series) // 2) modin_series.combine(modin_series2, lambda s1, s2: s1 if s1 < s2 else s2) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_combine_first(data): modin_series, pandas_series = create_test_series(data) modin_series2 = modin_series % (max(modin_series) // 2) pandas_series2 = pandas_series % (max(pandas_series) // 2) modin_result = modin_series.combine_first(modin_series2) pandas_result = pandas_series.combine_first(pandas_series2) df_equals(modin_result, pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_compress(data): modin_series, pandas_series = create_test_series(data) # noqa: F841 try: pandas_series.compress(pandas_series > 30) except Exception as err: with pytest.raises(type(err)): modin_series.compress(modin_series > 30) else: modin_series.compress(modin_series > 30) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_constructor(data): modin_series, pandas_series = create_test_series(data) df_equals(modin_series, pandas_series) df_equals(pd.Series(modin_series), pandas.Series(pandas_series)) def test_constructor_columns_and_index(): modin_series = pd.Series([1, 1, 10], index=[1, 2, 3], name="health") pandas_series = pandas.Series([1, 1, 10], index=[1, 2, 3], name="health") df_equals(modin_series, pandas_series) df_equals(pd.Series(modin_series), pandas.Series(pandas_series)) df_equals( pd.Series(modin_series, name="max_speed"), pandas.Series(pandas_series, name="max_speed"), ) df_equals( pd.Series(modin_series, index=[1, 2]), pandas.Series(pandas_series, index=[1, 2]), ) with pytest.raises(NotImplementedError): pd.Series(modin_series, index=[1, 2, 99999]) def test_constructor_arrow_extension_array(): # example from pandas docs pa = pytest.importorskip("pyarrow") array = pd.arrays.ArrowExtensionArray( pa.array( [{"1": "2"}, {"10": "20"}, None], type=pa.map_(pa.string(), pa.string()), ) ) md_ser, pd_ser = create_test_series(array) df_equals(md_ser, pd_ser) df_equals(md_ser.dtypes, pd_ser.dtypes) def test_pyarrow_backed_constructor(): pa = pytest.importorskip("pyarrow") data = list("abcd") df_equals(*create_test_series(data, dtype="string[pyarrow]")) df_equals(*create_test_series(data, dtype=pd.ArrowDtype(pa.string()))) data = [["hello"], ["there"]] list_str_type = pa.list_(pa.string()) df_equals(*create_test_series(data, dtype=pd.ArrowDtype(list_str_type))) def test_pyarrow_backed_functions(): pytest.importorskip("pyarrow") modin_series, pandas_series = create_test_series( [-1.545, 0.211, None], dtype="float32[pyarrow]" ) df_equals(modin_series.mean(), pandas_series.mean()) def comparator(df1, df2): df_equals(df1, df2) df_equals(df1.dtypes, df2.dtypes) eval_general( modin_series, pandas_series, lambda ser: ser + (modin_series if isinstance(ser, pd.Series) else pandas_series), comparator=comparator, ) eval_general( modin_series, pandas_series, lambda ser: ser > (ser + 1), comparator=comparator, ) eval_general( modin_series, pandas_series, lambda ser: ser.dropna(), comparator=comparator, ) eval_general( modin_series, pandas_series, lambda ser: ser.isna(), comparator=comparator, ) eval_general( modin_series, pandas_series, lambda ser: ser.fillna(0), comparator=comparator, ) def test_pyarrow_array_retrieve(): pa = pytest.importorskip("pyarrow") modin_series, pandas_series = create_test_series( [1, 2, None], dtype="uint8[pyarrow]" ) eval_general( modin_series, pandas_series, lambda ser: pa.array(ser), ) def test___arrow_array__(): # https://github.com/modin-project/modin/issues/6808 pa = pytest.importorskip("pyarrow") mpd_df_1 = pd.DataFrame({"a": ["1", "2", "3"], "b": ["4", "5", "6"]}) mpd_df_2 = pd.DataFrame({"a": ["7", "8", "9"], "b": ["10", "11", "12"]}) test_df = pd.concat([mpd_df_1, mpd_df_2]) res_from_md = pa.Table.from_pandas(df=test_df) res_from_pd = pa.Table.from_pandas(df=test_df._to_pandas()) assert res_from_md.equals(res_from_pd) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_copy(data): modin_series, pandas_series = create_test_series(data) df_equals(modin_series, modin_series.copy()) df_equals(modin_series.copy(), pandas_series) df_equals(modin_series.copy(), pandas_series.copy()) def test_copy_empty_series(): ser = pd.Series(range(3)) res = ser[:0].copy() assert res.dtype == ser.dtype @pytest.mark.parametrize("method", ["pearson", "kendall"]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_corr(data, method): modin_series, pandas_series = create_test_series(data) modin_result = modin_series.corr(modin_series, method=method) pandas_result = pandas_series.corr(pandas_series, method=method) df_equals(modin_result, pandas_result) @pytest.mark.parametrize( "data", test_data_values + test_data_large_categorical_series_values, ids=test_data_keys + test_data_large_categorical_series_keys, ) def test_count(data): modin_series, pandas_series = create_test_series(data) df_equals(modin_series.count(), pandas_series.count()) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_cov(data): modin_series, pandas_series = create_test_series(data) modin_result = modin_series.cov(modin_series) pandas_result = pandas_series.cov(pandas_series) df_equals(modin_result, pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("skipna", [False, True]) def test_cummax(data, skipna): modin_series, pandas_series = create_test_series(data) try: pandas_result = pandas_series.cummax(skipna=skipna) except Exception as err: with pytest.raises(type(err)): modin_series.cummax(skipna=skipna) else: df_equals(modin_series.cummax(skipna=skipna), pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("skipna", [False, True]) def test_cummin(data, skipna): modin_series, pandas_series = create_test_series(data) try: pandas_result = pandas_series.cummin(skipna=skipna) except Exception as err: with pytest.raises(type(err)): modin_series.cummin(skipna=skipna) else: df_equals(modin_series.cummin(skipna=skipna), pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("skipna", [False, True]) def test_cumprod(data, skipna): modin_series, pandas_series = create_test_series(data) try: pandas_result = pandas_series.cumprod(skipna=skipna) except Exception as err: with pytest.raises(type(err)): modin_series.cumprod(skipna=skipna) else: df_equals(modin_series.cumprod(skipna=skipna), pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("skipna", [False, True]) def test_cumsum(data, skipna): modin_series, pandas_series = create_test_series(data) try: pandas_result = pandas_series.cumsum(skipna=skipna) except Exception as err: with pytest.raises(type(err)): modin_series.cumsum(skipna=skipna) else: df_equals(modin_series.cumsum(skipna=skipna), pandas_result) def test_cumsum_6771(): _ = to_pandas(pd.Series([1, 2, 3], dtype="Int64").cumsum()) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_describe(data): modin_series, pandas_series = create_test_series(data) df_equals(modin_series.describe(), pandas_series.describe()) percentiles = [0.10, 0.11, 0.44, 0.78, 0.99] df_equals( modin_series.describe(percentiles=percentiles), pandas_series.describe(percentiles=percentiles), ) try: pandas_result = pandas_series.describe(exclude=[np.float64]) except Exception as err: with pytest.raises(type(err)): modin_series.describe(exclude=[np.float64]) else: modin_result = modin_series.describe(exclude=[np.float64]) df_equals(modin_result, pandas_result) try: pandas_result = pandas_series.describe(exclude=np.float64) except Exception as err: with pytest.raises(type(err)): modin_series.describe(exclude=np.float64) else: modin_result = modin_series.describe(exclude=np.float64) df_equals(modin_result, pandas_result) try: pandas_result = pandas_series.describe( include=[np.timedelta64, np.datetime64, np.object_, np.bool_] ) except Exception as err: with pytest.raises(type(err)): modin_series.describe( include=[np.timedelta64, np.datetime64, np.object_, np.bool_] ) else: modin_result = modin_series.describe( include=[np.timedelta64, np.datetime64, np.object_, np.bool_] ) df_equals(modin_result, pandas_result) modin_result = modin_series.describe(include=str(modin_series.dtypes)) pandas_result = pandas_series.describe(include=str(pandas_series.dtypes)) df_equals(modin_result, pandas_result) modin_result = modin_series.describe(include=[np.number]) pandas_result = pandas_series.describe(include=[np.number]) df_equals(modin_result, pandas_result) df_equals( modin_series.describe(include="all"), pandas_series.describe(include="all") ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize( "periods", int_arg_values, ids=arg_keys("periods", int_arg_keys) ) def test_diff(data, periods): modin_series, pandas_series = create_test_series(data) try: pandas_result = pandas_series.diff(periods=periods) except Exception as err: with pytest.raises(type(err)): modin_series.diff(periods=periods) else: modin_result = modin_series.diff(periods=periods) df_equals(modin_result, pandas_result) try: pandas_result = pandas_series.T.diff(periods=periods) except Exception as err: with pytest.raises(type(err)): modin_series.T.diff(periods=periods) else: modin_result = modin_series.T.diff(periods=periods) df_equals(modin_result, pandas_result) def test_diff_with_dates(): data = pandas.date_range("2018-01-01", periods=15, freq="h").values pandas_series = pandas.Series(data) modin_series = pd.Series(pandas_series) # Check that `diff` with datetime types works correctly. pandas_result = pandas_series.diff() modin_result = modin_series.diff() df_equals(modin_result, pandas_result) # Check that `diff` with timedelta types works correctly. td_pandas_result = pandas_result.diff() td_modin_result = modin_result.diff() df_equals(td_modin_result, td_pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_div(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "div") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_divide(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "divide") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_dot(data): modin_series, pandas_series = create_test_series(data) ind_len = len(modin_series) # Test 1D array input arr = np.arange(ind_len) modin_result = modin_series.dot(arr) pandas_result = pandas_series.dot(arr) df_equals(modin_result, pandas_result) # Test 2D array input arr = np.arange(ind_len * 2).reshape(ind_len, 2) modin_result = modin_series.dot(arr) pandas_result = pandas_series.dot(arr) assert_array_equal(modin_result, pandas_result) # Test bad dimensions with pytest.raises(ValueError): modin_series.dot(np.arange(ind_len + 10)) # Test dataframe input modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) modin_result = modin_series.dot(modin_df) pandas_result = pandas_series.dot(pandas_df) df_equals(modin_result, pandas_result) # Test series input modin_series_2 = pd.Series(np.arange(ind_len), index=modin_series.index) pandas_series_2 = pandas.Series(np.arange(ind_len), index=pandas_series.index) modin_result = modin_series.dot(modin_series_2) pandas_result = pandas_series.dot(pandas_series_2) df_equals(modin_result, pandas_result) # Test when input series index doesn't line up with columns with pytest.raises(ValueError): modin_series.dot( pd.Series( np.arange(ind_len), index=["a" for _ in range(len(modin_series.index))] ) ) # Test case when left series has size (1 x 1) # and right dataframe has size (1 x n) modin_result = pd.Series([1]).dot(pd.DataFrame(modin_series).T) pandas_result = pandas.Series([1]).dot(pandas.DataFrame(pandas_series).T) df_equals(modin_result, pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_matmul(data): modin_series, pandas_series = create_test_series(data) # noqa: F841 ind_len = len(modin_series) # Test 1D array input arr = np.arange(ind_len) modin_result = modin_series @ arr pandas_result = pandas_series @ arr df_equals(modin_result, pandas_result) # Test 2D array input arr = np.arange(ind_len * 2).reshape(ind_len, 2) modin_result = modin_series @ arr pandas_result = pandas_series @ arr assert_array_equal(modin_result, pandas_result) # Test bad dimensions with pytest.raises(ValueError): modin_series @ np.arange(ind_len + 10) # Test dataframe input modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) modin_result = modin_series @ modin_df pandas_result = pandas_series @ pandas_df df_equals(modin_result, pandas_result) # Test series input modin_series_2 = pd.Series(np.arange(ind_len), index=modin_series.index) pandas_series_2 = pandas.Series(np.arange(ind_len), index=pandas_series.index) modin_result = modin_series @ modin_series_2 pandas_result = pandas_series @ pandas_series_2 df_equals(modin_result, pandas_result) # Test when input series index doesn't line up with columns with pytest.raises(ValueError): modin_series @ pd.Series( np.arange(ind_len), index=["a" for _ in range(len(modin_series.index))] ) @pytest.mark.xfail(reason="Using pandas Series.") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_drop(data): modin_series = create_test_series(data) with pytest.raises(NotImplementedError): modin_series.drop(None, None, None, None) @pytest.mark.parametrize( "data", test_data_with_duplicates_values, ids=test_data_with_duplicates_keys ) @pytest.mark.parametrize( "keep", ["last", "first", False], ids=["last", "first", "False"] ) @pytest.mark.parametrize("inplace", [True, False], ids=["True", "False"]) def test_drop_duplicates(data, keep, inplace): modin_series, pandas_series = create_test_series(data) modin_res = modin_series.drop_duplicates(keep=keep, inplace=inplace) pandas_res = pandas_series.drop_duplicates(keep=keep, inplace=inplace) if inplace: sort_if_range_partitioning(modin_series, pandas_series) else: sort_if_range_partitioning(modin_res, pandas_res) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("how", ["any", "all"], ids=["any", "all"]) def test_dropna(data, how): modin_series, pandas_series = create_test_series(data) modin_result = modin_series.dropna(how=how) pandas_result = pandas_series.dropna(how=how) df_equals(modin_result, pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_dropna_inplace(data): modin_series, pandas_series = create_test_series(data) pandas_result = pandas_series.dropna() modin_series.dropna(inplace=True) df_equals(modin_series, pandas_result) modin_series, pandas_series = create_test_series(data) pandas_series.dropna(how="any", inplace=True) modin_series.dropna(how="any", inplace=True) df_equals(modin_series, pandas_series) def test_dtype_empty(): modin_series, pandas_series = pd.Series(), pandas.Series() assert modin_series.dtype == pandas_series.dtype @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_dtype(data): modin_series, pandas_series = create_test_series(data) df_equals(modin_series.dtype, modin_series.dtypes) df_equals(modin_series.dtype, pandas_series.dtype) df_equals(modin_series.dtype, pandas_series.dtypes) # Bug https://github.com/modin-project/modin/issues/4436 in # Series.dt.to_pydatetime is only reproducible when the date range out of which # the frame is created has timezone None, so that its dtype is datetime64[ns] # as opposed to, e.g. datetime64[ns, Europe/Berlin]. To reproduce that bug, we # use timezones None and Europe/Berlin. @pytest.mark.parametrize( "timezone", [ pytest.param(None), pytest.param("Europe/Berlin"), ], ) def test_dt(timezone): data = pd.date_range("2016-12-31", periods=128, freq="D", tz=timezone) modin_series = pd.Series(data) pandas_series = pandas.Series(data) df_equals(modin_series.dt.date, pandas_series.dt.date) df_equals(modin_series.dt.time, pandas_series.dt.time) df_equals(modin_series.dt.timetz, pandas_series.dt.timetz) df_equals(modin_series.dt.year, pandas_series.dt.year) df_equals(modin_series.dt.month, pandas_series.dt.month) df_equals(modin_series.dt.day, pandas_series.dt.day) df_equals(modin_series.dt.hour, pandas_series.dt.hour) df_equals(modin_series.dt.minute, pandas_series.dt.minute) df_equals(modin_series.dt.second, pandas_series.dt.second) df_equals(modin_series.dt.microsecond, pandas_series.dt.microsecond) df_equals(modin_series.dt.nanosecond, pandas_series.dt.nanosecond) df_equals(modin_series.dt.dayofweek, pandas_series.dt.dayofweek) df_equals(modin_series.dt.day_of_week, pandas_series.dt.day_of_week) df_equals(modin_series.dt.weekday, pandas_series.dt.weekday) df_equals(modin_series.dt.dayofyear, pandas_series.dt.dayofyear) df_equals(modin_series.dt.day_of_year, pandas_series.dt.day_of_year) df_equals(modin_series.dt.unit, pandas_series.dt.unit) df_equals(modin_series.dt.as_unit("s"), pandas_series.dt.as_unit("s")) df_equals(modin_series.dt.isocalendar(), pandas_series.dt.isocalendar()) df_equals(modin_series.dt.quarter, pandas_series.dt.quarter) df_equals(modin_series.dt.is_month_start, pandas_series.dt.is_month_start) df_equals(modin_series.dt.is_month_end, pandas_series.dt.is_month_end) df_equals(modin_series.dt.is_quarter_start, pandas_series.dt.is_quarter_start) df_equals(modin_series.dt.is_quarter_end, pandas_series.dt.is_quarter_end) df_equals(modin_series.dt.is_year_start, pandas_series.dt.is_year_start) df_equals(modin_series.dt.is_year_end, pandas_series.dt.is_year_end) df_equals(modin_series.dt.is_leap_year, pandas_series.dt.is_leap_year) df_equals(modin_series.dt.daysinmonth, pandas_series.dt.daysinmonth) df_equals(modin_series.dt.days_in_month, pandas_series.dt.days_in_month) assert modin_series.dt.tz == pandas_series.dt.tz assert modin_series.dt.freq == pandas_series.dt.freq df_equals(modin_series.dt.to_period("W"), pandas_series.dt.to_period("W")) assert_array_equal( modin_series.dt.to_pydatetime(), pandas_series.dt.to_pydatetime() ) df_equals( modin_series.dt.tz_localize(None), pandas_series.dt.tz_localize(None), ) if timezone: df_equals( modin_series.dt.tz_convert(tz="Europe/Berlin"), pandas_series.dt.tz_convert(tz="Europe/Berlin"), ) df_equals(modin_series.dt.normalize(), pandas_series.dt.normalize()) df_equals( modin_series.dt.strftime("%B %d, %Y, %r"), pandas_series.dt.strftime("%B %d, %Y, %r"), ) df_equals(modin_series.dt.round("h"), pandas_series.dt.round("h")) df_equals(modin_series.dt.floor("h"), pandas_series.dt.floor("h")) df_equals(modin_series.dt.ceil("h"), pandas_series.dt.ceil("h")) df_equals(modin_series.dt.month_name(), pandas_series.dt.month_name()) df_equals(modin_series.dt.day_name(), pandas_series.dt.day_name()) modin_series = pd.Series(pd.to_timedelta(np.arange(128), unit="d")) pandas_series = pandas.Series(pandas.to_timedelta(np.arange(128), unit="d")) assert_array_equal( modin_series.dt.to_pytimedelta(), pandas_series.dt.to_pytimedelta() ) df_equals(modin_series.dt.total_seconds(), pandas_series.dt.total_seconds()) df_equals(modin_series.dt.days, pandas_series.dt.days) df_equals(modin_series.dt.seconds, pandas_series.dt.seconds) df_equals(modin_series.dt.microseconds, pandas_series.dt.microseconds) df_equals(modin_series.dt.nanoseconds, pandas_series.dt.nanoseconds) df_equals(modin_series.dt.components, pandas_series.dt.components) data_per = pd.date_range("1/1/2012", periods=128, freq="M") pandas_series = pandas.Series(data_per, index=data_per).dt.to_period() modin_series = pd.Series(data_per, index=data_per).dt.to_period() df_equals(modin_series.dt.qyear, pandas_series.dt.qyear) df_equals(modin_series.dt.start_time, pandas_series.dt.start_time) df_equals(modin_series.dt.end_time, pandas_series.dt.end_time) df_equals(modin_series.dt.to_timestamp(), pandas_series.dt.to_timestamp()) def dt_with_empty_partition(lib): # For context, see https://github.com/modin-project/modin/issues/5112 df = ( pd.concat( [pd.DataFrame([None]), pd.DataFrame([pd.to_timedelta(1)])], axis=1 ) .dropna(axis=1) .squeeze(1) ) # BaseOnPython had a single partition after the concat, and it # maintains that partition after dropna and squeeze. In other execution modes, # the series should have two column partitions, one of which is empty. if isinstance(df, pd.DataFrame) and get_current_execution() != "BaseOnPython": assert df._query_compiler._modin_frame._partitions.shape == (1, 2) return df.dt.days eval_general(pd, pandas, dt_with_empty_partition) if timezone is None: data = pd.period_range("2016-12-31", periods=128, freq="D") modin_series = pd.Series(data) pandas_series = pandas.Series(data) df_equals(modin_series.dt.asfreq("min"), pandas_series.dt.asfreq("min")) @pytest.mark.parametrize( "data", test_data_with_duplicates_values, ids=test_data_with_duplicates_keys ) @pytest.mark.parametrize( "keep", ["last", "first", False], ids=["last", "first", "False"] ) def test_duplicated(data, keep): modin_series, pandas_series = create_test_series(data) modin_result = modin_series.duplicated(keep=keep) df_equals(modin_result, pandas_series.duplicated(keep=keep)) def test_duplicated_keeps_name_issue_7375(): # Ensure that the name property of a series is preserved across duplicated modin_series, pandas_series = create_test_series([1, 2, 3, 1], name="a") df_equals(modin_series.duplicated(), pandas_series.duplicated()) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_empty(data): modin_series, pandas_series = create_test_series(data) assert modin_series.empty == pandas_series.empty def test_empty_series(): modin_series = pd.Series() assert modin_series.empty @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_eq(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "eq") @pytest.mark.parametrize( "series1_data,series2_data,expected_pandas_equals", [ pytest.param([1], [0], False, id="single_unequal_values"), pytest.param([None], [None], True, id="single_none_values"), pytest.param( pandas.Series(1, name="series1"), pandas.Series(1, name="series2"), True, id="different_names", ), pytest.param( pandas.Series([1], index=[1]), pandas.Series([1], index=[1.0]), True, id="different_index_types", ), pytest.param( pandas.Series([1], index=[1]), pandas.Series([1], index=[2]), False, id="different_index_values", ), pytest.param([1], [1.0], False, id="different_value_types"), pytest.param( [1, 2], [1, 2], True, id="equal_series_of_length_two", ), pytest.param( [1, 2], [1, 3], False, id="unequal_series_of_length_two", ), pytest.param( [[1, 2]], [[1]], False, id="different_lengths", ), ], ) def test_equals(series1_data, series2_data, expected_pandas_equals): modin_series1, pandas_df1 = create_test_series(series1_data) modin_series2, pandas_df2 = create_test_series(series2_data) pandas_equals = pandas_df1.equals(pandas_df2) assert pandas_equals == expected_pandas_equals, ( "Test expected pandas to say the series were" + f"{'' if expected_pandas_equals else ' not'} equal, but they were" + f"{' not' if expected_pandas_equals else ''} equal." ) assert modin_series1.equals(modin_series2) == pandas_equals assert modin_series1.equals(pandas_df2) == pandas_equals def test_equals_several_partitions(): modin_series1 = pd.concat([pd.Series([0, 1]), pd.Series([None, 1])]) modin_series2 = pd.concat([pd.Series([0, 1]), pd.Series([1, None])]) assert not modin_series1.equals(modin_series2) def test_equals_with_nans(): ser1 = pd.Series([0, 1, None], dtype="uint8[pyarrow]") ser2 = pd.Series([None, None, None], dtype="uint8[pyarrow]") assert not ser1.equals(ser2) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_ewm(data): modin_series, _ = create_test_series(data) # noqa: F841 with warns_that_defaulting_to_pandas_if( not df_or_series_using_native_execution(modin_series) ): modin_series.ewm(halflife=6) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_expanding(data): modin_series, pandas_series = create_test_series(data) # noqa: F841 df_equals(modin_series.expanding().sum(), pandas_series.expanding().sum()) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_factorize(data): modin_series, _ = create_test_series(data) # noqa: F841 with warns_that_defaulting_to_pandas_if( not df_or_series_using_native_execution(modin_series) ): modin_series.factorize() @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_ffill(data): modin_series, pandas_series = create_test_series(data) df_equals(modin_series.ffill(), pandas_series.ffill()) # inplace modin_series_cp = modin_series.copy() pandas_series_cp = pandas_series.copy() modin_series_cp.ffill(inplace=True) pandas_series_cp.ffill(inplace=True) df_equals(modin_series_cp, pandas_series_cp) @pytest.mark.parametrize("limit_area", [None, "inside", "outside"]) @pytest.mark.parametrize("method", ["ffill", "bfill"]) def test_ffill_bfill_limit_area(method, limit_area): modin_ser, pandas_ser = create_test_series([1, None, 2, None]) eval_general( modin_ser, pandas_ser, lambda ser: getattr(ser, method)(limit_area=limit_area) ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("reindex", [None, 2, -2]) @pytest.mark.parametrize("limit", [None, 1, 2, 0.5, -1, -2, 1.5]) @pytest.mark.exclude_in_sanity def test_fillna(data, reindex, limit): modin_series, pandas_series = create_test_series(data) index = pandas_series.index pandas_replace_series = index.to_series().sample(frac=1) modin_replace_series = pd.Series(pandas_replace_series) replace_dict = pandas_replace_series.to_dict() if reindex is not None: if reindex > 0: pandas_series = pandas_series[:reindex].reindex(index) else: pandas_series = pandas_series[reindex:].reindex(index) # Because of bug #3178 modin Series has to be created from pandas # Series instead of performing the same slice and reindex operations. modin_series = pd.Series(pandas_series) if isinstance(limit, float): limit = int(len(modin_series) * limit) if limit is not None and limit < 0: limit = len(modin_series) + limit df_equals(modin_series.fillna(0, limit=limit), pandas_series.fillna(0, limit=limit)) df_equals( modin_series.fillna(method="bfill", limit=limit), pandas_series.fillna(method="bfill", limit=limit), ) df_equals( modin_series.fillna(method="ffill", limit=limit), pandas_series.fillna(method="ffill", limit=limit), ) df_equals( modin_series.fillna(modin_replace_series, limit=limit), pandas_series.fillna(pandas_replace_series, limit=limit), ) df_equals( modin_series.fillna(replace_dict, limit=limit), pandas_series.fillna(replace_dict, limit=limit), ) @pytest.mark.xfail(reason="Using pandas Series.") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_filter(data): modin_series = create_test_series(data) with pytest.raises(NotImplementedError): modin_series.filter(None, None, None) def test_first(): i = pd.date_range("2010-04-09", periods=400, freq="2D") modin_series = pd.Series(list(range(400)), index=i) pandas_series = pandas.Series(list(range(400)), index=i) with pytest.warns(FutureWarning, match="first is deprecated and will be removed"): modin_result = modin_series.first("3D") df_equals(modin_result, pandas_series.first("3D")) df_equals(modin_series.first("20D"), pandas_series.first("20D")) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_first_valid_index(data): modin_series, pandas_series = create_test_series(data) df_equals(modin_series.first_valid_index(), pandas_series.first_valid_index()) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_floordiv(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "floordiv") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_ge(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "ge") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_get(data): modin_series, pandas_series = create_test_series(data) for key in modin_series.keys(): df_equals(modin_series.get(key), pandas_series.get(key)) df_equals( modin_series.get("NO_EXIST", "DEFAULT"), pandas_series.get("NO_EXIST", "DEFAULT"), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_gt(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "gt") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_hasnans(data): modin_series, pandas_series = create_test_series(data) assert modin_series.hasnans == pandas_series.hasnans @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("n", int_arg_values, ids=arg_keys("n", int_arg_keys)) def test_head(data, n): modin_series, pandas_series = create_test_series(data) df_equals(modin_series.head(n), pandas_series.head(n)) df_equals( modin_series.head(len(modin_series)), pandas_series.head(len(pandas_series)) ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_hist(data): modin_series, _ = create_test_series(data) # noqa: F841 with warns_that_defaulting_to_pandas_if( not df_or_series_using_native_execution(modin_series) ): modin_series.hist(None) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_iat(data): modin_series, pandas_series = create_test_series(data) df_equals(modin_series.iat[0], pandas_series.iat[0]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("skipna", [False, True]) def test_idxmax(data, skipna): modin_series, pandas_series = create_test_series(data) pandas_result = pandas_series.idxmax(skipna=skipna) modin_result = modin_series.idxmax(skipna=skipna) df_equals(modin_result, pandas_result) pandas_result = pandas_series.T.idxmax(skipna=skipna) modin_result = modin_series.T.idxmax(skipna=skipna) df_equals(modin_result, pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("skipna", [False, True]) def test_idxmin(data, skipna): modin_series, pandas_series = create_test_series(data) pandas_result = pandas_series.idxmin(skipna=skipna) modin_result = modin_series.idxmin(skipna=skipna) df_equals(modin_result, pandas_result) pandas_result = pandas_series.T.idxmin(skipna=skipna) modin_result = modin_series.T.idxmin(skipna=skipna) df_equals(modin_result, pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_iloc(request, data): modin_series, pandas_series = create_test_series(data) if not name_contains(request.node.name, ["empty_data"]): # Scalar np.testing.assert_equal(modin_series.iloc[0], pandas_series.iloc[0]) # Series df_equals(modin_series.iloc[1:], pandas_series.iloc[1:]) df_equals(modin_series.iloc[1:2], pandas_series.iloc[1:2]) df_equals(modin_series.iloc[[1, 2]], pandas_series.iloc[[1, 2]]) # Write Item modin_series.iloc[[1, 2]] = 42 pandas_series.iloc[[1, 2]] = 42 df_equals(modin_series, pandas_series) with pytest.raises(IndexingError): modin_series.iloc[1:, 1] else: with pytest.raises(IndexError): modin_series.iloc[0] @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_index(data): modin_series, pandas_series = create_test_series(data) df_equals(modin_series.index, pandas_series.index) with pytest.raises(ValueError): modin_series.index = list(modin_series.index) + [999] modin_series.index = modin_series.index.map(str) pandas_series.index = pandas_series.index.map(str) df_equals(modin_series.index, pandas_series.index) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_interpolate(data): modin_series, _ = create_test_series(data) # noqa: F841 with warns_that_defaulting_to_pandas_if( not df_or_series_using_native_execution(modin_series) ): modin_series.interpolate() @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_is_monotonic_decreasing(data): modin_series, pandas_series = create_test_series(data) assert modin_series.is_monotonic_decreasing == pandas_series.is_monotonic_decreasing @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_is_monotonic_increasing(data): modin_series, pandas_series = create_test_series(data) assert modin_series.is_monotonic_increasing == pandas_series.is_monotonic_increasing @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_is_unique(data): modin_series, pandas_series = create_test_series(data) assert modin_series.is_unique == pandas_series.is_unique @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_isin(data): modin_series, pandas_series = create_test_series(data) val = [1, 2, 3, 4] pandas_result = pandas_series.isin(val) modin_result = modin_series.isin(val) df_equals(modin_result, pandas_result) def test_isin_with_series(): modin_series1, pandas_series1 = create_test_series([1, 2, 3]) modin_series2, pandas_series2 = create_test_series([1, 2, 3, 4, 5]) eval_general( (modin_series1, modin_series2), (pandas_series1, pandas_series2), lambda srs: srs[0].isin(srs[1]), ) # Verify that Series actualy behaves like Series and ignores unmatched indices on '.isin' modin_series1, pandas_series1 = create_test_series([1, 2, 3], index=[10, 11, 12]) eval_general( (modin_series1, modin_series2), (pandas_series1, pandas_series2), lambda srs: srs[0].isin(srs[1]), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_isnull(data): modin_series, pandas_series = create_test_series(data) df_equals(modin_series.isnull(), pandas_series.isnull()) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_items(data): modin_series, pandas_series = create_test_series(data) modin_items = modin_series.items() pandas_items = pandas_series.items() for modin_item, pandas_item in zip(modin_items, pandas_items): modin_index, modin_scalar = modin_item pandas_index, pandas_scalar = pandas_item df_equals(modin_scalar, pandas_scalar) assert pandas_index == modin_index @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_keys(data): modin_series, pandas_series = create_test_series(data) df_equals(modin_series.keys(), pandas_series.keys()) def test_kurtosis_alias(): # It's optimization. If failed, Series.kurt should be tested explicitly # in tests: `test_kurt_kurtosis`, `test_kurt_kurtosis_level`. _assert_casting_functions_wrap_same_implementation( pd.Series.kurt, pd.Series.kurtosis ) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("skipna", [False, True]) def test_kurtosis(axis, skipna): expected_exception = None if axis: expected_exception = ValueError("No axis named 1 for object type Series") eval_general( *create_test_series(test_data["float_nan_data"]), lambda df: df.kurtosis(axis=axis, skipna=skipna), expected_exception=expected_exception, ) @pytest.mark.parametrize("axis", ["rows", "columns"]) @pytest.mark.parametrize("numeric_only", [False, True]) def test_kurtosis_numeric_only(axis, numeric_only): expected_exception = None if axis: expected_exception = ValueError("No axis named columns for object type Series") eval_general( *create_test_series(test_data_diff_dtype), lambda df: df.kurtosis(axis=axis, numeric_only=numeric_only), expected_exception=expected_exception, ) def test_last(): modin_index = pd.date_range("2010-04-09", periods=400, freq="2D") pandas_index = pandas.date_range("2010-04-09", periods=400, freq="2D") modin_series = pd.Series(list(range(400)), index=modin_index) pandas_series = pandas.Series(list(range(400)), index=pandas_index) with pytest.warns(FutureWarning, match="last is deprecated and will be removed"): modin_result = modin_series.last("3D") df_equals(modin_result, pandas_series.last("3D")) df_equals(modin_series.last("20D"), pandas_series.last("20D")) @pytest.mark.parametrize("func", ["all", "any", "count"]) def test_index_order(func): # see #1708 and #1869 for details s_modin, s_pandas = create_test_series(test_data["float_nan_data"]) rows_number = len(s_modin.index) level_0 = np.random.choice([x for x in range(10)], rows_number) level_1 = np.random.choice([x for x in range(10)], rows_number) index = pandas.MultiIndex.from_arrays([level_0, level_1]) s_modin.index = index s_pandas.index = index # The result of the operation is not a Series, `.index` is missed df_equals( getattr(s_modin, func)(), getattr(s_pandas, func)(), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_last_valid_index(data): modin_series, pandas_series = create_test_series(data) assert modin_series.last_valid_index() == (pandas_series.last_valid_index()) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_le(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "le") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_loc(data): modin_series, pandas_series = create_test_series(data) for v in modin_series.index: df_equals(modin_series.loc[v], pandas_series.loc[v]) df_equals(modin_series.loc[v:], pandas_series.loc[v:]) indices = [True if i % 3 == 0 else False for i in range(len(modin_series.index))] modin_result = modin_series.loc[indices] pandas_result = pandas_series.loc[indices] df_equals(modin_result, pandas_result) # From issue #1988 index = pd.MultiIndex.from_product([np.arange(10), np.arange(10)], names=["f", "s"]) data = np.arange(100) modin_series = pd.Series(data, index=index).sort_index() pandas_series = pandas.Series(data, index=index).sort_index() modin_result = modin_series.loc[ (slice(None), 1), ] # fmt: skip pandas_result = pandas_series.loc[ (slice(None), 1), ] # fmt: skip df_equals(modin_result, pandas_result) def test_loc_with_boolean_series(): modin_series, pandas_series = create_test_series([1, 2, 3]) modin_mask, pandas_mask = create_test_series([True, False, False]) modin_result = modin_series.loc[modin_mask] pandas_result = pandas_series.loc[pandas_mask] df_equals(modin_result, pandas_result) # This tests the bug from https://github.com/modin-project/modin/issues/3736 def test_loc_setting_categorical_series(): modin_series = pd.Series(["a", "b", "c"], dtype="category") pandas_series = pandas.Series(["a", "b", "c"], dtype="category") modin_series.loc[1:3] = "a" pandas_series.loc[1:3] = "a" df_equals(modin_series, pandas_series) # This tests the bug from https://github.com/modin-project/modin/issues/3736 def test_iloc_assigning_scalar_none_to_string_series(): data = ["A"] modin_series, pandas_series = create_test_series(data, dtype="string") modin_series.iloc[0] = None pandas_series.iloc[0] = None df_equals(modin_series, pandas_series) def test_set_ordered_categorical_column(): data = {"a": [1, 2, 3], "b": [4, 5, 6]} mdf = pd.DataFrame(data) pdf = pandas.DataFrame(data) mdf["a"] = pd.Categorical(mdf["a"], ordered=True) pdf["a"] = pandas.Categorical(pdf["a"], ordered=True) df_equals(mdf, pdf) modin_categories = mdf["a"].dtype pandas_categories = pdf["a"].dtype assert modin_categories == pandas_categories @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_lt(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "lt") @pytest.mark.parametrize("na_values", ["ignore", None], ids=["na_ignore", "na_none"]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_map(data, na_values): modin_series, pandas_series = create_test_series(data) df_equals( modin_series.map(str, na_action=na_values), pandas_series.map(str, na_action=na_values), ) mapper = {i: str(i) for i in range(100)} df_equals( modin_series.map(mapper, na_action=na_values), pandas_series.map(mapper, na_action=na_values), # https://github.com/modin-project/modin/issues/5967 check_dtypes=False, ) # Return list objects modin_series_lists = modin_series.map(lambda s: [s, s, s]) pandas_series_lists = pandas_series.map(lambda s: [s, s, s]) df_equals(modin_series_lists, pandas_series_lists) # Index into list objects df_equals( modin_series_lists.map(lambda lst: lst[0]), pandas_series_lists.map(lambda lst: lst[0]), ) def test_mask(): modin_series = pd.Series(np.arange(10)) m = modin_series % 3 == 0 with warns_that_defaulting_to_pandas_if(not df_or_series_using_native_execution(m)): try: modin_series.mask(~m, -modin_series) except ValueError: pass @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("skipna", [False, True]) def test_max(data, skipna): eval_general(*create_test_series(data), lambda df: df.max(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("skipna", [False, True]) def test_mean(data, skipna): eval_general(*create_test_series(data), lambda df: df.mean(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("skipna", [False, True]) def test_median(data, skipna): eval_general(*create_test_series(data), lambda df: df.median(skipna=skipna)) @pytest.mark.parametrize( "method", ["median", "skew", "std", "sum", "var", "prod", "sem"] ) def test_median_skew_std_sum_var_prod_sem_1953(method): # See #1953 for details data = [3, 3, 3, 3, 3, 3, 3, 3, 3] arrays = [ ["1", "1", "1", "2", "2", "2", "3", "3", "3"], ["1", "2", "3", "4", "5", "6", "7", "8", "9"], ] modin_s = pd.Series(data, index=arrays) pandas_s = pandas.Series(data, index=arrays) eval_general(modin_s, pandas_s, lambda s: getattr(s, method)()) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("index", [True, False], ids=["True", "False"]) def test_memory_usage(data, index): modin_series, pandas_series = create_test_series(data) df_equals( modin_series.memory_usage(index=index), pandas_series.memory_usage(index=index) ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("skipna", [False, True]) def test_min(data, skipna): eval_general(*create_test_series(data), lambda df: df.min(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_mod(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "mod") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_mode(data): modin_series, pandas_series = create_test_series(data) df_equals(modin_series.mode(), pandas_series.mode()) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_mul(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "mul") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_multiply(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "multiply") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_name(data): modin_series, pandas_series = create_test_series(data) assert modin_series.name == pandas_series.name modin_series.name = pandas_series.name = "New_name" assert modin_series.name == pandas_series.name assert modin_series._query_compiler.columns == ["New_name"] def test_tuple_name(): names = [("a", 1), ("a", "b", "c"), "flat"] s = pd.Series(name=names[0]) # The internal representation of the Series stores the name as a column label. # When it is a tuple, this label is a MultiIndex object, and this test ensures that # the Series's name property remains a tuple. assert s.name == names[0] assert isinstance(s.name, tuple) # Setting the name to a tuple of a different level or a non-tuple should not error. s.name = names[1] assert s.name == names[1] assert isinstance(s.name, tuple) s.name = names[2] assert s.name == names[2] assert isinstance(s.name, str) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_nbytes(data): modin_series, pandas_series = create_test_series(data) assert modin_series.nbytes == pandas_series.nbytes @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_ndim(data): modin_series, _ = create_test_series(data) # noqa: F841 assert modin_series.ndim == 1 @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_ne(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "ne") @pytest.mark.xfail(reason="Using pandas Series.") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_nlargest(data): modin_series = create_test_series(data) with pytest.raises(NotImplementedError): modin_series.nlargest(None) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_notnull(data): modin_series, pandas_series = create_test_series(data) df_equals(modin_series.notnull(), pandas_series.notnull()) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_nsmallest(data): modin_series, pandas_series = create_test_series(data) df_equals( modin_series.nsmallest(n=5, keep="first"), pandas_series.nsmallest(n=5, keep="first"), ) df_equals( modin_series.nsmallest(n=10, keep="first"), pandas_series.nsmallest(n=10, keep="first"), ) df_equals( modin_series.nsmallest(n=10, keep="last"), pandas_series.nsmallest(n=10, keep="last"), ) df_equals(modin_series.nsmallest(keep="all"), pandas_series.nsmallest(keep="all")) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("dropna", [True, False], ids=["True", "False"]) def test_nunique(data, dropna): modin_series, pandas_series = create_test_series(data) df_equals(modin_series.nunique(dropna=dropna), pandas_series.nunique(dropna=dropna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_pct_change(data): modin_series, pandas_series = create_test_series(data) with warns_that_defaulting_to_pandas_if( not df_or_series_using_native_execution(modin_series) ): modin_series.pct_change() @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_pipe(data): modin_series, pandas_series = create_test_series(data) n = len(modin_series.index) a, b, c = 2 % n, 0, 3 % n def h(x): return x.dropna() def g(x, arg1=0): for _ in range(arg1): x = (pd if isinstance(x, pd.Series) else pandas).concat((x, x)) return x def f(x, arg2=0, arg3=0): return x.drop(x.index[[arg2, arg3]]) df_equals( f(g(h(modin_series), arg1=a), arg2=b, arg3=c), (modin_series.pipe(h).pipe(g, arg1=a).pipe(f, arg2=b, arg3=c)), ) df_equals( (modin_series.pipe(h).pipe(g, arg1=a).pipe(f, arg2=b, arg3=c)), (pandas_series.pipe(h).pipe(g, arg1=a).pipe(f, arg2=b, arg3=c)), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_plot(request, data): modin_series, pandas_series = create_test_series(data) if name_contains(request.node.name, numeric_dfs): # We have to test this way because equality in plots means same object. zipped_plot_lines = zip(modin_series.plot().lines, pandas_series.plot().lines) for left, right in zipped_plot_lines: if isinstance(left.get_xdata(), np.ma.core.MaskedArray) and isinstance( right.get_xdata(), np.ma.core.MaskedArray ): assert all((left.get_xdata() == right.get_xdata()).data) else: assert np.array_equal(left.get_xdata(), right.get_xdata()) if isinstance(left.get_ydata(), np.ma.core.MaskedArray) and isinstance( right.get_ydata(), np.ma.core.MaskedArray ): assert all((left.get_ydata() == right.get_ydata()).data) else: assert np.array_equal(left.get_xdata(), right.get_xdata()) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_pop(data): modin_series, pandas_series = create_test_series(data) for key in modin_series.keys(): df_equals(modin_series.pop(key), pandas_series.pop(key)) df_equals(modin_series, pandas_series) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_pow(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "pow") def test_product_alias(): _assert_casting_functions_wrap_same_implementation( pd.Series.prod, pd.Series.product ) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("skipna", [False, True]) def test_prod(axis, skipna): expected_exception = None if axis: expected_exception = ValueError("No axis named 1 for object type Series") eval_general( *create_test_series(test_data["float_nan_data"]), lambda s: s.prod(axis=axis, skipna=skipna), expected_exception=expected_exception, ) @pytest.mark.parametrize("numeric_only", [False, True]) @pytest.mark.parametrize( "min_count", int_arg_values, ids=arg_keys("min_count", int_arg_keys) ) def test_prod_specific(min_count, numeric_only): eval_general( *create_test_series(test_data_diff_dtype), lambda df: df.prod(min_count=min_count, numeric_only=numeric_only), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("q", quantiles_values, ids=quantiles_keys) def test_quantile(request, data, q): modin_series, pandas_series = create_test_series(data) if not name_contains(request.node.name, no_numeric_dfs): df_equals(modin_series.quantile(q), pandas_series.quantile(q)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_radd(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "radd") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize( "na_option", ["keep", "top", "bottom"], ids=["keep", "top", "bottom"] ) def test_rank(data, na_option): modin_series, pandas_series = create_test_series(data) try: pandas_result = pandas_series.rank(na_option=na_option) except Exception as err: with pytest.raises(type(err)): modin_series.rank(na_option=na_option) else: modin_result = modin_series.rank(na_option=na_option) df_equals(modin_result, pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("order", [None, "C", "F", "A", "K"]) def test_ravel(data, order): modin_series, pandas_series = create_test_series(data) np.testing.assert_equal( modin_series.ravel(order=order), pandas_series.ravel(order=order) ) @pytest.mark.parametrize( "data", [ pandas.Categorical(np.arange(1000), ordered=True), pandas.Categorical(np.arange(1000), ordered=False), pandas.Categorical(np.arange(1000), categories=np.arange(500), ordered=True), pandas.Categorical(np.arange(1000), categories=np.arange(500), ordered=False), ], ) @pytest.mark.parametrize("order", [None, "C", "F", "A", "K"]) def test_ravel_category(data, order): modin_series, pandas_series = create_test_series(data) categories_equals(modin_series.ravel(order=order), pandas_series.ravel(order=order)) @pytest.mark.parametrize( "data", [ pandas.Categorical(np.arange(10), ordered=True), pandas.Categorical(np.arange(10), ordered=False), pandas.Categorical(np.arange(10), categories=np.arange(5), ordered=True), pandas.Categorical(np.arange(10), categories=np.arange(5), ordered=False), ], ) @pytest.mark.parametrize("order", [None, "C", "F", "A", "K"]) def test_ravel_simple_category(data, order): modin_series, pandas_series = create_test_series(data) categories_equals(modin_series.ravel(order=order), pandas_series.ravel(order=order)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_rdiv(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "rdiv") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_reindex(data): modin_series, pandas_series = create_test_series(data) pandas_result = pandas_series.reindex( list(pandas_series.index) + ["_A_NEW_ROW"], fill_value=0 ) modin_result = modin_series.reindex( list(modin_series.index) + ["_A_NEW_ROW"], fill_value=0 ) df_equals(pandas_result, modin_result) frame_data = { "col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], "col3": [8, 9, 10, 11], "col4": [12, 13, 14, 15], "col5": [0, 0, 0, 0], } pandas_df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) for col in pandas_df.columns: modin_series = modin_df[col] pandas_series = pandas_df[col] df_equals( modin_series.reindex([0, 3, 2, 1]), pandas_series.reindex([0, 3, 2, 1]) ) df_equals(modin_series.reindex([0, 6, 2]), pandas_series.reindex([0, 6, 2])) df_equals( modin_series.reindex(index=[0, 1, 5]), pandas_series.reindex(index=[0, 1, 5]), ) # MultiIndex modin_series, pandas_series = create_test_series(data) modin_series.index, pandas_series.index = [ generate_multiindex(len(pandas_series)) ] * 2 pandas_result = pandas_series.reindex(list(reversed(pandas_series.index))) modin_result = modin_series.reindex(list(reversed(modin_series.index))) df_equals(pandas_result, modin_result) def test_reindex_like(): o_data = [ [24.3, 75.7, "high"], [31, 87.8, "high"], [22, 71.6, "medium"], [35, 95, "medium"], ] o_columns = ["temp_celsius", "temp_fahrenheit", "windspeed"] o_index = pd.date_range(start="2014-02-12", end="2014-02-15", freq="D") new_data = [[28, "low"], [30, "low"], [35.1, "medium"]] new_columns = ["temp_celsius", "windspeed"] new_index = pd.DatetimeIndex(["2014-02-12", "2014-02-13", "2014-02-15"]) modin_df1 = pd.DataFrame(o_data, columns=o_columns, index=o_index) modin_df2 = pd.DataFrame(new_data, columns=new_columns, index=new_index) modin_result = modin_df2["windspeed"].reindex_like(modin_df1["windspeed"]) pandas_df1 = pandas.DataFrame(o_data, columns=o_columns, index=o_index) pandas_df2 = pandas.DataFrame(new_data, columns=new_columns, index=new_index) pandas_result = pandas_df2["windspeed"].reindex_like(pandas_df1["windspeed"]) df_equals(modin_result, pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_rename(data): modin_series, pandas_series = create_test_series(data) new_name = "NEW_NAME" df_equals(modin_series.rename(new_name), pandas_series.rename(new_name)) modin_series_cp = modin_series.copy() pandas_series_cp = pandas_series.copy() modin_series_cp.rename(new_name, inplace=True) pandas_series_cp.rename(new_name, inplace=True) df_equals(modin_series_cp, pandas_series_cp) modin_result = modin_series.rename("{}__".format) pandas_result = pandas_series.rename("{}__".format) df_equals(modin_result, pandas_result) def test_reorder_levels(): data = np.random.randint(1, 100, 12) modin_series = pd.Series( data, index=pd.MultiIndex.from_tuples( [ (num, letter, color) for num in range(1, 3) for letter in ["a", "b", "c"] for color in ["Red", "Green"] ], names=["Number", "Letter", "Color"], ), ) pandas_series = pandas.Series( data, index=pandas.MultiIndex.from_tuples( [ (num, letter, color) for num in range(1, 3) for letter in ["a", "b", "c"] for color in ["Red", "Green"] ], names=["Number", "Letter", "Color"], ), ) modin_result = modin_series.reorder_levels(["Letter", "Color", "Number"]) pandas_result = pandas_series.reorder_levels(["Letter", "Color", "Number"]) df_equals(modin_result, pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize( "repeats", [0, 2, 3, 4], ids=["repeats_{}".format(i) for i in [0, 2, 3, 4]] ) def test_repeat(data, repeats): eval_general(pd.Series(data), pandas.Series(data), lambda df: df.repeat(repeats)) @pytest.mark.parametrize("data", [np.arange(256)]) @pytest.mark.parametrize( "repeats", [ 0, 2, [2], np.arange(256), [0] * 64 + [2] * 64 + [3] * 32 + [4] * 32 + [5] * 64, [2] * 257, ], ids=["0_case", "scalar", "one-elem-list", "array", "list", "wrong_list"], ) def test_repeat_lists(data, repeats, request): expected_exception = None if "wrong_list" in request.node.callspec.id: expected_exception = ValueError( "operands could not be broadcast together with shape (256,) (257,)" ) eval_general( *create_test_series(data), lambda df: df.repeat(repeats), expected_exception=expected_exception, ) def test_clip_4485(): modin_result = pd.Series([1]).clip([3]) pandas_result = pandas.Series([1]).clip([3]) df_equals(modin_result, pandas_result) def test_replace(): modin_series = pd.Series([0, 1, 2, 3, 4]) pandas_series = pandas.Series([0, 1, 2, 3, 4]) modin_result = modin_series.replace(0, 5) pandas_result = pandas_series.replace(0, 5) df_equals(modin_result, pandas_result) modin_result = modin_series.replace([1, 2], method="bfill") pandas_result = pandas_series.replace([1, 2], method="bfill") df_equals(modin_result, pandas_result) @pytest.mark.parametrize("closed", ["left", "right"]) @pytest.mark.parametrize("label", ["right", "left"]) @pytest.mark.parametrize("level", [None, 1]) @pytest.mark.exclude_in_sanity def test_resample(closed, label, level): rule = "5min" freq = "h" index = pandas.date_range("1/1/2000", periods=12, freq=freq) pandas_series = pandas.Series(range(12), index=index) modin_series = pd.Series(range(12), index=index) if level is not None: index = pandas.MultiIndex.from_product( [["a", "b", "c"], pandas.date_range("31/12/2000", periods=4, freq=freq)] ) pandas_series.index = index modin_series.index = index pandas_resampler = pandas_series.resample( rule, closed=closed, label=label, level=level ) modin_resampler = modin_series.resample( rule, closed=closed, label=label, level=level ) df_equals(modin_resampler.count(), pandas_resampler.count()) df_equals(modin_resampler.var(0), pandas_resampler.var(0)) df_equals(modin_resampler.sum(), pandas_resampler.sum()) df_equals(modin_resampler.std(), pandas_resampler.std()) df_equals(modin_resampler.sem(), pandas_resampler.sem()) df_equals(modin_resampler.size(), pandas_resampler.size()) df_equals(modin_resampler.prod(), pandas_resampler.prod()) df_equals(modin_resampler.ohlc(), pandas_resampler.ohlc()) df_equals(modin_resampler.min(), pandas_resampler.min()) df_equals(modin_resampler.median(), pandas_resampler.median()) df_equals(modin_resampler.mean(), pandas_resampler.mean()) df_equals(modin_resampler.max(), pandas_resampler.max()) df_equals(modin_resampler.last(), pandas_resampler.last()) df_equals(modin_resampler.first(), pandas_resampler.first()) df_equals(modin_resampler.nunique(), pandas_resampler.nunique()) df_equals( modin_resampler.pipe(lambda x: x.max() - x.min()), pandas_resampler.pipe(lambda x: x.max() - x.min()), ) df_equals( modin_resampler.transform(lambda x: (x - x.mean()) / x.std()), pandas_resampler.transform(lambda x: (x - x.mean()) / x.std()), ) df_equals( modin_resampler.aggregate("max"), pandas_resampler.aggregate("max"), ) df_equals( modin_resampler.apply("sum"), pandas_resampler.apply("sum"), ) df_equals( modin_resampler.get_group(name=list(modin_resampler.groups)[0]), pandas_resampler.get_group(name=list(pandas_resampler.groups)[0]), ) assert pandas_resampler.indices == modin_resampler.indices assert pandas_resampler.groups == modin_resampler.groups df_equals(modin_resampler.quantile(), pandas_resampler.quantile()) # Upsampling from level= or on= selection is not supported if level is None: df_equals( modin_resampler.interpolate(), pandas_resampler.interpolate(), ) df_equals(modin_resampler.asfreq(), pandas_resampler.asfreq()) df_equals( modin_resampler.fillna(method="nearest"), pandas_resampler.fillna(method="nearest"), ) df_equals(modin_resampler.nearest(), pandas_resampler.nearest()) df_equals(modin_resampler.bfill(), pandas_resampler.bfill()) df_equals(modin_resampler.ffill(), pandas_resampler.ffill()) df_equals( modin_resampler.apply(["sum", "mean", "max"]), pandas_resampler.apply(["sum", "mean", "max"]), ) df_equals( modin_resampler.aggregate(["sum", "mean", "max"]), pandas_resampler.aggregate(["sum", "mean", "max"]), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("drop", [True, False], ids=["True", "False"]) @pytest.mark.parametrize("name", [lib.no_default, "Custom name"]) @pytest.mark.parametrize("inplace", [True, False]) def test_reset_index(data, drop, name, inplace): expected_exception = None if inplace and not drop: expected_exception = TypeError( "Cannot reset_index inplace on a Series to create a DataFrame" ) eval_general( *create_test_series(data), lambda df, *args, **kwargs: df.reset_index(*args, **kwargs), drop=drop, name=name, inplace=inplace, __inplace__=inplace, expected_exception=expected_exception, ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_rfloordiv(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "rfloordiv") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_rmod(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "rmod") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_rmul(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "rmul") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_round(data): modin_series, pandas_series = create_test_series(data) df_equals(modin_series.round(), pandas_series.round()) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_rpow(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "rpow") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_rsub(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "rsub") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_rtruediv(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "rtruediv") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_sample(data): modin_series, pandas_series = create_test_series(data) try: pandas_result = pandas_series.sample(frac=0.5, random_state=21019) except Exception as err: with pytest.raises(type(err)): modin_series.sample(frac=0.5, random_state=21019) else: modin_result = modin_series.sample(frac=0.5, random_state=21019) df_equals(pandas_result, modin_result) try: pandas_result = pandas_series.sample(n=12, random_state=21019) except Exception as err: with pytest.raises(type(err)): modin_series.sample(n=12, random_state=21019) else: modin_result = modin_series.sample(n=12, random_state=21019) df_equals(pandas_result, modin_result) with warns_that_defaulting_to_pandas_if( not df_or_series_using_native_execution(modin_series) ): df_equals( modin_series.sample(n=0, random_state=21019), pandas_series.sample(n=0, random_state=21019), ) with pytest.raises(ValueError): modin_series.sample(n=-3) @pytest.mark.parametrize("single_value_data", [True, False]) @pytest.mark.parametrize("use_multiindex", [True, False]) @pytest.mark.parametrize("sorter", [True, None]) @pytest.mark.parametrize("values_number", [1, 2, 5]) @pytest.mark.parametrize("side", ["left", "right"]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.exclude_in_sanity def test_searchsorted( data, side, values_number, sorter, use_multiindex, single_value_data ): data = data if not single_value_data else data[next(iter(data.keys()))][0] if not sorter: modin_series, pandas_series = create_test_series(vals=data, sort=True) else: modin_series, pandas_series = create_test_series(vals=data) sorter = np.argsort(list(modin_series)) if use_multiindex: rows_number = len(modin_series.index) level_0_series = random_state.choice([0, 1], rows_number) level_1_series = random_state.choice([2, 3], rows_number) index_series = pd.MultiIndex.from_arrays( [level_0_series, level_1_series], names=["first", "second"] ) modin_series.index = index_series pandas_series.index = index_series min_sample = modin_series.min(skipna=True) max_sample = modin_series.max(skipna=True) if single_value_data: values = [data] else: values = [] values.append(pandas_series.sample(n=values_number, random_state=random_state)) values.append( random_state.uniform(low=min_sample, high=max_sample, size=values_number) ) values.append( random_state.uniform( low=max_sample, high=2 * max_sample, size=values_number ) ) values.append( random_state.uniform( low=min_sample - max_sample, high=min_sample, size=values_number ) ) pure_float = random_state.uniform(float(min_sample), float(max_sample)) pure_int = int(pure_float) values.append(pure_float) values.append(pure_int) test_cases = [ modin_series.searchsorted(value=value, side=side, sorter=sorter) == pandas_series.searchsorted(value=value, side=side, sorter=sorter) for value in values ] test_cases = [ case.all() if not isinstance(case, bool) else case for case in test_cases ] for case in test_cases: assert case @pytest.mark.parametrize("skipna", [False, True]) @pytest.mark.parametrize("ddof", int_arg_values, ids=arg_keys("ddof", int_arg_keys)) def test_sem_float_nan_only(skipna, ddof): eval_general( *create_test_series(test_data["float_nan_data"]), lambda df: df.sem(skipna=skipna, ddof=ddof), ) @pytest.mark.parametrize("ddof", int_arg_values, ids=arg_keys("ddof", int_arg_keys)) def test_sem_int_only(ddof): eval_general( *create_test_series(test_data["int_data"]), lambda df: df.sem(ddof=ddof), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_set_axis(data): modin_series, _ = create_test_series(data) # noqa: F841 modin_series.set_axis(labels=["{}_{}".format(i, i + 1) for i in modin_series.index]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_shape(data): modin_series, pandas_series = create_test_series(data) assert modin_series.shape == pandas_series.shape @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_size(data): modin_series, pandas_series = create_test_series(data) assert modin_series.size == pandas_series.size @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("skipna", [False, True]) def test_skew(data, skipna): eval_general(*create_test_series(data), lambda df: df.skew(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("index", ["default", "ndarray", "has_duplicates"]) @pytest.mark.parametrize("periods", [0, 1, -1, 10, -10, 1000000000, -1000000000]) @pytest.mark.parametrize("name", [None, "foo"]) def test_shift(data, index, periods, name): modin_series, pandas_series = create_test_series(data, name=name) if index == "ndarray": data_column_length = len(data[next(iter(data))]) modin_series.index = pandas_series.index = np.arange(2, data_column_length + 2) elif index == "has_duplicates": modin_series.index = pandas_series.index = list(modin_series.index[:-3]) + [ 0, 1, 2, ] df_equals( modin_series.shift(periods=periods), pandas_series.shift(periods=periods), ) df_equals( modin_series.shift(periods=periods, fill_value=777), pandas_series.shift(periods=periods, fill_value=777), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("ascending", [False, True]) @pytest.mark.parametrize( "sort_remaining", bool_arg_values, ids=arg_keys("sort_remaining", bool_arg_keys) ) @pytest.mark.parametrize("na_position", ["first", "last"], ids=["first", "last"]) def test_sort_index(data, ascending, sort_remaining, na_position): modin_series, pandas_series = create_test_series(data) eval_general( modin_series, pandas_series, lambda df: df.sort_index( ascending=ascending, sort_remaining=sort_remaining, na_position=na_position, ), ) eval_general( modin_series.copy(), pandas_series.copy(), lambda df: df.sort_index( ascending=ascending, sort_remaining=sort_remaining, na_position=na_position, inplace=True, ), __inplace__=True, ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize("na_position", ["first", "last"], ids=["first", "last"]) def test_sort_values(data, ascending, na_position): modin_series, pandas_series = create_test_series(data) modin_result = modin_series.sort_values( ascending=ascending, na_position=na_position ) pandas_result = pandas_series.sort_values( ascending=ascending, na_position=na_position ) # Note: For `ascending=False` only # For some reason, the indexing of Series and DataFrame differ in the underlying # algorithm. The order of values is the same, but the index values are shuffled. # Since we use `DataFrame.sort_values` even for Series, the index can be different # between `pandas.Series.sort_values`. For this reason, we check that the values are # identical instead of the index as well. if ascending: df_equals_with_non_stable_indices(modin_result, pandas_result) else: np.testing.assert_equal(modin_result.values, pandas_result.values) modin_series_cp = modin_series.copy() pandas_series_cp = pandas_series.copy() modin_series_cp.sort_values( ascending=ascending, na_position=na_position, inplace=True ) pandas_series_cp.sort_values( ascending=ascending, na_position=na_position, inplace=True ) # See above about `ascending=False` if ascending: df_equals_with_non_stable_indices(modin_result, pandas_result) else: np.testing.assert_equal(modin_series_cp.values, pandas_series_cp.values) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_squeeze(data): modin_series, pandas_series = create_test_series(data) df_equals(modin_series.squeeze(None), pandas_series.squeeze(None)) df_equals(modin_series.squeeze(0), pandas_series.squeeze(0)) with pytest.raises(ValueError): modin_series.squeeze(1) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("skipna", [False, True]) @pytest.mark.parametrize("ddof", int_arg_values, ids=arg_keys("ddof", int_arg_keys)) def test_std(request, data, skipna, ddof): modin_series, pandas_series = create_test_series(data) try: pandas_result = pandas_series.std(skipna=skipna, ddof=ddof) except Exception as err: with pytest.raises(type(err)): modin_series.std(skipna=skipna, ddof=ddof) else: modin_result = modin_series.std(skipna=skipna, ddof=ddof) df_equals(modin_result, pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_sub(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "sub") def test_6782(): datetime_scalar = datetime.datetime(1970, 1, 1, 0, 0) match = "Adding/subtracting object-dtype array to DatetimeArray not vectorized" with warnings.catch_warnings(): warnings.filterwarnings("error", match, PerformanceWarning) pd.Series([datetime.datetime(2000, 1, 1)]) - datetime_scalar @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_subtract(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "subtract") @pytest.mark.parametrize( "data", test_data_values + test_data_small_values, ids=test_data_keys + test_data_small_keys, ) @pytest.mark.parametrize("skipna", [False, True]) @pytest.mark.parametrize("numeric_only", [False, True]) @pytest.mark.parametrize( "min_count", int_arg_values, ids=arg_keys("min_count", int_arg_keys) ) @pytest.mark.exclude_in_sanity def test_sum(data, skipna, numeric_only, min_count): eval_general( *create_test_series(data), lambda df, *args, **kwargs: df.sum(*args, **kwargs), skipna=skipna, numeric_only=numeric_only, min_count=min_count, ) @pytest.mark.parametrize("operation", ["sum", "shift"]) def test_sum_axis_1_except(operation): eval_general( *create_test_series(test_data["int_data"]), lambda df, *args, **kwargs: getattr(df, operation)(*args, **kwargs), axis=1, expected_exception=ValueError("No axis named 1 for object type Series"), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis1", [0, 1, "columns", "index"]) @pytest.mark.parametrize("axis2", [0, 1, "columns", "index"]) def test_swapaxes(data, axis1, axis2): modin_series, pandas_series = create_test_series(data) try: pandas_result = pandas_series.swapaxes(axis1, axis2) except Exception as err: with pytest.raises(type(err)): modin_series.swapaxes(axis1, axis2) else: modin_result = modin_series.swapaxes(axis1, axis2) df_equals(modin_result, pandas_result) def test_swaplevel(): data = np.random.randint(1, 100, 12) modin_s = pd.Series( data, index=pd.MultiIndex.from_tuples( [ (num, letter, color) for num in range(1, 3) for letter in ["a", "b", "c"] for color in ["Red", "Green"] ], names=["Number", "Letter", "Color"], ), ) pandas_s = pandas.Series( data, index=pandas.MultiIndex.from_tuples( [ (num, letter, color) for num in range(1, 3) for letter in ["a", "b", "c"] for color in ["Red", "Green"] ], names=["Number", "Letter", "Color"], ), ) df_equals( modin_s.swaplevel("Number", "Color"), pandas_s.swaplevel("Number", "Color") ) df_equals(modin_s.swaplevel(), pandas_s.swaplevel()) df_equals(modin_s.swaplevel(1, 0), pandas_s.swaplevel(1, 0)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("n", int_arg_values, ids=arg_keys("n", int_arg_keys)) def test_tail(data, n): modin_series, pandas_series = create_test_series(data) df_equals(modin_series.tail(n), pandas_series.tail(n)) df_equals( modin_series.tail(len(modin_series)), pandas_series.tail(len(pandas_series)) ) def test_take(): modin_s = pd.Series(["falcon", "parrot", "lion", "cat"], index=[0, 2, 3, 1]) pandas_s = pandas.Series(["falcon", "parrot", "lion", "cat"], index=[0, 2, 3, 1]) a = modin_s.take([0, 3]) df_equals(a, pandas_s.take([0, 3])) try: pandas_s.take([2], axis=1) except Exception as err: with pytest.raises(type(err)): modin_s.take([2], axis=1) @pytest.mark.parametrize( "ignore_index", bool_arg_values, ids=arg_keys("ignore_index", bool_arg_keys) ) def test_explode(ignore_index): # Some items in this test data are lists that explode() should expand. data = [[1, 2, 3], "foo", [], [3, 4]] modin_series, pandas_series = create_test_series(data) df_equals( modin_series.explode(ignore_index=ignore_index), pandas_series.explode(ignore_index=ignore_index), ) def test_to_period(): idx = pd.date_range("1/1/2012", periods=5, freq="M") series = pd.Series(np.random.randint(0, 100, size=(len(idx))), index=idx) with warns_that_defaulting_to_pandas_if( not df_or_series_using_native_execution(series) ): series.to_period() @pytest.mark.parametrize( "data", test_data_values + test_data_large_categorical_series_values, ids=test_data_keys + test_data_large_categorical_series_keys, ) def test_to_numpy(data): modin_series, pandas_series = create_test_series(data) assert_array_equal(modin_series.to_numpy(), pandas_series.to_numpy()) def test_to_numpy_dtype(): modin_series, pandas_series = create_test_series(test_data["float_nan_data"]) assert_array_equal( modin_series.to_numpy(dtype="int64"), pandas_series.to_numpy(dtype="int64"), strict=True, ) @pytest.mark.parametrize( "data", test_data_values + test_data_large_categorical_series_values, ids=test_data_keys + test_data_large_categorical_series_keys, ) def test_series_values(data): modin_series, pandas_series = create_test_series(data) assert_array_equal(modin_series.values, pandas_series.values) def test_series_empty_values(): modin_series, pandas_series = pd.Series(), pandas.Series() assert_array_equal(modin_series.values, pandas_series.values) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_to_string(request, data): eval_general( *create_test_series(data), lambda df: df.to_string(), ) def test_to_timestamp(): idx = pd.date_range("1/1/2012", periods=5, freq="M") series = pd.Series(np.random.randint(0, 100, size=(len(idx))), index=idx) with warns_that_defaulting_to_pandas_if( not df_or_series_using_native_execution(series) ): series.to_period().to_timestamp() @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_to_xarray(data): modin_series, _ = create_test_series(data) # noqa: F841 with warns_that_defaulting_to_pandas_if( not df_or_series_using_native_execution(modin_series) ): modin_series.to_xarray() def test_to_xarray_mock(): modin_series = pd.Series([]) with mock.patch("pandas.Series.to_xarray") as to_xarray: modin_series.to_xarray() to_xarray.assert_called_once() assert len(to_xarray.call_args[0]) == 1 df_equals(modin_series, to_xarray.call_args[0][0]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_tolist(data): modin_series, _ = create_test_series(data) # noqa: F841 with warns_that_defaulting_to_pandas_if( not df_or_series_using_native_execution(modin_series) ): modin_series.tolist() @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize( "func", [lambda x: x + 1, [np.sqrt, np.exp]], ids=["lambda", "list_udfs"] ) def test_transform(data, func, request): if "list_udfs" in request.node.callspec.id: pytest.xfail(reason="https://github.com/modin-project/modin/issues/6998") eval_general( *create_test_series(data), lambda df: df.transform(func), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("func", agg_func_except_values, ids=agg_func_except_keys) def test_transform_except(data, func): eval_general( *create_test_series(data), lambda df: df.transform(func), expected_exception=ValueError("Function did not transform"), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_transpose(data): modin_series, pandas_series = create_test_series(data) df_equals(modin_series.transpose(), modin_series) df_equals(modin_series.transpose(), pandas_series.transpose()) df_equals(modin_series.transpose(), pandas_series) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_truediv(data): modin_series, pandas_series = create_test_series(data) inter_df_math_helper(modin_series, pandas_series, "truediv") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_truncate(data): modin_series, pandas_series = create_test_series(data) before = 1 after = len(modin_series - 3) df_equals( modin_series.truncate(before, after), pandas_series.truncate(before, after) ) before = 1 after = 3 df_equals( modin_series.truncate(before, after), pandas_series.truncate(before, after) ) before = None after = None df_equals( modin_series.truncate(before, after), pandas_series.truncate(before, after) ) def test_tz_convert(): modin_idx = pd.date_range( "1/1/2012", periods=400, freq="2D", tz="America/Los_Angeles" ) pandas_idx = pandas.date_range( "1/1/2012", periods=400, freq="2D", tz="America/Los_Angeles" ) data = np.random.randint(0, 100, size=len(modin_idx)) modin_series = pd.Series(data, index=modin_idx) pandas_series = pandas.Series(data, index=pandas_idx) modin_result = modin_series.tz_convert("UTC", axis=0) pandas_result = pandas_series.tz_convert("UTC", axis=0) df_equals(modin_result, pandas_result) modin_multi = pd.MultiIndex.from_arrays([modin_idx, range(len(modin_idx))]) pandas_multi = pandas.MultiIndex.from_arrays([pandas_idx, range(len(modin_idx))]) modin_series = pd.Series(data, index=modin_multi) pandas_series = pandas.Series(data, index=pandas_multi) df_equals( modin_series.tz_convert("UTC", axis=0, level=0), pandas_series.tz_convert("UTC", axis=0, level=0), ) def test_tz_localize(): idx = pd.date_range("1/1/2012", periods=400, freq="2D") data = np.random.randint(0, 100, size=len(idx)) modin_series = pd.Series(data, index=idx) pandas_series = pandas.Series(data, index=idx) df_equals( modin_series.tz_localize("America/Los_Angeles"), pandas_series.tz_localize("America/Los_Angeles"), ) df_equals( modin_series.tz_localize("UTC"), pandas_series.tz_localize("UTC"), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_unique(data): comparator = lambda *args: sort_if_range_partitioning( # noqa: E731 *args, comparator=assert_array_equal ) modin_series, pandas_series = create_test_series(data) modin_result = modin_series.unique() pandas_result = pandas_series.unique() comparator(modin_result, pandas_result) assert modin_result.shape == pandas_result.shape assert type(modin_result) is type(pandas_result) modin_result = pd.Series([2, 1, 3, 3], name="A").unique() pandas_result = pandas.Series([2, 1, 3, 3], name="A").unique() comparator(modin_result, pandas_result) assert modin_result.shape == pandas_result.shape assert type(modin_result) is type(pandas_result) modin_result = pd.Series([pd.Timestamp("2016-01-01") for _ in range(3)]).unique() pandas_result = pandas.Series( [pd.Timestamp("2016-01-01") for _ in range(3)] ).unique() comparator(modin_result, pandas_result) assert modin_result.shape == pandas_result.shape assert type(modin_result) is type(pandas_result) modin_result = pd.Series( [pd.Timestamp("2016-01-01", tz="US/Eastern") for _ in range(3)] ).unique() pandas_result = pandas.Series( [pd.Timestamp("2016-01-01", tz="US/Eastern") for _ in range(3)] ).unique() comparator(modin_result, pandas_result) assert modin_result.shape == pandas_result.shape assert type(modin_result) is type(pandas_result) modin_result = pandas.Series(pd.Categorical(list("baabc"))).unique() pandas_result = pd.Series(pd.Categorical(list("baabc"))).unique() comparator(modin_result, pandas_result) assert modin_result.shape == pandas_result.shape assert type(modin_result) is type(pandas_result) modin_result = pd.Series( pd.Categorical(list("baabc"), categories=list("abc"), ordered=True) ).unique() pandas_result = pandas.Series( pd.Categorical(list("baabc"), categories=list("abc"), ordered=True) ).unique() comparator(modin_result, pandas_result) assert modin_result.shape == pandas_result.shape assert type(modin_result) is type(pandas_result) def test_unique_pyarrow_dtype(): # See #6227 for details modin_series, pandas_series = create_test_series( [1, 0, pd.NA], dtype="uint8[pyarrow]" ) def comparator(df1, df2): # Perform our own non-strict version of dtypes equality check df_equals(df1, df2) # to be sure `unique` return `ArrowExtensionArray` assert type(df1) is type(df2) eval_general( modin_series, pandas_series, lambda df: df.unique(), comparator=comparator ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_unstack(data): modin_series, pandas_series = create_test_series(data) index = generate_multiindex(len(pandas_series), nlevels=4, is_tree_like=True) modin_series = pd.Series(data[next(iter(data.keys()))], index=index) pandas_series = pandas.Series(data[next(iter(data.keys()))], index=index) df_equals(modin_series.unstack(), pandas_series.unstack()) df_equals(modin_series.unstack(level=0), pandas_series.unstack(level=0)) df_equals(modin_series.unstack(level=[0, 1]), pandas_series.unstack(level=[0, 1])) df_equals( modin_series.unstack(level=[0, 1, 2]), pandas_series.unstack(level=[0, 1, 2]) ) def test_unstack_error_no_multiindex(): modin_series = pd.Series([0, 1, 2]) with pytest.raises(ValueError, match="index must be a MultiIndex to unstack"): modin_series.unstack() @pytest.mark.parametrize( "data, other_data", [([1, 2, 3], [4, 5, 6]), ([1, 2, 3], [4, 5, 6, 7, 8]), ([1, 2, 3], [4, np.nan, 6])], ) def test_update(data, other_data): modin_series, pandas_series = pd.Series(data), pandas.Series(data) modin_series.update(pd.Series(other_data)) pandas_series.update(pandas.Series(other_data)) df_equals(modin_series, pandas_series) @pytest.mark.parametrize("sort", bool_arg_values, ids=bool_arg_keys) @pytest.mark.parametrize("normalize", bool_arg_values, ids=bool_arg_keys) @pytest.mark.parametrize("bins", [3, None]) @pytest.mark.parametrize( "dropna", [ pytest.param(None), pytest.param(False), pytest.param(True), ], ) @pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.exclude_in_sanity def test_value_counts(sort, normalize, bins, dropna, ascending): def sort_sensitive_comparator(df1, df2): # We sort indices for Modin and pandas result because of issue #1650 return ( df_equals_with_non_stable_indices(df1, df2) if sort else df_equals(df1.sort_index(), df2.sort_index()) ) eval_general( *create_test_series(test_data_values[0]), lambda df: df.value_counts( sort=sort, bins=bins, normalize=normalize, dropna=dropna, ascending=ascending, ), comparator=sort_sensitive_comparator, ) # from issue #2365 arr = np.random.rand(2**6) arr[::10] = np.nan eval_general( *create_test_series(arr), lambda df: df.value_counts( sort=sort, bins=bins, normalize=normalize, dropna=dropna, ascending=ascending, ), comparator=sort_sensitive_comparator, ) def test_value_counts_categorical(): # from issue #3571 data = np.array(["a"] * 50000 + ["b"] * 10000 + ["c"] * 1000) random_state = np.random.RandomState(seed=42) random_state.shuffle(data) eval_general( *create_test_series(data, dtype="category"), lambda df: df.value_counts(), comparator=df_equals, ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_values(data): modin_series, pandas_series = create_test_series(data) np.testing.assert_equal(modin_series.values, pandas_series.values) def test_values_non_numeric(): data = ["str{0}".format(i) for i in range(0, 10**3)] modin_series, pandas_series = create_test_series(data) modin_series = modin_series.astype("category") pandas_series = pandas_series.astype("category") df_equals(modin_series.values, pandas_series.values) def test_values_ea(): data = pandas.arrays.SparseArray(np.arange(10, dtype="int64")) modin_series, pandas_series = create_test_series(data) modin_values = modin_series.values pandas_values = pandas_series.values assert modin_values.dtype == pandas_values.dtype df_equals(modin_values, pandas_values) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("skipna", [False, True]) @pytest.mark.parametrize("ddof", int_arg_values, ids=arg_keys("ddof", int_arg_keys)) def test_var(data, skipna, ddof): modin_series, pandas_series = create_test_series(data) try: pandas_result = pandas_series.var(skipna=skipna, ddof=ddof) except Exception as err: with pytest.raises(type(err)): modin_series.var(skipna=skipna, ddof=ddof) else: modin_result = modin_series.var(skipna=skipna, ddof=ddof) df_equals(modin_result, pandas_result) def test_view(): modin_series = pd.Series([-2, -1, 0, 1, 2], dtype="int8") pandas_series = pandas.Series([-2, -1, 0, 1, 2], dtype="int8") modin_result = modin_series.view(dtype="uint8") pandas_result = pandas_series.view(dtype="uint8") df_equals(modin_result, pandas_result) modin_series = pd.Series([-20, -10, 0, 10, 20], dtype="int32") pandas_series = pandas.Series([-20, -10, 0, 10, 20], dtype="int32") modin_result = modin_series.view(dtype="float32") pandas_result = pandas_series.view(dtype="float32") df_equals(modin_result, pandas_result) modin_series = pd.Series([-200, -100, 0, 100, 200], dtype="int64") pandas_series = pandas.Series([-200, -100, 0, 100, 200], dtype="int64") modin_result = modin_series.view(dtype="float64") pandas_result = pandas_series.view(dtype="float64") df_equals(modin_result, pandas_result) def test_where(): frame_data = random_state.randn(100) pandas_series = pandas.Series(frame_data) modin_series = pd.Series(frame_data) pandas_cond_series = pandas_series % 5 < 2 modin_cond_series = modin_series % 5 < 2 pandas_result = pandas_series.where(pandas_cond_series, -pandas_series) modin_result = modin_series.where(modin_cond_series, -modin_series) assert all((to_pandas(modin_result) == pandas_result)) other_data = random_state.randn(100) modin_other, pandas_other = pd.Series(other_data), pandas.Series(other_data) pandas_result = pandas_series.where(pandas_cond_series, pandas_other, axis=0) modin_result = modin_series.where(modin_cond_series, modin_other, axis=0) assert all(to_pandas(modin_result) == pandas_result) pandas_result = pandas_series.where(pandas_series < 2, True) modin_result = modin_series.where(modin_series < 2, True) assert all(to_pandas(modin_result) == pandas_result) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) @pytest.mark.parametrize( "key", [0, slice(0, len(test_string_data_values) / 2)], ids=["single_key", "slice_key"], ) def test_str___getitem__(data, key): modin_series, pandas_series = create_test_series(data) modin_result = modin_series.str[key] pandas_result = pandas_series.str[key] df_equals( modin_result, pandas_result, # https://github.com/modin-project/modin/issues/5968 check_dtypes=False, ) # Test str operations @pytest.mark.parametrize( "others", [["abC|DeF,Hik", "gSaf,qWer|Gre", "asd3,4sad|", np.nan], None], ids=["list", "None"], ) def test_str_cat(others): data = ["abC|DeF,Hik", "gSaf,qWer|Gre", "asd3,4sad|", np.nan] eval_general(*create_test_series(data), lambda s: s.str.cat(others=others)) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) @pytest.mark.parametrize("pat", string_sep_values, ids=string_sep_keys) @pytest.mark.parametrize("n", int_arg_values, ids=int_arg_keys) @pytest.mark.parametrize("expand", [False, True]) def test_str_split(data, pat, n, expand): eval_general( *create_test_series(data), lambda series: series.str.split(pat, n=n, expand=expand), ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) @pytest.mark.parametrize("pat", string_sep_values, ids=string_sep_keys) @pytest.mark.parametrize("n", int_arg_values, ids=int_arg_keys) @pytest.mark.parametrize("expand", [False, True]) def test_str_rsplit(data, pat, n, expand): eval_general( *create_test_series(data), lambda series: series.str.rsplit(pat, n=n, expand=expand), ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) @pytest.mark.parametrize("i", int_arg_values, ids=int_arg_keys) def test_str_get(data, i): modin_series, pandas_series = create_test_series(data) eval_general(modin_series, pandas_series, lambda series: series.str.get(i)) @pytest.mark.parametrize( "data", test_string_list_data_values, ids=test_string_list_data_keys ) @pytest.mark.parametrize("sep", string_sep_values, ids=string_sep_keys) def test_str_join(data, sep): modin_series, pandas_series = create_test_series(data) eval_general(modin_series, pandas_series, lambda series: series.str.join(sep)) @pytest.mark.parametrize( "data", test_string_list_data_values, ids=test_string_list_data_keys ) @pytest.mark.parametrize("sep", string_sep_values, ids=string_sep_keys) def test_str_get_dummies(data, sep): modin_series, pandas_series = create_test_series(data) if sep: with warns_that_defaulting_to_pandas_if( not df_or_series_using_native_execution(modin_series) ): # We are only testing that this defaults to pandas, so we will just check for # the warning modin_series.str.get_dummies(sep) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) @pytest.mark.parametrize("pat", string_sep_values, ids=string_sep_keys) @pytest.mark.parametrize("case", bool_arg_values, ids=bool_arg_keys) @pytest.mark.parametrize("na", string_na_rep_values, ids=string_na_rep_keys) def test_str_contains(data, pat, case, na): modin_series, pandas_series = create_test_series(data) eval_general( modin_series, pandas_series, lambda series: series.str.contains(pat, case=case, na=na, regex=False), # https://github.com/modin-project/modin/issues/5969 comparator_kwargs={"check_dtypes": False}, ) # Test regex pat = ",|b" eval_general( modin_series, pandas_series, lambda series: series.str.contains(pat, case=case, na=na, regex=True), # https://github.com/modin-project/modin/issues/5969 comparator_kwargs={"check_dtypes": False}, ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) @pytest.mark.parametrize("pat", string_sep_values, ids=string_sep_keys) @pytest.mark.parametrize("repl", string_sep_values, ids=string_sep_keys) @pytest.mark.parametrize("n", int_arg_values, ids=int_arg_keys) @pytest.mark.parametrize("case", bool_arg_values, ids=bool_arg_keys) def test_str_replace(data, pat, repl, n, case): eval_general( *create_test_series(data), lambda series: series.str.replace(pat, repl, n=n, case=case, regex=False), # https://github.com/modin-project/modin/issues/5970 comparator_kwargs={"check_dtypes": pat is not None}, ) # Test regex eval_general( *create_test_series(data), lambda series: series.str.replace( pat=",|b", repl=repl, n=n, case=case, regex=True ), # https://github.com/modin-project/modin/issues/5970 comparator_kwargs={"check_dtypes": pat is not None}, ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) @pytest.mark.parametrize("repeats", int_arg_values, ids=int_arg_keys) def test_str_repeat(data, repeats): modin_series, pandas_series = create_test_series(data) eval_general(modin_series, pandas_series, lambda series: series.str.repeat(repeats)) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) def test_str_removeprefix(data): modin_series, pandas_series = create_test_series(data) prefix = "test_prefix" eval_general( modin_series, pandas_series, lambda series: (prefix + series).str.removeprefix(prefix), ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) def test_str_removesuffix(data): modin_series, pandas_series = create_test_series(data) suffix = "test_suffix" eval_general( modin_series, pandas_series, lambda series: (series + suffix).str.removesuffix(suffix), ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) @pytest.mark.parametrize("width", [-1, 0, 5]) @pytest.mark.parametrize( "side", ["left", "right", "both"], ids=["left", "right", "both"] ) @pytest.mark.parametrize("fillchar", string_sep_values, ids=string_sep_keys) def test_str_pad(data, width, side, fillchar): modin_series, pandas_series = create_test_series(data) eval_general( modin_series, pandas_series, lambda series: series.str.pad(width, side=side, fillchar=fillchar), ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) @pytest.mark.parametrize("width", [-1, 0, 5]) @pytest.mark.parametrize("fillchar", string_sep_values, ids=string_sep_keys) def test_str_center(data, width, fillchar): modin_series, pandas_series = create_test_series(data) eval_general( modin_series, pandas_series, lambda series: series.str.center(width, fillchar=fillchar), ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) @pytest.mark.parametrize("width", [-1, 0, 5]) @pytest.mark.parametrize("fillchar", string_sep_values, ids=string_sep_keys) def test_str_ljust(data, width, fillchar): modin_series, pandas_series = create_test_series(data) eval_general( modin_series, pandas_series, lambda series: series.str.ljust(width, fillchar=fillchar), ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) @pytest.mark.parametrize("width", [-1, 0, 5]) @pytest.mark.parametrize("fillchar", string_sep_values, ids=string_sep_keys) def test_str_rjust(data, width, fillchar): modin_series, pandas_series = create_test_series(data) eval_general( modin_series, pandas_series, lambda series: series.str.rjust(width, fillchar=fillchar), ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) @pytest.mark.parametrize("width", [-1, 0, 5]) def test_str_zfill(data, width): modin_series, pandas_series = create_test_series(data) eval_general(modin_series, pandas_series, lambda series: series.str.zfill(width)) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) @pytest.mark.parametrize("width", [-1, 0, 5]) def test_str_wrap(data, width): expected_exception = None if width != 5: expected_exception = ValueError(f"invalid width {width} (must be > 0)") modin_series, pandas_series = create_test_series(data) eval_general( modin_series, pandas_series, lambda series: series.str.wrap(width), expected_exception=expected_exception, ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) @pytest.mark.parametrize("start", int_arg_values, ids=int_arg_keys) @pytest.mark.parametrize("stop", int_arg_values, ids=int_arg_keys) @pytest.mark.parametrize("step", [-2, 1, 3]) def test_str_slice(data, start, stop, step): modin_series, pandas_series = create_test_series(data) eval_general( modin_series, pandas_series, lambda series: series.str.slice(start=start, stop=stop, step=step), ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) @pytest.mark.parametrize("start", int_arg_values, ids=int_arg_keys) @pytest.mark.parametrize("stop", int_arg_values, ids=int_arg_keys) @pytest.mark.parametrize("repl", string_sep_values, ids=string_sep_keys) def test_str_slice_replace(data, start, stop, repl): modin_series, pandas_series = create_test_series(data) eval_general( modin_series, pandas_series, lambda series: series.str.slice_replace(start=start, stop=stop, repl=repl), ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) @pytest.mark.parametrize("pat", string_sep_values, ids=string_sep_keys) def test_str_count(data, pat): modin_series, pandas_series = create_test_series(data) eval_general(modin_series, pandas_series, lambda series: series.str.count(pat)) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) @pytest.mark.parametrize("pat", string_sep_values, ids=string_sep_keys) @pytest.mark.parametrize("na", string_na_rep_values, ids=string_na_rep_keys) def test_str_startswith(data, pat, na): modin_series, pandas_series = create_test_series(data) eval_general( modin_series, pandas_series, lambda series: series.str.startswith(pat, na=na), # https://github.com/modin-project/modin/issues/5969 comparator_kwargs={"check_dtypes": False}, ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) @pytest.mark.parametrize("pat", string_sep_values, ids=string_sep_keys) @pytest.mark.parametrize("na", string_na_rep_values, ids=string_na_rep_keys) def test_str_endswith(data, pat, na): modin_series, pandas_series = create_test_series(data) eval_general( modin_series, pandas_series, lambda series: series.str.endswith(pat, na=na), # https://github.com/modin-project/modin/issues/5969 comparator_kwargs={"check_dtypes": False}, ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) @pytest.mark.parametrize("pat", string_sep_values, ids=string_sep_keys) def test_str_findall(data, pat): modin_series, pandas_series = create_test_series(data) eval_general(modin_series, pandas_series, lambda series: series.str.findall(pat)) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) @pytest.mark.parametrize("pat", string_sep_values, ids=string_sep_keys) def test_str_fullmatch(data, pat): modin_series, pandas_series = create_test_series(data) eval_general(modin_series, pandas_series, lambda series: series.str.fullmatch(pat)) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) @pytest.mark.parametrize("pat", string_sep_values, ids=string_sep_keys) @pytest.mark.parametrize("case", bool_arg_values, ids=bool_arg_keys) @pytest.mark.parametrize("na", string_na_rep_values, ids=string_na_rep_keys) def test_str_match(data, pat, case, na): modin_series, pandas_series = create_test_series(data) eval_general( modin_series, pandas_series, lambda series: series.str.match(pat, case=case, na=na), ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) @pytest.mark.parametrize("expand", [False, True]) @pytest.mark.parametrize("pat", [r"([ab])", r"([ab])(\d)"]) def test_str_extract(data, expand, pat): modin_series, pandas_series = create_test_series(data) eval_general( modin_series, pandas_series, lambda series: series.str.extract(pat, expand=expand), ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) def test_str_extractall(data): modin_series, pandas_series = create_test_series(data) with warns_that_defaulting_to_pandas_if( not df_or_series_using_native_execution(modin_series) ): # We are only testing that this defaults to pandas, so we will just check for # the warning modin_series.str.extractall(r"([ab])(\d)") @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) def test_str_len(data): modin_series, pandas_series = create_test_series(data) eval_general(modin_series, pandas_series, lambda series: series.str.len()) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) @pytest.mark.parametrize("to_strip", string_sep_values, ids=string_sep_keys) def test_str_strip(data, to_strip): modin_series, pandas_series = create_test_series(data) eval_general( modin_series, pandas_series, lambda series: series.str.strip(to_strip=to_strip) ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) @pytest.mark.parametrize("to_strip", string_sep_values, ids=string_sep_keys) def test_str_rstrip(data, to_strip): modin_series, pandas_series = create_test_series(data) eval_general( modin_series, pandas_series, lambda series: series.str.rstrip(to_strip=to_strip) ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) @pytest.mark.parametrize("to_strip", string_sep_values, ids=string_sep_keys) def test_str_lstrip(data, to_strip): modin_series, pandas_series = create_test_series(data) eval_general( modin_series, pandas_series, lambda series: series.str.lstrip(to_strip=to_strip) ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) @pytest.mark.parametrize("sep", string_sep_values, ids=string_sep_keys) @pytest.mark.parametrize("expand", [False, True]) def test_str_partition(data, sep, expand): modin_series, pandas_series = create_test_series(data) eval_general( modin_series, pandas_series, lambda series: series.str.partition(sep, expand=expand), # https://github.com/modin-project/modin/issues/5971 comparator_kwargs={"check_dtypes": sep is not None}, ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) @pytest.mark.parametrize("sep", string_sep_values, ids=string_sep_keys) @pytest.mark.parametrize("expand", [False, True]) def test_str_rpartition(data, sep, expand): modin_series, pandas_series = create_test_series(data) eval_general( modin_series, pandas_series, lambda series: series.str.rpartition(sep, expand=expand), # https://github.com/modin-project/modin/issues/5971 comparator_kwargs={"check_dtypes": sep is not None}, ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) def test_str_lower(data): modin_series, pandas_series = create_test_series(data) eval_general(modin_series, pandas_series, lambda series: series.str.lower()) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) def test_str_upper(data): modin_series, pandas_series = create_test_series(data) eval_general(modin_series, pandas_series, lambda series: series.str.upper()) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) def test_str_title(data): modin_series, pandas_series = create_test_series(data) eval_general(modin_series, pandas_series, lambda series: series.str.title()) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) @pytest.mark.parametrize("sub", string_sep_values, ids=string_sep_keys) @pytest.mark.parametrize("start", int_arg_values, ids=int_arg_keys) @pytest.mark.parametrize("end", int_arg_values, ids=int_arg_keys) def test_str_find(data, sub, start, end): modin_series, pandas_series = create_test_series(data) eval_general( modin_series, pandas_series, lambda series: series.str.find(sub, start=start, end=end), ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) @pytest.mark.parametrize("sub", string_sep_values, ids=string_sep_keys) @pytest.mark.parametrize("start", int_arg_values, ids=int_arg_keys) @pytest.mark.parametrize("end", int_arg_values, ids=int_arg_keys) def test_str_rfind(data, sub, start, end): modin_series, pandas_series = create_test_series(data) eval_general( modin_series, pandas_series, lambda series: series.str.rfind(sub, start=start, end=end), ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) @pytest.mark.parametrize("sub", string_sep_values, ids=string_sep_keys) @pytest.mark.parametrize( "start, end", [(0, None), (1, -1), (1, 3)], ids=["default", "non_default_working", "exception"], ) def test_str_index(data, sub, start, end, request): modin_series, pandas_series = create_test_series(data) expected_exception = None if "exception-comma sep" in request.node.callspec.id: expected_exception = ValueError("substring not found") eval_general( modin_series, pandas_series, lambda series: series.str.index(sub, start=start, end=end), expected_exception=expected_exception, ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) @pytest.mark.parametrize("sub", string_sep_values, ids=string_sep_keys) @pytest.mark.parametrize( "start, end", [(0, None), (1, -1), (1, 3)], ids=["default", "non_default_working", "exception"], ) def test_str_rindex(data, sub, start, end, request): modin_series, pandas_series = create_test_series(data) expected_exception = None if "exception-comma sep" in request.node.callspec.id: expected_exception = ValueError("substring not found") eval_general( modin_series, pandas_series, lambda series: series.str.rindex(sub, start=start, end=end), expected_exception=expected_exception, ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) def test_str_capitalize(data): modin_series, pandas_series = create_test_series(data) eval_general(modin_series, pandas_series, lambda series: series.str.capitalize()) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) def test_str_swapcase(data): modin_series, pandas_series = create_test_series(data) eval_general(modin_series, pandas_series, lambda series: series.str.swapcase()) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) @pytest.mark.parametrize( "form", ["NFC", "NFKC", "NFD", "NFKD"], ids=["NFC", "NFKC", "NFD", "NFKD"] ) def test_str_normalize(data, form): modin_series, pandas_series = create_test_series(data) eval_general(modin_series, pandas_series, lambda series: series.str.normalize(form)) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) @pytest.mark.parametrize("pat", string_sep_values, ids=string_sep_keys) def test_str_translate(data, pat): modin_series, pandas_series = create_test_series(data) # Test none table eval_general( modin_series, pandas_series, lambda series: series.str.translate(None), # https://github.com/modin-project/modin/issues/5970 comparator_kwargs={"check_dtypes": False}, ) # Translation dictionary table = {pat: "DDD"} eval_general( modin_series, pandas_series, lambda series: series.str.translate(table) ) # Translation table with maketrans (python3 only) if pat is not None: table = str.maketrans(pat, "d" * len(pat)) eval_general( modin_series, pandas_series, lambda series: series.str.translate(table) ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) def test_str_isalnum(data): modin_series, pandas_series = create_test_series(data) eval_general( modin_series, pandas_series, lambda series: series.str.isalnum(), # https://github.com/modin-project/modin/issues/5969 comparator_kwargs={"check_dtypes": False}, ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) def test_str_isalpha(data): modin_series, pandas_series = create_test_series(data) eval_general( modin_series, pandas_series, lambda series: series.str.isalpha(), # https://github.com/modin-project/modin/issues/5969 comparator_kwargs={"check_dtypes": False}, ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) def test_str_isdigit(data): modin_series, pandas_series = create_test_series(data) eval_general( modin_series, pandas_series, lambda series: series.str.isdigit(), # https://github.com/modin-project/modin/issues/5969 comparator_kwargs={"check_dtypes": False}, ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) def test_str_isspace(data): modin_series, pandas_series = create_test_series(data) eval_general( modin_series, pandas_series, lambda series: series.str.isspace(), # https://github.com/modin-project/modin/issues/5969 comparator_kwargs={"check_dtypes": False}, ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) def test_str_islower(data): modin_series, pandas_series = create_test_series(data) eval_general( modin_series, pandas_series, lambda series: series.str.islower(), # https://github.com/modin-project/modin/issues/5969 comparator_kwargs={"check_dtypes": False}, ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) def test_str_isupper(data): modin_series, pandas_series = create_test_series(data) eval_general( modin_series, pandas_series, lambda series: series.str.isupper(), # https://github.com/modin-project/modin/issues/5969 comparator_kwargs={"check_dtypes": False}, ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) def test_str_istitle(data): modin_series, pandas_series = create_test_series(data) eval_general( modin_series, pandas_series, lambda series: series.str.istitle(), # https://github.com/modin-project/modin/issues/5969 comparator_kwargs={"check_dtypes": False}, ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) def test_str_isnumeric(data): modin_series, pandas_series = create_test_series(data) eval_general( modin_series, pandas_series, lambda series: series.str.isnumeric(), # https://github.com/modin-project/modin/issues/5969 comparator_kwargs={"check_dtypes": False}, ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) def test_str_isdecimal(data): modin_series, pandas_series = create_test_series(data) eval_general( modin_series, pandas_series, lambda series: series.str.isdecimal(), # https://github.com/modin-project/modin/issues/5969 comparator_kwargs={"check_dtypes": False}, ) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) def test_casefold(data): modin_series, pandas_series = create_test_series(data) eval_general(modin_series, pandas_series, lambda series: series.str.casefold()) @pytest.fixture def str_encode_decode_test_data() -> list[str]: return [ "abC|DeF,Hik", "234,3245.67", "gSaf,qWer|Gre", "asd3,4sad|", np.nan, None, # add a string that we can't encode in ascii, and whose utf-8 encoding # we cannot decode in ascii "ക", ] @pytest.mark.parametrize("encoding", encoding_types) @pytest.mark.parametrize("errors", ["strict", "ignore", "replace"]) def test_str_encode(encoding, errors, str_encode_decode_test_data): expected_exception = None if errors == "strict" and encoding == "ascii": # quite safe to check only types expected_exception = False eval_general( *create_test_series(str_encode_decode_test_data), lambda s: s.str.encode(encoding, errors=errors), expected_exception=expected_exception, ) @pytest.mark.parametrize( "encoding", encoding_types, ) @pytest.mark.parametrize("errors", ["strict", "ignore", "replace"]) def test_str_decode(encoding, errors, str_encode_decode_test_data): expected_exception = None if errors == "strict": # it's quite safe here to check only types of exceptions expected_exception = False eval_general( *create_test_series( [ s.encode("utf-8") if isinstance(s, str) else s for s in str_encode_decode_test_data ] ), lambda s: s.str.decode(encoding, errors=errors), expected_exception=expected_exception, ) def test_list_general(): pa = pytest.importorskip("pyarrow") # Copied from pandas examples modin_series, pandas_series = create_test_series( [ [1, 2, 3], [3], ], dtype=pd.ArrowDtype(pa.list_(pa.int64())), ) eval_general(modin_series, pandas_series, lambda series: series.list.flatten()) eval_general(modin_series, pandas_series, lambda series: series.list.len()) eval_general(modin_series, pandas_series, lambda series: series.list[0]) def test_struct_general(): pa = pytest.importorskip("pyarrow") # Copied from pandas examples modin_series, pandas_series = create_test_series( [ {"version": 1, "project": "pandas"}, {"version": 2, "project": "pandas"}, {"version": 1, "project": "numpy"}, ], dtype=pd.ArrowDtype( pa.struct([("version", pa.int64()), ("project", pa.string())]) ), ) eval_general(modin_series, pandas_series, lambda series: series.struct.dtypes) eval_general( modin_series, pandas_series, lambda series: series.struct.field("project") ) eval_general(modin_series, pandas_series, lambda series: series.struct.explode()) # nested struct types version_type = pa.struct( [ ("major", pa.int64()), ("minor", pa.int64()), ] ) modin_series, pandas_series = create_test_series( [ {"version": {"major": 1, "minor": 5}, "project": "pandas"}, {"version": {"major": 2, "minor": 1}, "project": "pandas"}, {"version": {"major": 1, "minor": 26}, "project": "numpy"}, ], dtype=pd.ArrowDtype( pa.struct([("version", version_type), ("project", pa.string())]) ), ) eval_general( modin_series, pandas_series, lambda series: series.struct.field(["version", "minor"]), ) def _case_when_caselists(): def permutations(values): return [ p for r in range(1, len(values) + 1) for p in itertools.permutations(values, r) ] conditions = permutations( [ [True, False, False, False] * 10, pandas.Series([True, False, False, False] * 10), pandas.Series([True, False, False, False] * 10, index=range(78, -2, -2)), lambda df: df.gt(0), ] ) replacements = permutations([[0, 3, 4, 5] * 10, 0, lambda df: 1]) caselists = [] for c in conditions: for r in replacements: if len(c) == len(r): caselists.append(list(zip(c, r))) return caselists @pytest.mark.parametrize( "base", [ pandas.Series(range(40)), pandas.Series([0, 7, 8, 9] * 10, name="c", index=range(0, 80, 2)), ], ) @pytest.mark.parametrize( "caselist", _case_when_caselists(), ) @pytest.mark.skipif( Engine.get() == "Dask", reason="https://github.com/modin-project/modin/issues/7148", ) def test_case_when(base, caselist): pandas_result = base.case_when(caselist) modin_bases = [pd.Series(base)] # 'base' and serieses from 'caselist' must have equal lengths, however in this test we want # to verify that 'case_when' works correctly even if partitioning of 'base' and 'caselist' isn't equal. # BaseOnPython always uses a single partition, thus skipping this test for them. if not ( f"{StorageFormat.get()}On{Engine.get()}" == "BaseOnPython" or current_execution_is_native() ): # we can only import this function for partitioned execution modes. from modin.tests.core.storage_formats.pandas.test_internals import ( construct_modin_df_by_scheme, ) modin_base_repart = construct_modin_df_by_scheme( base.to_frame(), partitioning_scheme={"row_lengths": [14, 14, 12], "column_widths": [1]}, ).squeeze(axis=1) assert ( modin_bases[0]._query_compiler._modin_frame._partitions.shape != modin_base_repart._query_compiler._modin_frame._partitions.shape ) modin_base_repart.name = base.name modin_bases.append(modin_base_repart) for modin_base in modin_bases: df_equals(pandas_result, modin_base.case_when(caselist)) if any( isinstance(data, pandas.Series) for case_tuple in caselist for data in case_tuple ): caselist = [ tuple( pd.Series(data) if isinstance(data, pandas.Series) else data for data in case_tuple ) for case_tuple in caselist ] df_equals(pandas_result, modin_base.case_when(caselist)) @pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) def test_non_commutative_add_string_to_series(data): # This test checks that add and radd do different things when addition is # not commutative, e.g. for adding a string to a string. For context see # https://github.com/modin-project/modin/issues/4908 eval_general(*create_test_series(data), lambda s: "string" + s) eval_general(*create_test_series(data), lambda s: s + "string") def test_non_commutative_multiply_pandas(): # The non commutative integer class implementation is tricky. Check that # multiplying such an integer with a pandas series is really not # commutative. pandas_series = pandas.Series(1, dtype=int) integer = NonCommutativeMultiplyInteger(2) assert not (integer * pandas_series).equals(pandas_series * integer) def test_non_commutative_multiply(): # This test checks that mul and rmul do different things when # multiplication is not commutative, e.g. for adding a string to a string. # For context see https://github.com/modin-project/modin/issues/5238 modin_series, pandas_series = create_test_series(1, dtype=int) integer = NonCommutativeMultiplyInteger(2) eval_general(modin_series, pandas_series, lambda s: integer * s) eval_general(modin_series, pandas_series, lambda s: s * integer) @pytest.mark.parametrize( "is_sparse_data", [True, False], ids=["is_sparse", "is_not_sparse"] ) def test_hasattr_sparse(is_sparse_data): modin_df, pandas_df = ( create_test_series( pandas.arrays.SparseArray(test_data["float_nan_data"].values()) ) if is_sparse_data else create_test_series(test_data["float_nan_data"]) ) eval_general(modin_df, pandas_df, lambda df: hasattr(df, "sparse")) @pytest.mark.parametrize( "data", test_data_categorical_values, ids=test_data_categorical_keys ) def test_cat_categories(data): modin_series, pandas_series = create_test_series(data.copy()) df_equals(modin_series.cat.categories, pandas_series.cat.categories) def set_categories(ser): ser.cat.categories = list("qwert") return ser # pandas 2.0.0: Removed setting Categorical.categories directly (GH47834) # Just check the exception expected_exception = AttributeError("can't set attribute") if sys.version_info >= (3, 10): # The exception message varies across different versions of Python expected_exception = False eval_general( modin_series, pandas_series, set_categories, expected_exception=expected_exception, ) @pytest.mark.parametrize( "data", test_data_categorical_values, ids=test_data_categorical_keys ) def test_cat_ordered(data): modin_series, pandas_series = create_test_series(data.copy()) assert modin_series.cat.ordered == pandas_series.cat.ordered @pytest.mark.parametrize( "data", test_data_categorical_values, ids=test_data_categorical_keys ) def test_cat_codes(data): modin_series, pandas_series = create_test_series(data.copy()) pandas_result = pandas_series.cat.codes modin_result = modin_series.cat.codes df_equals(modin_result, pandas_result) @pytest.mark.parametrize( "set_min_row_partition_size", [1, 2], ids=["four_row_partitions", "two_row_partitions"], indirect=True, ) def test_cat_codes_issue5650(set_min_row_partition_size): data = {"name": ["abc", "def", "ghi", "jkl"]} pandas_df = pandas.DataFrame(data) pandas_df = pandas_df.astype("category") modin_df = pd.DataFrame(data) modin_df = modin_df.astype("category") eval_general( modin_df, pandas_df, lambda df: df["name"].cat.codes, comparator_kwargs={"check_dtypes": True}, ) @pytest.mark.parametrize( "data", test_data_categorical_values, ids=test_data_categorical_keys ) def test_cat_rename_categories(data): modin_series, pandas_series = create_test_series(data.copy()) pandas_result = pandas_series.cat.rename_categories(list("qwert")) modin_result = modin_series.cat.rename_categories(list("qwert")) df_equals(modin_series, pandas_series) df_equals(modin_result, pandas_result) @pytest.mark.parametrize( "data", test_data_categorical_values, ids=test_data_categorical_keys ) @pytest.mark.parametrize("ordered", bool_arg_values, ids=bool_arg_keys) def test_cat_reorder_categories(data, ordered): modin_series, pandas_series = create_test_series(data.copy()) pandas_result = pandas_series.cat.reorder_categories(list("tades"), ordered=ordered) modin_result = modin_series.cat.reorder_categories(list("tades"), ordered=ordered) df_equals(modin_series, pandas_series) df_equals(modin_result, pandas_result) @pytest.mark.parametrize( "data", test_data_categorical_values, ids=test_data_categorical_keys ) def test_cat_add_categories(data): modin_series, pandas_series = create_test_series(data.copy()) pandas_result = pandas_series.cat.add_categories(list("qw")) modin_result = modin_series.cat.add_categories(list("qw")) df_equals(modin_series, pandas_series) df_equals(modin_result, pandas_result) @pytest.mark.parametrize( "data", test_data_categorical_values, ids=test_data_categorical_keys ) def test_cat_remove_categories(data): modin_series, pandas_series = create_test_series(data.copy()) pandas_result = pandas_series.cat.remove_categories(list("at")) modin_result = modin_series.cat.remove_categories(list("at")) df_equals(modin_series, pandas_series) df_equals(modin_result, pandas_result) @pytest.mark.parametrize( "data", test_data_categorical_values, ids=test_data_categorical_keys ) def test_cat_remove_unused_categories(data): modin_series, pandas_series = create_test_series(data.copy()) pandas_series[1] = np.nan pandas_result = pandas_series.cat.remove_unused_categories() modin_series[1] = np.nan modin_result = modin_series.cat.remove_unused_categories() df_equals(modin_series, pandas_series) df_equals(modin_result, pandas_result) @pytest.mark.parametrize( "data", test_data_categorical_values, ids=test_data_categorical_keys ) @pytest.mark.parametrize("ordered", bool_arg_values, ids=bool_arg_keys) @pytest.mark.parametrize("rename", [True, False]) def test_cat_set_categories(data, ordered, rename): modin_series, pandas_series = create_test_series(data.copy()) pandas_result = pandas_series.cat.set_categories( list("qwert"), ordered=ordered, rename=rename ) modin_result = modin_series.cat.set_categories( list("qwert"), ordered=ordered, rename=rename ) df_equals(modin_series, pandas_series) df_equals(modin_result, pandas_result) @pytest.mark.parametrize( "data", test_data_categorical_values, ids=test_data_categorical_keys ) def test_cat_as_ordered(data): modin_series, pandas_series = create_test_series(data.copy()) pandas_result = pandas_series.cat.as_ordered() modin_result = modin_series.cat.as_ordered() df_equals(modin_series, pandas_series) df_equals(modin_result, pandas_result) @pytest.mark.parametrize( "data", test_data_categorical_values, ids=test_data_categorical_keys ) def test_cat_as_unordered(data): modin_series, pandas_series = create_test_series(data.copy()) pandas_result = pandas_series.cat.as_unordered() modin_result = modin_series.cat.as_unordered() df_equals(modin_series, pandas_series) df_equals(modin_result, pandas_result) def test_peculiar_callback(): def func(val): if not isinstance(val, tuple): raise BaseException("Urgh...") return val pandas_df = pandas.DataFrame({"col": [(0, 1)]}) pandas_series = pandas_df["col"].apply(func) modin_df = pd.DataFrame({"col": [(0, 1)]}) modin_series = modin_df["col"].apply(func) df_equals(modin_series, pandas_series) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_apply_return_df(data): modin_series, pandas_series = create_test_series(data) eval_general( modin_series, pandas_series, lambda series: series.apply( lambda x: pandas.Series([x + i for i in range(100)]) ), ) @pytest.mark.parametrize( "apply_function", ( lambda series, function: function(series), lambda series, function: series.apply(function), lambda series, function: series.map(function), ), ) @pytest.mark.parametrize("function", UNIVERSAL_UNARY_NUMPY_FUNCTIONS_FOR_FLOATS) def test_unary_numpy_universal_function_issue_6483_and_7645(function, apply_function): eval_general( *create_test_series(test_data["float_nan_data"]), lambda series: apply_function(series, function), ) def test_binary_numpy_universal_function_issue_6483(): eval_general( *create_test_series(test_data["float_nan_data"]), lambda series: np.arctan2(series, np.sin(series)), ) def test__reduce__(): # `Series.__reduce__` will be called implicitly when lambda expressions are # pre-processed for the distributed engine. series_data = ["Major League Baseball", "National Basketball Association"] abbr_md, abbr_pd = create_test_series(series_data, index=["MLB", "NBA"]) dataframe_data = { "name": ["Mariners", "Lakers"] * 500, "league_abbreviation": ["MLB", "NBA"] * 500, } teams_md, teams_pd = create_test_dfs(dataframe_data) result_md = ( teams_md.set_index("name") .league_abbreviation.apply(lambda abbr: abbr_md.loc[abbr]) .rename("league") ) result_pd = ( teams_pd.set_index("name") .league_abbreviation.apply(lambda abbr: abbr_pd.loc[abbr]) .rename("league") ) df_equals(result_md, result_pd) @pytest.mark.parametrize( "op", [ "add", "radd", "divmod", "eq", "floordiv", "ge", "gt", "le", "lt", "mod", "mul", "rmul", "ne", "pow", "rdivmod", "rfloordiv", "rmod", "rpow", "rsub", "rtruediv", "sub", "truediv", ], ) def test_binary_with_fill_value_issue_7381(op): # Ensures that series binary operations respect the fill_value flag series_md, series_pd = create_test_series([0, 1, 2, 3]) rhs_md, rhs_pd = create_test_series([0]) result_md = getattr(series_md, op)(rhs_md, fill_value=2) result_pd = getattr(series_pd, op)(rhs_pd, fill_value=2) df_equals(result_md, result_pd) @pytest.mark.parametrize("op", ["eq", "ge", "gt", "le", "lt", "ne"]) def test_logical_binary_with_list(op): series_md, series_pd = create_test_series([0, 1, 2]) rhs = [2, 1, 0] result_md = getattr(series_md, op)(rhs) result_pd = getattr(series_pd, op)(rhs) df_equals(result_md, result_pd) @pytest.mark.parametrize("op", ["argmax", "argmin"]) def test_argmax_argmin_7413(op): # Ensures that argmin/argmax use positional index, not the actual index value series_md, series_pd = create_test_series([1, 2, 3], index=["b", "a", "c"]) result_md = getattr(series_md, op)() result_pd = getattr(series_pd, op)() assert result_md == result_pd def test_rename_axis(): series_md, series_pd = create_test_series([0, 1, 2]) eval_general(series_md, series_pd, lambda ser: ser.rename_axis("name")) eval_general( series_md, series_pd, lambda ser: ser.rename_axis("new_name", inplace=True), __inplace__=True, ) # axis=1 is invalid for series eval_general( series_md, series_pd, lambda ser: ser.rename_axis("newer_name", axis=1), expected_exception=ValueError("No axis named 1 for object type Series"), ) ================================================ FILE: modin/tests/pandas/utils.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. from __future__ import annotations import csv import functools import itertools import math import os import re from contextlib import contextmanager from io import BytesIO from pathlib import Path from string import ascii_letters from typing import Union import numpy as np import pandas import psutil import pytest from pandas.core.dtypes.common import ( is_bool_dtype, is_datetime64_any_dtype, is_list_like, is_numeric_dtype, is_object_dtype, is_string_dtype, is_timedelta64_dtype, ) import modin.pandas as pd from modin import set_execution from modin.config import ( Backend, Engine, MinColumnPartitionSize, MinRowPartitionSize, NativePandasDeepCopy, NPartitions, RangePartitioning, StorageFormat, TestDatasetSize, TrackFileLeaks, ) from modin.pandas.io import to_pandas from modin.pandas.testing import ( assert_extension_array_equal, assert_frame_equal, assert_index_equal, assert_series_equal, ) from modin.utils import try_cast_to_pandas random_state = np.random.RandomState(seed=42) DATASET_SIZE_DICT = { "Small": (2**6, 2**6), "Normal": (2**6, 2**8), "Big": (2**7, 2**12), } # Size of test dataframes NCOLS, NROWS = DATASET_SIZE_DICT.get(TestDatasetSize.get(), DATASET_SIZE_DICT["Normal"]) NGROUPS = 10 # Range for values for test data RAND_LOW = 0 RAND_HIGH = 100 # Input data and functions for the tests # The test data that we will test our code against test_data = { # "empty_data": {}, # "columns_only": {"col1": [], "col2": [], "col3": [], "col4": [], "col5": []}, "int_data": { "col{}".format(int((i - NCOLS / 2) % NCOLS + 1)): random_state.randint( RAND_LOW, RAND_HIGH, size=(NROWS) ) for i in range(NCOLS) }, "float_nan_data": { "col{}".format(int((i - NCOLS / 2) % NCOLS + 1)): [ ( x if (j % 4 == 0 and i > NCOLS // 2) or (j != i and i <= NCOLS // 2) else np.nan ) for j, x in enumerate( random_state.uniform(RAND_LOW, RAND_HIGH, size=(NROWS)) ) ] for i in range(NCOLS) }, # "int_float_object_data": { # "col3": [1, 2, 3, 4], # "col4": [4, 5, 6, 7], # "col1": [8.0, 9.4, 10.1, 11.3], # "col2": ["a", "b", "c", "d"], # }, # "datetime_timedelta_data": { # "col3": [ # np.datetime64("2010"), # np.datetime64("2011"), # np.datetime64("2011-06-15T00:00"), # np.datetime64("2009-01-01"), # ], # "col4": [ # np.datetime64("2010"), # np.datetime64("2011"), # np.datetime64("2011-06-15T00:00"), # np.datetime64("2009-01-01"), # ], # "col1": [ # np.timedelta64(1, "M"), # np.timedelta64(2, "D"), # np.timedelta64(3, "Y"), # np.timedelta64(20, "D"), # ], # "col2": [ # np.timedelta64(1, "M"), # np.timedelta64(2, "D"), # np.timedelta64(3, "Y"), # np.timedelta64(20, "D"), # ], # }, # "all_data": { # "col3": 1.0, # "col4": np.datetime64("2011-06-15T00:00"), # "col5": np.array([3] * 4, dtype="int32"), # "col1": "foo", # "col2": True, # }, } # The parse_dates param can take several different types and combinations of # types. Use the following values to test date parsing on a CSV created for # that purpose at `time_parsing_csv_path` parse_dates_values_by_id = { "bool": False, "list_of_single_int": [0], "list_of_single_string": ["timestamp"], "list_of_list_of_strings": [["year", "month", "date"]], "list_of_string_and_list_of_strings": ["timestamp", ["year", "month", "date"]], "list_of_list_of_ints": [[1, 2, 3]], "list_of_list_of_strings_and_ints": [["year", 2, "date"]], "empty_list": [], "dict": {"year_and_month": [1, 2], "day": ["date"]}, "nonexistent_string_column": ["z"], "nonexistent_int_column": [99], } # See details in #1403 test_data["int_data"]["index"] = test_data["int_data"].pop( "col{}".format(int(NCOLS / 2)) ) for col in test_data["float_nan_data"]: for row in range(NROWS // 2): if row % 16 == 0: test_data["float_nan_data"][col][row] = np.nan test_data_values = list(test_data.values()) test_data_keys = list(test_data.keys()) test_bool_data = { "col{}".format(int((i - NCOLS / 2) % NCOLS + 1)): random_state.choice( [True, False], size=(NROWS) ) for i in range(NCOLS) } test_groupby_data = {f"col{i}": np.arange(NCOLS) % NGROUPS for i in range(NROWS)} test_data_resample = { "data": { f"col{i}": random_state.randint(RAND_LOW, RAND_HIGH, size=NROWS) for i in range(10) }, "index": pandas.date_range("31/12/2000", periods=NROWS, freq="h"), } test_data_with_duplicates = { "no_duplicates": { "col{}".format(int((i - NCOLS / 2) % NCOLS + 1)): range(NROWS) for i in range(NCOLS) }, "all_duplicates": { "col{}".format(int((i - NCOLS / 2) % NCOLS + 1)): [ float(i) for _ in range(NROWS) ] for i in range(NCOLS) }, "some_duplicates": { "col{}".format(int((i - NCOLS / 2) % NCOLS + 1)): [ i if j % 7 == 0 else x for j, x in enumerate(range(NROWS)) ] for i in range(NCOLS) }, "has_name_column": { "name": ["one", "two", "two", "three"], "col1": [1, 2, 2, 3], "col3": [10, 20, 20, 3], "col7": [100, 201, 200, 300], }, "str_columns": { "col_str{}".format(int((i - NCOLS / 2) % NCOLS + 1)): [ "s" + str(x % 5) for x in range(NROWS) ] for i in range(NCOLS) }, } test_data_with_duplicates["float_nan"] = test_data["float_nan_data"] test_data_small = { "small": { "col0": [1, 2, 3, 4], "col1": [8.0, 9.4, 10.1, 11.3], "col2": [4, 5, 6, 7], } } test_data_diff_dtype = { "int_col": [-5, 2, 7, 16], "float_col": [np.nan, -9.4, 10.1, np.nan], "str_col": ["a", np.nan, "c", "d"], "bool_col": [False, True, True, False], } test_data_small_values = list(test_data_small.values()) test_data_small_keys = list(test_data_small.keys()) test_data_with_duplicates_values = list(test_data_with_duplicates.values()) test_data_with_duplicates_keys = list(test_data_with_duplicates.keys()) test_data_categorical = { "ordered": pandas.Categorical(list("testdata"), ordered=True), "unordered": pandas.Categorical(list("testdata"), ordered=False), } test_data_categorical_values = list(test_data_categorical.values()) test_data_categorical_keys = list(test_data_categorical.keys()) # Fully fill all of the partitions used in tests. test_data_large_categorical_dataframe = { i: pandas.Categorical(np.arange(NPartitions.get() * MinRowPartitionSize.get())) for i in range(NPartitions.get() * MinColumnPartitionSize.get()) } test_data_large_categorical_series_values = [ pandas.Categorical(np.arange(NPartitions.get() * MinRowPartitionSize.get())) ] test_data_large_categorical_series_keys = ["categorical_series"] numeric_dfs = [ "empty_data", "columns_only", "int_data", "float_nan_data", "with_index_column", ] no_numeric_dfs = ["datetime_timedelta_data"] # String test data test_string_data = { "separator data": [ "abC|DeF,Hik", "234,3245.67", "gSaf,qWer|Gre", "asd3,4sad|", np.nan, ] } test_string_data_values = list(test_string_data.values()) test_string_data_keys = list(test_string_data.keys()) # List of strings test data test_string_list_data = {"simple string": [["a"], ["CdE"], ["jDf"], ["werB"]]} test_string_list_data_values = list(test_string_list_data.values()) test_string_list_data_keys = list(test_string_list_data.keys()) string_seperators = {"comma sep": ","} string_sep_values = list(string_seperators.values()) string_sep_keys = list(string_seperators.keys()) string_na_rep = {"None na_rep": None, "- na_rep": "-", "nan na_rep": np.nan} string_na_rep_values = list(string_na_rep.values()) string_na_rep_keys = list(string_na_rep.keys()) join_type = {"left": "left", "right": "right", "inner": "inner", "outer": "outer"} join_type_keys = list(join_type.keys()) join_type_values = list(join_type.values()) UNIVERSAL_UNARY_NUMPY_FUNCTIONS_FOR_FLOATS = ( np.negative, np.abs, np.sin, np.positive, np.absolute, np.fabs, np.rint, np.sign, np.conj, np.conjugate, np.exp, np.exp2, np.log, np.log2, np.log10, np.expm1, np.log1p, np.sqrt, np.square, np.cbrt, np.reciprocal, np.sin, np.cos, np.tan, np.arcsin, np.arccos, np.arctan, np.sinh, np.cosh, np.tanh, np.arcsinh, np.arccosh, np.arctanh, np.degrees, np.radians, np.deg2rad, np.rad2deg, np.logical_not, np.isfinite, np.isinf, np.isnan, np.fabs, np.signbit, np.spacing, np.floor, np.ceil, np.trunc, ) # Test functions for applymap test_func = { "plus one": lambda x: x + 1, "convert to string": str, "square": lambda x: x * x, "identity": lambda x: x, "return false": lambda x: False, **{func.__name__: func for func in UNIVERSAL_UNARY_NUMPY_FUNCTIONS_FOR_FLOATS}, } test_func_keys = list(test_func.keys()) test_func_values = list(test_func.values()) numeric_test_funcs = ["plus one", "square"] # Test functions for query query_func = { "col1 < col2": "col1 < col2", "col3 > col4": "col3 > col4", "col1 == col2": "col1 == col2", "(col2 > col1) and (col1 < col3)": "(col2 > col1) and (col1 < col3)", # this is how to query for values of an unnamed index per # https://pandas.pydata.org/docs/user_guide/indexing.html#multiindex-query-syntax "ilevel_0 % 2 == 1": "ilevel_0 % 2 == 1", } query_func_keys = list(query_func.keys()) query_func_values = list(query_func.values()) # Test agg functions for apply, agg, and aggregate agg_func = { "sum": "sum", "df sum": lambda df: df.sum(), "str": str, "sum mean": ["sum", "mean"], "sum df sum": ["sum", lambda df: df.sum()], # The case verifies that returning a scalar that is based on a frame's data doesn't cause a problem "sum of certain elements": lambda axis: ( axis.iloc[0] + axis.iloc[-1] if isinstance(axis, pandas.Series) else axis + axis ), "should raise AssertionError": 1, } agg_func_keys = list(agg_func.keys()) agg_func_values = list(agg_func.values()) # For this sort of parameters pandas throws an exception. # See details in pandas issue 36036. agg_func_except = { "sum sum": ["sum", "sum"], } agg_func_except_keys = list(agg_func_except.keys()) agg_func_except_values = list(agg_func_except.values()) numeric_agg_funcs = ["sum mean", "sum sum", "sum df sum"] udf_func = { "return self": lambda x, *args, **kwargs: type(x)(x.values), "change index": lambda x, *args, **kwargs: pandas.Series( x.values, index=np.arange(-1, len(x.index) - 1) ), "return none": lambda x, *args, **kwargs: None, "return empty": lambda x, *args, **kwargs: pandas.Series(), "access self": lambda x, other, *args, **kwargs: pandas.Series( x.values, index=other.index ), } udf_func_keys = list(udf_func.keys()) udf_func_values = list(udf_func.values()) # Test q values for quantiles quantiles = { "0.25": 0.25, "0.5": 0.5, "0.75": 0.75, "0.66": 0.66, "0.01": 0.01, "list": [0.25, 0.5, 0.75, 0.66, 0.01], } quantiles_keys = list(quantiles.keys()) quantiles_values = list(quantiles.values()) # Test indices for get, set_index, __contains__, insert indices = { "col1": "col1", "col2": "col2", "A": "A", "B": "B", "does not exist": "does not exist", } indices_keys = list(indices.keys()) indices_values = list(indices.values()) # Test functions for groupby apply groupby_apply_func = {"sum": lambda df: df.sum(), "negate": lambda df: -df} groupby_apply_func_keys = list(groupby_apply_func.keys()) groupby_apply_func_values = list(groupby_apply_func.values()) # Test functions for groupby agg groupby_agg_func = {"min": "min", "max": "max"} groupby_agg_func_keys = list(groupby_agg_func.keys()) groupby_agg_func_values = list(groupby_agg_func.values()) # Test functions for groupby transform groupby_transform_func = { "add 4": lambda df: df + 4, "negatie and minus 10": lambda df: -df - 10, } groupby_transform_func_keys = list(groupby_transform_func.keys()) groupby_transform_func_values = list(groupby_transform_func.values()) # Test functions for groupby pipe groupby_pipe_func = {"sum": lambda df: df.sum()} groupby_pipe_func_keys = list(groupby_pipe_func.keys()) groupby_pipe_func_values = list(groupby_pipe_func.values()) # END Test input data and functions # Parametrizations of common kwargs axis = { "over_rows_int": 0, "over_rows_str": "rows", "over_columns_int": 1, "over_columns_str": "columns", } axis_keys = list(axis.keys()) axis_values = list(axis.values()) bool_arg = {"True": True, "False": False, "None": None} bool_arg_keys = list(bool_arg.keys()) bool_arg_values = list(bool_arg.values()) int_arg = {"-5": -5, "-1": -1, "0": 0, "1": 1, "5": 5} int_arg_keys = list(int_arg.keys()) int_arg_values = list(int_arg.values()) # END parametrizations of common kwargs json_short_string = """[{"project": "modin"}]""" json_long_string = """{ "quiz": { "sport": { "q1": { "question": "Which one is correct team name in NBA?", "options": [ "New York Bulls", "Los Angeles Kings", "Golden State Warriros", "Huston Rocket" ], "answer": "Huston Rocket" } }, "maths": { "q1": { "question": "5 + 7 = ?", "options": [ "10", "11", "12", "13" ], "answer": "12" }, "q2": { "question": "12 - 8 = ?", "options": [ "1", "2", "3", "4" ], "answer": "4" } } } }""" json_long_bytes = BytesIO(json_long_string.encode(encoding="UTF-8")) json_short_bytes = BytesIO(json_short_string.encode(encoding="UTF-8")) # Text encoding types encoding_types = [ "ascii", "utf_32", "utf_32_be", "utf_32_le", "utf_16", "utf_16_be", "utf_16_le", "utf_7", "utf_8", "utf_8_sig", ] default_to_pandas_ignore_string = "default:.*defaulting to pandas.*:UserWarning" # Files compression to extension mapping COMP_TO_EXT = {"gzip": "gz", "bz2": "bz2", "xz": "xz", "zip": "zip"} time_parsing_csv_path = "modin/tests/pandas/data/test_time_parsing.csv" class CustomIntegerForAddition: def __init__(self, value: int): self.value = value def __add__(self, other): return self.value + other def __radd__(self, other): return other + self.value class NonCommutativeMultiplyInteger: """int-like class with non-commutative multiply operation. We need to test that rmul and mul do different things even when multiplication is not commutative, but almost all multiplication is commutative. This class' fake multiplication overloads are not commutative when you multiply an instance of this class with pandas.series, which does not know how to __mul__ with this class. e.g. NonCommutativeMultiplyInteger(2) * pd.Series(1, dtype=int) == pd.Series(2, dtype=int) pd.Series(1, dtype=int) * NonCommutativeMultiplyInteger(2) == pd.Series(3, dtype=int) """ def __init__(self, value: int): if not isinstance(value, int): raise TypeError( f"must initialize with integer, but got {value} of type {type(value)}" ) self.value = value def __mul__(self, other): # Note that we need to check other is an int, otherwise when we (left) mul # this with a series, we'll just multiply self.value by the series, whereas # we want to make the series do an rmul instead. if not isinstance(other, int): return NotImplemented return self.value * other def __rmul__(self, other): return self.value * other + 1 def categories_equals(left, right): assert (left.ordered and right.ordered) or (not left.ordered and not right.ordered) assert_extension_array_equal(left, right) def df_categories_equals(df1, df2): if not hasattr(df1, "select_dtypes"): if isinstance(df1, pandas.CategoricalDtype): categories_equals(df1, df2) elif isinstance(getattr(df1, "dtype"), pandas.CategoricalDtype) and isinstance( getattr(df2, "dtype"), pandas.CategoricalDtype ): categories_equals(df1.dtype, df2.dtype) return True df1_categorical = df1.select_dtypes(include="category") df2_categorical = df2.select_dtypes(include="category") assert df1_categorical.columns.equals(df2_categorical.columns) # Use an index instead of a column name to iterate through columns. There # may be duplicate colum names. e.g. if two columns are named col1, # selecting df1_categorical["col1"] gives a dataframe of width 2 instead of a series. for i in range(len(df1_categorical.columns)): assert_extension_array_equal( df1_categorical.iloc[:, i].values, df2_categorical.iloc[:, i].values, check_dtype=False, ) def assert_empty_frame_equal(df1, df2): """ Test if df1 and df2 are empty. Parameters ---------- df1 : pandas.DataFrame or pandas.Series df2 : pandas.DataFrame or pandas.Series Raises ------ AssertionError If check fails. """ if (df1.empty and not df2.empty) or (df2.empty and not df1.empty): assert False, "One of the passed frames is empty, when other isn't" elif df1.empty and df2.empty and type(df1) is not type(df2): assert False, f"Empty frames have different types: {type(df1)} != {type(df2)}" def assert_all_act_same(condition, *objs): """ Assert that all of the objs give the same boolean result for the passed condition (either all True or all False). Parameters ---------- condition : callable(obj) -> bool Condition to run on the passed objects. *objs : Objects to pass to the condition. Returns ------- bool Result of the condition. """ results = [condition(obj) for obj in objs] if len(results) < 2: return results[0] if len(results) else None assert all(results[0] == res for res in results[1:]) return results[0] def assert_dtypes_equal(df1, df2): """ Assert that the two passed DataFrame/Series objects have equal dtypes. The function doesn't require that the dtypes are identical, it has the following reliefs: 1. The dtypes are not required to be in the same order (e.g. {"col1": int, "col2": float} == {"col2": float, "col1": int}) 2. The dtypes are only required to be in the same class (e.g. both numerical, both categorical, etc...) Parameters ---------- df1 : DataFrame or Series df2 : DataFrame or Series """ if not isinstance( df1, (pandas.Series, pd.Series, pandas.DataFrame, pd.DataFrame) ) or not isinstance( df2, (pandas.Series, pd.Series, pandas.DataFrame, pd.DataFrame) ): return if isinstance(df1.dtypes, (pandas.Series, pd.Series)): dtypes1 = df1.dtypes dtypes2 = df2.dtypes else: # Case when `dtypes` is a scalar dtypes1 = pandas.Series({"col": df1.dtypes}) dtypes2 = pandas.Series({"col": df2.dtypes}) # Don't require for dtypes to be in the same order assert len(dtypes1.index.difference(dtypes2.index)) == 0 assert len(dtypes1) == len(dtypes2) dtype_comparators = ( is_numeric_dtype, lambda obj: is_object_dtype(obj) or is_string_dtype(obj), is_bool_dtype, lambda obj: isinstance(obj, pandas.CategoricalDtype), is_datetime64_any_dtype, is_timedelta64_dtype, lambda obj: isinstance(obj, pandas.PeriodDtype), ) for idx in range(len(dtypes1)): for comparator in dtype_comparators: if assert_all_act_same(comparator, dtypes1.iloc[idx], dtypes2.iloc[idx]): # We met a dtype that both types satisfy, so we can stop iterating # over comparators and compare next dtypes break def assert_set_of_rows_identical(df1, df2): """ Assert that the set of rows for the passed dataframes is identical. Works much slower than ``df1.equals(df2)``, so it's recommended to use this function only in exceptional cases. """ # replacing NaN with None to pass the comparison: 'NaN == NaN -> false; None == None -> True' df1, df2 = map( lambda df: (df.to_frame() if df.ndim == 1 else df).replace({np.nan: None}), (df1, df2), ) rows1 = set((idx, *row.tolist()) for idx, row in df1.iterrows()) rows2 = set((idx, *row.tolist()) for idx, row in df2.iterrows()) assert rows1 == rows2 def sort_data(data): """Sort the passed sequence.""" if isinstance(data, (pandas.DataFrame, pd.DataFrame)): return data.sort_values(data.columns.to_list(), ignore_index=True) elif isinstance(data, (pandas.Series, pd.Series)): return data.sort_values() else: return np.sort(data) def sort_if_range_partitioning(df1, df2, comparator=None, force=False): """Sort the passed objects if 'RangePartitioning' is enabled and compare the sorted results.""" if comparator is None: comparator = df_equals if force or RangePartitioning.get(): df1, df2 = sort_data(df1), sort_data(df2) comparator(df1, df2) def df_equals(df1, df2, check_dtypes=True): """Tests if df1 and df2 are equal. Args: df1: (pandas or modin DataFrame or series) dataframe to test if equal. df2: (pandas or modin DataFrame or series) dataframe to test if equal. Returns: True if df1 is equal to df2. """ # Gets AttributError if modin's groupby object is not import like this from modin.pandas.groupby import DataFrameGroupBy groupby_types = (pandas.core.groupby.DataFrameGroupBy, DataFrameGroupBy) # The typing behavior of how pandas treats its index is not consistent when the # length of the DataFrame or Series is 0, so we just verify that the contents are # the same. if ( hasattr(df1, "index") and hasattr(df2, "index") and len(df1) == 0 and len(df2) == 0 ): if type(df1).__name__ == type(df2).__name__: if hasattr(df1, "name") and hasattr(df2, "name") and df1.name == df2.name: return if ( hasattr(df1, "columns") and hasattr(df2, "columns") and df1.columns.equals(df2.columns) ): return assert False if isinstance(df1, (list, tuple)) and all( isinstance(d, (pd.DataFrame, pd.Series, pandas.DataFrame, pandas.Series)) for d in df1 ): assert isinstance(df2, type(df1)), "Different type of collection" assert len(df1) == len(df2), "Different length result" return (df_equals(d1, d2) for d1, d2 in zip(df1, df2)) if check_dtypes: assert_dtypes_equal(df1, df2) # Convert to pandas if isinstance(df1, (pd.DataFrame, pd.Series)): df1 = to_pandas(df1) if isinstance(df2, (pd.DataFrame, pd.Series)): df2 = to_pandas(df2) if isinstance(df1, pandas.DataFrame) and isinstance(df2, pandas.DataFrame): assert_empty_frame_equal(df1, df2) if isinstance(df1, pandas.DataFrame) and isinstance(df2, pandas.DataFrame): assert_frame_equal( df1, df2, check_dtype=False, check_datetimelike_compat=True, check_index_type=False, check_column_type=False, check_categorical=False, ) df_categories_equals(df1, df2) elif isinstance(df1, pandas.Index) and isinstance(df2, pandas.Index): assert_index_equal(df1, df2) elif isinstance(df1, pandas.Series) and isinstance(df2, pandas.Series): assert_series_equal(df1, df2, check_dtype=False, check_series_type=False) elif ( hasattr(df1, "dtype") and hasattr(df2, "dtype") and isinstance(df1.dtype, pandas.core.dtypes.dtypes.ExtensionDtype) and isinstance(df2.dtype, pandas.core.dtypes.dtypes.ExtensionDtype) ): assert_extension_array_equal(df1, df2) elif isinstance(df1, groupby_types) and isinstance(df2, groupby_types): for g1, g2 in zip(df1, df2): assert g1[0] == g2[0] df_equals(g1[1], g2[1]) elif ( isinstance(df1, pandas.Series) and isinstance(df2, pandas.Series) and df1.empty and df2.empty ): assert all(df1.index == df2.index) assert df1.dtypes == df2.dtypes elif isinstance(df1, pandas.core.arrays.NumpyExtensionArray): assert isinstance(df2, pandas.core.arrays.NumpyExtensionArray) assert df1 == df2 elif isinstance(df1, np.recarray) and isinstance(df2, np.recarray): np.testing.assert_array_equal(df1, df2) else: res = df1 != df2 if res.any() if isinstance(res, np.ndarray) else res: np.testing.assert_almost_equal(df1, df2) def modin_df_almost_equals_pandas(modin_df, pandas_df, max_diff=0.0001): df_categories_equals(modin_df._to_pandas(), pandas_df) modin_df = to_pandas(modin_df) if hasattr(modin_df, "select_dtypes"): modin_df = modin_df.select_dtypes(exclude=["category"]) if hasattr(pandas_df, "select_dtypes"): pandas_df = pandas_df.select_dtypes(exclude=["category"]) if modin_df.equals(pandas_df): return isna = modin_df.isna().all() if isinstance(isna, bool): if isna: assert pandas_df.isna().all() return elif isna.all(): assert pandas_df.isna().all().all() return diff = (modin_df - pandas_df).abs() diff /= pandas_df.abs() diff_max = diff.max() if isinstance(diff, pandas.Series) else diff.max().max() assert diff_max < max_diff, f"{diff_max} >= {max_diff}" def try_modin_df_almost_equals_compare(df1, df2): """Compare two dataframes as nearly equal if possible, otherwise compare as completely equal.""" # `modin_df_almost_equals_pandas` is numeric-only comparator dtypes1, dtypes2 = [ dtype if is_list_like(dtype := df.dtypes) else [dtype] for df in (df1, df2) ] if all(map(is_numeric_dtype, dtypes1)) and all(map(is_numeric_dtype, dtypes2)): modin_df_almost_equals_pandas(df1, df2) else: df_equals(df1, df2) def df_is_empty(df): """Tests if df is empty. Args: df: (pandas or modin DataFrame) dataframe to test if empty. Returns: True if df is empty. """ assert df.size == 0 and df.empty assert df.shape[0] == 0 or df.shape[1] == 0 def arg_keys(arg_name, keys): """Appends arg_name to the front of all values in keys. Args: arg_name: (string) String containing argument name. keys: (list of strings) Possible inputs of argument. Returns: List of strings with arg_name append to front of keys. """ return ["{0}_{1}".format(arg_name, key) for key in keys] def name_contains(test_name, vals): """Determines if any string in vals is a substring of test_name. Args: test_name: (string) String to determine if contains substrings. vals: (list of strings) List of substrings to test for. Returns: True if a substring in vals is in test_name, else False. """ return any(val in test_name for val in vals) def check_df_columns_have_nans(df, cols): """Checks if there are NaN values in specified columns of a dataframe. :param df: Dataframe to check. :param cols: One column name or list of column names. :return: True if specified columns of dataframe contains NaNs. """ return ( pandas.api.types.is_list_like(cols) and ( any(isinstance(x, str) and x in df.columns and df[x].hasnans for x in cols) or any( isinstance(x, pd.Series) and x._parent is df and x.hasnans for x in cols ) ) ) or ( not pandas.api.types.is_list_like(cols) and cols in df.columns and df[cols].hasnans ) class NoModinException(Exception): pass def eval_general( modin_df, pandas_df, operation, comparator=df_equals, __inplace__=False, expected_exception=None, check_kwargs_callable=True, md_extra_kwargs=None, comparator_kwargs=None, check_for_execution_propagation=True, no_check_for_execution_propagation_reason=None, **kwargs, ): md_kwargs, pd_kwargs = {}, {} if isinstance(modin_df, (pd.DataFrame, pd.Series)): original_engine = modin_df._query_compiler.engine original_storage_format = modin_df._query_compiler.storage_format else: original_engine = None original_storage_format = None def execute_callable(fn, inplace=False, md_kwargs={}, pd_kwargs={}): try: pd_result = fn(pandas_df, **pd_kwargs) except Exception as pd_e: try: if inplace: _ = fn(modin_df, **md_kwargs) try_cast_to_pandas(modin_df) # force materialization else: try_cast_to_pandas( fn(modin_df, **md_kwargs) ) # force materialization except Exception as md_e: assert isinstance( md_e, type(pd_e) ), "Got Modin Exception type {}, but pandas Exception type {} was expected".format( type(md_e), type(pd_e) ) if expected_exception: if Engine.get() == "Ray": from ray.exceptions import RayTaskError # unwrap ray exceptions from remote worker if isinstance(md_e, RayTaskError): md_e = md_e.args[0] assert ( type(md_e) is type(expected_exception) and md_e.args == expected_exception.args ), f"not acceptable Modin's exception: [{repr(md_e)}]" assert ( pd_e.args == expected_exception.args ), f"not acceptable Pandas' exception: [{repr(pd_e)}]" elif expected_exception is False: # The only way to disable exception message checking. pass else: # It’s not enough that Modin and pandas have the same types of exceptions; # we need to explicitly specify the instance of an exception # (using `expected_exception`) in tests so that we can check exception messages. # This allows us to eliminate situations where exceptions are thrown # that we don't expect, which could hide different bugs. raise pd_e else: raise NoModinException( f"Modin doesn't throw an exception, while pandas does: [{repr(pd_e)}]" ) else: md_result = fn(modin_df, **md_kwargs) return (md_result, pd_result) if not inplace else (modin_df, pandas_df) for key, value in kwargs.items(): if check_kwargs_callable and callable(value): values = execute_callable(value) # that means, that callable raised an exception if values is None: return else: md_value, pd_value = values else: md_value, pd_value = value, value md_kwargs[key] = md_value pd_kwargs[key] = pd_value if md_extra_kwargs: assert isinstance(md_extra_kwargs, dict) md_kwargs.update(md_extra_kwargs) values = execute_callable( operation, md_kwargs=md_kwargs, pd_kwargs=pd_kwargs, inplace=__inplace__ ) if values is not None: assert isinstance(values, tuple) and len(values) == 2 modin_result, pandas_result = values if ( isinstance(modin_result, (pd.DataFrame, pd.Series)) and original_engine is not None and original_storage_format is not None ): if check_for_execution_propagation: assert modin_result._query_compiler.engine == original_engine, ( f"Result engine {modin_result._query_compiler.engine} does " + f"not match expected engine {original_engine}" ) assert ( modin_result._query_compiler.storage_format == original_storage_format ), ( "Result storage format " + f"{modin_result._query_compiler.storage_format} does " + f"not match expected storage format {original_storage_format}" ) else: assert ( isinstance(no_check_for_execution_propagation_reason, str) and len(no_check_for_execution_propagation_reason) > 0 ), ( "Must provide a reason for not expecting the operation to " + "propagate dataframe/series engine." ) comparator(modin_result, pandas_result, **(comparator_kwargs or {})) def eval_io( fn_name, comparator=df_equals, cast_to_str=False, expected_exception=None, check_kwargs_callable=True, modin_warning=None, modin_warning_str_match=None, md_extra_kwargs=None, *args, **kwargs, ): """Evaluate I/O operation outputs equality check. Parameters ---------- fn_name: str I/O operation name ("read_csv" for example). comparator: obj Function to perform comparison. cast_to_str: bool There could be some mismatches in dtypes, so we're casting the whole frame to `str` before comparison. See issue #1931 for details. expected_exception: Exception Exception that should be raised even if it is raised both by Pandas and Modin. modin_warning: obj Warning that should be raised by Modin. modin_warning_str_match: str If `modin_warning` is set, checks that the raised warning matches this string. md_extra_kwargs: dict Modin operation specific kwargs. """ def applyier(module, *args, **kwargs): result = getattr(module, fn_name)(*args, **kwargs) if cast_to_str: result = result.astype(str) if isinstance(result, (pd.DataFrame, pd.Series)): # Input methods that return a dataframe, e.g. read_csv, should # return a dataframe with engine and storage_format that match # the default Engine and StorageFormat, respectively. assert result._query_compiler.engine == Engine.get() assert result._query_compiler.storage_format == StorageFormat.get() return result def call_eval_general(): eval_general( pd, pandas, applyier, comparator=comparator, expected_exception=expected_exception, check_kwargs_callable=check_kwargs_callable, md_extra_kwargs=md_extra_kwargs, *args, **kwargs, ) warn_match = modin_warning_str_match if modin_warning is not None else None if modin_warning: with pytest.warns(modin_warning, match=warn_match): call_eval_general() else: call_eval_general() def eval_io_from_str(csv_str: str, unique_filename: str, **kwargs): """Evaluate I/O operation outputs equality check by using `csv_str` data passed as python str (csv test file will be created from `csv_str`). Parameters ---------- csv_str: str Test data for storing to csv file. unique_filename: str csv file name. """ with open(unique_filename, "w") as f: f.write(csv_str) eval_io( filepath_or_buffer=unique_filename, fn_name="read_csv", **kwargs, ) def create_test_dfs( *args, post_fn=None, backend=None, **kwargs ) -> tuple[pd.DataFrame, pandas.DataFrame]: if post_fn is None: post_fn = lambda df: ( # noqa: E731 df.convert_dtypes(dtype_backend=backend) if backend is not None else df ) elif backend is not None: post_fn = lambda df: post_fn(df).convert_dtypes( # noqa: E731 dtype_backend=backend ) return tuple( map(post_fn, [pd.DataFrame(*args, **kwargs), pandas.DataFrame(*args, **kwargs)]) ) def create_test_series( vals, sort=False, backend=None, **kwargs ) -> tuple[pd.Series, pandas.Series]: if isinstance(vals, dict): modin_series = pd.Series(vals[next(iter(vals.keys()))], **kwargs) pandas_series = pandas.Series(vals[next(iter(vals.keys()))], **kwargs) else: modin_series = pd.Series(vals, **kwargs) pandas_series = pandas.Series(vals, **kwargs) if sort: modin_series = modin_series.sort_values().reset_index(drop=True) pandas_series = pandas_series.sort_values().reset_index(drop=True) if backend is not None: modin_series = modin_series.convert_dtypes(dtype_backend=backend) pandas_series = pandas_series.convert_dtypes(dtype_backend=backend) return modin_series, pandas_series def generate_dfs(): df = pandas.DataFrame( { "col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], "col3": [8, 9, 10, 11], "col4": [12, 13, 14, 15], "col5": [0, 0, 0, 0], } ) df2 = pandas.DataFrame( { "col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], "col3": [8, 9, 10, 11], "col6": [12, 13, 14, 15], "col7": [0, 0, 0, 0], } ) return df, df2 def generate_multiindex_dfs(axis=1): def generate_multiindex(index): return pandas.MultiIndex.from_tuples( [("a", x) for x in index.values], names=["name1", "name2"] ) df1, df2 = generate_dfs() df1.axes[axis], df2.axes[axis] = map( generate_multiindex, [df1.axes[axis], df2.axes[axis]] ) return df1, df2 def generate_multiindex(elements_number, nlevels=2, is_tree_like=False): def generate_level(length, nlevel): src = ["bar", "baz", "foo", "qux"] return [src[i % len(src)] + f"-{nlevel}-{i}" for i in range(length)] if is_tree_like: for penalty_level in [0, 1]: lvl_len_f, lvl_len_d = math.modf( round(elements_number ** (1 / (nlevels - penalty_level)), 12) ) if lvl_len_d >= 2 and lvl_len_f == 0: break if lvl_len_d < 2 or lvl_len_f != 0: raise RuntimeError( f"Can't generate Tree-like MultiIndex with lenght: {elements_number} and number of levels: {nlevels}" ) lvl_len = int(lvl_len_d) result = pd.MultiIndex.from_product( [generate_level(lvl_len, i) for i in range(nlevels - penalty_level)], names=[f"level-{i}" for i in range(nlevels - penalty_level)], ) if penalty_level: result = pd.MultiIndex.from_tuples( [("base_level", *ml_tuple) for ml_tuple in result], names=[f"level-{i}" for i in range(nlevels)], ) return result.sort_values() else: base_level = ["first"] * (elements_number // 2 + elements_number % 2) + [ "second" ] * (elements_number // 2) primary_levels = [generate_level(elements_number, i) for i in range(1, nlevels)] arrays = [base_level] + primary_levels return pd.MultiIndex.from_tuples( list(zip(*arrays)), names=[f"level-{i}" for i in range(nlevels)] ).sort_values() def generate_none_dfs(): df = pandas.DataFrame( { "col1": [0, 1, 2, 3], "col2": [4, 5, None, 7], "col3": [8, 9, 10, 11], "col4": [12, 13, 14, 15], "col5": [None, None, None, None], } ) df2 = pandas.DataFrame( { "col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], "col3": [8, 9, 10, 11], "col6": [12, 13, 14, 15], "col7": [0, 0, 0, 0], } ) return df, df2 def get_unique_filename( test_name: str = "test", kwargs: dict = {}, extension: str = "csv", data_dir: Union[str, Path] = "", suffix: str = "", debug_mode=False, ): """Returns unique file name with specified parameters. Parameters ---------- test_name: str name of the test for which the unique file name is needed. kwargs: list of ints Unique combiantion of test parameters for creation of unique name. extension: str, default: "csv" Extension of unique file. data_dir: Union[str, Path] Data directory where test files will be created. suffix: str String to append to the resulted name. debug_mode: bool, default: False Get unique filename containing kwargs values. Otherwise kwargs values will be replaced with hash equivalent. Returns ------- Unique file name. """ suffix_part = f"_{suffix}" if suffix else "" extension_part = f".{extension}" if extension else "" if debug_mode: # shortcut if kwargs parameter are not provided if len(kwargs) == 0 and extension == "csv" and suffix == "": return os.path.join(data_dir, (test_name + suffix_part + f".{extension}")) assert "." not in extension, "please provide pure extension name without '.'" prohibited_chars = ['"', "\n"] non_prohibited_char = "np_char" char_counter = 0 kwargs_name = dict(kwargs) for key, value in kwargs_name.items(): for char in prohibited_chars: if isinstance(value, str) and char in value or callable(value): kwargs_name[key] = non_prohibited_char + str(char_counter) char_counter += 1 parameters_values = "_".join( [ ( str(value) if not isinstance(value, (list, tuple)) else "_".join([str(x) for x in value]) ) for value in kwargs_name.values() ] ) return os.path.join( data_dir, test_name + parameters_values + suffix_part + extension_part ) else: import uuid return os.path.join(data_dir, uuid.uuid1().hex + suffix_part + extension_part) def get_random_string(): random_string = "".join( random_state.choice([x for x in ascii_letters], size=10).tolist() ) return random_string def insert_lines_to_csv( csv_name: str, lines_positions: list, lines_type: str = "blank", encoding: str = None, **csv_reader_writer_params, ): """Insert lines to ".csv" file. Parameters ---------- csv_name: str ".csv" file that should be modified. lines_positions: list of ints Lines postions that sghould be modified (serial number of line - begins from 0, ends in - 1). lines_type: str Lines types that should be inserted to ".csv" file. Possible types: "blank" - empty line without any delimiters/separators, "bad" - lines with len(lines_data) > cols_number encoding: str Encoding type that should be used during file reading and writing. """ if lines_type == "blank": lines_data = [] elif lines_type == "bad": cols_number = len(pandas.read_csv(csv_name, nrows=1).columns) lines_data = [x for x in range(cols_number + 1)] else: raise ValueError( f"acceptable values for parameter are ['blank', 'bad'], actually passed {lines_type}" ) lines = [] with open(csv_name, "r", encoding=encoding, newline="") as read_file: try: dialect = csv.Sniffer().sniff(read_file.read()) read_file.seek(0) except Exception: dialect = None reader = csv.reader( read_file, dialect=dialect if dialect is not None else "excel", **csv_reader_writer_params, ) counter = 0 for row in reader: if counter in lines_positions: lines.append(lines_data) else: lines.append(row) counter += 1 with open(csv_name, "w", encoding=encoding, newline="") as write_file: writer = csv.writer( write_file, dialect=dialect if dialect is not None else "excel", **csv_reader_writer_params, ) writer.writerows(lines) def _get_open_files(): """ psutil open_files() can return a lot of extra information that we can allow to be different, like file position; for simplicity we care about path and fd only. """ return sorted((info.path, info.fd) for info in psutil.Process().open_files()) def check_file_leaks(func): """ A decorator that ensures that no *newly* opened file handles are left after decorated function is finished. """ if not TrackFileLeaks.get(): return func @functools.wraps(func) def check(*a, **kw): fstart = _get_open_files() try: return func(*a, **kw) finally: leaks = [] for item in _get_open_files(): try: fstart.remove(item) except ValueError: # Ignore files in /proc/, as they have nothing to do with # modin reading any data (and this is what we care about). if item[0].startswith("/proc/"): continue # Ignore files in /tmp/ray/session_*/logs (ray session logs) # because Ray intends to keep these logs open even after # work has been done. if re.search(r"/tmp/ray/session_.*/logs", item[0]): continue leaks.append(item) assert ( not leaks ), f"Unexpected open handles left for: {', '.join(item[0] for item in leaks)}" return check def dummy_decorator(): """A problematic decorator that does not use `functools.wraps`. This introduces unwanted local variables for inspect.currentframe. This decorator is used in test_io to test `read_csv` and `read_table` """ def wrapper(method): def wrapped_function(self, *args, **kwargs): result = method(self, *args, **kwargs) return result return wrapped_function return wrapper def generate_dataframe(row_size=NROWS, additional_col_values=None, idx_name=None): dates = pandas.date_range("2000", freq="h", periods=row_size) data = { "col1": np.arange(row_size) * 10, "col2": [str(x.date()) for x in dates], "col3": np.arange(row_size) * 10, "col4": [str(x.time()) for x in dates], "col5": [get_random_string() for _ in range(row_size)], "col6": random_state.uniform(low=0.0, high=10000.0, size=row_size), } index = None if idx_name is None else pd.RangeIndex(0, row_size, name=idx_name) if additional_col_values is not None: assert isinstance(additional_col_values, (list, tuple)) data.update({"col7": random_state.choice(additional_col_values, size=row_size)}) return pandas.DataFrame(data, index=index) def _make_csv_file(data_dir): def _csv_file_maker( filename=None, row_size=NROWS, force=True, delimiter=",", encoding=None, compression="infer", additional_col_values=None, remove_randomness=False, add_blank_lines=False, add_bad_lines=False, add_nan_lines=False, thousands_separator=None, decimal_separator=None, comment_col_char=None, quoting=csv.QUOTE_MINIMAL, quotechar='"', doublequote=True, escapechar=None, lineterminator=None, ): if filename is None: filename = get_unique_filename(data_dir=data_dir) if os.path.exists(filename) and not force: return None else: df = generate_dataframe(row_size, additional_col_values) if remove_randomness: df = df[["col1", "col2", "col3", "col4"]] if add_nan_lines: for i in range(0, row_size, row_size // (row_size // 10)): df.loc[i] = pandas.Series() if comment_col_char: char = comment_col_char if isinstance(comment_col_char, str) else "#" df.insert( loc=0, column="col_with_comments", value=[char if (x + 2) == 0 else x for x in range(row_size)], ) if thousands_separator is not None: for col_id in ["col1", "col3"]: df[col_id] = df[col_id].apply( lambda x: f"{x:,d}".replace(",", thousands_separator) ) df["col6"] = df["col6"].apply( lambda x: f"{x:,f}".replace(",", thousands_separator) ) filename = ( f"{filename}.{COMP_TO_EXT[compression]}" if compression != "infer" else filename ) df.to_csv( filename, sep=delimiter, encoding=encoding, compression=compression, index=False, decimal=decimal_separator if decimal_separator else ".", lineterminator=lineterminator, quoting=quoting, quotechar=quotechar, doublequote=doublequote, escapechar=escapechar, ) csv_reader_writer_params = { "delimiter": delimiter, "doublequote": doublequote, "escapechar": escapechar, "lineterminator": lineterminator if lineterminator else os.linesep, "quotechar": quotechar, "quoting": quoting, } if add_blank_lines: insert_lines_to_csv( csv_name=filename, lines_positions=[ x for x in range(5, row_size, row_size // (row_size // 10)) ], lines_type="blank", encoding=encoding, **csv_reader_writer_params, ) if add_bad_lines: insert_lines_to_csv( csv_name=filename, lines_positions=[ x for x in range(6, row_size, row_size // (row_size // 10)) ], lines_type="bad", encoding=encoding, **csv_reader_writer_params, ) return filename return _csv_file_maker def sort_index_for_equal_values(df, ascending=True): """Sort `df` indices of equal rows.""" if df.index.dtype == np.float64: # HACK: workaround for pandas bug: # https://github.com/pandas-dev/pandas/issues/34455 df.index = df.index.astype("str") res = df.groupby(by=df if df.ndim == 1 else df.columns, sort=False).apply( lambda df: df.sort_index(ascending=ascending) ) if res.index.nlevels > df.index.nlevels: # Sometimes GroupBy adds an extra level with 'by' to the result index. # GroupBy is very inconsistent about when it's doing this, so that's # why this clumsy if-statement is used. res.index = res.index.droplevel(0) # GroupBy overwrites original index names with 'by', so the following line restores original names res.index.names = df.index.names return res def df_equals_with_non_stable_indices(df1, df2): """Assert equality of two frames regardless of the index order for equal values.""" df1, df2 = map(try_cast_to_pandas, (df1, df2)) np.testing.assert_array_equal(df1.values, df2.values) sorted1, sorted2 = map(sort_index_for_equal_values, (df1, df2)) df_equals(sorted1, sorted2) def rotate_decimal_digits_or_symbols(value): if value.dtype == object: # When dtype is object, we assume that it is actually strings from MultiIndex level names return [x[-1] + x[:-1] for x in value] else: tens = value // 10 ones = value % 10 return tens + ones * 10 def make_default_file(file_type: str, data_dir: str): """Helper function for pytest fixtures.""" def _create_file(filename, force, nrows, ncols, func: str, func_kw=None): """ Helper function that creates a dataframe before writing it to a file. Eliminates the duplicate code that is needed before of output functions calls. Notes ----- Importantly, names of created files are added to `filenames` variable for their further automatic deletion. Without this step, files created by `pytest` fixtures will not be deleted. """ if force or not os.path.exists(filename): df = pandas.DataFrame( {f"col{x + 1}": np.arange(nrows) for x in range(ncols)} ) getattr(df, func)(filename, **func_kw if func_kw else {}) file_type_to_extension = { "excel": "xlsx", "fwf": "txt", "pickle": "pkl", } extension = file_type_to_extension.get(file_type, file_type) def _make_default_file(nrows=NROWS, ncols=2, force=True, **kwargs): filename = get_unique_filename(extension=extension, data_dir=data_dir) if file_type == "json": lines = kwargs.get("lines") func_kw = {"lines": lines, "orient": "records"} if lines else {} _create_file(filename, force, nrows, ncols, "to_json", func_kw) elif file_type in ("html", "excel", "feather", "stata", "pickle"): _create_file(filename, force, nrows, ncols, f"to_{file_type}") elif file_type == "hdf": func_kw = {"key": "df", "format": kwargs.get("format")} _create_file(filename, force, nrows, ncols, "to_hdf", func_kw) elif file_type == "fwf": if force or not os.path.exists(filename): fwf_data = kwargs.get("fwf_data") if fwf_data is None: with open("modin/tests/pandas/data/test_data.fwf", "r") as fwf_file: fwf_data = fwf_file.read() with open(filename, "w") as f: f.write(fwf_data) else: raise ValueError(f"Unsupported file type: {file_type}") return filename return _make_default_file def value_equals(obj1, obj2): """Check wherher two scalar or list-like values are equal and raise an ``AssertionError`` if they aren't.""" if is_list_like(obj1): np.testing.assert_array_equal(obj1, obj2) else: assert (obj1 == obj2) or (np.isnan(obj1) and np.isnan(obj2)) def dict_equals(dict1, dict2): """Check whether two dictionaries are equal and raise an ``AssertionError`` if they aren't.""" for key1, key2 in itertools.zip_longest(sorted(dict1), sorted(dict2)): value_equals(key1, key2) value_equals(dict1[key1], dict2[key2]) @contextmanager def switch_execution(engine: str, storage_format: str): old_engine = Engine.get() old_storage = StorageFormat.get() try: set_execution(engine, storage_format) yield finally: set_execution(old_engine, old_storage) def is_native_shallow_copy() -> bool: """Return if the current configuration uses native pandas execution and performs shallow copies.""" return ( Backend.get() == "Pandas" and not NativePandasDeepCopy.get() and not pandas.get_option("mode.copy_on_write") ) ================================================ FILE: modin/tests/polars/test_dataframe.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import polars import polars.testing import modin.polars as pl def test_init_roundtrip(): data = {"a": [1, 2, 3], "b": [4, 5, 6]} df = pl.DataFrame(data) polars_df = polars.DataFrame(data) to_polars = polars.from_pandas(df._query_compiler.to_pandas()) polars.testing.assert_frame_equal(polars_df, to_polars) ================================================ FILE: modin/tests/test_dataframe_api_standard.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import pytest import modin.pandas def test_dataframe_api_standard() -> None: """ Test some basic methods of the dataframe consortium standard. Full testing is done at https://github.com/data-apis/dataframe-api-compat, this is just to check that the entry point works as expected. """ pytest.importorskip("dataframe_api_compat") df_pd = modin.pandas.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) df = df_pd.__dataframe_consortium_standard__() result_1 = df.get_column_names() expected_1 = ["a", "b"] assert result_1 == expected_1 ser = modin.pandas.Series([1, 2, 3]) col = ser.__column_consortium_standard__() result_2 = col.get_value(1) expected_2 = 2 assert result_2 == expected_2 ================================================ FILE: modin/tests/test_docstring_urls.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import importlib import pkgutil from concurrent.futures import ThreadPoolExecutor from urllib.error import HTTPError from urllib.request import urlopen import pytest import modin.pandas from modin.utils import PANDAS_API_URL_TEMPLATE @pytest.fixture def doc_urls(get_generated_doc_urls): # ensure all docstring are generated - import _everything_ under 'modin.pandas' for modinfo in pkgutil.walk_packages(modin.pandas.__path__, "modin.pandas."): try: importlib.import_module(modinfo.name) except ModuleNotFoundError: # some optional 3rd-party dep missing, ignore pass return sorted(get_generated_doc_urls()) def test_all_urls_exist(doc_urls): broken = [] # TODO: remove the hack after pandas fixes it methods_with_broken_urls = ( "pandas.DataFrame.flags", "pandas.Series.info", "pandas.DataFrame.isetitem", "pandas.Series.swapaxes", "pandas.DataFrame.to_numpy", "pandas.Series.axes", "pandas.Series.divmod", "pandas.Series.rdivmod", ) for broken_method in methods_with_broken_urls: doc_urls.remove(PANDAS_API_URL_TEMPLATE.format(broken_method)) def _test_url(url): try: with urlopen(url): pass except HTTPError: broken.append(url) with ThreadPoolExecutor(32) as pool: pool.map(_test_url, doc_urls) assert not broken, "Invalid URLs detected" ================================================ FILE: modin/tests/test_envvar_catcher.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import os import pytest @pytest.fixture def nameset(): name = "hey_i_am_an_env_var" os.environ[name] = "i am a value" yield name del os.environ[name] def test_envvar_catcher(nameset): with pytest.raises(AssertionError): os.environ.get("Modin_FOO", "bar") with pytest.raises(AssertionError): "modin_qux" not in os.environ assert "yay_random_name" not in os.environ assert os.environ[nameset] ================================================ FILE: modin/tests/test_envvar_npartitions.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import numpy as np import pytest import modin.pandas as pd from modin.config import NPartitions @pytest.mark.parametrize("num_partitions", [2, 4, 6, 8, 10]) def test_set_npartitions(num_partitions): NPartitions.put(num_partitions) data = np.random.randint(0, 100, size=(2**16, 2**8)) df = pd.DataFrame(data) part_shape = df._query_compiler._modin_frame._partitions.shape assert part_shape[0] == num_partitions and part_shape[1] == min(num_partitions, 8) @pytest.mark.parametrize("left_num_partitions", [2, 4, 6, 8, 10]) @pytest.mark.parametrize("right_num_partitions", [2, 4, 6, 8, 10]) def test_runtime_change_npartitions(left_num_partitions, right_num_partitions): NPartitions.put(left_num_partitions) data = np.random.randint(0, 100, size=(2**16, 2**8)) left_df = pd.DataFrame(data) part_shape = left_df._query_compiler._modin_frame._partitions.shape assert part_shape[0] == left_num_partitions and part_shape[1] == min( left_num_partitions, 8 ) NPartitions.put(right_num_partitions) right_df = pd.DataFrame(data) part_shape = right_df._query_compiler._modin_frame._partitions.shape assert part_shape[0] == right_num_partitions and part_shape[1] == min( right_num_partitions, 8 ) ================================================ FILE: modin/tests/test_executions_api.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import pytest from modin.core.storage_formats import BaseQueryCompiler, PandasQueryCompiler BASE_EXECUTION = BaseQueryCompiler EXECUTIONS = [PandasQueryCompiler] def test_base_abstract_methods(): allowed_abstract_methods = [ "__init__", "free", "finalize", "execute", "to_pandas", "from_pandas", "from_arrow", "default_to_pandas", "from_interchange_dataframe", "to_interchange_dataframe", "engine", "storage_format", ] not_implemented_methods = BASE_EXECUTION.__abstractmethods__.difference( allowed_abstract_methods ) # sorting for beauty output in error not_implemented_methods = list(not_implemented_methods) not_implemented_methods.sort() assert ( len(not_implemented_methods) == 0 ), f"{BASE_EXECUTION} has not implemented abstract methods: {not_implemented_methods}" @pytest.mark.parametrize("execution", EXECUTIONS) def test_api_consistent(execution): base_methods = set(BASE_EXECUTION.__dict__) custom_methods = set( [key for key in execution.__dict__.keys() if not key.startswith("_")] ) extra_methods = custom_methods.difference(base_methods) # checking that custom execution do not implements extra api methods assert ( len(extra_methods) == 0 ), f"{execution} implement these extra methods: {extra_methods}" ================================================ FILE: modin/tests/test_headers.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import os from os.path import abspath, dirname # This is the python file root directory (modin/modin) rootdir = dirname(dirname(abspath(__file__))) exclude_files = ["_version.py"] def test_headers(): with open("{}{}".format(dirname(rootdir), "/LICENSE_HEADER"), "r") as f: # Lines to check each line individually header_lines = f.readlines() for subdir, dirs, files in os.walk(rootdir): for file in files: filepath = os.path.join(subdir, file) if file.endswith(".py") and file not in exclude_files: with open(filepath, "r", encoding="utf8") as f: # Lines for line by line comparison py_file_lines = f.readlines() for left, right in zip( header_lines, py_file_lines[: len(header_lines)] ): assert left == right def test_line_endings(): # This is the project root rootdir = dirname(dirname(abspath(__file__))) for subdir, dirs, files in os.walk(rootdir): if any(i in subdir for i in [".git", ".idea", "__pycache__"]): continue for file in files: if file.endswith(".parquet"): continue filepath = os.path.join(subdir, file) with open(filepath, "rb+") as f: file_contents = f.read() new_contents = file_contents.replace(b"\r\n", b"\n") assert new_contents == file_contents, "File has CRLF: {}".format( filepath ) ================================================ FILE: modin/tests/test_logging.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import collections import logging import pytest import modin.logging from modin.config import LogMode class _FakeLogger: _loggers = {} def __init__(self, namespace): self.messages = collections.defaultdict(list) self.namespace = namespace def log(self, log_level, message, *args, **kw): self.messages[log_level].append(message.format(*args, **kw)) def exception(self, message, *args, **kw): self.messages["exception"].append(message.format(*args, **kw)) @classmethod def make(cls, namespace): return cls._loggers.setdefault(namespace, cls(namespace)) @classmethod def get(cls, namespace="modin.logger.default"): return cls._loggers[namespace].messages @classmethod def clear(cls): cls._loggers = {} def _get_logger(namespace="modin.logger.default"): return _FakeLogger.make(namespace) def mock_get_logger(ctx): ctx.setattr(logging, "getLogger", _get_logger) @pytest.fixture def get_log_messages(): old = LogMode.get() LogMode.enable() modin.logging.get_logger() # initialize the logging pior to mocking getLogger() yield _FakeLogger.get _FakeLogger.clear() LogMode.put(old) def test_function_decorator(monkeypatch, get_log_messages): @modin.logging.enable_logging def func(do_raise): if do_raise: raise ValueError() with monkeypatch.context() as ctx: # NOTE: we cannot patch in the fixture as mockin logger.getLogger() # without monkeypatch.context() breaks pytest mock_get_logger(ctx) func(do_raise=False) with pytest.raises(ValueError): func(do_raise=True) assert "func" in get_log_messages()[logging.INFO][0] assert "START" in get_log_messages()[logging.INFO][0] assert get_log_messages("modin.logger.errors")["exception"] == [ "STOP::PANDAS-API::func" ] def test_function_decorator_on_outer_function_6237(monkeypatch, get_log_messages): @modin.logging.enable_logging def inner_func(): raise ValueError() @modin.logging.enable_logging def outer_func(): inner_func() with monkeypatch.context() as ctx: # NOTE: we cannot patch in the fixture as mockin logger.getLogger() # without monkeypatch.context() breaks pytest mock_get_logger(ctx) with pytest.raises(ValueError): outer_func() assert get_log_messages("modin.logger.errors")["exception"] == [ "STOP::PANDAS-API::inner_func" ] def test_class_decorator(monkeypatch, get_log_messages): @modin.logging.enable_logging("CUSTOM") class Foo: def method1(self): pass @classmethod def method2(cls): pass @staticmethod def method3(): pass class Bar(Foo): def method4(self): pass with monkeypatch.context() as ctx: mock_get_logger(ctx) Foo().method1() Foo.method2() Foo.method3() Bar().method1() Bar().method4() assert get_log_messages()[logging.INFO] == [ "START::CUSTOM::Foo.method1", "STOP::CUSTOM::Foo.method1", "START::CUSTOM::Foo.method2", "STOP::CUSTOM::Foo.method2", "START::CUSTOM::Foo.method3", "STOP::CUSTOM::Foo.method3", "START::CUSTOM::Foo.method1", "STOP::CUSTOM::Foo.method1", ] def test_class_inheritance(monkeypatch, get_log_messages): class Foo(modin.logging.ClassLogger, modin_layer="CUSTOM"): def method1(self): pass class Bar(Foo): def method2(self): pass with monkeypatch.context() as ctx: mock_get_logger(ctx) Foo().method1() Bar().method1() Bar().method2() assert get_log_messages()[logging.INFO] == [ "START::CUSTOM::Foo.method1", "STOP::CUSTOM::Foo.method1", "START::CUSTOM::Foo.method1", "STOP::CUSTOM::Foo.method1", "START::CUSTOM::Bar.method2", "STOP::CUSTOM::Bar.method2", ] ================================================ FILE: modin/tests/test_metrics.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. from typing import Union import pytest import modin.logging import modin.pandas as pd from modin.config import MetricsMode from modin.logging.metrics import ( _metric_handlers, add_metric_handler, clear_metric_handler, emit_metric, ) class FakeTelemetryClient: def __init__(self): self._metrics = {} self._metric_handler = None def metric_handler_fail(self, name: str, value: Union[int, float]): raise KeyError("Poorly implemented metric handler") def metric_handler_pass(self, name: str, value: Union[int, float]): self._metrics[name] = value @modin.logging.enable_logging def func(do_raise): if do_raise: raise ValueError() @pytest.fixture() def metric_client(): MetricsMode.enable() client = FakeTelemetryClient() yield client clear_metric_handler(client._metric_handler) MetricsMode.disable() def test_metrics_api_timings(metric_client): assert len(_metric_handlers) == 0 metric_client._metric_handler = metric_client.metric_handler_pass add_metric_handler(metric_client._metric_handler) assert len(_metric_handlers) == 1 assert _metric_handlers[0] == metric_client._metric_handler func(do_raise=False) assert len(metric_client._metrics) == 1 assert metric_client._metrics["modin.pandas-api.func"] is not None assert metric_client._metrics["modin.pandas-api.func"] > 0.0 def test_df_metrics(metric_client): metric_client._metric_handler = metric_client.metric_handler_pass add_metric_handler(metric_client._metric_handler) df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) df.sum() assert len(metric_client._metrics) == 54 assert metric_client._metrics["modin.pandas-api.dataframe.sum"] is not None assert metric_client._metrics["modin.pandas-api.dataframe.sum"] > 0.0 def test_metrics_handler_fails(metric_client): assert len(metric_client._metrics) == 0 metric_client._metric_handler = metric_client.metric_handler_fail add_metric_handler(metric_client._metric_handler) assert len(_metric_handlers) == 1 func(do_raise=False) assert len(_metric_handlers) == 0 assert len(metric_client._metrics) == 0 def test_emit_name_enforced(): MetricsMode.enable() with pytest.raises(KeyError): emit_metric("Not::A::Valid::Metric::Name", 1.0) def test_metrics_can_be_opt_out(metric_client): MetricsMode.enable() assert len(metric_client._metrics) == 0 metric_client._metric_handler = metric_client.metric_handler_pass add_metric_handler(metric_client._metric_handler) # If Metrics are disabled after the addition of a handler # no metrics are emitted MetricsMode.disable() assert len(_metric_handlers) == 1 func(do_raise=False) assert len(metric_client._metrics) == 0 ================================================ FILE: modin/tests/test_partition_api.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import numpy as np import pandas import pytest import modin.pandas as pd from modin.config import Engine, NPartitions from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher from modin.distributed.dataframe.pandas import from_partitions, unwrap_partitions from modin.pandas.indexing import compute_sliced_len from modin.tests.pandas.utils import df_equals, test_data PartitionClass = ( FactoryDispatcher.get_factory().io_cls.frame_cls._partition_mgr_cls._partition_class ) if Engine.get() == "Ray": from modin.core.execution.ray.common import RayWrapper from modin.core.execution.ray.common.utils import ObjectIDType put_func = RayWrapper.put get_func = RayWrapper.materialize is_future = lambda obj: isinstance(obj, ObjectIDType) # noqa: E731 elif Engine.get() == "Dask": from distributed import Future from modin.core.execution.dask.common import DaskWrapper # Looks like there is a key collision; # https://github.com/dask/distributed/issues/3703#issuecomment-619446739 # recommends to use `hash=False`. Perhaps this should be the default value of `put`. put_func = lambda obj: DaskWrapper.put(obj, hash=False) # noqa: E731 get_func = DaskWrapper.materialize is_future = lambda obj: isinstance(obj, Future) # noqa: E731 elif Engine.get() == "Unidist": from unidist import is_object_ref from modin.core.execution.unidist.common import UnidistWrapper put_func = UnidistWrapper.put get_func = UnidistWrapper.materialize is_future = is_object_ref elif Engine.get() == "Python": put_func = lambda x: x # noqa: E731 get_func = lambda x: x # noqa: E731 is_future = lambda obj: isinstance(obj, object) # noqa: E731 else: raise NotImplementedError( f"'{Engine.get()}' engine is not supported by these test suites" ) NPartitions.put(4) # HACK: implicit engine initialization (Modin issue #2989) pd.DataFrame([]) @pytest.mark.parametrize("axis", [None, 0, 1]) @pytest.mark.parametrize("reverse_index", [True, False]) @pytest.mark.parametrize("reverse_columns", [True, False]) def test_unwrap_partitions(axis, reverse_index, reverse_columns): data = test_data["int_data"] def get_df(lib, data): df = lib.DataFrame(data) if reverse_index: df.index = df.index[::-1] if reverse_columns: df.columns = df.columns[::-1] return df df = get_df(pd, data) # `df` should not have propagated the index and column updates to its # partitions yet. The partitions of `expected_df` should have the updated # metadata because we construct `expected_df` directly from the updated # pandas dataframe. expected_df = pd.DataFrame(get_df(pandas, data)) expected_partitions = expected_df._query_compiler._modin_frame._partitions if axis is None: actual_partitions = np.array(unwrap_partitions(df, axis=axis)) assert expected_partitions.shape == actual_partitions.shape for row_idx in range(expected_partitions.shape[0]): for col_idx in range(expected_partitions.shape[1]): df_equals( get_func(expected_partitions[row_idx][col_idx].list_of_blocks[0]), get_func(actual_partitions[row_idx][col_idx]), ) else: expected_axis_partitions = ( expected_df._query_compiler._modin_frame._partition_mgr_cls.axis_partition( expected_partitions, axis ^ 1 ) ) expected_axis_partitions = [ axis_partition.force_materialization().unwrap(squeeze=True) for axis_partition in expected_axis_partitions ] actual_axis_partitions = unwrap_partitions(df, axis=axis) assert len(expected_axis_partitions) == len(actual_axis_partitions) for item_idx in range(len(expected_axis_partitions)): if Engine.get() in ["Ray", "Dask", "Unidist"]: df_equals( get_func(expected_axis_partitions[item_idx]), get_func(actual_axis_partitions[item_idx]), ) def test_unwrap_virtual_partitions(): # see #5164 for details data = test_data["int_data"] df = pd.DataFrame(data) virtual_partitioned_df = pd.concat([df] * 10) actual_partitions = np.array(unwrap_partitions(virtual_partitioned_df, axis=None)) expected_df = pd.concat([pd.DataFrame(data)] * 10) expected_partitions = expected_df._query_compiler._modin_frame._partitions assert expected_partitions.shape == actual_partitions.shape for row_idx in range(expected_partitions.shape[0]): for col_idx in range(expected_partitions.shape[1]): df_equals( get_func( expected_partitions[row_idx][col_idx] .force_materialization() .list_of_blocks[0] ), get_func(actual_partitions[row_idx][col_idx]), ) @pytest.mark.parametrize("column_widths", [None, "column_widths"]) @pytest.mark.parametrize("row_lengths", [None, "row_lengths"]) @pytest.mark.parametrize("columns", [None, "columns"]) @pytest.mark.parametrize("index", [None, "index"]) @pytest.mark.parametrize("axis", [None, 0, 1]) def test_from_partitions(axis, index, columns, row_lengths, column_widths): data = test_data["int_data"] df1, df2 = pandas.DataFrame(data), pandas.DataFrame(data) num_rows, num_cols = df1.shape expected_df = pandas.concat([df1, df2], axis=1 if axis is None else axis) index = expected_df.index if index == "index" else None columns = expected_df.columns if columns == "columns" else None row_lengths = ( None if row_lengths is None else [num_rows, num_rows] if axis == 0 else [num_rows] ) column_widths = ( None if column_widths is None else [num_cols] if axis == 0 else [num_cols, num_cols] ) futures = [] if axis is None: futures = [[put_func(df1), put_func(df2)]] else: futures = [put_func(df1), put_func(df2)] actual_df = from_partitions( futures, axis, index=index, columns=columns, row_lengths=row_lengths, column_widths=column_widths, ) df_equals(expected_df, actual_df) @pytest.mark.parametrize("columns", ["original_col", "new_col"]) @pytest.mark.parametrize("index", ["original_idx", "new_idx"]) @pytest.mark.parametrize("axis", [None, 0, 1]) def test_from_partitions_mismatched_labels(axis, index, columns): expected_df = pd.DataFrame(test_data["int_data"]) partitions = unwrap_partitions(expected_df, axis=axis) index = ( expected_df.index if index == "original_idx" else [f"row{i}" for i in expected_df.index] ) columns = ( expected_df.columns if columns == "original_col" else [f"col{i}" for i in expected_df.columns] ) expected_df.index = index expected_df.columns = columns actual_df = from_partitions(partitions, axis=axis, index=index, columns=columns) df_equals(expected_df, actual_df) @pytest.mark.parametrize("row_labels", [[0, 2], slice(None)]) @pytest.mark.parametrize("col_labels", [[0, 2], slice(None)]) @pytest.mark.parametrize("is_length_future", [False, True]) @pytest.mark.parametrize("is_width_future", [False, True]) def test_mask_preserve_cache(row_labels, col_labels, is_length_future, is_width_future): def deserialize(obj): if is_future(obj): return get_func(obj) return obj def compute_length(indices, length): if not isinstance(indices, slice): return len(indices) return compute_sliced_len(indices, length) df = pandas.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "c": [9, 10, 11, 12]}) obj_id = put_func(df) partition_shape = [ put_func(len(df)) if is_length_future else len(df), put_func(len(df.columns)) if is_width_future else len(df.columns), ] source_partition = PartitionClass(obj_id, *partition_shape) masked_partition = source_partition.mask( row_labels=row_labels, col_labels=col_labels ) expected_length = compute_length(row_labels, len(df)) expected_width = compute_length(col_labels, len(df.columns)) # Check that the cache is preserved assert expected_length == deserialize(masked_partition._length_cache) assert expected_width == deserialize(masked_partition._width_cache) # Check that the cache is interpreted properly assert expected_length == masked_partition.length() assert expected_width == masked_partition.width() # Recompute shape explicitly to check that the cached data was correct expected_length, expected_width = [ masked_partition._length_cache, masked_partition._width_cache, ] masked_partition._length_cache = None masked_partition._width_cache = None assert expected_length == masked_partition.length() assert expected_width == masked_partition.width() ================================================ FILE: modin/tests/test_utils.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import contextlib import json from textwrap import dedent, indent from typing import Optional, Union from unittest.mock import Mock, patch import numpy as np import pandas import pytest import modin.pandas as pd import modin.utils from modin.config import Engine, StorageFormat from modin.error_message import ErrorMessage from modin.tests.pandas.utils import create_test_dfs # Note: classes below are used for purely testing purposes - they # simulate real-world use cases for _inherit_docstring class BaseParent: def method(self): """ordinary method (base)""" def base_method(self): """ordinary method in base only""" @property def prop(self): """property""" @staticmethod def static(): """static method""" @classmethod def clsmtd(cls): """class method""" class BaseChild(BaseParent): """this is class docstring""" def method(self): """ordinary method (child)""" def own_method(self): """own method""" def no_overwrite(self): """another own method""" F = property(method) @pytest.fixture(scope="module") def wrapped_cls(): @modin.utils._inherit_docstrings(BaseChild) class Wrapped: def method(self): pass def base_method(self): pass def own_method(self): pass def no_overwrite(self): """not overwritten doc""" @property def prop(self): return None @staticmethod def static(): pass @classmethod def clsmtd(cls): pass F = property(method) return Wrapped def _check_doc(wrapped, orig): assert wrapped.__doc__ == orig.__doc__ if isinstance(wrapped, property): assert wrapped.fget.__doc_inherited__ else: assert wrapped.__doc_inherited__ def test_doc_inherit_clslevel(wrapped_cls): _check_doc(wrapped_cls, BaseChild) def test_doc_inherit_methods(wrapped_cls): _check_doc(wrapped_cls.method, BaseChild.method) _check_doc(wrapped_cls.base_method, BaseParent.base_method) _check_doc(wrapped_cls.own_method, BaseChild.own_method) assert wrapped_cls.no_overwrite.__doc__ != BaseChild.no_overwrite.__doc__ assert not getattr(wrapped_cls.no_overwrite, "__doc_inherited__", False) def test_doc_inherit_special(wrapped_cls): _check_doc(wrapped_cls.static, BaseChild.static) _check_doc(wrapped_cls.clsmtd, BaseChild.clsmtd) def test_doc_inherit_props(wrapped_cls): assert type(wrapped_cls.method) == type(BaseChild.method) # noqa: E721 _check_doc(wrapped_cls.prop, BaseChild.prop) _check_doc(wrapped_cls.F, BaseChild.F) def test_doc_inherit_prop_builder(): def builder(name): return property(lambda self: name) class Parent: prop = builder("Parent") @modin.utils._inherit_docstrings(Parent) class Child(Parent): prop = builder("Child") assert Parent().prop == "Parent" assert Child().prop == "Child" @pytest.mark.parametrize( "source_doc,to_append,expected", [ ( "One-line doc.", "One-line message.", "One-line doc.One-line message.", ), ( """ Regular doc-string With the setted indent style. """, """ Doc-string having different indents in comparison with the regular one. """, """ Regular doc-string With the setted indent style. Doc-string having different indents in comparison with the regular one. """, ), ], ) def test_append_to_docstring(source_doc, to_append, expected): def source_fn(): pass source_fn.__doc__ = source_doc result_fn = modin.utils.append_to_docstring(to_append)(source_fn) answer = dedent(result_fn.__doc__) expected = dedent(expected) assert answer == expected def test_align_indents(): source = """ Source string that sets the indent pattern.""" target = indent(source, " " * 5) result = modin.utils.align_indents(source, target) assert source == result def test_format_string(): template = """ Source template string that has some {inline_placeholder}s. Placeholder1: {new_line_placeholder1} Placeholder2: {new_line_placeholder2} Placeholder3: {new_line_placeholder3} Placeholder4: {new_line_placeholder4}Text text: Placeholder5: {new_line_placeholder5} """ singleline_value = "Single-line value" multiline_value = """ Some string Having different indentation From the source one.""" multiline_value_new_line_at_the_end = multiline_value + "\n" multiline_value_new_line_at_the_begin = "\n" + multiline_value expected = """ Source template string that has some Single-line values. Placeholder1: Some string Having different indentation From the source one. Placeholder2: Single-line value Placeholder3: Some string Having different indentation From the source one. Placeholder4: Some string Having different indentation From the source one. Text text: Placeholder5: Some string Having different indentation From the source one. """ # noqa: W293 answer = modin.utils.format_string( template, inline_placeholder=singleline_value, new_line_placeholder1=multiline_value, new_line_placeholder2=singleline_value, new_line_placeholder3=multiline_value_new_line_at_the_begin, new_line_placeholder4=multiline_value_new_line_at_the_end, new_line_placeholder5=multiline_value, ) assert answer == expected def warns_that_defaulting_to_pandas_if( condition: bool, prefix: Optional[str] = None, suffix: Optional[str] = None ): """ Get a context manager that checks for a default to pandas warning if `condition` is True. Parameters ---------- condition : bool Whether to check for the default to pandas warning. prefix : Optional[str] If specified, checks that the start of the warning message matches this argument before "[Dd]efaulting to pandas". suffix : Optional[str] If specified, checks that the end of the warning message matches this argument after "[Dd]efaulting to pandas". Returns ------- pytest.recwarn.WarningsChecker or contextlib.nullcontext If ``condition`` is True, ``WarningsChecker`` is returned, which will check for a ``UserWarning`` indicating that Modin is defaulting to Pandas. If it is False, a ``nullcontext`` is returned to avoid checking for the warning about defaulting to Pandas. """ assert isinstance(condition, bool) return ( warns_that_defaulting_to_pandas(prefix=prefix, suffix=suffix) if condition else contextlib.nullcontext() ) def warns_that_defaulting_to_pandas(prefix=None, suffix=None): """ Assert that code warns that it's defaulting to pandas. Parameters ---------- prefix : Optional[str] If specified, checks that the start of the warning message matches this argument before "[Dd]efaulting to pandas". suffix : Optional[str] If specified, checks that the end of the warning message matches this argument after "[Dd]efaulting to pandas". Returns ------- pytest.recwarn.WarningsChecker """ match = "[Dd]efaulting to pandas" if prefix: # Message may be separated by newlines match = match + "(.|\\n)+" if suffix: match += "(.|\\n)+" + suffix return pytest.warns(UserWarning, match=match) @pytest.mark.parametrize("as_json", [True, False]) def test_show_versions(as_json, capsys): modin.utils.show_versions(as_json=as_json) versions = capsys.readouterr().out assert modin.__version__ in versions if as_json: versions = json.loads(versions) assert versions["modin dependencies"]["modin"] == modin.__version__ def test_warns_that_defaulting_to_pandas(): with warns_that_defaulting_to_pandas(): ErrorMessage.default_to_pandas() with warns_that_defaulting_to_pandas(): ErrorMessage.default_to_pandas(message="Function name") def test_warns_that_defaulting_to_pandas_if_false(): with pytest.raises(UserWarning): with warns_that_defaulting_to_pandas_if(False): ErrorMessage.default_to_pandas() def test_warns_that_defaulting_to_pandas_if_true(): with warns_that_defaulting_to_pandas_if(True): ErrorMessage.default_to_pandas() def test_warns_that_defaulting_to_pandas_if_non_bool(): with pytest.raises(AssertionError): warns_that_defaulting_to_pandas_if(3) def test_assert_dtypes_equal(): """Verify that `assert_dtypes_equal` from test utils works correctly (raises an error when it has to).""" from modin.tests.pandas.utils import assert_dtypes_equal # Serieses with equal dtypes sr1, sr2 = pd.Series([1.0]), pandas.Series([1.0]) assert sr1.dtype == sr2.dtype == "float" assert_dtypes_equal(sr1, sr2) # shouldn't raise an error since dtypes are equal # Serieses with different dtypes belonging to the same class sr1 = sr1.astype("int") assert sr1.dtype != sr2.dtype and sr1.dtype == "int" assert_dtypes_equal(sr1, sr2) # shouldn't raise an error since both are numeric # Serieses with different dtypes not belonging to the same class sr2 = sr2.astype("str") assert sr1.dtype != sr2.dtype and sr2.dtype == "object" with pytest.raises(AssertionError): assert_dtypes_equal(sr1, sr2) # Dfs with equal dtypes df1, df2 = create_test_dfs({"a": [1], "b": [1.0]}) assert_dtypes_equal(df1, df2) # shouldn't raise an error since dtypes are equal # Dfs with different dtypes belonging to the same class df1 = df1.astype({"a": "float"}) assert df1.dtypes["a"] != df2.dtypes["a"] assert_dtypes_equal(df1, df2) # shouldn't raise an error since both are numeric # Dfs with different dtypes df2 = df2.astype("str") with pytest.raises(AssertionError): assert_dtypes_equal(sr1, sr2) # Dfs with categorical dtypes df1 = df1.astype("category") df2 = df2.astype("category") assert_dtypes_equal(df1, df2) # shouldn't raise an error since both are categorical # Dfs with different dtypes (categorical and str) df1 = df1.astype({"a": "str"}) with pytest.raises(AssertionError): assert_dtypes_equal(df1, df2) def test_execute(): data = np.random.rand(100, 64) modin_df, pandas_df = create_test_dfs(data) partitions = modin_df._query_compiler._modin_frame._partitions.flatten() mgr_cls = modin_df._query_compiler._modin_frame._partition_mgr_cls # check modin case with patch.object(mgr_cls, "wait_partitions", new=Mock()): modin.utils.execute(modin_df) mgr_cls.wait_partitions.assert_called_once() assert (mgr_cls.wait_partitions.call_args[0] == partitions).all() # check pandas case without error with patch.object(mgr_cls, "wait_partitions", new=Mock()): modin.utils.execute(pandas_df) mgr_cls.wait_partitions.assert_not_called() with patch.object(mgr_cls, "wait_partitions", new=Mock()): modin.utils.execute(modin_df) mgr_cls.wait_partitions.assert_called_once() # check several modin dataframes with patch.object(mgr_cls, "wait_partitions", new=Mock()): modin.utils.execute(modin_df, modin_df[modin_df.columns[:4]]) mgr_cls.wait_partitions.assert_called assert mgr_cls.wait_partitions.call_count == 2 def current_execution_is_native() -> bool: """Whether the current global execution mode is native.""" return StorageFormat.get() == "Native" and Engine.get() == "Native" def df_or_series_using_native_execution(df: Union[pd.DataFrame, pd.Series]) -> bool: """Whether this Modin DataFrame or Series is using native execution.""" return ( df._query_compiler.engine == "Native" and df._query_compiler.storage_format == "Native" ) ================================================ FILE: modin/utils.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """Collection of general utility functions, mostly for internal use.""" import codecs import functools import importlib import inspect import json import os import re import sys import types import warnings from pathlib import Path from textwrap import dedent, indent from typing import ( Any, Callable, Iterable, List, Mapping, Optional, Protocol, TypeVar, Union, runtime_checkable, ) import numpy as np import pandas from packaging import version from pandas._typing import JSONSerializable from pandas.util._decorators import Appender # type: ignore from pandas.util._print_versions import ( # type: ignore[attr-defined] _get_dependency_info, _get_sys_info, ) from modin._version import get_versions from modin.config import DocModule, Engine, StorageFormat # Similar to pandas, sentinel value to use as kwarg in place of None when None has # special meaning and needs to be distinguished from a user explicitly passing None. sentinel = object() T = TypeVar("T") """Generic type parameter""" Fn = TypeVar("Fn", bound=Callable) """Function type parameter (used in decorators that don't change a function's signature)""" @runtime_checkable class SupportsPublicToPandas(Protocol): # noqa: PR01 """Structural type for objects with a ``to_pandas`` method (without a leading underscore).""" def to_pandas(self) -> Any: # noqa: GL08 pass @runtime_checkable class SupportsPublicToNumPy(Protocol): # noqa: PR01 """Structural type for objects with a ``to_numpy`` method (without a leading underscore).""" def to_numpy(self) -> Any: # noqa: GL08 pass @runtime_checkable class SupportsPrivateToNumPy(Protocol): # noqa: PR01 """Structural type for objects with a ``_to_numpy`` method (note the leading underscore).""" def _to_numpy(self) -> Any: # noqa: GL08 pass MIN_RAY_VERSION = version.parse("2.10.0") MIN_DASK_VERSION = version.parse("2.22.0") MIN_UNIDIST_VERSION = version.parse("0.2.1") PANDAS_API_URL_TEMPLATE = f"https://pandas.pydata.org/pandas-docs/version/{pandas.__version__}/reference/api/{{}}.html" # The '__reduced__' name is used internally by the query compiler as a column name to # represent pandas Series objects that are not explicitly assigned a name, so as to # distinguish between an N-element series and 1xN dataframe. MODIN_UNNAMED_SERIES_LABEL = "__reduced__" def _make_api_url(token: str) -> str: """ Generate the link to pandas documentation. Parameters ---------- token : str Part of URL to use for generation. Returns ------- str URL to pandas doc. Notes ----- This function is extracted for better testability. """ return PANDAS_API_URL_TEMPLATE.format(token) def _get_indent(doc: str) -> int: """ Compute indentation in docstring. Parameters ---------- doc : str The docstring to compute indentation for. Returns ------- int Minimal indent (excluding empty lines). """ indents = _get_indents(doc) return min(indents) if indents else 0 def _get_indents(source: Union[list, str]) -> list: """ Compute indentation for each line of the source string. Parameters ---------- source : str or list of str String to compute indents for. Passed list considered as a list of lines of the source string. Returns ------- list of ints List containing computed indents for each line. """ indents = [] if not isinstance(source, list): source = source.splitlines() for line in source: if not line.strip(): continue for pos, ch in enumerate(line): if ch != " ": break indents.append(pos) return indents def format_string(template: str, **kwargs: str) -> str: """ Insert passed values at the corresponding placeholders of the specified template. In contrast with the regular ``str.format()`` this function computes proper indents for the placeholder values. Parameters ---------- template : str Template to substitute values in. **kwargs : dict Dictionary that maps placeholder names with values. Returns ------- str Formated string. """ # We want to change indentation only for those values which placeholders are located # at the start of the line, in that case the placeholder sets an indentation # that the filling value has to obey. # RegExp determining placeholders located at the beginning of the line. regex = r"^( *)\{(\w+)\}" for line in template.splitlines(): if line.strip() == "": continue match = re.search(regex, line) if match is None: continue nspaces = len(match.group(1)) key = match.group(2) value = kwargs.get(key) if not value: continue value = dedent(value) # Since placeholder is located at the beginning of a new line, # it already has '\n' before it, so to avoid double new lines # we want to discard the first leading '\n' at the value line, # the others leading '\n' are considered as being put on purpose if value[0] == "\n": value = value[1:] # `.splitlines()` doesn't preserve last empty line, # so we have to restore it further value_lines = value.splitlines() # We're not indenting the first line of the value, since it's already indented # properly because of the placeholder indentation. indented_lines = [ indent(line, " " * nspaces) if line != "\n" else line for line in value_lines[1:] ] # If necessary, restoring the last line dropped by `.splitlines()` if value[-1] == "\n": indented_lines += [" " * nspaces] indented_value = "\n".join([value_lines[0], *indented_lines]) kwargs[key] = indented_value return template.format(**kwargs) def align_indents(source: str, target: str) -> str: """ Align indents of two strings. Parameters ---------- source : str Source string to align indents with. target : str Target string to align indents. Returns ------- str Target string with indents aligned with the source. """ source_indent = _get_indent(source) target = dedent(target) return indent(target, " " * source_indent) def append_to_docstring(message: str) -> Callable[[Fn], Fn]: """ Create a decorator which appends passed message to the function's docstring. Parameters ---------- message : str Message to append. Returns ------- callable """ def decorator(func: Fn) -> Fn: to_append = align_indents(func.__doc__ or "", message) return Appender(to_append)(func) return decorator def _replace_doc( source_obj: object, target_obj: object, overwrite: bool, apilink: Optional[Union[str, List[str]]], parent_cls: Optional[Fn] = None, attr_name: Optional[str] = None, ) -> None: """ Replace docstring in `target_obj`, possibly taking from `source_obj` and augmenting. Can append the link to pandas API online documentation. Parameters ---------- source_obj : object Any object from which to take docstring from. target_obj : object The object which docstring to replace. overwrite : bool Forces replacing the docstring with the one from `source_obj` even if `target_obj` has its own non-empty docstring. apilink : str | List[str], optional If non-empty, insert the link(s) to pandas API documentation. Should be the prefix part in the URL template, e.g. "pandas.DataFrame". parent_cls : class, optional If `target_obj` is an attribute of a class, `parent_cls` should be that class. This is used for generating the API URL as well as for handling special cases like `target_obj` being a property or a cached_property. attr_name : str, optional Gives the name to `target_obj` if it's an attribute of `parent_cls`. Needed to handle some special cases and in most cases could be determined automatically. """ if isinstance(target_obj, (staticmethod, classmethod)): # we cannot replace docs on decorated objects, we must replace them # on original functions instead target_obj = target_obj.__func__ source_doc = source_obj.__doc__ or "" target_doc = target_obj.__doc__ or "" overwrite = overwrite or not target_doc doc = source_doc if overwrite else target_doc if doc == "": # Empty docstrings do not need to be inherited return if parent_cls and not attr_name: if isinstance(target_obj, property): attr_name = target_obj.fget.__name__ # type: ignore[union-attr] elif isinstance(target_obj, functools.cached_property): attr_name = target_obj.func.__name__ elif isinstance(target_obj, (staticmethod, classmethod)): attr_name = target_obj.__func__.__name__ else: attr_name = target_obj.__name__ # type: ignore[attr-defined] if ( source_doc.strip() and apilink and "pandas API documentation for " not in target_doc and (not (attr_name or "").startswith("_")) ): apilink_l = [apilink] if not isinstance(apilink, list) and apilink else apilink links = [] for link in apilink_l: if attr_name: token = f"{link}.{attr_name}" else: token = link url = _make_api_url(token) links.append(f"`{token} <{url}>`_") indent_line = " " * _get_indent(doc) notes_section = f"\n{indent_line}Notes\n{indent_line}-----\n" url_line = f"{indent_line}See pandas API documentation for {', '.join(links)} for more.\n" notes_section_with_url = notes_section + url_line if notes_section in doc: doc = doc.replace(notes_section, notes_section_with_url) else: doc += notes_section_with_url if parent_cls and isinstance(target_obj, property): if overwrite: target_obj.fget.__doc_inherited__ = True # type: ignore[union-attr] assert attr_name is not None setattr( parent_cls, attr_name, property(target_obj.fget, target_obj.fset, target_obj.fdel, doc), ) elif parent_cls and isinstance(target_obj, functools.cached_property): if overwrite: target_obj.func.__doc_inherited__ = True # type: ignore[attr-defined] assert attr_name is not None target_obj.func.__doc__ = doc setattr( parent_cls, attr_name, functools.cached_property(target_obj.func), ) # otherwise: `TypeError: Cannot use cached_property instance without calling __set_name__ on it.` getattr(parent_cls, attr_name).__set_name__(parent_cls, attr_name) else: if overwrite: target_obj.__doc_inherited__ = True # type: ignore[attr-defined] target_obj.__doc__ = doc # This is a map from objects whose docstrings we are overriding to functions that # take a DocModule string and override the docstring according to the # DocModule. When we update DocModule, we can use this map to update all # inherited docstrings. _docstring_inheritance_calls: list[Callable[[str], None]] = [] # This is a set of (class, attribute_name) pairs whose docstrings we have # already replaced since we last updated DocModule. Note that we don't store # the attributes themselves since we replace property attributes instead of # modifying them in place: # https://github.com/modin-project/modin/blob/e9dbcc127913db77473a83936e8b6bb94ef84f0d/modin/utils.py#L353 _attributes_with_docstrings_replaced: set[tuple[type, str]] = set() def _documentable_obj(obj: object) -> bool: """ Check whether we can replace the docstring of `obj`. Parameters ---------- obj : object Object whose docstring we want to replace. Returns ------- bool Whether we can replace the docstring. """ return bool( callable(obj) and not inspect.isclass(obj) or (isinstance(obj, property) and obj.fget) or (isinstance(obj, functools.cached_property)) or (isinstance(obj, (staticmethod, classmethod)) and obj.__func__) ) def _update_inherited_docstrings(doc_module: DocModule) -> None: """ Update all inherited docstrings. Parameters ---------- doc_module : DocModule The current DocModule. """ _attributes_with_docstrings_replaced.clear() _doc_module = doc_module.get() for doc_inheritance_call in _docstring_inheritance_calls: doc_inheritance_call(doc_module=_doc_module) # type: ignore[call-arg] def _inherit_docstrings_in_place( cls_or_func: Fn, doc_module: str, parent: object, excluded: List[object], overwrite_existing: bool = False, apilink: Optional[Union[str, List[str]]] = None, ) -> None: """ Replace `cls_or_func` docstrings with `parent` docstrings in place. Parameters ---------- cls_or_func : Fn The class or function whose docstrings we need to update. doc_module : str The docs module. parent : object Parent object from which the decorated object inherits __doc__. excluded : list, default: [] List of parent objects from which the class does not inherit docstrings. overwrite_existing : bool, default: False Allow overwriting docstrings that already exist in the decorated class. apilink : str | List[str], optional If non-empty, insert the link(s) to pandas API documentation. Should be the prefix part in the URL template, e.g. "pandas.DataFrame". """ # Import the docs module and get the class (e.g. `DataFrame`). imported_doc_module = importlib.import_module(doc_module) # Set the default parent so we can use it in case some docs are missing from # parent module. default_parent = parent # Try to get the parent object from the doc module, and if it isn't there, # get it from parent instead. We only do this if we are overriding pandas # documentation. We don't touch other docs. if doc_module != DocModule.default and "pandas" in str( getattr(parent, "__module__", "") ): parent_name = ( # DocModule should use the class BasePandasDataset to override the # docstrings of BasePandasDataset, even if BasePandasDataset # normally inherits docstrings from a different `parent`. "BasePandasDataset" if getattr(cls_or_func, "__name__", "") == "BasePandasDataset" # For other classes, override docstrings with the class that has the # same name as the `parent` class, e.g. DataFrame inherits # docstrings from doc_module.DataFrame. else getattr(parent, "__name__", "") ) parent = getattr(imported_doc_module, parent_name, parent) if parent != default_parent: # Reset API link in case the docs are overridden. apilink = None overwrite_existing = True if parent not in excluded: _replace_doc(parent, cls_or_func, overwrite_existing, apilink) if not isinstance(cls_or_func, types.FunctionType): seen = set() for base in cls_or_func.__mro__: # type: ignore[attr-defined] if base is object: continue for attr, obj in base.__dict__.items(): # only replace docstrings once to prevent https://github.com/modin-project/modin/issues/7113 if attr in seen or (base, attr) in _attributes_with_docstrings_replaced: continue seen.add(attr) if hasattr(obj, "_wrapped_superclass_method"): # If this method originally comes from a superclass, we get # docstrings directly from the wrapped superclass method # rather than inheriting docstrings from the usual parent. # For example, for BasePandasDataset and Series, the behavior is: # - If Series inherits a method from BasePandasDataset, then # it gets the docstring from that method in BasePandasDataset. # - If Series overrides a method or defines its own method # that's not present in BasePandasDataset, it follows the usual # inheritance hierarchy of `parent` and `default_parent`. parent_obj = obj._wrapped_superclass_method else: # Try to get the attribute from the docs class first, then # from the default parent (pandas), and if it's not in either, # set `parent_obj` to `None`. parent_obj = getattr( parent, attr, getattr(default_parent, attr, None) ) if ( parent_obj in excluded or not _documentable_obj(parent_obj) or not _documentable_obj(obj) ): continue _replace_doc( parent_obj, obj, overwrite_existing, apilink, parent_cls=base, attr_name=attr, ) _attributes_with_docstrings_replaced.add((base, attr)) def _inherit_docstrings( parent: object, excluded: List[object] = [], overwrite_existing: bool = False, apilink: Optional[Union[str, List[str]]] = None, ) -> Callable[[Fn], Fn]: """ Create a decorator which overwrites decorated object docstring(s). It takes `parent` __doc__ attribute. Also overwrites __doc__ of methods and properties defined in the target or its ancestors if it's a class with the __doc__ of matching methods and properties from the `parent`. Parameters ---------- parent : object Parent object from which the decorated object inherits __doc__. excluded : list, default: [] List of parent objects from which the class does not inherit docstrings. overwrite_existing : bool, default: False Allow overwriting docstrings that already exist in the decorated class. apilink : str | List[str], optional If non-empty, insert the link(s) to pandas API documentation. Should be the prefix part in the URL template, e.g. "pandas.DataFrame". Returns ------- callable Decorator which replaces the decorated object's documentation with `parent` documentation. Notes ----- Keep in mind that the function will override docstrings even for attributes which are not defined in target class (but are defined in the ancestor class), which means that ancestor class attribute docstrings could also change. """ def decorator(cls_or_func: Fn) -> Fn: inherit_docstring_in_place = functools.partial( _inherit_docstrings_in_place, cls_or_func=cls_or_func, parent=parent, excluded=excluded, overwrite_existing=overwrite_existing, apilink=apilink, ) inherit_docstring_in_place(doc_module=DocModule.get()) _docstring_inheritance_calls.append(inherit_docstring_in_place) return cls_or_func return decorator DocModule.subscribe(_update_inherited_docstrings) def expanduser_path_arg(argname: str) -> Callable[[Fn], Fn]: """ Decorate a function replacing its path argument with "user-expanded" value. Parameters ---------- argname : str Name of the argument which is containing a path to be expanded. Returns ------- callable Decorator which performs the replacement. """ def decorator(func: Fn) -> Fn: signature = inspect.signature(func) assert ( getattr(signature.parameters.get(argname), "name", None) == argname ), f"Function {func} does not take '{argname}' as argument" @functools.wraps(func) def wrapped(*args: tuple, **kw: dict) -> Any: params = signature.bind(*args, **kw) if patharg := params.arguments.get(argname, None): if isinstance(patharg, str) and patharg.startswith("~"): params.arguments[argname] = os.path.expanduser(patharg) elif isinstance(patharg, Path): params.arguments[argname] = patharg.expanduser() return func(*params.args, **params.kwargs) return func(*args, **kw) return wrapped # type: ignore[return-value] return decorator def func_from_deprecated_location( func_name: str, module: str, deprecation_message: str ) -> Callable: """ Create a function that decorates a function ``module.func_name`` with a ``FutureWarning``. Parameters ---------- func_name : str Function name to decorate. module : str Module where the function is located. deprecation_message : str Message to print in a future warning. Returns ------- callable """ def deprecated_func(*args: tuple[Any], **kwargs: dict[Any, Any]) -> Any: """Call deprecated function.""" func = getattr(importlib.import_module(module), func_name) # using 'FutureWarning' as 'DeprecationWarnings' are filtered out by default warnings.warn(deprecation_message, FutureWarning) return func(*args, **kwargs) return deprecated_func def hashable(obj: bool) -> bool: """ Return whether the `obj` is hashable. Parameters ---------- obj : object The object to check. Returns ------- bool """ # Happy path: if there's no __hash__ method, the object definitely isn't hashable if not hasattr(obj, "__hash__"): return False # Otherwise, we may still need to check for type errors, as in the case of `hash(([],))`. # (e.g. an unhashable object inside a tuple) try: hash(obj) except TypeError: return False return True def try_cast_to_pandas(obj: Any, squeeze: bool = False) -> Any: """ Convert `obj` and all nested objects from Modin to pandas if it is possible. If no convertion possible return `obj`. Parameters ---------- obj : object Object to convert from Modin to pandas. squeeze : bool, default: False Squeeze the converted object(s) before returning them. Returns ------- object Converted object. """ if isinstance(obj, SupportsPublicToPandas) or hasattr(obj, "modin"): result = obj.modin.to_pandas() if hasattr(obj, "modin") else obj.to_pandas() if squeeze: result = result.squeeze(axis=1) # QueryCompiler/low-level ModinFrame case, it doesn't have logic about convertion to Series if ( isinstance(getattr(result, "name", None), str) and result.name == MODIN_UNNAMED_SERIES_LABEL ): result.name = None return result if isinstance(obj, (list, tuple)): return type(obj)([try_cast_to_pandas(o, squeeze=squeeze) for o in obj]) if isinstance(obj, dict): return {k: try_cast_to_pandas(v, squeeze=squeeze) for k, v in obj.items()} if callable(obj): module_hierarchy = getattr(obj, "__module__", "").split(".") fn_name = getattr(obj, "__name__", None) if fn_name and module_hierarchy[0] == "modin": return ( getattr(pandas.DataFrame, fn_name, obj) if module_hierarchy[-1] == "dataframe" else getattr(pandas.Series, fn_name, obj) ) return obj def execute(*objs: Iterable[Any]) -> None: """ Trigger the lazy computations for each obj in `objs`, if any, and wait for them to complete. Parameters ---------- *objs : Iterable[Any] A collection of objects to trigger lazy computations. """ for obj in objs: if not hasattr(obj, "_query_compiler"): continue query_compiler = obj._query_compiler query_compiler.execute() def wrap_into_list(*args: Any, skipna: bool = True) -> List[Any]: """ Wrap a sequence of passed values in a flattened list. If some value is a list by itself the function appends its values to the result one by one instead inserting the whole list object. Parameters ---------- *args : tuple Objects to wrap into a list. skipna : bool, default: True Whether or not to skip nan or None values. Returns ------- list Passed values wrapped in a list. """ def isnan(o: Any) -> bool: return o is None or (isinstance(o, float) and np.isnan(o)) res = [] for o in args: if skipna and isnan(o): continue if isinstance(o, list): res.extend(o) else: res.append(o) return res def wrap_udf_function(func: Callable) -> Callable: """ Create a decorator that makes `func` return pandas objects instead of Modin. Parameters ---------- func : callable Function to wrap. Returns ------- callable """ def wrapper(*args: Any, **kwargs: Any) -> Any: result = func(*args, **kwargs) # if user accidently returns modin DataFrame or Series # casting it back to pandas to properly process return try_cast_to_pandas(result) wrapper.__name__ = func.__name__ return wrapper def get_current_execution() -> str: """ Return current execution name as a string. Returns ------- str Returns On-like string. """ return f"{StorageFormat.get()}On{Engine.get()}" def instancer(_class: Callable[[], T]) -> T: """ Create a dummy instance each time this is imported. This serves the purpose of allowing us to use all of pandas plotting methods without aliasing and writing each of them ourselves. Parameters ---------- _class : object Returns ------- object Instance of `_class`. """ return _class() def import_optional_dependency(name: str, message: str) -> types.ModuleType: """ Import an optional dependecy. Parameters ---------- name : str The module name. message : str Additional text to include in the ImportError message. Returns ------- module : ModuleType The imported module. """ try: return importlib.import_module(name) except ImportError: raise ImportError( f"Missing optional dependency '{name}'. {message} " + f"Use pip or conda to install {name}." ) from None def _get_modin_deps_info() -> Mapping[str, Optional[JSONSerializable]]: """ Return Modin-specific dependencies information as a JSON serializable dictionary. Returns ------- Mapping[str, Optional[pandas.JSONSerializable]] The dictionary of Modin dependencies and their versions. """ import modin # delayed import so modin.__init__ is fully initialized result = {"modin": modin.__version__} for pkg_name, pkg_version in [ ("ray", MIN_RAY_VERSION), ("dask", MIN_DASK_VERSION), ("distributed", MIN_DASK_VERSION), ]: try: pkg = importlib.import_module(pkg_name) except ImportError: result[pkg_name] = None else: result[pkg_name] = pkg.__version__ + ( f" (outdated; >={pkg_version} required)" if version.parse(pkg.__version__) < pkg_version else "" ) return result def show_versions(as_json: Union[str, bool] = False) -> None: """ Provide useful information, important for bug reports. It comprises info about hosting operation system, pandas version, and versions of other installed relative packages. Parameters ---------- as_json : str or bool, default: False * If False, outputs info in a human readable form to the console. * If str, it will be considered as a path to a file. Info will be written to that file in JSON format. * If True, outputs info in JSON format to the console. Notes ----- This is mostly a copy of pandas.show_versions() but adds separate listing of Modin-specific dependencies. """ sys_info = _get_sys_info() sys_info["commit"] = get_versions()["full-revisionid"] modin_deps = _get_modin_deps_info() deps = _get_dependency_info() if as_json: j = { "system": sys_info, "modin dependencies": modin_deps, "dependencies": deps, } if as_json is True: sys.stdout.writelines(json.dumps(j, indent=2)) else: assert isinstance(as_json, str) # needed for mypy with codecs.open(as_json, "wb", encoding="utf8") as f: json.dump(j, f, indent=2) else: assert isinstance(sys_info["LOCALE"], dict) # needed for mypy language_code = sys_info["LOCALE"]["language-code"] encoding = sys_info["LOCALE"]["encoding"] sys_info["LOCALE"] = f"{language_code}.{encoding}" maxlen = max(max(len(x) for x in d) for d in (deps, modin_deps)) print("\nINSTALLED VERSIONS\n------------------") # noqa: T201 for k, v in sys_info.items(): print(f"{k:<{maxlen}}: {v}") # noqa: T201 for name, d in (("Modin", modin_deps), ("pandas", deps)): print(f"\n{name} dependencies\n{'-' * (len(name) + 13)}") # noqa: T201 for k, v in d.items(): print(f"{k:<{maxlen}}: {v}") # noqa: T201 class ModinAssumptionError(Exception): """An exception that allows us defaults to pandas if any assumption fails.""" pass def _maybe_warn_on_default(message: str = "", *, reason: str = "") -> None: """ Raise a warning on an operation that defaults to pandas if necessary. This checks the query compiler used by the current configured active backend, and prints a warning message about defaulting to pandas if needed. Parameters ---------- message : str, default: "" The message to show. reason : str, default: "" The reason for defaulting. """ # Avoids a module-level circular import from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher FactoryDispatcher.get_factory().io_cls.query_compiler_cls._maybe_warn_on_default( message=message, reason=reason ) class classproperty: """ Decorator that allows creating read-only class properties. Parameters ---------- func : method Examples -------- >>> class A: ... field = 10 ... @classproperty ... def field_x2(cls): ... return cls.field * 2 ... >>> print(A.field_x2) 20 """ def __init__(self, func: Any): self.fget = func def __get__(self, instance: Any, owner: Any) -> Any: # noqa: GL08 return self.fget(owner) def reload_modin() -> None: """ Reload all previously imported Modin modules. The call to this function is required if an execution engine has been shut down and is going to be started up once again. """ modules = sys.modules.copy() for name, module in modules.items(): if name.startswith("modin"): importlib.reload(module) ================================================ FILE: modin-autoimport-pandas.pth ================================================ import os; os.environ.get("__MODIN_AUTOIMPORT_PANDAS__", None) and __import__("pandas") ================================================ FILE: mypy.ini ================================================ [mypy] # Ignoring missing imports can be dangerous, should do this at module-by-module level ignore_missing_imports = True show_error_codes = True show_column_numbers = True check_untyped_defs = True follow_imports = silent # be strict disallow_untyped_calls=True disallow_untyped_defs=True strict_optional=True warn_no_return=True warn_redundant_casts=True warn_unused_ignores=True disallow_any_generics=False warn_unreachable=True # We will add more files over time to increase coverage files = modin/config/, modin/core/dataframe/base/, modin/logging/, modin/distributed/, modin/*.py exclude = .*/tests/.* ================================================ FILE: requirements/env_unidist_linux.yml ================================================ name: modin_on_unidist channels: - conda-forge dependencies: - pip # required dependencies - pandas>=2.2,<2.4 - numpy>=1.22.4 - unidist-mpi>=0.2.1 - mpich - fsspec>=2022.11.0 - packaging>=21.0 - psutil>=5.8.0 # optional dependencies # NOTE Keep the ray and dask dependencies in sync with the Windows Unidist # environment and the general environment-dev.yml. # We include the ray and dask dependencies here because we want to test # switching dataframe backends to ray or dask. - ray-core>=2.10.0,<3 # workaround for https://github.com/conda/conda/issues/11744 - grpcio!=1.45.* - grpcio!=1.46.* - dask>=2.22.0 - pyarrow>=10.0.1 - xarray>=2022.12.0 - jinja2>=3.1.2 - scipy>=1.10.0 - s3fs>=2022.11.0 - lxml>=4.9.2 - openpyxl>=3.1.0 - xlrd>=2.0.1 - matplotlib>=3.6.3 - sqlalchemy>=2.0.0 - pandas-gbq>=0.19.0 - pytables>=3.8.0 # pymssql==2.2.8 broken: https://github.com/modin-project/modin/issues/6429 - pymssql>=2.1.5,!=2.2.8 - psycopg2>=2.9.6 - fastparquet>=2022.12.0 - tqdm>=4.60.0 - numexpr>=2.8.4 # dependencies for making release - pygithub>=v1.58.0 - pygit2>=1.9.2 # test dependencies - coverage>=7.1.0 - moto>=4.1.0 - pytest>=7.3.2 - pytest-cov>=4.0.0 - pytest-xdist>=3.2.0 - typing_extensions # code linters - black>=24.1.0 - flake8>=6.0.0 - flake8-no-implicit-concat>=0.3.4 - flake8-print>=5.0.0 - mypy>=1.0.0 - pandas-stubs>=2.0.0 - pip: # Fixes breaking ipywidgets changes, but didn't release yet. - git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5 - connectorx>=0.2.6a4 # The `numpydoc` version should match the version installed in the `lint-pydocstyle` job of the CI. - numpydoc==1.6.0 ================================================ FILE: requirements/env_unidist_win.yml ================================================ name: modin_on_unidist channels: - conda-forge dependencies: - pip # required dependencies - pandas>=2.2,<2.4 - numpy>=1.22.4 - unidist-mpi>=0.2.1 - msmpi - fsspec>=2022.11.0 - packaging>=21.0 - psutil>=5.8.0 # optional dependencies # NOTE Keep the ray and dask dependencies in sync with the Linux Unidist # environment and the general environment-dev.yml. # We include the ray and dask dependencies here because we want to test # switching dataframe backends to ray or dask. - ray-core>=2.10.0,<3 # workaround for https://github.com/conda/conda/issues/11744 - grpcio!=1.45.* - grpcio!=1.46.* - dask>=2.22.0 - pyarrow>=10.0.1 - xarray>=2022.12.0 - jinja2>=3.1.2 - scipy>=1.10.0 - s3fs>=2022.11.0 - lxml>=4.9.2 - openpyxl>=3.1.0 - xlrd>=2.0.1 - matplotlib>=3.6.3 - sqlalchemy>=2.0.0 - pandas-gbq>=0.19.0 - pytables>=3.8.0 # pymssql==2.2.8 broken: https://github.com/modin-project/modin/issues/6429 - pymssql>=2.1.5,!=2.2.8 - psycopg2>=2.9.6 - fastparquet>=2022.12.0 - tqdm>=4.60.0 - numexpr>=2.8.4 # dependencies for making release - pygithub>=v1.58.0 - pygit2>=1.9.2 # test dependencies - coverage>=7.1.0 - moto>=4.1.0 - pytest>=7.3.2 - pytest-cov>=4.0.0 - pytest-xdist>=3.2.0 - typing_extensions # code linters - black>=24.1.0 - flake8>=6.0.0 - flake8-no-implicit-concat>=0.3.4 - flake8-print>=5.0.0 - mypy>=1.0.0 - pandas-stubs>=2.0.0 - pip: - dataframe-api-compat>=0.2.7 # Fixes breaking ipywidgets changes, but didn't release yet. - git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5 - connectorx>=0.2.6a4 # The `numpydoc` version should match the version installed in the `lint-pydocstyle` job of the CI. - numpydoc==1.6.0 ================================================ FILE: requirements/requirements-no-engine.yml ================================================ channels: - conda-forge dependencies: - pip # required dependencies - pandas>=2.2,<2.4 - numpy>=1.22.4 - fsspec>=2022.11.0 - packaging>=21.0 - psutil>=5.8.0 # optional dependencies - pyarrow>=10.0.1 - xarray>=2022.12.0 - jinja2>=3.1.2 - scipy>=1.10.0 - s3fs>=2022.11.0 - lxml>=4.9.2 - openpyxl>=3.1.0 - xlrd>=2.0.1 - matplotlib>=3.6.3 - sqlalchemy>=2.0.0 - pandas-gbq>=0.19.0 - pytables>=3.8.0 - tqdm>=4.60.0 - numexpr>=2.8.4 # dependencies for making release - pygithub>=v1.58.0 - pygit2>=1.9.2 # test dependencies - coverage>=7.1.0 - moto>=4.1.0 - pytest>=7.3.2 - pytest-cov>=4.0.0 - pytest-xdist>=3.2.0 - typing_extensions # code linters - black>=24.1.0 - flake8>=6.0.0 - flake8-no-implicit-concat>=0.3.4 - flake8-print>=5.0.0 - pip: - dataframe-api-compat>=0.2.7 - asv==0.5.1 # no conda package for windows - connectorx>=0.2.6a4 # Fixes breaking ipywidgets changes, but didn't release yet. - git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5 # The `numpydoc` version should match the version installed in the `lint-pydocstyle` job of the CI. - numpydoc==1.6.0 ================================================ FILE: requirements-dev.txt ================================================ ## required dependencies pandas>=2.2,<2.4 numpy>=1.22.4 fsspec>=2022.11.0 packaging>=21.0 psutil>=5.8.0 ## optional dependencies ray>=2.10.0,<3 pyarrow>=10.0.1 dask[complete]>=2.22.0 distributed>=2.22.0 xarray>=2022.12.0 Jinja2>=3.1.2 scipy>=1.10.0 s3fs>=2022.11.0 lxml>=4.9.2 openpyxl>=3.1.0 xlrd>=2.0.1 matplotlib>=3.6.3 sqlalchemy>=2.0.0 pandas-gbq>=0.19.0 tables>=3.7.0 # pymssql==2.2.8 broken: https://github.com/modin-project/modin/issues/6429 pymssql>=2.1.5,!=2.2.8 # psycopg devs recommend the other way of installation for production # but this is ok for testing and development psycopg2-binary>=2.9.3 connectorx>=0.2.6a4 fastparquet>=2022.12.0 flask-cors tqdm>=4.60.0 numexpr>=2.8.4 # Latest modin-spreadsheet with widget fix git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5 dataframe-api-compat>=0.2.7 ## dependencies for making release PyGithub>=1.58.0 pygit2>=1.9.2 ## test dependencies asv==0.5.1 coverage>=7.1.0 fuzzydata>=0.0.11 # The `numpydoc` version should match the version installed in the `lint-pydocstyle` job of the CI. numpydoc==1.1.0 moto>=4.1.0 pytest>=7.3.2 pytest-benchmark>=4.0.0 pytest-cov>=4.0.0 pytest-xdist>=3.2.0 typing_extensions ## code linters black>=24.1.0 flake8>=6.0.0 flake8-no-implicit-concat>=0.3.4 flake8-print>=5.0.0 mypy>=1.0.0 pandas-stubs>=2.0.0 isort>=5.12 ================================================ FILE: scripts/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. ================================================ FILE: scripts/doc_checker.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. """ Validate docstrings using pydocstyle and numpydoc. Example usage: python scripts/doc_checker.py asv_bench/benchmarks/utils.py modin/pandas """ import argparse import ast import functools import inspect import logging import os import pathlib import re import shutil import subprocess import sys from typing import List from numpydoc.docscrape import NumpyDocString, get_doc_object from numpydoc.validate import Validator # Let the other modules to know that the doc checker is running. os.environ["_MODIN_DOC_CHECKER_"] = "1" logging.basicConfig( stream=sys.stdout, format="%(levelname)s:%(message)s", level=logging.INFO ) MODIN_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) sys.path.insert(0, MODIN_PATH) # error codes that pandas test in CI # https://numpydoc.readthedocs.io/en/latest/validation.html#built-in-validation-checks NUMPYDOC_BASE_ERROR_CODES = { *("GL01", "GL02", "GL03", "GL05", "GL06", "GL07", "GL08", "GL09", "GL10"), *("SS02", "SS03", "SS04", "SS05", "PR01", "PR02", "PR03", "PR04", "PR05"), *("PR08", "PR09", "PR10", "RT01", "RT04", "RT05", "SA02", "SA03"), } MODIN_ERROR_CODES = { "MD01": "'{parameter}' description should be '[type], default: [value]', found: '{found}'", "MD02": "Spelling error in line: {line}, found: '{word}', reference: '{reference}'", "MD03": "Section contents is over-indented (in section '{section}')", } def get_optional_args(doc: Validator) -> dict: """ Get optional parameters for the object for which the docstring is checked. Parameters ---------- doc : numpydoc.validate.Validator Validator handler. Returns ------- dict Dict with default argument names and its values. """ obj = doc.obj if not callable(obj) or inspect.isclass(obj): return {} signature = inspect.signature(obj) return { k: v.default for k, v in signature.parameters.items() if v.default is not inspect.Parameter.empty } def check_optional_args(doc: Validator) -> list: """ Check type description of optional arguments. Parameters ---------- doc : numpydoc.validate.Validator Returns ------- list List of tuples with Modin error code and its description. """ # `not doc.raw_doc and doc.clean_doc` - means that docstring was # automatically generated by numpydoc with help of `pydoc.getdoc`. if not doc.doc_parameters or (not doc.raw_doc and doc.clean_doc): return [] optional_args = get_optional_args(doc) if not optional_args: return [] errors = [] for parameter in optional_args: # case when not all parameters are listed in "Parameters" section; # it's handled by numpydoc itself if parameter not in doc.doc_parameters: continue type_line = doc.doc_parameters[parameter][0] has_default = "default: " in type_line has_optional = "optional" in type_line if not (has_default ^ has_optional): errors.append( ( "MD01", MODIN_ERROR_CODES["MD01"].format( parameter=parameter, found=type_line, ), ) ) return errors def check_spelling_words(doc: Validator) -> list: """ Check spelling of chosen words in doc. Parameters ---------- doc : numpydoc.validate.Validator Validator handler. Returns ------- list List of tuples with Modin error code and its description. Notes ----- Any special words enclosed in apostrophes(") are treated as python string constants and are not checked for spelling. """ if not doc.raw_doc: return [] components = set( ["Modin", "pandas", "NumPy", "Ray", "Dask"] + ["PyArrow", "XGBoost", "Plasma"] ) check_words = "|".join(x.lower() for x in components) # comments work only with re.VERBOSE pattern = r""" (?: # non-capturing group [^-\\\w\/] # any symbol except: '-', '\', '/' and any from [a-zA-Z0-9_] | ^ # or line start ) ({check_words}) # words to check, example - "modin|pandas|numpy" (?: # non-capturing group [^-"\.\/\w\\] # any symbol except: '-', '"', '.', '\', '/' and any from [a-zA-Z0-9_] | \.\s # or '.' and any whitespace | \.$ # or '.' and line end | $ # or line end ) """.format( check_words=check_words ) results = [ set(re.findall(pattern, line, re.I | re.VERBOSE)) - components for line in doc.raw_doc.splitlines() ] docstring_start_line = None for idx, line in enumerate(inspect.getsourcelines(doc.code_obj)[0]): if '"""' in line or "'''" in line: docstring_start_line = doc.source_file_def_line + idx break errors = [] for line_idx, words_in_line in enumerate(results): for word in words_in_line: reference = [x for x in components if x.lower() == word.lower()][0] errors.append( ( "MD02", MODIN_ERROR_CODES["MD02"].format( line=docstring_start_line + line_idx, word=word, reference=reference, ), ) ) return errors def check_docstring_indention(doc: Validator) -> list: """ Check indention of docstring since numpydoc reports weird results. Parameters ---------- doc : numpydoc.validate.Validator Validator handler. Returns ------- list List of tuples with Modin error code and its description. """ from modin.utils import _get_indent numpy_docstring = NumpyDocString(doc.clean_doc) numpy_docstring._doc.reset() numpy_docstring._parse_summary() sections = list(numpy_docstring._read_sections()) errors = [] for section in sections: description = "\n".join(section[1]) if _get_indent(description) != 0: errors.append( ("MD03", MODIN_ERROR_CODES["MD03"].format(section=section[0])) ) return errors def validate_modin_error(doc: Validator, results: dict) -> list: """ Validate custom Modin errors. Parameters ---------- doc : numpydoc.validate.Validator Validator handler. results : dict Dictionary that numpydoc.validate.validate return. Returns ------- dict Updated dict with Modin custom errors. """ errors = check_optional_args(doc) errors += check_spelling_words(doc) errors += check_docstring_indention(doc) results["errors"].extend(errors) return results def skip_check_if_noqa(doc: Validator, err_code: str, noqa_checks: list) -> bool: """ Skip the check that matches `err_code` if `err_code` found in noqa string. Parameters ---------- doc : numpydoc.validate.Validator Validator handler. err_code : str Error code found by numpydoc. noqa_checks : list Found noqa checks. Returns ------- bool Return True if 'noqa' found. """ if noqa_checks == ["all"]: return True # GL08 - missing docstring in an arbitary object; numpydoc code if err_code == "GL08": name = doc.name.split(".")[-1] # Numpydoc recommends to add docstrings of __init__ method in class docstring. # So there is no error if docstring is missing in __init__ if name == "__init__": return True return err_code in noqa_checks def get_noqa_checks(doc: Validator) -> list: """ Get codes after `# noqa`. Parameters ---------- doc : numpydoc.validate.Validator Validator handler. Returns ------- list List with codes. Notes ----- If noqa doesn't have any codes - returns ["all"]. """ source = doc.method_source if not source: return [] noqa_str = "" if not inspect.ismodule(doc.obj): # find last line of obj definition for line in source.split("\n"): if ")" in line and ":" in line.split(")", 1)[1]: noqa_str = line break else: # noqa string is defined as the first line before the docstring if not doc.raw_doc: # noqa string is meaningless if there is no docstring in module return [] lines = source.split("\n") for idx, line in enumerate(lines): if '"""' in line or "'''" in line: noqa_str = lines[idx - 1] break if "# noqa:" in noqa_str: noqa_checks = noqa_str.split("# noqa:", 1)[1].split(",") elif "# noqa" in noqa_str: noqa_checks = ["all"] else: noqa_checks = [] return [check.strip() for check in noqa_checks] def construct_validator(import_path: str) -> Validator: # noqa: GL08 # helper function return Validator(get_doc_object(Validator._load_obj(import_path))) # code snippet from numpydoc def validate_object(import_path: str) -> list: """ Check docstrings of an entity that can be imported. Parameters ---------- import_path : str Python-like import path. Returns ------- errors : list List with string representations of errors. """ from numpydoc.validate import validate errors = [] doc = construct_validator(import_path) if ( getattr(doc.obj, "__doc_inherited__", False) or ( isinstance(doc.obj, property) and getattr(doc.obj.fget, "__doc_inherited__", False) ) or ( isinstance(doc.obj, functools.cached_property) and getattr(doc.obj.func, "__doc_inherited__", False) ) ): # do not check inherited docstrings return errors results = validate(import_path) results = validate_modin_error(doc, results) noqa_checks = get_noqa_checks(doc) for err_code, err_desc in results["errors"]: if ( err_code not in NUMPYDOC_BASE_ERROR_CODES and err_code not in MODIN_ERROR_CODES ) or skip_check_if_noqa(doc, err_code, noqa_checks): continue errors.append( ":".join([import_path, str(results["file_line"]), err_code, err_desc]) ) return errors def numpydoc_validate(path: pathlib.Path) -> bool: """ Perform numpydoc checks. Parameters ---------- path : pathlib.Path Filename or directory path for check. Returns ------- is_successfull : bool Return True if all checks are successful. """ is_successfull = True if path.is_file(): walker = ((str(path.parent), [], [path.name]),) else: walker = os.walk(path) for root, _, files in walker: if "__pycache__" in root: continue for _file in files: if not _file.endswith(".py"): continue current_path = os.path.join(root, _file) # get importable name module_name = current_path.replace("/", ".").replace("\\", ".") # remove ".py" module_name = os.path.splitext(module_name)[0] with open(current_path) as fd: file_contents = fd.read() # using static parsing for collecting module, functions, classes and their methods module = ast.parse(file_contents) def is_public_func(node): return isinstance(node, ast.FunctionDef) and ( not node.name.startswith("__") or node.name.endswith("__") ) functions = [node for node in module.body if is_public_func(node)] classes = [node for node in module.body if isinstance(node, ast.ClassDef)] methods = [ f"{module_name}.{_class.name}.{node.name}" for _class in classes for node in _class.body if is_public_func(node) ] # numpydoc docstrings validation # docstrings are taken dynamically to_validate = ( [module_name] + [f"{module_name}.{x.name}" for x in (functions + classes)] + methods ) results = list(map(validate_object, to_validate)) is_successfull_file = not any(results) if not is_successfull_file: logging.info(f"NUMPYDOC OUTPUT FOR {current_path}") [logging.error(error) for errors in results for error in errors] is_successfull &= is_successfull_file return is_successfull def pydocstyle_validate( path: pathlib.Path, add_ignore: List[str], use_numpydoc: bool ) -> int: """ Perform pydocstyle checks. Parameters ---------- path : pathlib.Path Filename or directory path for check. add_ignore : List[int] `pydocstyle` error codes which are not verified. use_numpydoc : bool Disable duplicate `pydocstyle` checks if `numpydoc` is in use. Returns ------- bool Return True if all pydocstyle checks are successful. """ pydocstyle = "pydocstyle" if not shutil.which(pydocstyle): raise ValueError(f"{pydocstyle} not found in PATH") # These check can be done with numpydoc tool, so disable them for pydocstyle. if use_numpydoc: add_ignore.extend(["D100", "D101", "D102", "D103", "D104", "D105"]) result = subprocess.run( [ pydocstyle, "--convention", "numpy", "--add-ignore", ",".join(add_ignore), str(path), ], text=True, capture_output=True, ) if result.returncode: logging.info(f"PYDOCSTYLE OUTPUT FOR {path}") logging.error(result.stdout) logging.error(result.stderr) return True if result.returncode == 0 else False def monkeypatching(): """Monkeypatch not installed modules and decorators which change __doc__ attribute.""" from unittest.mock import Mock import ray import modin.utils def monkeypatch(*args, **kwargs): if len(args) == 1 and len(kwargs) == 0 and callable(args[0]): # This is the case where the decorator is just @ray.remote without parameters. return args[0] return lambda cls_or_func: cls_or_func ray.remote = monkeypatch # We are mocking packages we don't need for docs checking in order to avoid import errors sys.modules["sqlalchemy"] = Mock() modin.utils.instancer = functools.wraps(modin.utils.instancer)(lambda cls: cls) # monkey-patch numpydoc for working correctly with properties # until https://github.com/numpy/numpydoc/issues/551 is fixed def load_obj(name, old_load_obj=Validator._load_obj): obj = old_load_obj(name) if isinstance(obj, property): obj = obj.fget elif isinstance(obj, functools.cached_property): obj = obj.func return obj Validator._load_obj = staticmethod(load_obj) # enable docs testing on windows sys.getdlopenflags = Mock() sys.setdlopenflags = Mock() xgboost_mock = Mock() class Booster: pass xgboost_mock.Booster = Booster sys.modules["xgboost"] = xgboost_mock def validate( paths: List[pathlib.Path], add_ignore: List[str], use_numpydoc: bool ) -> bool: """ Perform pydocstyle and numpydoc checks. Parameters ---------- paths : List[pathlib.Path] Filenames of directories for check. add_ignore : List[str] `pydocstyle` error codes which are not verified. use_numpydoc : bool Determine if numpydoc checks are needed. Returns ------- is_successfull : bool Return True if all checks are successful. """ is_successfull = True for path in paths: if not pydocstyle_validate(path, add_ignore, use_numpydoc): is_successfull = False if use_numpydoc: if not numpydoc_validate(path): is_successfull = False return is_successfull def check_args(args: argparse.Namespace): """ Check the obtained values for correctness. Parameters ---------- args : argparse.Namespace Parser arguments. Raises ------ ValueError Occurs in case of non-existent files or directories. """ for path in args.paths: if not path.exists(): raise ValueError(f"{path} does not exist") abs_path = os.path.abspath(path) if not abs_path.startswith(MODIN_PATH): raise ValueError( "it is unsupported to use this script on files from another " + f"repository; script' repo '{MODIN_PATH}', " + f"input path '{abs_path}'" ) def get_args() -> argparse.Namespace: """ Get args from cli with validation. Returns ------- argparse.Namespace """ parser = argparse.ArgumentParser( description="Check docstrings by using pydocstyle and numpydoc" ) parser.add_argument( "paths", nargs="+", type=pathlib.Path, help="Filenames or directories; in case of direstories perform recursive check", ) parser.add_argument( "--add-ignore", nargs="*", default=[], help="Pydocstyle error codes; for example: D100,D100,D102", ) parser.add_argument( "--disable-numpydoc", default=False, action="store_true", help="Determine if numpydoc checks are not needed", ) args = parser.parse_args() check_args(args) return args if __name__ == "__main__": args = get_args() monkeypatching() if not validate(args.paths, args.add_ignore, not args.disable_numpydoc): logging.error("INVALID DOCUMENTATION FOUND") exit(1) logging.info("SUCCESSFUL CHECK") ================================================ FILE: scripts/release.py ================================================ import argparse import atexit import collections import json import re import sys from pathlib import Path import github import pygit2 from packaging import version class GithubUserResolver: def __init__(self, email2commit, token): self.__cache_file = Path(__file__).parent / "gh-users-cache.json" self.__cache = ( json.loads(self.__cache_file.read_text()) if self.__cache_file.exists() else {} ) # filter unknown users hoping we'd be able to find them this time self.__cache = {key: value for key, value in self.__cache.items() if value} # using anonymous access if token not specified self.__github = github.Github(token or None) self.__modin_repo = self.__github.get_repo("modin-project/modin") self.__email2commit = email2commit atexit.register(self.__save) def __search_commits(self, term): if commit := self.__email2commit.get(term): gh_commit = self.__modin_repo.get_commit(str(commit)) return gh_commit.author.login return None @staticmethod def __is_email(term): return re.match(r".*@.*\..*", term) def __search_github(self, term): search = f"in:email {term}" if self.__is_email(term) else f"fullname:{term}" match = [user.login for user in self.__github.search_users(search)] return match[0] if len(match) == 1 else None def __try_user(self, term): if self.__is_email(term): return None try: return self.__github.get_user(term).login except github.GithubException as ex: if ex.status != 404: raise return None def __resolve_single(self, term): return ( self.__search_commits(term) or self.__search_github(term) or self.__try_user(term) ) def __resolve_cache(self, name, email): return self.__cache.get(f"{name} <{email}>", None) def __register(self, name, email, match): self.__cache[f"{name} <{email}>"] = match def resolve(self, people): logins, unknowns = set(), set() for name, email in people: if match := self.__resolve_cache(name, email): logins.add(match) elif match := self.__resolve_single(email): self.__register(name, email, match) logins.add(match) else: if match := self.__resolve_single(name): logins.add(match) else: unknowns.add((name, email)) self.__register(name, email, match) return logins, unknowns def resolve_by_reviews(self, unknowns, email2pr): logins, new_unknowns = set(), set() for name, email in unknowns: commit = self.__modin_repo.get_commit(str(email2pr[email])) found = set() for pull in commit.get_pulls(): for review in pull.get_reviews(): user = review.user if user.name == name and (not user.email or user.email == email): found.add(user.login) if len(found) == 1: self.__register(name, email, list(found)[0]) logins |= found else: new_unknowns.add((name, email)) return logins, new_unknowns def __save(self): self.__cache_file.write_text(json.dumps(self.__cache, indent=4, sort_keys=True)) class GitWrapper: def __init__(self): self.repo = pygit2.Repository(Path(__file__).parent) def is_on_main(self): return self.repo.references["refs/heads/main"] == self.repo.head @staticmethod def __get_tag_version(entry): try: return version.parse(entry.lstrip("refs/tags/")) except version.InvalidVersion as ex: return f'' def get_previous_release(self, rel_type): tags = [ (entry, self.__get_tag_version(entry)) for entry in self.repo.references if entry.startswith("refs/tags/") ] # filter away legacy versions (which aren't following the proper naming schema); # also skip pre-releases tags = [ (entry, ver) for entry, ver in tags if isinstance(ver, version.Version) and not ver.pre ] if rel_type == "minor": # leave only minor releases tags = [(entry, ver) for entry, ver in tags if ver.micro == 0] else: assert rel_type == "patch" prev_ref, prev_ver = max(tags, key=lambda pair: pair[1]) return prev_ref, self.repo.references[prev_ref].peel(), prev_ver def get_commits_upto(self, stop_commit): history = [] for obj in self.repo.walk(self.repo.head.target): if obj.id == stop_commit.id: break history.append(obj) else: raise ValueError("Current HEAD is not derived from previous release") return history def ensure_title_link(self, obj: pygit2.Commit): title = obj.message.splitlines()[0] if not re.match(r".*\(#(\d+)\)$", title): title += f" ({obj.short_id})" return title def make_notes(args): wrapper = GitWrapper() release_type = "minor" if wrapper.is_on_main() else "patch" sys.stderr.write(f"Detected release type: {release_type}\n") prev_ref, prev_commit, prev_ver = wrapper.get_previous_release(release_type) sys.stderr.write(f"Previous {release_type} release: {prev_ref}\n") next_major, next_minor, next_patch = prev_ver.release if release_type == "minor": next_minor += 1 elif release_type == "patch": next_patch += 1 else: raise ValueError(f"Unexpected release type: {release_type}") next_ver = version.Version(f"{next_major}.{next_minor}.{next_patch}") sys.stderr.write(f"Computing release notes for {prev_ver} -> {next_ver}...\n") try: history = wrapper.get_commits_upto(prev_commit) except ValueError as ex: sys.stderr.write( f"{ex}: did you forget to checkout correct branch or pull tags?" ) return 1 if not history: sys.stderr.write(f"No commits since {prev_ver} found, nothing to generate!\n") return 1 titles = collections.defaultdict(list) people = set() email2commit, email2pr = {}, {} for obj in history: title = obj.message.splitlines()[0] titles[title.split("-")[0]].append(obj) new_people = set( re.findall( r"(?:(?:Signed-off-by|Co-authored-by):\s*)([\w\s,]+?)\s*<([^>]+)>", obj.message, ) ) for _, email in new_people: email2pr[email] = obj.id people |= new_people email2commit[obj.author.email] = obj.id sys.stderr.write(f"Found {len(history)} commit(s) since {prev_ref}\n") sys.stderr.write("Resolving contributors...\n") user_resolver = GithubUserResolver(email2commit, args.token) logins, unknowns = user_resolver.resolve(people) new_logins, unknowns = user_resolver.resolve_by_reviews(unknowns, email2pr) logins |= new_logins sys.stderr.write(f"Found {len(logins)} GitHub usernames.\n") if unknowns: sys.stderr.write( f"Warning! Failed to resolve {len(unknowns)} usernames, please resolve them manually!\n" ) sections = [ ("Stability and Bugfixes", "FIX"), ("Performance enhancements", "PERF"), ("Refactor Codebase", "REFACTOR"), ("Update testing suite", "TEST"), ("Documentation improvements", "DOCS"), ("New Features", "FEAT"), ] notes = rf"""Modin {next_ver} Key Features and Updates Since {prev_ver} -------------------------------{'-' * len(str(prev_ver))} """ def _add_section(section, prs): nonlocal notes if prs: notes += f"* {section}\n" notes += "\n".join( [ f" * {wrapper.ensure_title_link(obj)}" for obj in sorted(prs, key=lambda obj: obj.message) ] ) notes += "\n" for section, key in sections: _add_section(section, titles.pop(key, None)) uncategorized = sum(titles.values(), []) _add_section("Uncategorized improvements", uncategorized) notes += r""" Contributors ------------ """ notes += "\n".join(f"@{login}" for login in sorted(logins)) + "\n" notes += ( "\n".join( f" {name} <{email}>" for name, email in sorted(unknowns) ) + "\n" ) sys.stdout.write(notes) def main(): parse = argparse.ArgumentParser() parse.add_argument( "--token", type=str, default="", help="GitHub token for queries (optional, bumps up rate limit)", ) parse.set_defaults(func=lambda _: parse.print_usage()) subparsers = parse.add_subparsers() notes = subparsers.add_parser("notes", help="Generate release notes") notes.set_defaults(func=make_notes) args = parse.parse_args() sys.exit(args.func(args)) if __name__ == "__main__": main() ================================================ FILE: scripts/test/__init__.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. ================================================ FILE: scripts/test/examples.py ================================================ # noqa: MD01 # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. # noqa: MD02 """Function examples for docstring testing.""" class weakdict(dict): # noqa: GL08 __slots__ = ("__weakref__",) def optional_square(number: int = 5) -> int: # noqa """ Square `number`. The function from Modin. Parameters ---------- number : int Some number. Notes ----- The `optional_square` Modin function from modin/scripts/examples.py. """ return number**2 def optional_square_empty_parameters(number: int = 5) -> int: """ Parameters ---------- """ return number**2 def square_summary(number: int) -> int: # noqa: PR01, GL08 """ Square `number`. See https://github.com/ray-project/ray. Examples -------- The function that will never be used in modin.pandas.DataFrame same as in pandas or NumPy. """ return number**2 ================================================ FILE: scripts/test/test_doc_checker.py ================================================ # Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import pytest from scripts.doc_checker import ( MODIN_ERROR_CODES, check_optional_args, check_spelling_words, construct_validator, get_noqa_checks, get_optional_args, ) @pytest.mark.parametrize( "import_path, result", [ ("scripts.test.examples.optional_square", {"number": 5}), ("scripts.test.examples.optional_square_empty_parameters", {"number": 5}), ("scripts.test.examples.square_summary", {}), ("scripts.test.examples.weakdict", {}), ("scripts.test.examples", {}), ], ) def test_get_optional_args(import_path, result): optional_args = get_optional_args(construct_validator(import_path)) assert optional_args == result @pytest.mark.parametrize( "import_path, result", [ ( "scripts.test.examples.optional_square", [ ( "MD01", MODIN_ERROR_CODES["MD01"].format(parameter="number", found="int"), ) ], ), ("scripts.test.examples.optional_square_empty_parameters", []), ("scripts.test.examples.square_summary", []), ("scripts.test.examples.weakdict", []), ("scripts.test.examples", []), ], ) def test_check_optional_args(import_path, result): errors = check_optional_args(construct_validator(import_path)) assert errors == result @pytest.mark.parametrize( "import_path, result", [ ("scripts.test.examples.optional_square", []), ( "scripts.test.examples.square_summary", [ ("MD02", 57, "Pandas", "pandas"), ("MD02", 57, "Numpy", "NumPy"), ], ), ("scripts.test.examples.optional_square_empty_parameters", []), ("scripts.test.examples.weakdict", []), ("scripts.test.examples", []), ], ) def test_check_spelling_words(import_path, result): result_errors = [] for code, line, word, reference in result: result_errors.append( ( code, MODIN_ERROR_CODES[code].format( line=line, word=word, reference=reference ), ) ) errors = check_spelling_words(construct_validator(import_path)) # the order of incorrect words found on the same line is not guaranteed for error in errors: assert error in result_errors @pytest.mark.parametrize( "import_path, result", [ ("scripts.test.examples.optional_square", ["all"]), ("scripts.test.examples.optional_square_empty_parameters", []), ("scripts.test.examples.square_summary", ["PR01", "GL08"]), ("scripts.test.examples.weakdict", ["GL08"]), ("scripts.test.examples", ["MD02"]), ], ) def test_get_noqa_checks(import_path, result): noqa_checks = get_noqa_checks(construct_validator(import_path)) assert noqa_checks == result ================================================ FILE: setup.cfg ================================================ # See the docstring in versioneer.py for instructions. Note that you must # re-run 'versioneer.py setup' after changing this section, and commit the # resulting files. [versioneer] VCS = git style = pep440 versionfile_source = modin/_version.py versionfile_build = modin/_version.py tag_prefix = parentdir_prefix = modin- [tool:pytest] addopts = --cov-config=setup.cfg --cov=modin --cov-append --cov-report= -m "not exclude_by_default" xfail_strict=true markers = exclude_in_sanity exclude_by_default filterwarnings = error:.*defaulting to pandas.*:UserWarning [isort] profile = black [flake8] max-line-length = 88 ignore = E203, E266, E501, W503 select = B,C,E,F,W,T,B9,NIC per-file-ignores = modin/pandas/__init__.py:E402,F401 stress_tests/kaggle/*:E402 modin/experimental/pandas/__init__.py:E402 modin/_version.py:T201 modin/tests/*:E402 [coverage:run] source = # modin sources modin/* omit = # These are not covered by any test because it is an experimental API modin/sql/* modin/experimental/sql* # This is not used yet modin/pandas/index/* # Skip tests modin/tests/* # Plotting is not tested modin/pandas/plotting.py # Skip CLI part modin/__main__.py # Skip third-party stuff modin/_version.py parallel = True # The use of this feature is one of the recommendations of codecov if the # tests are run in different environments (for example, on different operating # systems): https://coverage.readthedocs.io/en/stable/config.html#run-relative-files relative_files = true [coverage:report] exclude_lines = # Have to re-enable the standard pragma pragma: no cover # Don't complain if tests don't hit defensive assertion code: raise AssertionError raise NotImplementedError raise ImportError assert pass ================================================ FILE: setup.py ================================================ from setuptools import find_packages, setup import versioneer with open("README.md", "r", encoding="utf-8") as fh: long_description = fh.read() dask_deps = ["dask>=2.22.0", "distributed>=2.22.0"] ray_deps = ["ray>=2.10.0,<3", "pyarrow>=10.0.1"] mpi_deps = ["unidist[mpi]>=0.2.1"] consortium_standard_deps = ["dataframe-api-compat>=0.2.7"] spreadsheet_deps = ["modin-spreadsheet>=0.1.0"] # Currently, Modin does not include `mpi` option in `all`. # Otherwise, installation of modin[all] would fail because # users need to have a working MPI implementation and # certain software installed beforehand. all_deps = dask_deps + ray_deps + spreadsheet_deps + consortium_standard_deps # Distribute 'modin-autoimport-pandas.pth' along with binary and source distributions. # This file provides the "import pandas before Ray init" feature if specific # environment variable is set (see https://github.com/modin-project/modin/issues/4564). cmdclass = versioneer.get_cmdclass() extra_files = ["modin-autoimport-pandas.pth"] class AddPthFileBuild(cmdclass["build_py"]): def _get_data_files(self): return (super()._get_data_files() or []) + [ (".", ".", self.build_lib, extra_files) ] class AddPthFileSDist(cmdclass["sdist"]): def make_distribution(self): self.filelist.extend(extra_files) return super().make_distribution() cmdclass["build_py"] = AddPthFileBuild cmdclass["sdist"] = AddPthFileSDist setup( name="modin", version=versioneer.get_version(), cmdclass=cmdclass, description="Modin: Make your pandas code run faster by changing one line of code.", packages=find_packages(exclude=["scripts", "scripts.*"]), include_package_data=True, license="Apache 2", url="https://github.com/modin-project/modin", long_description=long_description, long_description_content_type="text/markdown", install_requires=[ "pandas>=2.2,<2.4", "packaging>=21.0", "numpy>=1.22.4", "fsspec>=2022.11.0", "psutil>=5.8.0", "typing-extensions", ], extras_require={ # can be installed by pip install modin[dask] "dask": dask_deps, "ray": ray_deps, "mpi": mpi_deps, "consortium-standard": consortium_standard_deps, "spreadsheet": spreadsheet_deps, "all": all_deps, }, python_requires=">=3.9", ) ================================================ FILE: stress_tests/kaggle/kaggle10.py ================================================ import matplotlib matplotlib.use("PS") import warnings import matplotlib.pyplot as plt import numpy as np # linear algebra import seaborn as sns import modin.pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) warnings.filterwarnings("ignore") data = pd.read_csv("column_2C_weka.csv") print(plt.style.available) # look at available plot styles plt.style.use("ggplot") data.head() data.info() data.describe() color_list = ["red" if i == "Abnormal" else "green" for i in data.loc[:, "class"]] pd.plotting.scatter_matrix( data.loc[:, data.columns != "class"], c=color_list, figsize=[15, 15], diagonal="hist", alpha=0.5, s=200, marker="*", edgecolor="black", ) plt.show() sns.countplot(x="class", data=data) data.loc[:, "class"].value_counts() from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=3) x, y = data.loc[:, data.columns != "class"], data.loc[:, "class"] knn.fit(x, y) prediction = knn.predict(x) print("Prediction: {}".format(prediction)) from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1) knn = KNeighborsClassifier(n_neighbors=3) x, y = data.loc[:, data.columns != "class"], data.loc[:, "class"] knn.fit(x_train, y_train) prediction = knn.predict(x_test) print("With KNN (K=3) accuracy is: ", knn.score(x_test, y_test)) # accuracy neig = np.arange(1, 25) train_accuracy = [] test_accuracy = [] for i, k in enumerate(neig): knn = KNeighborsClassifier(n_neighbors=k) knn.fit(x_train, y_train) train_accuracy.append(knn.score(x_train, y_train)) test_accuracy.append(knn.score(x_test, y_test)) plt.figure(figsize=[13, 8]) plt.plot(neig, test_accuracy, label="Testing Accuracy") plt.plot(neig, train_accuracy, label="Training Accuracy") plt.legend() plt.title("-value VS Accuracy") plt.xlabel("Number of Neighbors") plt.ylabel("Accuracy") plt.xticks(neig) plt.savefig("graph.png") plt.show() print( "Best accuracy is {} with K = {}".format( np.max(test_accuracy), 1 + test_accuracy.index(np.max(test_accuracy)) ) ) data1 = data[data["class"] == "A"] x = np.array(data1.loc[:, "pelvic_incidence"]).reshape(-1, 1) y = np.array(data1.loc[:, "sacral_slope"]).reshape(-1, 1) plt.figure(figsize=[10, 10]) plt.scatter(x=x, y=y) plt.xlabel("pelvic_incidence") plt.ylabel("sacral_slope") plt.show() from sklearn.linear_model import LinearRegression reg = LinearRegression() predict_space = np.linspace(min(x), max(x)).reshape(-1, 1) reg.fit(x, y) predicted = reg.predict(predict_space) print("R^2 score: ", reg.score(x, y)) plt.plot(predict_space, predicted, color="black", linewidth=3) plt.scatter(x=x, y=y) plt.xlabel("pelvic_incidence") plt.ylabel("sacral_slope") plt.show() from sklearn.model_selection import cross_val_score reg = LinearRegression() k = 5 cv_result = cross_val_score(reg, x, y, cv=k) # uses R^2 as score print("CV Scores: ", cv_result) print("CV scores average: ", np.sum(cv_result) / k) from sklearn.linear_model import Ridge x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=2, test_size=0.3) ridge = Ridge(alpha=0.1, normalize=True) ridge.fit(x_train, y_train) ridge_predict = ridge.predict(x_test) print("Ridge score: ", ridge.score(x_test, y_test)) from sklearn.linear_model import Lasso x = np.array( data1.loc[ :, [ "pelvic_incidence", "pelvic_tilt numeric", "lumbar_lordosis_angle", "pelvic_radius", ], ] ) x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=3, test_size=0.3) lasso = Lasso(alpha=0.1, normalize=True) lasso.fit(x_train, y_train) ridge_predict = lasso.predict(x_test) print("Lasso score: ", lasso.score(x_test, y_test)) print("Lasso coefficients: ", lasso.coef_) from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report, confusion_matrix x, y = data.loc[:, data.columns != "class"], data.loc[:, "class"] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1) rf = RandomForestClassifier(random_state=4) rf.fit(x_train, y_train) y_pred = rf.predict(x_test) cm = confusion_matrix(y_test, y_pred) print("Confusion matrix: \n", cm) print("Classification report: \n", classification_report(y_test, y_pred)) sns.heatmap(cm, annot=True, fmt="d") plt.show() from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report, confusion_matrix, roc_curve data["class_binary"] = [1 if i == "Abnormal" else 0 for i in data.loc[:, "class"]] x, y = ( data.loc[:, (data.columns != "class") & (data.columns != "class_binary")], data.loc[:, "class_binary"], ) x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.3, random_state=42 ) logreg = LogisticRegression() logreg.fit(x_train, y_train) y_pred_prob = logreg.predict_proba(x_test)[:, 1] fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob) plt.plot([0, 1], [0, 1], "k--") plt.plot(fpr, tpr) plt.xlabel("False Positive Rate") plt.ylabel("True Positive Rate") plt.title("ROC") plt.show() from sklearn.model_selection import GridSearchCV grid = {"n_neighbors": np.arange(1, 50)} knn = KNeighborsClassifier() knn_cv = GridSearchCV(knn, grid, cv=3) # GridSearchCV knn_cv.fit(x, y) # Fit print("Tuned hyperparameter k: {}".format(knn_cv.best_params_)) print("Best score: {}".format(knn_cv.best_score_)) param_grid = {"C": np.logspace(-3, 3, 7), "penalty": ["l1", "l2"]} x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.3, random_state=12 ) logreg = LogisticRegression() logreg_cv = GridSearchCV(logreg, param_grid, cv=3) logreg_cv.fit(x_train, y_train) print("Tuned hyperparameters : {}".format(logreg_cv.best_params_)) print("Best Accuracy: {}".format(logreg_cv.best_score_)) data = pd.read_csv("column_2C_weka.csv") df = pd.get_dummies(data) df.head(10) df.drop("class_Normal", axis=1, inplace=True) df.head(10) from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC steps = [("scalar", StandardScaler()), ("SVM", SVC())] pipeline = Pipeline(steps) parameters = {"SVM__C": [1, 10, 100], "SVM__gamma": [0.1, 0.01]} x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1) cv = GridSearchCV(pipeline, param_grid=parameters, cv=3) cv.fit(x_train, y_train) y_pred = cv.predict(x_test) print("Accuracy: {}".format(cv.score(x_test, y_test))) print("Tuned Model Parameters: {}".format(cv.best_params_)) data = pd.read_csv("column_2C_weka.csv") plt.scatter(data["pelvic_radius"], data["degree_spondylolisthesis"]) plt.xlabel("pelvic_radius") plt.ylabel("degree_spondylolisthesis") plt.show() data2 = data.loc[:, ["degree_spondylolisthesis", "pelvic_radius"]] from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=2) kmeans.fit(data2) labels = kmeans.predict(data2) plt.scatter(data["pelvic_radius"], data["degree_spondylolisthesis"], c=labels) plt.xlabel("pelvic_radius") plt.xlabel("degree_spondylolisthesis") plt.show() df = pd.DataFrame({"labels": labels, "class": data["class"]}) ct = pd.crosstab(df["labels"], df["class"]) print(ct) inertia_list = np.empty(8) for i in range(1, 8): kmeans = KMeans(n_clusters=i) kmeans.fit(data2) inertia_list[i] = kmeans.inertia_ plt.plot(range(0, 8), inertia_list, "-o") plt.xlabel("Number of cluster") plt.ylabel("Inertia") plt.show() data = pd.read_csv("column_2C_weka.csv") data3 = data.drop("class", axis=1) from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler scalar = StandardScaler() kmeans = KMeans(n_clusters=2) pipe = make_pipeline(scalar, kmeans) pipe.fit(data3) labels = pipe.predict(data3) df = pd.DataFrame({"labels": labels, "class": data["class"]}) ct = pd.crosstab(df["labels"], df["class"]) print(ct) from scipy.cluster.hierarchy import dendrogram, linkage merg = linkage(data3.iloc[200:220, :], method="single") dendrogram(merg, leaf_rotation=90, leaf_font_size=6) plt.show() from sklearn.manifold import TSNE model = TSNE(learning_rate=100) transformed = model.fit_transform(data2) x = transformed[:, 0] y = transformed[:, 1] plt.scatter(x, y, c=color_list) plt.xlabel("pelvic_radius") plt.xlabel("degree_spondylolisthesis") plt.show() from sklearn.decomposition import PCA model = PCA() model.fit(data3) transformed = model.transform(data3) print("Principle components: ", model.components_) scaler = StandardScaler() pca = PCA() pipeline = make_pipeline(scaler, pca) pipeline.fit(data3) plt.bar(range(pca.n_components_), pca.explained_variance_) plt.xlabel("PCA feature") plt.ylabel("variance") plt.show() pca = PCA(n_components=2) pca.fit(data3) transformed = pca.transform(data3) x = transformed[:, 0] y = transformed[:, 1] plt.scatter(x, y, c=color_list) plt.show() ================================================ FILE: stress_tests/kaggle/kaggle12.py ================================================ import matplotlib matplotlib.use("PS") from collections import Counter import matplotlib.pyplot as plt import numpy as np import seaborn as sns from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.ensemble import ( AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier, VotingClassifier, ) from sklearn.linear_model import LogisticRegression from sklearn.model_selection import ( GridSearchCV, StratifiedKFold, cross_val_score, learning_curve, ) from sklearn.neighbors import KNeighborsClassifier from sklearn.neural_network import MLPClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier import modin.pandas as pd sns.set(style="white", context="notebook", palette="deep") train = pd.read_csv("train.csv") test = pd.read_csv("test.csv") IDtest = test["PassengerId"] def detect_outliers(df, n, features): outlier_indices = [] for col in features: Q1 = np.percentile(df[col], 25) Q3 = np.percentile(df[col], 75) IQR = Q3 - Q1 outlier_step = 1.5 * IQR outlier_list_col = df[ (df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step) ].index outlier_indices.extend(outlier_list_col) outlier_indices = Counter(outlier_indices) multiple_outliers = [k for k, v in outlier_indices.items() if v > n] return multiple_outliers Outliers_to_drop = detect_outliers(train, 2, ["Age", "SibSp", "Parch", "Fare"]) train.loc[Outliers_to_drop] # Show the outliers rows train = train.drop(Outliers_to_drop, axis=0).reset_index(drop=True) train_len = len(train) dataset = pd.concat(list_of_objs=[train, test], axis=0).reset_index(drop=True) dataset = dataset.fillna(np.nan) dataset.isnull().sum() train.info() train.isnull().sum() train.head() train.dtypes train.describe() g = sns.heatmap( train[["Survived", "SibSp", "Parch", "Age", "Fare"]].corr(), annot=True, fmt=".2f", cmap="coolwarm", ) g = sns.factorplot( x="SibSp", y="Survived", data=train, kind="bar", size=6, palette="muted" ) g.despine(left=True) g = g.set_ylabels("survival probability") g = sns.factorplot( x="Parch", y="Survived", data=train, kind="bar", size=6, palette="muted" ) g.despine(left=True) g = g.set_ylabels("survival probability") dataset["Fare"].isnull().sum() dataset["Fare"] = dataset["Fare"].fillna(dataset["Fare"].median()) g = sns.distplot( dataset["Fare"], color="m", label="Skewness : %.2f" % (dataset["Fare"].skew()) ) g = g.legend(loc="best") dataset["Fare"] = dataset["Fare"].map(lambda i: np.log(i) if i > 0 else 0) g = sns.distplot( dataset["Fare"], color="b", label="Skewness : %.2f" % (dataset["Fare"].skew()) ) g = g.legend(loc="best") g = sns.barplot(x="Sex", y="Survived", data=train) g = g.set_ylabel("Survival Probability") train[["Sex", "Survived"]].groupby("Sex").mean() g = sns.factorplot( x="Pclass", y="Survived", data=train, kind="bar", size=6, palette="muted" ) g.despine(left=True) g = g.set_ylabels("survival probability") g = sns.factorplot( x="Pclass", y="Survived", hue="Sex", data=train, size=6, kind="bar", palette="muted" ) g.despine(left=True) g = g.set_ylabels("survival probability") dataset["Embarked"].isnull().sum() dataset["Embarked"] = dataset["Embarked"].fillna("S") g = sns.factorplot( x="Embarked", y="Survived", data=train, size=6, kind="bar", palette="muted" ) g.despine(left=True) g = g.set_ylabels("survival probability") g = sns.factorplot( "Pclass", col="Embarked", data=train, size=6, kind="count", palette="muted" ) g.despine(left=True) g = g.set_ylabels("Count") g = sns.factorplot(y="Age", x="Sex", data=dataset, kind="box") g = sns.factorplot(y="Age", x="Sex", hue="Pclass", data=dataset, kind="box") g = sns.factorplot(y="Age", x="Parch", data=dataset, kind="box") g = sns.factorplot(y="Age", x="SibSp", data=dataset, kind="box") dataset["Sex"] = dataset["Sex"].map({"male": 0, "female": 1}) g = sns.heatmap( dataset[["Age", "Sex", "SibSp", "Parch", "Pclass"]].corr(), cmap="BrBG", annot=True ) index_NaN_age = list(dataset["Age"][dataset["Age"].isnull()].index) for i in index_NaN_age: age_med = dataset["Age"].median() age_pred = dataset["Age"][ ( (dataset["SibSp"] == dataset.iloc[i]["SibSp"]) & (dataset["Parch"] == dataset.iloc[i]["Parch"]) & (dataset["Pclass"] == dataset.iloc[i]["Pclass"]) ) ].median() if not np.isnan(age_pred): dataset["Age"].iloc[i] = age_pred else: dataset["Age"].iloc[i] = age_med g = sns.factorplot(x="Survived", y="Age", data=train, kind="box") g = sns.factorplot(x="Survived", y="Age", data=train, kind="violin") dataset["Name"].head() dataset_title = [i.split(",")[1].split(".")[0].strip() for i in dataset["Name"]] dataset["Title"] = pd.Series(dataset_title) dataset["Title"].head() g = sns.countplot(x="Title", data=dataset) g = plt.setp(g.get_xticklabels(), rotation=45) dataset["Title"] = dataset["Title"].replace( [ "Lady", "the Countess", "Countess", "Capt", "Col", "Don", "Dr", "Major", "Rev", "Sir", "Jonkheer", "Dona", ], "Rare", ) dataset["Title"] = dataset["Title"].map( {"Master": 0, "Miss": 1, "Ms": 1, "Mme": 1, "Mlle": 1, "Mrs": 1, "Mr": 2, "Rare": 3} ) dataset["Title"] = dataset["Title"].astype(int) g = sns.countplot(dataset["Title"]) g = g.set_xticklabels(["Master", "Miss/Ms/Mme/Mlle/Mrs", "Mr", "Rare"]) g = sns.factorplot(x="Title", y="Survived", data=dataset, kind="bar") g = g.set_xticklabels(["Master", "Miss-Mrs", "Mr", "Rare"]) g = g.set_ylabels("survival probability") dataset.drop(labels=["Name"], axis=1, inplace=True) dataset["Fsize"] = dataset["SibSp"] + dataset["Parch"] + 1 g = sns.factorplot(x="Fsize", y="Survived", data=dataset) g = g.set_ylabels("Survival Probability") dataset["Single"] = dataset["Fsize"].map(lambda s: 1 if s == 1 else 0) dataset["SmallF"] = dataset["Fsize"].map(lambda s: 1 if s == 2 else 0) dataset["MedF"] = dataset["Fsize"].map(lambda s: 1 if 3 <= s <= 4 else 0) dataset["LargeF"] = dataset["Fsize"].map(lambda s: 1 if s >= 5 else 0) g = sns.factorplot(x="Single", y="Survived", data=dataset, kind="bar") g = g.set_ylabels("Survival Probability") g = sns.factorplot(x="SmallF", y="Survived", data=dataset, kind="bar") g = g.set_ylabels("Survival Probability") g = sns.factorplot(x="MedF", y="Survived", data=dataset, kind="bar") g = g.set_ylabels("Survival Probability") g = sns.factorplot(x="LargeF", y="Survived", data=dataset, kind="bar") g = g.set_ylabels("Survival Probability") dataset = pd.get_dummies(dataset, columns=["Title"]) dataset = pd.get_dummies(dataset, columns=["Embarked"], prefix="Em") dataset.head() dataset["Cabin"].head() dataset["Cabin"].describe() dataset["Cabin"].isnull().sum() dataset["Cabin"][dataset["Cabin"].notnull()].head() dataset["Cabin"] = pd.Series( [i[0] if not pd.isnull(i) else "X" for i in dataset["Cabin"]] ) g = sns.countplot(dataset["Cabin"], order=["A", "B", "C", "D", "E", "F", "G", "T", "X"]) g = sns.factorplot( y="Survived", x="Cabin", data=dataset, kind="bar", order=["A", "B", "C", "D", "E", "F", "G", "T", "X"], ) g = g.set_ylabels("Survival Probability") dataset = pd.get_dummies(dataset, columns=["Cabin"], prefix="Cabin") dataset["Ticket"].head() Ticket = [] for i in list(dataset.Ticket): if not i.isdigit(): Ticket.append( i.replace(".", "").replace("/", "").strip().split(" ")[0] ) # Take prefix else: Ticket.append("X") dataset["Ticket"] = Ticket dataset["Ticket"].head() dataset = pd.get_dummies(dataset, columns=["Ticket"], prefix="T") dataset["Pclass"] = dataset["Pclass"].astype("category") dataset = pd.get_dummies(dataset, columns=["Pclass"], prefix="Pc") dataset.drop(labels=["PassengerId"], axis=1, inplace=True) dataset.head() train = dataset[:train_len] test = dataset[train_len:] test.drop(labels=["Survived"], axis=1, inplace=True) train["Survived"] = train["Survived"].astype(int) Y_train = train["Survived"] X_train = train.drop(labels=["Survived"], axis=1) kfold = StratifiedKFold(n_splits=10) random_state = 2 classifiers = [] classifiers.append(SVC(random_state=random_state)) classifiers.append(DecisionTreeClassifier(random_state=random_state)) classifiers.append( AdaBoostClassifier( DecisionTreeClassifier(random_state=random_state), random_state=random_state, learning_rate=0.1, ) ) classifiers.append(RandomForestClassifier(random_state=random_state)) classifiers.append(ExtraTreesClassifier(random_state=random_state)) classifiers.append(GradientBoostingClassifier(random_state=random_state)) classifiers.append(MLPClassifier(random_state=random_state)) classifiers.append(KNeighborsClassifier()) classifiers.append(LogisticRegression(random_state=random_state)) classifiers.append(LinearDiscriminantAnalysis()) cv_results = [] for classifier in classifiers: cv_results.append( cross_val_score( classifier, X_train, y=Y_train, scoring="accuracy", cv=kfold, n_jobs=4 ) ) cv_means = [] cv_std = [] for cv_result in cv_results: cv_means.append(cv_result.mean()) cv_std.append(cv_result.std()) cv_res = pd.DataFrame( { "CrossValMeans": cv_means, "CrossValerrors": cv_std, "Algorithm": [ "SVC", "DecisionTree", "AdaBoost", "RandomForest", "ExtraTrees", "GradientBoosting", "MultipleLayerPerceptron", "KNeighboors", "LogisticRegression", "LinearDiscriminantAnalysis", ], } ) g = sns.barplot( "CrossValMeans", "Algorithm", data=cv_res, palette="Set3", orient="h", **{"xerr": cv_std} ) g.set_xlabel("Mean Accuracy") g = g.set_title("Cross validation scores") DTC = DecisionTreeClassifier() adaDTC = AdaBoostClassifier(DTC, random_state=7) ada_param_grid = { "base_estimator__criterion": ["gini", "entropy"], "base_estimator__splitter": ["best", "random"], "algorithm": ["SAMME", "SAMME.R"], "n_estimators": [1, 2], "learning_rate": [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 1.5], } gsadaDTC = GridSearchCV( adaDTC, param_grid=ada_param_grid, cv=kfold, scoring="accuracy", n_jobs=4, verbose=1 ) gsadaDTC.fit(X_train, Y_train) ada_best = gsadaDTC.best_estimator_ gsadaDTC.best_score_ ExtC = ExtraTreesClassifier() ex_param_grid = { "max_depth": [None], "max_features": [1, 3, 10], "min_samples_split": [2, 3, 10], "min_samples_leaf": [1, 3, 10], "bootstrap": [False], "n_estimators": [100, 300], "criterion": ["gini"], } gsExtC = GridSearchCV( ExtC, param_grid=ex_param_grid, cv=kfold, scoring="accuracy", n_jobs=4, verbose=1 ) gsExtC.fit(X_train, Y_train) ExtC_best = gsExtC.best_estimator_ gsExtC.best_score_ RFC = RandomForestClassifier() rf_param_grid = { "max_depth": [None], "max_features": [1, 3, 10], "min_samples_split": [2, 3, 10], "min_samples_leaf": [1, 3, 10], "bootstrap": [False], "n_estimators": [100, 300], "criterion": ["gini"], } gsRFC = GridSearchCV( RFC, param_grid=rf_param_grid, cv=kfold, scoring="accuracy", n_jobs=4, verbose=1 ) gsRFC.fit(X_train, Y_train) RFC_best = gsRFC.best_estimator_ gsRFC.best_score_ GBC = GradientBoostingClassifier() gb_param_grid = { "loss": ["deviance"], "n_estimators": [100, 200, 300], "learning_rate": [0.1, 0.05, 0.01], "max_depth": [4, 8], "min_samples_leaf": [100, 150], "max_features": [0.3, 0.1], } gsGBC = GridSearchCV( GBC, param_grid=gb_param_grid, cv=kfold, scoring="accuracy", n_jobs=4, verbose=1 ) gsGBC.fit(X_train, Y_train) GBC_best = gsGBC.best_estimator_ gsGBC.best_score_ SVMC = SVC(probability=True) svc_param_grid = { "kernel": ["rbf"], "gamma": [0.001, 0.01, 0.1, 1], "C": [1, 10, 50, 100, 200, 300, 1000], } gsSVMC = GridSearchCV( SVMC, param_grid=svc_param_grid, cv=kfold, scoring="accuracy", n_jobs=4, verbose=1 ) gsSVMC.fit(X_train, Y_train) SVMC_best = gsSVMC.best_estimator_ gsSVMC.best_score_ def plot_learning_curve( estimator, title, X, y, ylim=None, cv=None, n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 5), ): """Generate a simple plot of the test and training learning curve""" plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel("Training examples") plt.ylabel("Score") train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes ) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between( train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r", ) plt.fill_between( train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g", ) plt.plot(train_sizes, train_scores_mean, "o-", color="r", label="Training score") plt.plot( train_sizes, test_scores_mean, "o-", color="g", label="Cross-validation score" ) plt.legend(loc="best") return plt g = plot_learning_curve( gsRFC.best_estimator_, "RF mearning curves", X_train, Y_train, cv=kfold ) g = plot_learning_curve( gsExtC.best_estimator_, "ExtraTrees learning curves", X_train, Y_train, cv=kfold ) g = plot_learning_curve( gsSVMC.best_estimator_, "SVC learning curves", X_train, Y_train, cv=kfold ) g = plot_learning_curve( gsadaDTC.best_estimator_, "AdaBoost learning curves", X_train, Y_train, cv=kfold ) g = plot_learning_curve( gsGBC.best_estimator_, "GradientBoosting learning curves", X_train, Y_train, cv=kfold, ) nrows = ncols = 2 fig, axes = plt.subplots(nrows=nrows, ncols=ncols, sharex="all", figsize=(15, 15)) names_classifiers = [ ("AdaBoosting", ada_best), ("ExtraTrees", ExtC_best), ("RandomForest", RFC_best), ("GradientBoosting", GBC_best), ] nclassifier = 0 for row in range(nrows): for col in range(ncols): name = names_classifiers[nclassifier][0] classifier = names_classifiers[nclassifier][1] indices = np.argsort(classifier.feature_importances_)[::-1][:40] g = sns.barplot( y=X_train.columns[indices][:40], x=classifier.feature_importances_[indices][:40], orient="h", ax=axes[row][col], ) g.set_xlabel("Relative importance", fontsize=12) g.set_ylabel("Features", fontsize=12) g.tick_params(labelsize=9) g.set_title(name + " feature importance") nclassifier += 1 test_Survived_RFC = pd.Series(RFC_best.predict(test), name="RFC") test_Survived_ExtC = pd.Series(ExtC_best.predict(test), name="ExtC") test_Survived_SVMC = pd.Series(SVMC_best.predict(test), name="SVC") test_Survived_AdaC = pd.Series(ada_best.predict(test), name="Ada") test_Survived_GBC = pd.Series(GBC_best.predict(test), name="GBC") ensemble_results = pd.concat( [ test_Survived_RFC, test_Survived_ExtC, test_Survived_AdaC, test_Survived_GBC, test_Survived_SVMC, ], axis=1, ) g = sns.heatmap(ensemble_results.corr(), annot=True) votingC = VotingClassifier( estimators=[ ("rfc", RFC_best), ("extc", ExtC_best), ("svc", SVMC_best), ("adac", ada_best), ("gbc", GBC_best), ], voting="soft", n_jobs=4, ) votingC = votingC.fit(X_train, Y_train) test_Survived = pd.Series(votingC.predict(test), name="Survived") results = pd.concat([IDtest, test_Survived], axis=1) results.to_csv("ensemble_python_voting.csv", index=False) ================================================ FILE: stress_tests/kaggle/kaggle13.py ================================================ #!/usr/bin/env python import matplotlib matplotlib.use("PS") import warnings # current version of seaborn generates a bunch of warnings that we'll ignore import modin.pandas as pd warnings.filterwarnings("ignore") import matplotlib.pyplot as plt import seaborn as sns sns.set(style="white", color_codes=True) iris = pd.read_csv("Iris.csv") # the iris dataset is now a Pandas DataFrame iris.head() iris["Species"].value_counts() iris.plot(kind="scatter", x="SepalLengthCm", y="SepalWidthCm") sns.jointplot(x="SepalLengthCm", y="SepalWidthCm", data=iris, size=5) sns.FacetGrid(iris, hue="Species", size=5).map( plt.scatter, "SepalLengthCm", "SepalWidthCm" ).add_legend() sns.boxplot(x="Species", y="PetalLengthCm", data=iris) ax = sns.boxplot(x="Species", y="PetalLengthCm", data=iris) ax = sns.stripplot( x="Species", y="PetalLengthCm", data=iris, jitter=True, edgecolor="gray" ) sns.violinplot(x="Species", y="PetalLengthCm", data=iris, size=6) sns.FacetGrid(iris, hue="Species", size=6).map( sns.kdeplot, "PetalLengthCm" ).add_legend() iris.drop("Id", axis=1).boxplot(by="Species", figsize=(12, 6)) from pandas.tools.plotting import andrews_curves andrews_curves(iris.drop("Id", axis=1), "Species") from pandas.tools.plotting import parallel_coordinates parallel_coordinates(iris.drop("Id", axis=1), "Species") from pandas.tools.plotting import radviz radviz(iris.drop("Id", axis=1), "Species") ================================================ FILE: stress_tests/kaggle/kaggle14.py ================================================ import matplotlib matplotlib.use("PS") import matplotlib.pyplot as plt import seaborn as sns import modin.pandas as pd plt.style.use("fivethirtyeight") import warnings warnings.filterwarnings("ignore") data = pd.read_csv("train.csv") data.head() data.isnull().sum() # checking for total null values data.groupby(["Sex", "Survived"])["Survived"].count() f, ax = plt.subplots(1, 2, figsize=(18, 8)) data[["Sex", "Survived"]].groupby(["Sex"]).mean().plot.bar(ax=ax[0]) ax[0].set_title("Survived vs Sex") sns.countplot("Sex", hue="Survived", data=data, ax=ax[1]) ax[1].set_title("Sex:Survived vs Dead") plt.show() pd.crosstab(data.Pclass, data.Survived, margins=True).style.background_gradient( cmap="summer_r" ) f, ax = plt.subplots(1, 2, figsize=(18, 8)) data["Pclass"].value_counts().plot.bar( color=["#CD7F32", "#FFDF00", "#D3D3D3"], ax=ax[0] ) ax[0].set_title("Number Of Passengers By Pclass") ax[0].set_ylabel("Count") sns.countplot("Pclass", hue="Survived", data=data, ax=ax[1]) ax[1].set_title("Pclass:Survived vs Dead") plt.show() pd.crosstab( [data.Sex, data.Survived], data.Pclass, margins=True ).style.background_gradient(cmap="summer_r") sns.factorplot("Pclass", "Survived", hue="Sex", data=data) plt.show() print("Oldest Passenger was of:", data["Age"].max(), "Years") print("Youngest Passenger was of:", data["Age"].min(), "Years") print("Average Age on the ship:", data["Age"].mean(), "Years") f, ax = plt.subplots(1, 2, figsize=(18, 8)) sns.violinplot("Pclass", "Age", hue="Survived", data=data, split=True, ax=ax[0]) ax[0].set_title("Pclass and Age vs Survived") ax[0].set_yticks(range(0, 110, 10)) sns.violinplot("Sex", "Age", hue="Survived", data=data, split=True, ax=ax[1]) ax[1].set_title("Sex and Age vs Survived") ax[1].set_yticks(range(0, 110, 10)) plt.show() data["Initial"] = 0 for i in data: data["Initial"] = data.Name.str.extract( r"([A-Za-z]+)\." # noqa: W605 ) # lets extract the Salutations pd.crosstab(data.Initial, data.Sex).T.style.background_gradient( cmap="summer_r" ) # Checking the Initials with the Sex data["Initial"].replace( [ "Mlle", "Mme", "Ms", "Dr", "Major", "Lady", "Countess", "Jonkheer", "Col", "Rev", "Capt", "Sir", "Don", ], [ "Miss", "Miss", "Miss", "Mr", "Mr", "Mrs", "Mrs", "Other", "Other", "Other", "Mr", "Mr", "Mr", ], inplace=True, ) data.groupby("Initial")["Age"].mean() # lets check the average age by Initials data.loc[(data.Age.isnull()) & (data.Initial == "Mr"), "Age"] = 33 data.loc[(data.Age.isnull()) & (data.Initial == "Mrs"), "Age"] = 36 data.loc[(data.Age.isnull()) & (data.Initial == "Master"), "Age"] = 5 data.loc[(data.Age.isnull()) & (data.Initial == "Miss"), "Age"] = 22 data.loc[(data.Age.isnull()) & (data.Initial == "Other"), "Age"] = 46 data.Age.isnull().any() # So no null values left finally f, ax = plt.subplots(1, 2, figsize=(20, 10)) data[data["Survived"] == 0].Age.plot.hist( ax=ax[0], bins=20, edgecolor="black", color="red" ) ax[0].set_title("Survived= 0") x1 = list(range(0, 85, 5)) ax[0].set_xticks(x1) data[data["Survived"] == 1].Age.plot.hist( ax=ax[1], color="green", bins=20, edgecolor="black" ) ax[1].set_title("Survived= 1") x2 = list(range(0, 85, 5)) ax[1].set_xticks(x2) plt.show() sns.factorplot("Pclass", "Survived", col="Initial", data=data) plt.show() pd.crosstab( [data.Embarked, data.Pclass], [data.Sex, data.Survived], margins=True ).style.background_gradient(cmap="summer_r") sns.factorplot("Embarked", "Survived", data=data) fig = plt.gcf() fig.set_size_inches(5, 3) plt.show() f, ax = plt.subplots(2, 2, figsize=(20, 15)) sns.countplot("Embarked", data=data, ax=ax[0, 0]) ax[0, 0].set_title("No. Of Passengers Boarded") sns.countplot("Embarked", hue="Sex", data=data, ax=ax[0, 1]) ax[0, 1].set_title("Male-Female Split for Embarked") sns.countplot("Embarked", hue="Survived", data=data, ax=ax[1, 0]) ax[1, 0].set_title("Embarked vs Survived") sns.countplot("Embarked", hue="Pclass", data=data, ax=ax[1, 1]) ax[1, 1].set_title("Embarked vs Pclass") plt.subplots_adjust(wspace=0.2, hspace=0.5) plt.show() sns.factorplot("Pclass", "Survived", hue="Sex", col="Embarked", data=data) plt.show() data["Embarked"].fillna("S", inplace=True) data.Embarked.isnull().any() # Finally No NaN values pd.crosstab([data.SibSp], data.Survived).style.background_gradient(cmap="summer_r") f, ax = plt.subplots(1, 2, figsize=(20, 8)) sns.barplot("SibSp", "Survived", data=data, ax=ax[0]) ax[0].set_title("SibSp vs Survived") sns.factorplot("SibSp", "Survived", data=data, ax=ax[1]) ax[1].set_title("SibSp vs Survived") plt.close(2) plt.show() pd.crosstab(data.SibSp, data.Pclass).style.background_gradient(cmap="summer_r") pd.crosstab(data.Parch, data.Pclass).style.background_gradient(cmap="summer_r") f, ax = plt.subplots(1, 2, figsize=(20, 8)) sns.barplot("Parch", "Survived", data=data, ax=ax[0]) ax[0].set_title("Parch vs Survived") sns.factorplot("Parch", "Survived", data=data, ax=ax[1]) ax[1].set_title("Parch vs Survived") plt.close(2) plt.show() print("Highest Fare was:", data["Fare"].max()) print("Lowest Fare was:", data["Fare"].min()) print("Average Fare was:", data["Fare"].mean()) f, ax = plt.subplots(1, 3, figsize=(20, 8)) sns.distplot(data[data["Pclass"] == 1].Fare, ax=ax[0]) ax[0].set_title("Fares in Pclass 1") sns.distplot(data[data["Pclass"] == 2].Fare, ax=ax[1]) ax[1].set_title("Fares in Pclass 2") sns.distplot(data[data["Pclass"] == 3].Fare, ax=ax[2]) ax[2].set_title("Fares in Pclass 3") plt.show() sns.heatmap( data.corr(), annot=True, cmap="RdYlGn", linewidths=0.2 ) # data.corr()-->correlation matrix fig = plt.gcf() fig.set_size_inches(10, 8) plt.show() data["Age_band"] = 0 data.loc[data["Age"] <= 16, "Age_band"] = 0 data.loc[(data["Age"] > 16) & (data["Age"] <= 32), "Age_band"] = 1 data.loc[(data["Age"] > 32) & (data["Age"] <= 48), "Age_band"] = 2 data.loc[(data["Age"] > 48) & (data["Age"] <= 64), "Age_band"] = 3 data.loc[data["Age"] > 64, "Age_band"] = 4 data.head(2) data["Age_band"].value_counts().to_frame().style.background_gradient( cmap="summer" ) # checking the number of passenegers in each band sns.factorplot("Age_band", "Survived", data=data, col="Pclass") plt.show() data["Family_Size"] = 0 data["Family_Size"] = data["Parch"] + data["SibSp"] # family size data["Alone"] = 0 data.loc[data.Family_Size == 0, "Alone"] = 1 # Alone f, ax = plt.subplots(1, 2, figsize=(18, 6)) sns.factorplot("Family_Size", "Survived", data=data, ax=ax[0]) ax[0].set_title("Family_Size vs Survived") sns.factorplot("Alone", "Survived", data=data, ax=ax[1]) ax[1].set_title("Alone vs Survived") plt.close(2) plt.close(3) plt.show() sns.factorplot("Alone", "Survived", data=data, hue="Sex", col="Pclass") plt.show() data["Fare_Range"] = pd.qcut(data["Fare"], 4) data.groupby(["Fare_Range"])["Survived"].mean().to_frame().style.background_gradient( cmap="summer_r" ) data["Fare_cat"] = 0 data.loc[data["Fare"] <= 7.91, "Fare_cat"] = 0 data.loc[(data["Fare"] > 7.91) & (data["Fare"] <= 14.454), "Fare_cat"] = 1 data.loc[(data["Fare"] > 14.454) & (data["Fare"] <= 31), "Fare_cat"] = 2 data.loc[(data["Fare"] > 31) & (data["Fare"] <= 513), "Fare_cat"] = 3 sns.factorplot("Fare_cat", "Survived", data=data, hue="Sex") plt.show() data["Sex"].replace(["male", "female"], [0, 1], inplace=True) data["Embarked"].replace(["S", "C", "Q"], [0, 1, 2], inplace=True) data["Initial"].replace( ["Mr", "Mrs", "Miss", "Master", "Other"], [0, 1, 2, 3, 4], inplace=True ) data.drop( ["Name", "Age", "Ticket", "Fare", "Cabin", "Fare_Range", "PassengerId"], axis=1, inplace=True, ) sns.heatmap( data.corr(), annot=True, cmap="RdYlGn", linewidths=0.2, annot_kws={"size": 20} ) fig = plt.gcf() fig.set_size_inches(18, 15) plt.xticks(fontsize=14) plt.yticks(fontsize=14) plt.show() from sklearn import metrics # accuracy measure from sklearn import svm # support vector Machine from sklearn.ensemble import RandomForestClassifier # Random Forest from sklearn.linear_model import LogisticRegression # logistic regression from sklearn.metrics import confusion_matrix # for confusion matrix from sklearn.model_selection import train_test_split # training and testing data split from sklearn.naive_bayes import GaussianNB # Naive bayes from sklearn.neighbors import KNeighborsClassifier # KNN from sklearn.tree import DecisionTreeClassifier # Decision Tree train, test = train_test_split( data, test_size=0.3, random_state=0, stratify=data["Survived"] ) train_X = train[train.columns[1:]] train_Y = train[train.columns[:1]] test_X = test[test.columns[1:]] test_Y = test[test.columns[:1]] X = data[data.columns[1:]] Y = data["Survived"] model = svm.SVC(kernel="rbf", C=1, gamma=0.1) model.fit(train_X, train_Y) prediction1 = model.predict(test_X) print("Accuracy for rbf SVM is ", metrics.accuracy_score(prediction1, test_Y)) model = svm.SVC(kernel="linear", C=0.1, gamma=0.1) model.fit(train_X, train_Y) prediction2 = model.predict(test_X) print("Accuracy for linear SVM is", metrics.accuracy_score(prediction2, test_Y)) model = LogisticRegression() model.fit(train_X, train_Y) prediction3 = model.predict(test_X) print( "The accuracy of the Logistic Regression is", metrics.accuracy_score(prediction3, test_Y), ) model = DecisionTreeClassifier() model.fit(train_X, train_Y) prediction4 = model.predict(test_X) print( "The accuracy of the Decision Tree is", metrics.accuracy_score(prediction4, test_Y) ) model = KNeighborsClassifier() model.fit(train_X, train_Y) prediction5 = model.predict(test_X) print("The accuracy of the KNN is", metrics.accuracy_score(prediction5, test_Y)) a_index = list(range(1, 11)) a = pd.Series() x = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] for i in list(range(1, 11)): model = KNeighborsClassifier(n_neighbors=i) model.fit(train_X, train_Y) prediction = model.predict(test_X) a = a.append(pd.Series(metrics.accuracy_score(prediction, test_Y))) plt.plot(a_index, a) plt.xticks(x) fig = plt.gcf() fig.set_size_inches(12, 6) plt.show() print( "Accuracies for different values of n are:", a.values, "with the max value as ", a.values.max(), ) model = GaussianNB() model.fit(train_X, train_Y) prediction6 = model.predict(test_X) print("The accuracy of the NaiveBayes is", metrics.accuracy_score(prediction6, test_Y)) model = RandomForestClassifier(n_estimators=100) model.fit(train_X, train_Y) prediction7 = model.predict(test_X) print( "The accuracy of the Random Forests is", metrics.accuracy_score(prediction7, test_Y) ) from sklearn.model_selection import KFold # for K-fold cross validation from sklearn.model_selection import cross_val_predict # prediction from sklearn.model_selection import cross_val_score # score evaluation kfold = KFold(n_splits=10, random_state=22) # k=10, split the data into 10 equal parts xyz = [] accuracy = [] std = [] classifiers = [ "Linear Svm", "Radial Svm", "Logistic Regression", "KNN", "Decision Tree", "Naive Bayes", "Random Forest", ] models = [ svm.SVC(kernel="linear"), svm.SVC(kernel="rbf"), LogisticRegression(), KNeighborsClassifier(n_neighbors=9), DecisionTreeClassifier(), GaussianNB(), RandomForestClassifier(n_estimators=100), ] for i in models: model = i cv_result = cross_val_score(model, X, Y, cv=kfold, scoring="accuracy") xyz.append(cv_result.mean()) std.append(cv_result.std()) accuracy.append(cv_result) new_models_dataframe2 = pd.DataFrame({"CV Mean": xyz, "Std": std}, index=classifiers) new_models_dataframe2 plt.subplots(figsize=(12, 6)) box = pd.DataFrame(accuracy, index=[classifiers]) box.T.boxplot() new_models_dataframe2["CV Mean"].plot.barh(width=0.8) plt.title("Average CV Mean Accuracy") fig = plt.gcf() fig.set_size_inches(8, 5) plt.show() f, ax = plt.subplots(3, 3, figsize=(12, 10)) y_pred = cross_val_predict(svm.SVC(kernel="rbf"), X, Y, cv=10) sns.heatmap(confusion_matrix(Y, y_pred), ax=ax[0, 0], annot=True, fmt="2.0f") ax[0, 0].set_title("Matrix for rbf-SVM") y_pred = cross_val_predict(svm.SVC(kernel="linear"), X, Y, cv=10) sns.heatmap(confusion_matrix(Y, y_pred), ax=ax[0, 1], annot=True, fmt="2.0f") ax[0, 1].set_title("Matrix for Linear-SVM") y_pred = cross_val_predict(KNeighborsClassifier(n_neighbors=9), X, Y, cv=10) sns.heatmap(confusion_matrix(Y, y_pred), ax=ax[0, 2], annot=True, fmt="2.0f") ax[0, 2].set_title("Matrix for KNN") y_pred = cross_val_predict(RandomForestClassifier(n_estimators=100), X, Y, cv=10) sns.heatmap(confusion_matrix(Y, y_pred), ax=ax[1, 0], annot=True, fmt="2.0f") ax[1, 0].set_title("Matrix for Random-Forests") y_pred = cross_val_predict(LogisticRegression(), X, Y, cv=10) sns.heatmap(confusion_matrix(Y, y_pred), ax=ax[1, 1], annot=True, fmt="2.0f") ax[1, 1].set_title("Matrix for Logistic Regression") y_pred = cross_val_predict(DecisionTreeClassifier(), X, Y, cv=10) sns.heatmap(confusion_matrix(Y, y_pred), ax=ax[1, 2], annot=True, fmt="2.0f") ax[1, 2].set_title("Matrix for Decision Tree") y_pred = cross_val_predict(GaussianNB(), X, Y, cv=10) sns.heatmap(confusion_matrix(Y, y_pred), ax=ax[2, 0], annot=True, fmt="2.0f") ax[2, 0].set_title("Matrix for Naive Bayes") plt.subplots_adjust(hspace=0.2, wspace=0.2) plt.show() from sklearn.model_selection import GridSearchCV C = [0.05, 0.1, 0.2, 0.3, 0.25, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1] gamma = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] kernel = ["rbf", "linear"] hyper = {"kernel": kernel, "C": C, "gamma": gamma} gd = GridSearchCV(estimator=svm.SVC(), param_grid=hyper, verbose=True) gd.fit(X, Y) print(gd.best_score_) print(gd.best_estimator_) n_estimators = range(100, 1000, 100) hyper = {"n_estimators": n_estimators} gd = GridSearchCV( estimator=RandomForestClassifier(random_state=0), param_grid=hyper, verbose=True ) gd.fit(X, Y) print(gd.best_score_) print(gd.best_estimator_) from sklearn.ensemble import VotingClassifier ensemble_lin_rbf = VotingClassifier( estimators=[ ("KNN", KNeighborsClassifier(n_neighbors=10)), ("RBF", svm.SVC(probability=True, kernel="rbf", C=0.5, gamma=0.1)), ("RFor", RandomForestClassifier(n_estimators=500, random_state=0)), ("LR", LogisticRegression(C=0.05)), ("DT", DecisionTreeClassifier(random_state=0)), ("NB", GaussianNB()), ("svm", svm.SVC(kernel="linear", probability=True)), ], voting="soft", ).fit(train_X, train_Y) print("The accuracy for ensembled model is:", ensemble_lin_rbf.score(test_X, test_Y)) cross = cross_val_score(ensemble_lin_rbf, X, Y, cv=10, scoring="accuracy") print("The cross validated score is", cross.mean()) from sklearn.ensemble import BaggingClassifier model = BaggingClassifier( base_estimator=KNeighborsClassifier(n_neighbors=3), random_state=0, n_estimators=700 ) model.fit(train_X, train_Y) prediction = model.predict(test_X) print("The accuracy for bagged KNN is:", metrics.accuracy_score(prediction, test_Y)) result = cross_val_score(model, X, Y, cv=10, scoring="accuracy") print("The cross validated score for bagged KNN is:", result.mean()) model = BaggingClassifier( base_estimator=DecisionTreeClassifier(), random_state=0, n_estimators=100 ) model.fit(train_X, train_Y) prediction = model.predict(test_X) print( "The accuracy for bagged Decision Tree is:", metrics.accuracy_score(prediction, test_Y), ) result = cross_val_score(model, X, Y, cv=10, scoring="accuracy") print("The cross validated score for bagged Decision Tree is:", result.mean()) from sklearn.ensemble import AdaBoostClassifier ada = AdaBoostClassifier(n_estimators=200, random_state=0, learning_rate=0.1) result = cross_val_score(ada, X, Y, cv=10, scoring="accuracy") print("The cross validated score for AdaBoost is:", result.mean()) from sklearn.ensemble import GradientBoostingClassifier grad = GradientBoostingClassifier(n_estimators=500, random_state=0, learning_rate=0.1) result = cross_val_score(grad, X, Y, cv=10, scoring="accuracy") print("The cross validated score for Gradient Boosting is:", result.mean()) import xgboost as xg xgboost = xg.XGBClassifier(n_estimators=900, learning_rate=0.1) result = cross_val_score(xgboost, X, Y, cv=10, scoring="accuracy") print("The cross validated score for XGBoost is:", result.mean()) n_estimators = list(range(100, 1100, 100)) learn_rate = [0.05, 0.1, 0.2, 0.3, 0.25, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1] hyper = {"n_estimators": n_estimators, "learning_rate": learn_rate} gd = GridSearchCV(estimator=AdaBoostClassifier(), param_grid=hyper, verbose=True) gd.fit(X, Y) print(gd.best_score_) print(gd.best_estimator_) ada = AdaBoostClassifier(n_estimators=200, random_state=0, learning_rate=0.05) result = cross_val_predict(ada, X, Y, cv=10) sns.heatmap(confusion_matrix(Y, result), cmap="winter", annot=True, fmt="2.0f") plt.show() f, ax = plt.subplots(2, 2, figsize=(15, 12)) model = RandomForestClassifier(n_estimators=500, random_state=0) model.fit(X, Y) pd.Series(model.feature_importances_, X.columns).sort_values(ascending=True).plot.barh( width=0.8, ax=ax[0, 0] ) ax[0, 0].set_title("Feature Importance in Random Forests") model = AdaBoostClassifier(n_estimators=200, learning_rate=0.05, random_state=0) model.fit(X, Y) pd.Series(model.feature_importances_, X.columns).sort_values(ascending=True).plot.barh( width=0.8, ax=ax[0, 1], color="#ddff11" ) ax[0, 1].set_title("Feature Importance in AdaBoost") model = GradientBoostingClassifier(n_estimators=500, learning_rate=0.1, random_state=0) model.fit(X, Y) pd.Series(model.feature_importances_, X.columns).sort_values(ascending=True).plot.barh( width=0.8, ax=ax[1, 0], cmap="RdYlGn_r" ) ax[1, 0].set_title("Feature Importance in Gradient Boosting") model = xg.XGBClassifier(n_estimators=900, learning_rate=0.1) model.fit(X, Y) pd.Series(model.feature_importances_, X.columns).sort_values(ascending=True).plot.barh( width=0.8, ax=ax[1, 1], color="#FD0F00" ) ax[1, 1].set_title("Feature Importance in XgBoost") plt.show() ================================================ FILE: stress_tests/kaggle/kaggle17.py ================================================ import modin.pandas as pd melbourne_file_path = "melb_data.csv" melbourne_data = pd.read_csv(melbourne_file_path) print(melbourne_data.columns) melbourne_price_data = melbourne_data.Price print(melbourne_price_data.head()) columns_of_interest = ["Landsize", "BuildingArea"] two_columns_of_data = melbourne_data[columns_of_interest] two_columns_of_data.describe() ================================================ FILE: stress_tests/kaggle/kaggle18.py ================================================ #!/usr/bin/env python # noqa: E902 import matplotlib matplotlib.use("PS") import re import string import matplotlib.pyplot as plt import nltk import numpy as np import pandas as pd import seaborn as sns sns.set(style="white") import warnings from collections import Counter import bokeh.plotting as bp import plotly.graph_objs as go import plotly.offline as py from bokeh.models import HoverTool # BoxSelectTool from bokeh.models import ColumnDataSource from bokeh.plotting import output_notebook, show # figure from nltk.corpus import stopwords from nltk.tokenize import sent_tokenize, word_tokenize from sklearn.decomposition import LatentDirichletAllocation from sklearn.feature_extraction import stop_words from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from wordcloud import WordCloud warnings.filterwarnings("ignore") import logging logging.getLogger("lda").setLevel(logging.WARNING) nltk.download("punkt") nltk.download("stopwords") train = pd.read_csv("train.csv") test = pd.read_csv("test.csv") print(train.shape) print(test.shape) train.dtypes train.head() train.price.describe() plt.subplot(1, 2, 1) (train["price"]).plot.hist(bins=50, figsize=(20, 10), edgecolor="white", range=[0, 250]) plt.xlabel("price+", fontsize=17) plt.ylabel("frequency", fontsize=17) plt.tick_params(labelsize=15) plt.title("Price Distribution - Training Set", fontsize=17) plt.subplot(1, 2, 2) np.log(train["price"] + 1).plot.hist(bins=50, figsize=(20, 10), edgecolor="white") plt.xlabel("log(price+1)", fontsize=17) plt.ylabel("frequency", fontsize=17) plt.tick_params(labelsize=15) plt.title("Log(Price) Distribution - Training Set", fontsize=17) plt.show() train.shipping.value_counts() / len(train) prc_shipBySeller = train.loc[train.shipping == 1, "price"] prc_shipByBuyer = train.loc[train.shipping == 0, "price"] fig, ax = plt.subplots(figsize=(20, 10)) ax.hist( np.log(prc_shipBySeller + 1), color="#8CB4E1", alpha=1.0, bins=50, label="Price when Seller pays Shipping", ) ax.hist( np.log(prc_shipByBuyer + 1), color="#007D00", alpha=0.7, bins=50, label="Price when Buyer pays Shipping", ) ax.set(title="Histogram Comparison", ylabel="% of Dataset in Bin") plt.xlabel("log(price+1)", fontsize=17) plt.ylabel("frequency", fontsize=17) plt.title("Price Distribution by Shipping Type", fontsize=17) plt.tick_params(labelsize=15) plt.show() print( "There are %d unique values in the category column." % train["category_name"].nunique() ) train["category_name"].value_counts()[:5] print( "There are %d items that do not have a label." % train["category_name"].isnull().sum() ) def split_cat(text): try: return text.split("/") except Exception: return ("No Label", "No Label", "No Label") train["general_cat"], train["subcat_1"], train["subcat_2"] = zip( *train["category_name"].apply(lambda x: split_cat(x)) ) train.head() test["general_cat"], test["subcat_1"], test["subcat_2"] = zip( *test["category_name"].apply(lambda x: split_cat(x)) ) print("There are %d unique first sub-categories." % train["subcat_1"].nunique()) print("There are %d unique second sub-categories." % train["subcat_2"].nunique()) x = train["general_cat"].value_counts().index.values.astype("str") y = train["general_cat"].value_counts().values pct = [("%.2f" % (v * 100)) + "%" for v in (y / len(train))] trace1 = go.Bar(x=x, y=y, text=pct) layout = { "title": "Number of Items by Main Category", "yaxis": {"title": "Count"}, "xaxis": {"title": "Category"}, } fig = {"data": [trace1], "layout": layout} py.iplot(fig) x = train["subcat_1"].value_counts().index.values.astype("str")[:15] y = train["subcat_1"].value_counts().values[:15] pct = [("%.2f" % (v * 100)) + "%" for v in (y / len(train))][:15] trace1 = go.Bar( x=x, y=y, text=pct, marker={ "color": y, "colorscale": "Portland", "showscale": True, "reversescale": False, }, ) layout = { "title": "Number of Items by Sub Category (Top 15)", "yaxis": {"title": "Count"}, "xaxis": {"title": "SubCategory"}, } fig = {"data": [trace1], "layout": layout} py.iplot(fig) general_cats = train["general_cat"].unique() x = [train.loc[train["general_cat"] == cat, "price"] for cat in general_cats] data = [ go.Box(x=np.log(x[i] + 1), name=general_cats[i]) for i in range(len(general_cats)) ] layout = { "title": "Price Distribution by General Category", "yaxis": {"title": "Frequency"}, "xaxis": {"title": "Category"}, } fig = {"data": data, "layout": layout} py.iplot(fig) print( "There are %d unique brand names in the training dataset." % train["brand_name"].nunique() ) x = train["brand_name"].value_counts().index.values.astype("str")[:10] y = train["brand_name"].value_counts().values[:10] def wordCount(text): try: text = text.lower() regex = re.compile("[" + re.escape(string.punctuation) + "0-9\\r\\t\\n]") txt = regex.sub(" ", text) words = [ w for w in txt.split(" ") if w not in stop_words.ENGLISH_STOP_WORDS and len(w) > 3 ] return len(words) except Exception: return 0 train["desc_len"] = train["item_description"].apply(lambda x: wordCount(x)) test["desc_len"] = test["item_description"].apply(lambda x: wordCount(x)) train.head() df = train.groupby("desc_len")["price"].mean().reset_index() trace1 = go.Scatter( x=df["desc_len"], y=np.log(df["price"] + 1), mode="lines+markers", name="lines+markers", ) layout = { "title": "Average Log(Price) by Description Length", "yaxis": {"title": "Average Log(Price)"}, "xaxis": {"title": "Description Length"}, } fig = {"data": [trace1], "layout": layout} py.iplot(fig) train.item_description.isnull().sum() train = train[pd.notnull(train["item_description"])] stop = set(stopwords.words("english")) def tokenize(text): """ sent_tokenize(): segment text into sentences word_tokenize(): break sentences into words """ try: regex = re.compile("[" + re.escape(string.punctuation) + "0-9\\r\\t\\n]") text = regex.sub(" ", text) # remove punctuation tokens_ = [word_tokenize(s) for s in sent_tokenize(text)] tokens = [] for token_by_sent in tokens_: tokens += token_by_sent tokens = list(filter(lambda t: t.lower() not in stop, tokens)) filtered_tokens = [w for w in tokens if re.search("[a-zA-Z]", w)] filtered_tokens = [w.lower() for w in filtered_tokens if len(w) >= 3] return filtered_tokens except TypeError as err: print(text, err) cat_desc = {} for cat in general_cats: text = " ".join(train.loc[train["general_cat"] == cat, "item_description"].values) cat_desc[cat] = tokenize(text) flat_lst = [item for sublist in list(cat_desc.values()) for item in sublist] allWordsCount = Counter(flat_lst) all_top10 = allWordsCount.most_common(20) x = [w[0] for w in all_top10] y = [w[1] for w in all_top10] trace1 = go.Bar(x=x, y=y, text=pct) layout = { "title": "Word Frequency", "yaxis": {"title": "Count"}, "xaxis": {"title": "Word"}, } fig = {"data": [trace1], "layout": layout} py.iplot(fig) stop = set(stopwords.words("english")) def tokenize(text): try: regex = re.compile("[" + re.escape(string.punctuation) + "0-9\\r\\t\\n]") text = regex.sub(" ", text) # remove punctuation tokens_ = [word_tokenize(s) for s in sent_tokenize(text)] tokens = [] for token_by_sent in tokens_: tokens += token_by_sent tokens = list(filter(lambda t: t.lower() not in stop, tokens)) filtered_tokens = [w for w in tokens if re.search("[a-zA-Z]", w)] filtered_tokens = [w.lower() for w in filtered_tokens if len(w) >= 3] return filtered_tokens except TypeError as err: print(text, err) train["tokens"] = train["item_description"].map(tokenize) test["tokens"] = test["item_description"].map(tokenize) train.reset_index(drop=True, inplace=True) test.reset_index(drop=True, inplace=True) for description, tokens in zip( train["item_description"].head(), train["tokens"].head() ): print("description:", description) print("tokens:", tokens) print() cat_desc = {} for cat in general_cats: text = " ".join(train.loc[train["general_cat"] == cat, "item_description"].values) cat_desc[cat] = tokenize(text) import sys sys.exit() women100 = Counter(cat_desc["Women"]).most_common(100) beauty100 = Counter(cat_desc["Beauty"]).most_common(100) kids100 = Counter(cat_desc["Kids"]).most_common(100) electronics100 = Counter(cat_desc["Electronics"]).most_common(100) def generate_wordcloud(tup): wordcloud = WordCloud( background_color="white", max_words=50, max_font_size=40, random_state=42 ).generate(str(tup)) return wordcloud fig, axes = plt.subplots(2, 2, figsize=(30, 15)) ax = axes[0, 0] ax.imshow(generate_wordcloud(women100), interpolation="bilinear") ax.axis("off") ax.set_title("Women Top 100", fontsize=30) ax = axes[0, 1] ax.imshow(generate_wordcloud(beauty100)) ax.axis("off") ax.set_title("Beauty Top 100", fontsize=30) ax = axes[1, 0] ax.imshow(generate_wordcloud(kids100)) ax.axis("off") ax.set_title("Kids Top 100", fontsize=30) ax = axes[1, 1] ax.imshow(generate_wordcloud(electronics100)) ax.axis("off") ax.set_title("Electronic Top 100", fontsize=30) vectorizer = TfidfVectorizer( min_df=10, max_features=180000, tokenizer=tokenize, ngram_range=(1, 2) ) all_desc = np.append(train["item_description"].values, test["item_description"].values) vz = vectorizer.fit_transform(list(all_desc)) tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_)) tfidf = pd.DataFrame(columns=["tfidf"]).from_dict(dict(tfidf), orient="index") tfidf.columns = ["tfidf"] tfidf.sort_values(by=["tfidf"], ascending=True).head(10) tfidf.sort_values(by=["tfidf"], ascending=False).head(10) trn = train.copy() tst = test.copy() trn["is_train"] = 1 tst["is_train"] = 0 sample_sz = 15000 combined_df = pd.concat([trn, tst]) combined_sample = combined_df.sample(n=sample_sz) vz_sample = vectorizer.fit_transform(list(combined_sample["item_description"])) from sklearn.decomposition import TruncatedSVD n_comp = 30 svd = TruncatedSVD(n_components=n_comp, random_state=42) svd_tfidf = svd.fit_transform(vz_sample) from sklearn.manifold import TSNE tsne_model = TSNE(n_components=2, verbose=1, random_state=42, n_iter=500) tsne_tfidf = tsne_model.fit_transform(svd_tfidf) output_notebook() plot_tfidf = bp.figure( plot_width=700, plot_height=600, title="tf-idf clustering of the item description", tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave", x_axis_type=None, y_axis_type=None, min_border=1, ) combined_sample.reset_index(inplace=True, drop=True) tfidf_df = pd.DataFrame(tsne_tfidf, columns=["x", "y"]) tfidf_df["description"] = combined_sample["item_description"] tfidf_df["tokens"] = combined_sample["tokens"] tfidf_df["category"] = combined_sample["general_cat"] plot_tfidf.scatter(x="x", y="y", source=tfidf_df, alpha=0.7) hover = plot_tfidf.select({"type": HoverTool}) hover.tooltips = { "description": "@description", "tokens": "@tokens", "category": "@category", } show(plot_tfidf) from sklearn.cluster import MiniBatchKMeans num_clusters = 30 # need to be selected wisely kmeans_model = MiniBatchKMeans( n_clusters=num_clusters, init="k-means++", n_init=1, init_size=1000, batch_size=1000, verbose=0, max_iter=1000, ) kmeans = kmeans_model.fit(vz) kmeans_clusters = kmeans.predict(vz) kmeans_distances = kmeans.transform(vz) sorted_centroids = kmeans.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() for i in range(num_clusters): print("Cluster %d:" % i) aux = "" for j in sorted_centroids[i, :10]: aux += terms[j] + " | " print(aux) print() kmeans = kmeans_model.fit(vz_sample) kmeans_clusters = kmeans.predict(vz_sample) kmeans_distances = kmeans.transform(vz_sample) tsne_kmeans = tsne_model.fit_transform(kmeans_distances) colormap = np.array( [ "#6d8dca", "#69de53", "#723bca", "#c3e14c", "#c84dc9", "#68af4e", "#6e6cd5", "#e3be38", "#4e2d7c", "#5fdfa8", "#d34690", "#3f6d31", "#d44427", "#7fcdd8", "#cb4053", "#5e9981", "#803a62", "#9b9e39", "#c88cca", "#e1c37b", "#34223b", "#bdd8a3", "#6e3326", "#cfbdce", "#d07d3c", "#52697d", "#194196", "#d27c88", "#36422b", "#b68f79", ] ) kmeans_df = pd.DataFrame(tsne_kmeans, columns=["x", "y"]) kmeans_df["cluster"] = kmeans_clusters kmeans_df["description"] = combined_sample["item_description"] kmeans_df["category"] = combined_sample["general_cat"] plot_kmeans = bp.figure( plot_width=700, plot_height=600, title="KMeans clustering of the description", tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave", x_axis_type=None, y_axis_type=None, min_border=1, ) source = ColumnDataSource( data={ "x": kmeans_df["x"], "y": kmeans_df["y"], "color": colormap[kmeans_clusters], "description": kmeans_df["description"], "category": kmeans_df["category"], "cluster": kmeans_df["cluster"], } ) plot_kmeans.scatter(x="x", y="y", color="color", source=source) hover = plot_kmeans.select({"type": HoverTool}) hover.tooltips = { "description": "@description", "category": "@category", "cluster": "@cluster", } show(plot_kmeans) cvectorizer = CountVectorizer( min_df=4, max_features=180000, tokenizer=tokenize, ngram_range=(1, 2) ) cvz = cvectorizer.fit_transform(combined_sample["item_description"]) lda_model = LatentDirichletAllocation( n_components=20, learning_method="online", max_iter=20, random_state=42 ) X_topics = lda_model.fit_transform(cvz) n_top_words = 10 topic_summaries = [] topic_word = lda_model.components_ # get the topic words vocab = cvectorizer.get_feature_names() for i, topic_dist in enumerate(topic_word): topic_words = np.array(vocab)[np.argsort(topic_dist)][: -(n_top_words + 1) : -1] topic_summaries.append(" ".join(topic_words)) print("Topic {}: {}".format(i, " | ".join(topic_words))) tsne_lda = tsne_model.fit_transform(X_topics) unnormalized = np.matrix(X_topics) doc_topic = unnormalized / unnormalized.sum(axis=1) lda_keys = [] for i, tweet in enumerate(combined_sample["item_description"]): lda_keys += [doc_topic[i].argmax()] lda_df = pd.DataFrame(tsne_lda, columns=["x", "y"]) lda_df["description"] = combined_sample["item_description"] lda_df["category"] = combined_sample["general_cat"] lda_df["topic"] = lda_keys lda_df["topic"] = lda_df["topic"].map(int) plot_lda = bp.figure( plot_width=700, plot_height=600, title="LDA topic visualization", tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave", x_axis_type=None, y_axis_type=None, min_border=1, ) source = ColumnDataSource( data={ "x": lda_df["x"], "y": lda_df["y"], "color": colormap[lda_keys], "description": lda_df["description"], "topic": lda_df["topic"], "category": lda_df["category"], } ) plot_lda.scatter(source=source, x="x", y="y", color="color") hover = plot_kmeans.select({"type": HoverTool}) hover = plot_lda.select({"type": HoverTool}) hover.tooltips = { "description": "@description", "topic": "@topic", "category": "@category", } show(plot_lda) def prepareLDAData(): data = { "vocab": vocab, "doc_topic_dists": doc_topic, "doc_lengths": list(lda_df["len_docs"]), "term_frequency": cvectorizer.vocabulary_, "topic_term_dists": lda_model.components_, } return data import pyLDAvis lda_df["len_docs"] = combined_sample["tokens"].map(len) ldadata = prepareLDAData() pyLDAvis.enable_notebook() prepared_data = pyLDAvis.prepare(**ldadata) ================================================ FILE: stress_tests/kaggle/kaggle19.py ================================================ #!/usr/bin/env python # coding: utf-8 import matplotlib matplotlib.use("PS") import warnings import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns warnings.filterwarnings("ignore") train = pd.read_csv("train.csv") train.info() train.head() print( "The average person kills {:.4f} players, 99% of people have {} kills or less, while the most kills ever recorded is {}.".format( train["kills"].mean(), train["kills"].quantile(0.99), train["kills"].max() ) ) data = train.copy() data.loc[data["kills"] > data["kills"].quantile(0.99)] = "8+" plt.figure(figsize=(15, 10)) sns.countplot(data["kills"].astype("str").sort_values()) plt.title("Kill Count", fontsize=15) plt.show() data = train.copy() data = data[data["kills"] == 0] plt.figure(figsize=(15, 10)) plt.title("Damage Dealt by 0 killers", fontsize=15) plt.show() print( "{} players ({:.4f}%) have won without a single kill!".format( len(data[data["winPlacePerc"] == 1]), 100 * len(data[data["winPlacePerc"] == 1]) / len(train), ) ) data1 = train[train["damageDealt"] == 0].copy() print( "{} players ({:.4f}%) have won without dealing damage!".format( len(data1[data1["winPlacePerc"] == 1]), 100 * len(data1[data1["winPlacePerc"] == 1]) / len(train), ) ) kills = train.copy() kills["killsCategories"] = pd.cut( kills["kills"], [-1, 0, 2, 5, 10, 60], labels=["0_kills", "1-2_kills", "3-5_kills", "6-10_kills", "10+_kills"], ) plt.figure(figsize=(15, 8)) sns.boxplot(x="killsCategories", y="winPlacePerc", data=kills) plt.show() print( "The average person walks for {:.1f}m, 99% of people have walked {}m or less, while the marathoner champion walked for {}m.".format( train["walkDistance"].mean(), train["walkDistance"].quantile(0.99), train["walkDistance"].max(), ) ) data = train.copy() data = data[data["walkDistance"] < train["walkDistance"].quantile(0.99)] plt.figure(figsize=(15, 10)) plt.title("Walking Distance Distribution", fontsize=15) sns.distplot(data["walkDistance"]) plt.show() print( "{} players ({:.4f}%) walked 0 meters. This means that they die before even taking a step or they are afk (more possible).".format( len(data[data["walkDistance"] == 0]), 100 * len(data1[data1["walkDistance"] == 0]) / len(train), ) ) print( "The average person drives for {:.1f}m, 99% of people have drived {}m or less, while the formula 1 champion drived for {}m.".format( train["rideDistance"].mean(), train["rideDistance"].quantile(0.99), train["rideDistance"].max(), ) ) data = train.copy() data = data[data["rideDistance"] < train["rideDistance"].quantile(0.9)] plt.figure(figsize=(15, 10)) plt.title("Ride Distance Distribution", fontsize=15) sns.distplot(data["rideDistance"]) plt.show() print( "{} players ({:.4f}%) drived for 0 meters. This means that they don't have a driving licence yet.".format( len(data[data["rideDistance"] == 0]), 100 * len(data1[data1["rideDistance"] == 0]) / len(train), ) ) f, ax1 = plt.subplots(figsize=(20, 10)) sns.pointplot( x="vehicleDestroys", y="winPlacePerc", data=data, color="#606060", alpha=0.8 ) plt.xlabel("Number of Vehicle Destroys", fontsize=15, color="blue") plt.ylabel("Win Percentage", fontsize=15, color="blue") plt.title("Vehicle Destroys/ Win Ratio", fontsize=20, color="blue") plt.grid() plt.show() print( "The average person swims for {:.1f}m, 99% of people have swimemd {}m or less, while the olympic champion swimmed for {}m.".format( train["swimDistance"].mean(), train["swimDistance"].quantile(0.99), train["swimDistance"].max(), ) ) data = train.copy() data = data[data["swimDistance"] < train["swimDistance"].quantile(0.95)] plt.figure(figsize=(15, 10)) plt.title("Swim Distance Distribution", fontsize=15) sns.distplot(data["swimDistance"]) plt.show() swim = train.copy() swim["swimDistance"] = pd.cut( swim["swimDistance"], [-1, 0, 5, 20, 5286], labels=["0m", "1-5m", "6-20m", "20m+"] ) plt.figure(figsize=(15, 8)) sns.boxplot(x="swimDistance", y="winPlacePerc", data=swim) plt.show() print( "The average person uses {:.1f} heal items, 99% of people use {} or less, while the doctor used {}.".format( train["heals"].mean(), train["heals"].quantile(0.99), train["heals"].max() ) ) print( "The average person uses {:.1f} boost items, 99% of people use {} or less, while the doctor used {}.".format( train["boosts"].mean(), train["boosts"].quantile(0.99), train["boosts"].max() ) ) data = train.copy() data = data[data["heals"] < data["heals"].quantile(0.99)] data = data[data["boosts"] < data["boosts"].quantile(0.99)] f, ax1 = plt.subplots(figsize=(20, 10)) sns.pointplot(x="heals", y="winPlacePerc", data=data, color="lime", alpha=0.8) sns.pointplot(x="boosts", y="winPlacePerc", data=data, color="blue", alpha=0.8) plt.text(4, 0.6, "Heals", color="lime", fontsize=17, style="italic") plt.text(4, 0.55, "Boosts", color="blue", fontsize=17, style="italic") plt.xlabel("Number of heal/boost items", fontsize=15, color="blue") plt.ylabel("Win Percentage", fontsize=15, color="blue") plt.title("Heals vs Boosts", fontsize=20, color="blue") plt.grid() plt.show() solos = train[train["numGroups"] > 50] duos = train[(train["numGroups"] > 25) & (train["numGroups"] <= 50)] squads = train[train["numGroups"] <= 25] print( "There are {} ({:.2f}%) solo games, {} ({:.2f}%) duo games and {} ({:.2f}%) squad games.".format( len(solos), 100 * len(solos) / len(train), len(duos), 100 * len(duos) / len(train), len(squads), 100 * len(squads) / len(train), ) ) f, ax1 = plt.subplots(figsize=(20, 10)) sns.pointplot(x="kills", y="winPlacePerc", data=solos, color="black", alpha=0.8) sns.pointplot(x="kills", y="winPlacePerc", data=duos, color="#CC0000", alpha=0.8) sns.pointplot(x="kills", y="winPlacePerc", data=squads, color="#3399FF", alpha=0.8) plt.text(37, 0.6, "Solos", color="black", fontsize=17, style="italic") plt.text(37, 0.55, "Duos", color="#CC0000", fontsize=17, style="italic") plt.text(37, 0.5, "Squads", color="#3399FF", fontsize=17, style="italic") plt.xlabel("Number of kills", fontsize=15, color="blue") plt.ylabel("Win Percentage", fontsize=15, color="blue") plt.title("Solo vs Duo vs Squad Kills", fontsize=20, color="blue") plt.grid() plt.show() f, ax1 = plt.subplots(figsize=(20, 10)) sns.pointplot(x="DBNOs", y="winPlacePerc", data=duos, color="#CC0000", alpha=0.8) sns.pointplot(x="DBNOs", y="winPlacePerc", data=squads, color="#3399FF", alpha=0.8) sns.pointplot(x="assists", y="winPlacePerc", data=duos, color="#FF6666", alpha=0.8) sns.pointplot(x="assists", y="winPlacePerc", data=squads, color="#CCE5FF", alpha=0.8) sns.pointplot(x="revives", y="winPlacePerc", data=duos, color="#660000", alpha=0.8) sns.pointplot(x="revives", y="winPlacePerc", data=squads, color="#000066", alpha=0.8) plt.text(14, 0.5, "Duos - Assists", color="#FF6666", fontsize=17, style="italic") plt.text(14, 0.45, "Duos - DBNOs", color="#CC0000", fontsize=17, style="italic") plt.text(14, 0.4, "Duos - Revives", color="#660000", fontsize=17, style="italic") plt.text(14, 0.35, "Squads - Assists", color="#CCE5FF", fontsize=17, style="italic") plt.text(14, 0.3, "Squads - DBNOs", color="#3399FF", fontsize=17, style="italic") plt.text(14, 0.25, "Squads - Revives", color="#000066", fontsize=17, style="italic") plt.xlabel("Number of DBNOs/Assits/Revives", fontsize=15, color="blue") plt.ylabel("Win Percentage", fontsize=15, color="blue") plt.title("Duo vs Squad DBNOs, Assists, and Revives", fontsize=20, color="blue") plt.grid() plt.show() f, ax = plt.subplots(figsize=(15, 15)) sns.heatmap(train.corr(), annot=True, linewidths=0.5, fmt=".1f", ax=ax) plt.show() k = 5 # number of variables for heatmap f, ax = plt.subplots(figsize=(11, 11)) cols = train.corr().nlargest(k, "winPlacePerc")["winPlacePerc"].index cm = np.corrcoef(train[cols].values.T) sns.set(font_scale=1.25) hm = sns.heatmap( cm, cbar=True, annot=True, square=True, fmt=".2f", annot_kws={"size": 10}, yticklabels=cols.values, xticklabels=cols.values, ) plt.show() train["playersJoined"] = train.groupby("matchId")["matchId"].transform("count") data = train.copy() data = data[data["playersJoined"] > 49] train["killsNorm"] = train["kills"] * ((100 - train["playersJoined"]) / 100 + 1) train["damageDealtNorm"] = train["damageDealt"] * ( (100 - train["playersJoined"]) / 100 + 1 ) train[["playersJoined", "kills", "killsNorm", "damageDealt", "damageDealtNorm"]][5:8] train["healsAndBoosts"] = train["heals"] + train["boosts"] train["totalDistance"] = ( train["walkDistance"] + train["rideDistance"] + train["swimDistance"] ) train["boostsPerWalkDistance"] = train["boosts"] / ( train["walkDistance"] + 1 ) # The +1 is to avoid infinity, because there are entries where boosts>0 and walkDistance=0. Strange. train["boostsPerWalkDistance"].fillna(0, inplace=True) train["healsPerWalkDistance"] = train["heals"] / ( train["walkDistance"] + 1 ) # The +1 is to avoid infinity, because there are entries where heals>0 and walkDistance=0. Strange. train["healsPerWalkDistance"].fillna(0, inplace=True) train["healsAndBoostsPerWalkDistance"] = train["healsAndBoosts"] / ( train["walkDistance"] + 1 ) # The +1 is to avoid infinity. train["healsAndBoostsPerWalkDistance"].fillna(0, inplace=True) train[ [ "walkDistance", "boosts", "boostsPerWalkDistance", "heals", "healsPerWalkDistance", "healsAndBoosts", "healsAndBoostsPerWalkDistance", ] ][40:45] train["killsPerWalkDistance"] = train["kills"] / ( train["walkDistance"] + 1 ) # The +1 is to avoid infinity, because there are entries where kills>0 and walkDistance=0. Strange. train["killsPerWalkDistance"].fillna(0, inplace=True) train[ ["kills", "walkDistance", "rideDistance", "killsPerWalkDistance", "winPlacePerc"] ].sort_values(by="killsPerWalkDistance").tail(10) train["team"] = [ 1 if i > 50 else 2 if (i > 25 & i <= 50) else 4 for i in train["numGroups"] ] train.head() ================================================ FILE: stress_tests/kaggle/kaggle20.py ================================================ import matplotlib matplotlib.use("PS") import time import matplotlib.pyplot as plt import numpy as np # linear algebra import seaborn as sns # data visualization library import modin.pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) data = pd.read_csv("data.csv") data.head() # head method show only first 5 rows col = data.columns print(col) y = data.diagnosis # M or B list = ["Unnamed: 32", "id", "diagnosis"] x = data.drop(list, axis=1) x.head() ax = sns.countplot(y, label="Count") # M = 212, B = 357 x.describe() data_dia = y data = x data_n_2 = (data - data.mean()) / (data.std()) # standardization data = pd.concat([y, data_n_2.iloc[:, 0:10]], axis=1) data = pd.melt(data, id_vars="diagnosis", var_name="features", value_name="value") plt.figure(figsize=(10, 10)) sns.violinplot( x="features", y="value", hue="diagnosis", data=data, split=True, inner="quart" ) plt.xticks(rotation=90) data = pd.concat([y, data_n_2.iloc[:, 10:20]], axis=1) data = pd.melt(data, id_vars="diagnosis", var_name="features", value_name="value") plt.figure(figsize=(10, 10)) sns.violinplot( x="features", y="value", hue="diagnosis", data=data, split=True, inner="quart" ) plt.xticks(rotation=90) data = pd.concat([y, data_n_2.iloc[:, 20:31]], axis=1) data = pd.melt(data, id_vars="diagnosis", var_name="features", value_name="value") plt.figure(figsize=(10, 10)) sns.violinplot( x="features", y="value", hue="diagnosis", data=data, split=True, inner="quart" ) plt.xticks(rotation=90) plt.figure(figsize=(10, 10)) sns.boxplot(x="features", y="value", hue="diagnosis", data=data) plt.xticks(rotation=90) sns.jointplot( x.loc[:, "concavity_worst"], x.loc[:, "concave points_worst"], kind="regg", color="#ce1414", ) sns.set(style="white") df = x.loc[:, ["radius_worst", "perimeter_worst", "area_worst"]] g = sns.PairGrid(df, diag_sharey=False) g.map_lower(sns.kdeplot, cmap="Blues_d") g.map_upper(plt.scatter) g.map_diag(sns.kdeplot, lw=3) sns.set(style="whitegrid", palette="muted") data_dia = y data = x data_n_2 = (data - data.mean()) / (data.std()) # standardization data = pd.concat([y, data_n_2.iloc[:, 0:10]], axis=1) data = pd.melt(data, id_vars="diagnosis", var_name="features", value_name="value") plt.figure(figsize=(10, 10)) tic = time.time() sns.swarmplot(x="features", y="value", hue="diagnosis", data=data) plt.xticks(rotation=90) data = pd.concat([y, data_n_2.iloc[:, 10:20]], axis=1) data = pd.melt(data, id_vars="diagnosis", var_name="features", value_name="value") plt.figure(figsize=(10, 10)) sns.swarmplot(x="features", y="value", hue="diagnosis", data=data) plt.xticks(rotation=90) data = pd.concat([y, data_n_2.iloc[:, 20:31]], axis=1) data = pd.melt(data, id_vars="diagnosis", var_name="features", value_name="value") plt.figure(figsize=(10, 10)) sns.swarmplot(x="features", y="value", hue="diagnosis", data=data) toc = time.time() plt.xticks(rotation=90) print("swarm plot time: ", toc - tic, " s") f, ax = plt.subplots(figsize=(18, 18)) sns.heatmap(x.corr(), annot=True, linewidths=0.5, fmt=".1f", ax=ax) drop_list1 = [ "perimeter_mean", "radius_mean", "compactness_mean", "concave points_mean", "radius_se", "perimeter_se", "radius_worst", "perimeter_worst", "compactness_worst", "concave points_worst", "compactness_se", "concave points_se", "texture_worst", "area_worst", ] x_1 = x.drop(drop_list1, axis=1) # do not modify x, we will use it later x_1.head() f, ax = plt.subplots(figsize=(14, 14)) sns.heatmap(x_1.corr(), annot=True, linewidths=0.5, fmt=".1f", ax=ax) from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import confusion_matrix # f1_score from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split( x_1, y, test_size=0.3, random_state=42 ) clf_rf = RandomForestClassifier(random_state=43) clr_rf = clf_rf.fit(x_train, y_train) ac = accuracy_score(y_test, clf_rf.predict(x_test)) print("Accuracy is: ", ac) cm = confusion_matrix(y_test, clf_rf.predict(x_test)) sns.heatmap(cm, annot=True, fmt="d") from sklearn.feature_selection import SelectKBest, chi2 select_feature = SelectKBest(chi2, k=5).fit(x_train, y_train) print("Score list:", select_feature.scores_) print("Feature list:", x_train.columns) x_train_2 = select_feature.transform(x_train) x_test_2 = select_feature.transform(x_test) clf_rf_2 = RandomForestClassifier() clr_rf_2 = clf_rf_2.fit(x_train_2, y_train) ac_2 = accuracy_score(y_test, clf_rf_2.predict(x_test_2)) print("Accuracy is: ", ac_2) cm_2 = confusion_matrix(y_test, clf_rf_2.predict(x_test_2)) sns.heatmap(cm_2, annot=True, fmt="d") from sklearn.feature_selection import RFE clf_rf_3 = RandomForestClassifier() rfe = RFE(estimator=clf_rf_3, n_features_to_select=5, step=1) rfe = rfe.fit(x_train, y_train) print("Chosen best 5 feature by rfe:", x_train.columns[rfe.support_]) from sklearn.feature_selection import RFECV clf_rf_4 = RandomForestClassifier() rfecv = RFECV( estimator=clf_rf_4, step=1, cv=5, scoring="accuracy" ) # 5-fold cross-validation rfecv = rfecv.fit(x_train, y_train) print("Optimal number of features :", rfecv.n_features_) print("Best features :", x_train.columns[rfecv.support_]) import matplotlib.pyplot as plt plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score of number of selected features") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() clf_rf_5 = RandomForestClassifier() clr_rf_5 = clf_rf_5.fit(x_train, y_train) importances = clr_rf_5.feature_importances_ std = np.std([tree.feature_importances_ for tree in clf_rf.estimators_], axis=0) indices = np.argsort(importances)[::-1] print("Feature ranking:") for f in range(x_train.shape[1]): print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) plt.figure(1, figsize=(14, 13)) plt.title("Feature importances") plt.bar( range(x_train.shape[1]), importances[indices], color="g", yerr=std[indices], align="center", ) plt.xticks(range(x_train.shape[1]), x_train.columns[indices], rotation=90) plt.xlim([-1, x_train.shape[1]]) plt.show() x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.3, random_state=42 ) x_train_N = (x_train - x_train.mean()) / (x_train.max() - x_train.min()) x_test_N = (x_test - x_test.mean()) / (x_test.max() - x_test.min()) from sklearn.decomposition import PCA pca = PCA() pca.fit(x_train_N) plt.figure(1, figsize=(14, 13)) plt.clf() plt.axes([0.2, 0.2, 0.7, 0.7]) plt.plot(pca.explained_variance_ratio_, linewidth=2) plt.axis("tight") plt.xlabel("n_components") plt.ylabel("explained_variance_ratio_") ================================================ FILE: stress_tests/kaggle/kaggle22.py ================================================ import matplotlib matplotlib.use("PS") import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer # CountVectorizer from sklearn.linear_model import LogisticRegression import modin.pandas as pd train = pd.read_csv("train.csv") test = pd.read_csv("test.csv") subm = pd.read_csv("sample_submission.csv") train.head() train["comment_text"][0] train["comment_text"][2] lens = train.comment_text.str.len() lens.mean(), lens.std(), lens.max() lens.hist() label_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"] train["none"] = 1 - train[label_cols].max(axis=1) train.describe() len(train), len(test) COMMENT = "comment_text" train[COMMENT].fillna("unknown", inplace=True) test[COMMENT].fillna("unknown", inplace=True) import re import string re_tok = re.compile(f"([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])") def tokenize(s): return re_tok.sub(r" \1 ", s).split() n = train.shape[0] vec = TfidfVectorizer( ngram_range=(1, 2), tokenizer=tokenize, min_df=3, max_df=0.9, strip_accents="unicode", use_idf=1, smooth_idf=1, sublinear_tf=1, ) trn_term_doc = vec.fit_transform(train[COMMENT]) test_term_doc = vec.transform(test[COMMENT]) trn_term_doc, test_term_doc def pr(y_i, y): p = x[y == y_i].sum(0) return (p + 1) / ((y == y_i).sum() + 1) x = trn_term_doc test_x = test_term_doc def get_mdl(y): y = y.values r = np.log(pr(1, y) / pr(0, y)) m = LogisticRegression(C=4, dual=True) x_nb = x.multiply(r) return m.fit(x_nb, y), r preds = np.zeros((len(test), len(label_cols))) for i, j in enumerate(label_cols): print("fit", j) m, r = get_mdl(train[j]) preds[:, i] = m.predict_proba(test_x.multiply(r))[:, 1] submid = pd.DataFrame({"id": subm["id"]}) submission = pd.concat([submid, pd.DataFrame(preds, columns=label_cols)], axis=1) submission.to_csv("submission.csv", index=False) ================================================ FILE: stress_tests/kaggle/kaggle3.py ================================================ #!/usr/bin/env python import matplotlib matplotlib.use("PS") import matplotlib.pyplot as plt import numpy as np # linear algebra import seaborn as sns # visualization tool import modin.pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) data = pd.read_csv("pokemon.csv") data.info() data.corr() f, ax = plt.subplots(figsize=(18, 18)) sns.heatmap(data.corr(), annot=True, linewidths=0.5, fmt=".1f", ax=ax) data.head(10) data.columns data.Speed.plot( kind="line", color="g", label="Speed", linewidth=1, alpha=0.5, grid=True, linestyle=":", ) data.Defense.plot( color="r", label="Defense", linewidth=1, alpha=0.5, grid=True, linestyle="-." ) plt.legend(loc="upper right") # legend = puts label into plot plt.xlabel("x axis") # label = name of label plt.ylabel("y axis") plt.title("Line Plot") # title = title of plot data.plot(kind="scatter", x="Attack", y="Defense", alpha=0.5, color="red") plt.xlabel("Attack") # label = name of label plt.ylabel("Defence") plt.title("Attack Defense Scatter Plot") # title = title of plot data.Speed.plot(kind="hist", bins=50, figsize=(12, 12)) data.Speed.plot(kind="hist", bins=50) dictionary = {"spain": "madrid", "usa": "vegas"} print(dictionary.keys()) print(dictionary.values()) dictionary["spain"] = "barcelona" # update existing entry print(dictionary) dictionary["france"] = "paris" # Add new entry print(dictionary) del dictionary["spain"] # remove entry with key 'spain' print(dictionary) print("france" in dictionary) # check include or not dictionary.clear() # remove all entries in dict print(dictionary) print(dictionary) # it gives error because dictionary is deleted data = pd.read_csv("pokemon.csv") series = data["Defense"] # data['Defense'] = series print(type(series)) data_frame = data[["Defense"]] # data[['Defense']] = data frame print(type(data_frame)) print(3 > 2) print(3 != 2) print(True and False) print(True or False) x = ( data["Defense"] > 200 ) # There are only 3 pokemons who have higher defense value than 200 data[x] data[np.logical_and(data["Defense"] > 200, data["Attack"] > 100)] data[(data["Defense"] > 200) & (data["Attack"] > 100)] i = 0 while i != 5: print("i is: ", i) i += 1 print(i, " is equal to 5") lis = [1, 2, 3, 4, 5] for i in lis: print("i is: ", i) print("") for index, value in enumerate(lis): print(index, " : ", value) print("") dictionary = {"spain": "madrid", "france": "paris"} for key, value in dictionary.items(): print(key, " : ", value) print("") for index, value in data[["Attack"]][0:1].iterrows(): print(index, " : ", value) def tuble_ex(): """return defined t tuble""" t = (1, 2, 3) return t a, b, c = tuble_ex() print(a, b, c) x = 2 def f(): x = 3 return x print(x) # x = 2 global scope print(f()) # x = 3 local scope x = 5 def f(): y = 2 * x # there is no local scope x return y print(f()) # it uses global scope x import builtins dir(builtins) def square(): """return square of value""" def add(): """add two local variable""" x = 2 y = 3 z = x + y return z return add() ** 2 print(square()) def f(a, b=1, c=2): y = a + b + c return y print(f(5)) print(f(5, 4, 3)) def f(*args): for i in args: print(i) f(1) print("") f(1, 2, 3, 4) def f(**kwargs): """print key and value of dictionary""" for ( key, value, ) in ( kwargs.items() ): # If you do not understand this part turn for loop part and look at dictionary in for loop print(key, " ", value) f(country="spain", capital="madrid", population=123456) number_list = [1, 2, 3] y = map(lambda x: x**2, number_list) print(list(y)) name = "ronaldo" it = iter(name) print(next(it)) # print next iteration print(*it) # print remaining iteration list1 = [1, 2, 3, 4] list2 = [5, 6, 7, 8] z = zip(list1, list2) print(z) z_list = list(z) print(z_list) un_zip = zip(*z_list) un_list1, un_list2 = list(un_zip) # unzip returns tuble print(un_list1) print(un_list2) print(type(un_list2)) num1 = [1, 2, 3] num2 = [i + 1 for i in num1] print(num2) num1 = [5, 10, 15] num2 = [i**2 if i == 10 else i - 5 if i < 7 else i + 5 for i in num1] print(num2) threshold = sum(data.Speed) / len(data.Speed) data["speed_level"] = ["high" if i > threshold else "low" for i in data.Speed] data.loc[:10, ["speed_level", "Speed"]] # we will learn loc more detailed later data = pd.read_csv("pokemon.csv") data.head() # head shows first 5 rows data.tail() data.columns data.shape data.info() print( data["Type 1"].value_counts(dropna=False) ) # if there are nan values that also be counted data.describe() # ignore null entries data.boxplot(column="Attack", by="Legendary") data_new = data.head() # I only take 5 rows into new data data_new melted = pd.melt(frame=data_new, id_vars="Name", value_vars=["Attack", "Defense"]) melted melted.pivot(index="Name", columns="variable", values="value") data1 = data.head() data2 = data.tail() conc_data_row = pd.concat( [data1, data2], axis=0, ignore_index=True ) # axis = 0 : adds dataframes in row conc_data_row data1 = data["Attack"].head() data2 = data["Defense"].head() conc_data_col = pd.concat([data1, data2], axis=1) # axis = 0 : adds dataframes in row conc_data_col data.dtypes data["Type 1"] = data["Type 1"].astype("category") data["Speed"] = data["Speed"].astype("float") data.dtypes data.info() data["Type 2"].value_counts(dropna=False) data1 = ( data # also we will use data to fill missing value so I assign it to data1 variable ) data1["Type 2"].dropna( inplace=True ) # inplace = True means we do not assign it to new variable. Changes automatically assigned to data assert 1 == 1 # return nothing because it is true assert data["Type 2"].notnull().all() # returns nothing because we drop nan values data["Type 2"].fillna("empty", inplace=True) assert ( data["Type 2"].notnull().all() ) # returns nothing because we do not have nan values country = ["Spain", "France"] population = ["11", "12"] list_label = ["country", "population"] list_col = [country, population] zipped = list(zip(list_label, list_col)) data_dict = dict(zipped) df = pd.DataFrame(data_dict) df df["capital"] = ["madrid", "paris"] df df["income"] = 0 # Broadcasting entire column df data1 = data.loc[:, ["Attack", "Defense", "Speed"]] data1.plot() data1.plot(subplots=True) plt.show() data1.plot(kind="scatter", x="Attack", y="Defense") plt.show() data1.plot(kind="hist", y="Defense", bins=50, range=(0, 250), normed=True) fig, axes = plt.subplots(nrows=2, ncols=1) data1.plot(kind="hist", y="Defense", bins=50, range=(0, 250), normed=True, ax=axes[0]) data1.plot( kind="hist", y="Defense", bins=50, range=(0, 250), normed=True, ax=axes[1], cumulative=True, ) plt.savefig("graph.png") plt data.describe() time_list = ["1992-03-08", "1992-04-12"] print(type(time_list[1])) # As you can see date is string datetime_object = pd.to_datetime(time_list) print(type(datetime_object)) import warnings warnings.filterwarnings("ignore") data2 = data.head() date_list = ["1992-01-10", "1992-02-10", "1992-03-10", "1993-03-15", "1993-03-16"] datetime_object = pd.to_datetime(date_list) data2["date"] = datetime_object data2 = data2.set_index("date") data2 print(data2.loc["1993-03-16"]) print(data2.loc["1992-03-10":"1993-03-16"]) data2.resample("A").mean() data2.resample("M").mean() data2.resample("M").first().interpolate("linear") data2.resample("M").mean().interpolate("linear") data = pd.read_csv("pokemon.csv") data = data.set_index("#") data.head() data["HP"][1] data.HP[1] data.loc[1, ["HP"]] data[["HP", "Attack"]] print(type(data["HP"])) # series print(type(data[["HP"]])) # data frames data.loc[1:10, "HP":"Defense"] # 10 and "Defense" are inclusive data.loc[10:1:-1, "HP":"Defense"] data.loc[1:10, "Speed":] boolean = data.HP > 200 data[boolean] first_filter = data.HP > 150 second_filter = data.Speed > 35 data[first_filter & second_filter] data.HP[data.Speed < 15] def div(n): return n / 2 data.HP.apply(div) data.HP.apply(lambda n: n / 2) data["total_power"] = data.Attack + data.Defense data.head() print(data.index.name) data.index.name = "index_name" data.head() data.head() data3 = data.copy() data3.index = range(100, 100 + len(data3.index), 1) data3.head() data = pd.read_csv("pokemon.csv") data.head() data1 = data.set_index(["Type 1", "Type 2"]) data1.head(100) dic = { "treatment": ["A", "A", "B", "B"], "gender": ["F", "M", "F", "M"], "response": [10, 45, 5, 9], "age": [15, 4, 72, 65], } df = pd.DataFrame(dic) df df.pivot(index="treatment", columns="gender", values="response") df1 = df.set_index(["treatment", "gender"]) df1 df1.unstack(level=0) df1.unstack(level=1) df2 = df1.swaplevel(0, 1) df2 df pd.melt(df, id_vars="treatment", value_vars=["age", "response"]) df df.groupby("treatment").mean() # mean is aggregation / reduce method df.groupby("treatment").age.max() df.groupby("treatment")[["age", "response"]].min() df.info() ================================================ FILE: stress_tests/kaggle/kaggle4.py ================================================ import matplotlib matplotlib.use("PS") import matplotlib.pyplot as plt # Matlab-style plotting import numpy as np # linear algebra import seaborn as sns import modin.pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) color = sns.color_palette() sns.set_style("darkgrid") import warnings def ignore_warn(*args, **kwargs): pass warnings.warn = ignore_warn # ignore annoying warning (from sklearn and seaborn) from scipy import stats from scipy.stats import norm, skew # for some statistics pd.set_option( "display.float_format", lambda x: "{:.3f}".format(x) ) # Limiting floats output to 3 decimal points train = pd.read_csv("train.csv") test = pd.read_csv("test.csv") train.head(5) test.head(5) print("The train data size before dropping Id feature is : {} ".format(train.shape)) print("The test data size before dropping Id feature is : {} ".format(test.shape)) train_ID = train["Id"] test_ID = test["Id"] train.drop("Id", axis=1, inplace=True) test.drop("Id", axis=1, inplace=True) print("\nThe train data size after dropping Id feature is : {} ".format(train.shape)) print("The test data size after dropping Id feature is : {} ".format(test.shape)) fig, ax = plt.subplots() ax.scatter(x=train["GrLivArea"], y=train["SalePrice"]) plt.ylabel("SalePrice", fontsize=13) plt.xlabel("GrLivArea", fontsize=13) plt.show() train = train.drop( train[(train["GrLivArea"] > 4000) & (train["SalePrice"] < 300000)].index ) fig, ax = plt.subplots() ax.scatter(train["GrLivArea"], train["SalePrice"]) plt.ylabel("SalePrice", fontsize=13) plt.xlabel("GrLivArea", fontsize=13) plt.show() sns.distplot(train["SalePrice"], fit=norm) (mu, sigma) = norm.fit(train["SalePrice"]) print("\n mu = {:.2f} and sigma = {:.2f}\n".format(mu, sigma)) plt.legend( [r"Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )".format(mu, sigma)], loc="best", # noqa: W605 ) plt.ylabel("Frequency") plt.title("SalePrice distribution") fig = plt.figure() res = stats.probplot(train["SalePrice"], plot=plt) plt.show() train["SalePrice"] = np.log1p(train["SalePrice"]) sns.distplot(train["SalePrice"], fit=norm) (mu, sigma) = norm.fit(train["SalePrice"]) print("\n mu = {:.2f} and sigma = {:.2f}\n".format(mu, sigma)) plt.legend( [r"Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )".format(mu, sigma)], loc="best", # noqa: W605 ) plt.ylabel("Frequency") plt.title("SalePrice distribution") fig = plt.figure() res = stats.probplot(train["SalePrice"], plot=plt) plt.show() ntrain = train.shape[0] ntest = test.shape[0] y_train = train.SalePrice.values all_data = pd.concat((train, test)).reset_index(drop=True) all_data.drop(["SalePrice"], axis=1, inplace=True) print("all_data size is : {}".format(all_data.shape)) all_data_na = (all_data.isnull().sum() / len(all_data)) * 100 all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values( ascending=False )[:30] missing_data = pd.DataFrame({"Missing Ratio": all_data_na}) missing_data.head(20) corrmat = train.corr() plt.subplots(figsize=(12, 9)) sns.heatmap(corrmat, vmax=0.9, square=True) all_data["PoolQC"] = all_data["PoolQC"].fillna("None") all_data["MiscFeature"] = all_data["MiscFeature"].fillna("None") all_data["Alley"] = all_data["Alley"].fillna("None") all_data["Fence"] = all_data["Fence"].fillna("None") all_data["FireplaceQu"] = all_data["FireplaceQu"].fillna("None") all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform( lambda x: x.fillna(x.median()) ) for col in ("GarageType", "GarageFinish", "GarageQual", "GarageCond"): all_data[col] = all_data[col].fillna("None") for col in ("GarageYrBlt", "GarageArea", "GarageCars"): all_data[col] = all_data[col].fillna(0) for col in ( "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "BsmtFullBath", "BsmtHalfBath", ): all_data[col] = all_data[col].fillna(0) for col in ("BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2"): all_data[col] = all_data[col].fillna("None") all_data["MasVnrType"] = all_data["MasVnrType"].fillna("None") all_data["MasVnrArea"] = all_data["MasVnrArea"].fillna(0) all_data["MSZoning"] = all_data["MSZoning"].fillna(all_data["MSZoning"].mode()[0]) all_data = all_data.drop(["Utilities"], axis=1) all_data["Functional"] = all_data["Functional"].fillna("Typ") all_data["Electrical"] = all_data["Electrical"].fillna(all_data["Electrical"].mode()[0]) all_data["KitchenQual"] = all_data["KitchenQual"].fillna( all_data["KitchenQual"].mode()[0] ) all_data["Exterior1st"] = all_data["Exterior1st"].fillna( all_data["Exterior1st"].mode()[0] ) all_data["Exterior2nd"] = all_data["Exterior2nd"].fillna( all_data["Exterior2nd"].mode()[0] ) all_data["SaleType"] = all_data["SaleType"].fillna(all_data["SaleType"].mode()[0]) all_data["MSSubClass"] = all_data["MSSubClass"].fillna("None") all_data_na = (all_data.isnull().sum() / len(all_data)) * 100 all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values( ascending=False ) missing_data = pd.DataFrame({"Missing Ratio": all_data_na}) missing_data.head() all_data["MSSubClass"] = all_data["MSSubClass"].apply(str) all_data["OverallCond"] = all_data["OverallCond"].astype(str) all_data["YrSold"] = all_data["YrSold"].astype(str) all_data["MoSold"] = all_data["MoSold"].astype(str) from sklearn.preprocessing import LabelEncoder cols = ( "FireplaceQu", "BsmtQual", "BsmtCond", "GarageQual", "GarageCond", "ExterQual", "ExterCond", "HeatingQC", "PoolQC", "KitchenQual", "BsmtFinType1", "BsmtFinType2", "Functional", "Fence", "BsmtExposure", "GarageFinish", "LandSlope", "LotShape", "PavedDrive", "Street", "Alley", "CentralAir", "MSSubClass", "OverallCond", "YrSold", "MoSold", ) for c in cols: lbl = LabelEncoder() lbl.fit(list(all_data[c].values)) all_data[c] = lbl.transform(list(all_data[c].values)) print("Shape all_data: {}".format(all_data.shape)) all_data["TotalSF"] = ( all_data["TotalBsmtSF"] + all_data["1stFlrSF"] + all_data["2ndFlrSF"] ) numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index skewed_feats = ( all_data[numeric_feats] .apply(lambda x: skew(x.dropna())) .sort_values(ascending=False) ) print("\nSkew in numerical features: \n") skewness = pd.DataFrame({"Skew": skewed_feats}) skewness.head(10) skewness = skewness[abs(skewness) > 0.75] print( "There are {} skewed numerical features to Box Cox transform".format( skewness.shape[0] ) ) from scipy.special import boxcox1p skewed_features = skewness.index lam = 0.15 for feat in skewed_features: # all_data[feat] += 1 all_data[feat] = boxcox1p(all_data[feat], lam) all_data = pd.get_dummies(all_data) print(all_data.shape) train = all_data[:ntrain] test = all_data[ntrain:] import lightgbm as lgb import xgboost as xgb from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin, clone from sklearn.ensemble import GradientBoostingRegressor # RandomForestRegressor from sklearn.kernel_ridge import KernelRidge from sklearn.linear_model import ElasticNet # BayesianRidge, LassoLarsIC from sklearn.linear_model import Lasso from sklearn.metrics import mean_squared_error from sklearn.model_selection import KFold, cross_val_score # train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import RobustScaler n_folds = 5 def rmsle_cv(model): kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values) rmse = np.sqrt( -cross_val_score( model, train.values, y_train, scoring="neg_mean_squared_error", cv=kf ) ) return rmse lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1)) ENet = make_pipeline( RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=0.9, random_state=3) ) KRR = KernelRidge(alpha=0.6, kernel="polynomial", degree=2, coef0=2.5) GBoost = GradientBoostingRegressor( n_estimators=1, learning_rate=0.05, max_depth=4, max_features="sqrt", min_samples_leaf=15, min_samples_split=10, loss="huber", random_state=5, ) model_xgb = xgb.XGBRegressor( colsample_bytree=0.4603, gamma=0.0468, learning_rate=0.05, max_depth=3, min_child_weight=1.7817, n_estimators=1, reg_alpha=0.4640, reg_lambda=0.8571, subsample=0.5213, silent=1, random_state=7, nthread=-1, ) model_lgb = lgb.LGBMRegressor( objective="regression", num_leaves=5, learning_rate=0.05, n_estimators=1, max_bin=55, bagging_fraction=0.8, bagging_freq=5, feature_fraction=0.2319, feature_fraction_seed=9, bagging_seed=9, min_data_in_leaf=6, min_sum_hessian_in_leaf=11, ) score = rmsle_cv(lasso) print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std())) score = rmsle_cv(ENet) print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std())) score = rmsle_cv(KRR) print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std())) score = rmsle_cv(GBoost) print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std())) score = rmsle_cv(model_xgb) print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std())) score = rmsle_cv(model_lgb) print("LGBM score: {:.4f} ({:.4f})\n".format(score.mean(), score.std())) class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin): def __init__(self, models): self.models = models def fit(self, X, y): self.models_ = [clone(x) for x in self.models] for model in self.models_: model.fit(X, y) return self def predict(self, X): predictions = np.column_stack([model.predict(X) for model in self.models_]) return np.mean(predictions, axis=1) averaged_models = AveragingModels(models=(ENet, GBoost, KRR, lasso)) score = rmsle_cv(averaged_models) print( " Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()) ) class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin): def __init__(self, base_models, meta_model, n_folds=5): self.base_models = base_models self.meta_model = meta_model self.n_folds = n_folds def fit(self, X, y): self.base_models_ = [[] for _ in self.base_models] self.meta_model_ = clone(self.meta_model) kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156) out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models))) for i, model in enumerate(self.base_models): for train_index, holdout_index in kfold.split(X, y): instance = clone(model) self.base_models_[i].append(instance) instance.fit(X[train_index], y[train_index]) y_pred = instance.predict(X[holdout_index]) out_of_fold_predictions[holdout_index, i] = y_pred self.meta_model_.fit(out_of_fold_predictions, y) return self def predict(self, X): meta_features = np.column_stack( [ np.column_stack([model.predict(X) for model in base_models]).mean(axis=1) for base_models in self.base_models_ ] ) return self.meta_model_.predict(meta_features) stacked_averaged_models = StackingAveragedModels( base_models=(ENet, GBoost, KRR), meta_model=lasso ) score = rmsle_cv(stacked_averaged_models) print( "Stacking Averaged models score: {:.4f} ({:.4f})".format(score.mean(), score.std()) ) def rmsle(y, y_pred): return np.sqrt(mean_squared_error(y, y_pred)) stacked_averaged_models.fit(train.values, y_train) stacked_train_pred = stacked_averaged_models.predict(train.values) stacked_pred = np.expm1(stacked_averaged_models.predict(test.values)) print(rmsle(y_train, stacked_train_pred)) model_xgb.fit(train, y_train) xgb_train_pred = model_xgb.predict(train) xgb_pred = np.expm1(model_xgb.predict(test)) print(rmsle(y_train, xgb_train_pred)) model_lgb.fit(train, y_train) lgb_train_pred = model_lgb.predict(train) lgb_pred = np.expm1(model_lgb.predict(test.values)) print(rmsle(y_train, lgb_train_pred)) print("RMSLE score on train data:") print( rmsle( y_train, stacked_train_pred * 0.70 + xgb_train_pred * 0.15 + lgb_train_pred * 0.15, ) ) ensemble = stacked_pred * 0.70 + xgb_pred * 0.15 + lgb_pred * 0.15 sub = pd.DataFrame() sub["Id"] = test_ID sub["SalePrice"] = ensemble sub.to_csv("submission.csv", index=False) ================================================ FILE: stress_tests/kaggle/kaggle5.py ================================================ import matplotlib matplotlib.use("PS") import matplotlib.pyplot as plt import numpy as np import seaborn as sns from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier from sklearn.naive_bayes import GaussianNB from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC, LinearSVC from sklearn.tree import DecisionTreeClassifier import modin.pandas as pd train_df = pd.read_csv("train.csv") test_df = pd.read_csv("test.csv") combine = [train_df, test_df] print(train_df.columns.values) train_df.head() train_df.tail() train_df.info() print("_" * 40) test_df.info() train_df.describe() train_df.describe(include=["O"]) train_df[["Pclass", "Survived"]].groupby(["Pclass"], as_index=False).mean().sort_values( by="Survived", ascending=False ) train_df[["Sex", "Survived"]].groupby(["Sex"], as_index=False).mean().sort_values( by="Survived", ascending=False ) train_df[["SibSp", "Survived"]].groupby(["SibSp"], as_index=False).mean().sort_values( by="Survived", ascending=False ) train_df[["Parch", "Survived"]].groupby(["Parch"], as_index=False).mean().sort_values( by="Survived", ascending=False ) grid = sns.FacetGrid(train_df, col="Survived", row="Pclass", size=2.2, aspect=1.6) grid.map(plt.hist, "Age", alpha=0.5, bins=20) grid.add_legend() grid = sns.FacetGrid(train_df, row="Embarked", size=2.2, aspect=1.6) grid.map(sns.pointplot, "Pclass", "Survived", "Sex", palette="deep") grid.add_legend() grid = sns.FacetGrid(train_df, row="Embarked", col="Survived", size=2.2, aspect=1.6) grid.map(sns.barplot, "Sex", "Fare", alpha=0.5, ci=None) grid.add_legend() print("Before", train_df.shape, test_df.shape, combine[0].shape, combine[1].shape) train_df = train_df.drop(["Ticket", "Cabin"], axis=1) test_df = test_df.drop(["Ticket", "Cabin"], axis=1) combine = [train_df, test_df] "After", train_df.shape, test_df.shape, combine[0].shape, combine[1].shape for dataset in combine: dataset["Title"] = dataset.Name.str.extract( r" ([A-Za-z]+)\.", expand=False ) # noqa: W605 pd.crosstab(train_df["Title"], train_df["Sex"]) for dataset in combine: dataset["Title"] = dataset["Title"].replace( [ "Lady", "Countess", "Capt", "Col", "Don", "Dr", "Major", "Rev", "Sir", "Jonkheer", "Dona", ], "Rare", ) dataset["Title"] = dataset["Title"].replace("Mlle", "Miss") dataset["Title"] = dataset["Title"].replace("Ms", "Miss") dataset["Title"] = dataset["Title"].replace("Mme", "Mrs") train_df[["Title", "Survived"]].groupby(["Title"], as_index=False).mean() def title_mapping(string): return np.random.randint(1, high=6) for dataset in combine: dataset["Title"] = dataset["Title"].map(title_mapping) dataset["Title"] = dataset["Title"].fillna(0) train_df.head() train_df = train_df.drop(["Name", "PassengerId"], axis=1) test_df = test_df.drop(["Name"], axis=1) combine = [train_df, test_df] train_df.shape, test_df.shape def gender_mapping(string): return np.random.randint(0, high=2) for dataset in combine: # dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int) dataset["Sex"] = dataset["Sex"].map(gender_mapping).astype(int) train_df.head() grid = sns.FacetGrid(train_df, row="Pclass", col="Sex", size=2.2, aspect=1.6) grid.map(plt.hist, "Age", alpha=0.5, bins=20) grid.add_legend() guess_ages = np.zeros((2, 3)) guess_ages for dataset in combine: for i in range(0, 2): for j in range(0, 3): guess_df = dataset[(dataset["Sex"] == i) & (dataset["Pclass"] == j + 1)][ "Age" ].dropna() # age_mean = guess_df.mean() # age_std = guess_df.std() # age_guess = rnd.uniform(age_mean - age_std, age_mean + age_std) age_guess = guess_df.median() # Convert random age float to nearest .5 age guess_ages[i, j] = int(age_guess / 0.5 + 0.5) * 0.5 for i in range(0, 2): for j in range(0, 3): dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j + 1), "Age", ] = guess_ages[i, j] dataset["Age"] = dataset["Age"].astype(int) train_df.head() train_df["AgeBand"] = pd.cut(train_df["Age"], 5) train_df[["AgeBand", "Survived"]].groupby( ["AgeBand"], as_index=False ).mean().sort_values(by="AgeBand", ascending=True) for dataset in combine: dataset.loc[dataset["Age"] <= 16, "Age"] = 0 dataset.loc[(dataset["Age"] > 16) & (dataset["Age"] <= 32), "Age"] = 1 dataset.loc[(dataset["Age"] > 32) & (dataset["Age"] <= 48), "Age"] = 2 dataset.loc[(dataset["Age"] > 48) & (dataset["Age"] <= 64), "Age"] = 3 dataset.loc[dataset["Age"] > 64, "Age"] train_df.head() train_df = train_df.drop(["AgeBand"], axis=1) combine = [train_df, test_df] train_df.head() for dataset in combine: dataset["FamilySize"] = dataset["SibSp"] + dataset["Parch"] + 1 train_df[["FamilySize", "Survived"]].groupby( ["FamilySize"], as_index=False ).mean().sort_values(by="Survived", ascending=False) for dataset in combine: dataset["IsAlone"] = 0 dataset.loc[dataset["FamilySize"] == 1, "IsAlone"] = 1 train_df[["IsAlone", "Survived"]].groupby(["IsAlone"], as_index=False).mean() train_df = train_df.drop(["Parch", "SibSp", "FamilySize"], axis=1) test_df = test_df.drop(["Parch", "SibSp", "FamilySize"], axis=1) combine = [train_df, test_df] train_df.head() for dataset in combine: dataset["Age*Class"] = dataset.Age * dataset.Pclass train_df.loc[:, ["Age*Class", "Age", "Pclass"]].head(10) freq_port = train_df.Embarked.dropna().mode()[0] freq_port for dataset in combine: dataset["Embarked"] = dataset["Embarked"].fillna(freq_port) train_df[["Embarked", "Survived"]].groupby( ["Embarked"], as_index=False ).mean().sort_values(by="Survived", ascending=False) def embarked_mapping(string): return np.random.randint(0, high=3) for dataset in combine: dataset["Embarked"] = dataset["Embarked"].map({"S": 0, "C": 1, "Q": 2}).astype(int) train_df.head() test_df["Fare"].fillna(test_df["Fare"].dropna().median(), inplace=True) test_df.head() train_df["FareBand"] = pd.qcut(train_df["Fare"], 4) train_df[["FareBand", "Survived"]].groupby( ["FareBand"], as_index=False ).mean().sort_values(by="FareBand", ascending=True) for dataset in combine: dataset.loc[dataset["Fare"] <= 7.91, "Fare"] = 0 dataset.loc[(dataset["Fare"] > 7.91) & (dataset["Fare"] <= 14.454), "Fare"] = 1 dataset.loc[(dataset["Fare"] > 14.454) & (dataset["Fare"] <= 31), "Fare"] = 2 dataset.loc[dataset["Fare"] > 31, "Fare"] = 3 dataset["Fare"] = dataset["Fare"].astype(int) train_df = train_df.drop(["FareBand"], axis=1) combine = [train_df, test_df] train_df.head(10) test_df.head(10) X_train = train_df.drop("Survived", axis=1) Y_train = train_df["Survived"] X_test = test_df.drop("PassengerId", axis=1).copy() X_train.shape, Y_train.shape, X_test.shape logreg = LogisticRegression() logreg.fit(X_train, Y_train) Y_pred = logreg.predict(X_test) acc_log = round(logreg.score(X_train, Y_train) * 100, 2) acc_log coeff_df = pd.DataFrame(train_df.columns.delete(0)) coeff_df.columns = ["Feature"] coeff_df["Correlation"] = pd.Series(logreg.coef_[0]) coeff_df.sort_values(by="Correlation", ascending=False) svc = SVC() svc.fit(X_train, Y_train) Y_pred = svc.predict(X_test) acc_svc = round(svc.score(X_train, Y_train) * 100, 2) acc_svc knn = KNeighborsClassifier(n_neighbors=3) knn.fit(X_train, Y_train) Y_pred = knn.predict(X_test) acc_knn = round(knn.score(X_train, Y_train) * 100, 2) acc_knn gaussian = GaussianNB() gaussian.fit(X_train, Y_train) Y_pred = gaussian.predict(X_test) acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2) acc_gaussian perceptron = Perceptron() perceptron.fit(X_train, Y_train) Y_pred = perceptron.predict(X_test) acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2) acc_perceptron linear_svc = LinearSVC() linear_svc.fit(X_train, Y_train) Y_pred = linear_svc.predict(X_test) acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2) acc_linear_svc sgd = SGDClassifier() sgd.fit(X_train, Y_train) Y_pred = sgd.predict(X_test) acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2) acc_sgd decision_tree = DecisionTreeClassifier() decision_tree.fit(X_train, Y_train) Y_pred = decision_tree.predict(X_test) acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2) acc_decision_tree random_forest = RandomForestClassifier(n_estimators=1) random_forest.fit(X_train, Y_train) Y_pred = random_forest.predict(X_test) random_forest.score(X_train, Y_train) acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2) acc_random_forest models = pd.DataFrame( { "Model": [ "Support Vector Machines", "KNN", "Logistic Regression", "Random Forest", "Naive Bayes", "Perceptron", "Stochastic Gradient Decent", "Linear SVC", "Decision Tree", ], "Score": [ acc_svc, acc_knn, acc_log, acc_random_forest, acc_gaussian, acc_perceptron, acc_sgd, acc_linear_svc, acc_decision_tree, ], } ) models.sort_values(by="Score", ascending=False) submission = pd.DataFrame({"PassengerId": test_df["PassengerId"], "Survived": Y_pred}) ================================================ FILE: stress_tests/kaggle/kaggle6.py ================================================ import matplotlib matplotlib.use("PS") import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns np.random.seed(2) import itertools from keras.callbacks import ReduceLROnPlateau from keras.layers import Conv2D, Dense, Dropout, Flatten, MaxPool2D from keras.models import Sequential from keras.optimizers import RMSprop from keras.preprocessing.image import ImageDataGenerator from keras.utils.np_utils import to_categorical # convert to one-hot-encoding from sklearn.metrics import confusion_matrix from sklearn.model_selection import train_test_split sns.set(style="white", context="notebook", palette="deep") train = pd.read_csv("train.csv") test = pd.read_csv("test.csv") Y_train = train["label"] X_train = train.drop(labels=["label"], axis=1) del train g = sns.countplot(Y_train) Y_train.value_counts() X_train.isnull().any().describe() test.isnull().any().describe() X_train = X_train / 255.0 test = test / 255.0 X_train = X_train.values.reshape(-1, 28, 28, 1) test = test.values.reshape(-1, 28, 28, 1) Y_train = to_categorical(Y_train, num_classes=10) random_seed = 2 X_train, X_val, Y_train, Y_val = train_test_split( X_train, Y_train, test_size=0.1, random_state=random_seed ) g = plt.imshow(X_train[0][:, :, 0]) model = Sequential() model.add( Conv2D( filters=32, kernel_size=(5, 5), padding="Same", activation="relu", input_shape=(28, 28, 1), ) ) model.add(Conv2D(filters=32, kernel_size=(5, 5), padding="Same", activation="relu")) model.add(MaxPool2D(pool_size=(2, 2))) model.add(Dropout(0.25)) model.add(Conv2D(filters=64, kernel_size=(3, 3), padding="Same", activation="relu")) model.add(Conv2D(filters=64, kernel_size=(3, 3), padding="Same", activation="relu")) model.add(MaxPool2D(pool_size=(2, 2), strides=(2, 2))) model.add(Dropout(0.25)) model.add(Flatten()) model.add(Dense(256, activation="relu")) model.add(Dropout(0.5)) model.add(Dense(10, activation="softmax")) optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0) model.compile( optimizer=optimizer, loss="categorical_crossentropy", metrics=["accuracy"] ) learning_rate_reduction = ReduceLROnPlateau( monitor="val_acc", patience=3, verbose=1, factor=0.5, min_lr=0.00001 ) epochs = 1 # Turn epochs to 30 to get 0.9967 accuracy batch_size = 86 datagen = ImageDataGenerator( featurewise_center=False, # set input mean to 0 over the dataset samplewise_center=False, # set each sample mean to 0 featurewise_std_normalization=False, # divide inputs by std of the dataset samplewise_std_normalization=False, # divide each input by its std zca_whitening=False, # apply ZCA whitening rotation_range=10, # randomly rotate images in the range (degrees, 0 to 180) zoom_range=0.1, # Randomly zoom image width_shift_range=0.1, # randomly shift images horizontally (fraction of total width) height_shift_range=0.1, # randomly shift images vertically (fraction of total height) horizontal_flip=False, # randomly flip images vertical_flip=False, ) # randomly flip images datagen.fit(X_train) history = model.fit_generator( datagen.flow(X_train, Y_train, batch_size=batch_size), epochs=epochs, validation_data=(X_val, Y_val), verbose=2, steps_per_epoch=X_train.shape[0] // batch_size, callbacks=[learning_rate_reduction], ) fig, ax = plt.subplots(2, 1) ax[0].plot(history.history["loss"], color="b", label="Training loss") ax[0].plot(history.history["val_loss"], color="r", label="validation loss", axes=ax[0]) legend = ax[0].legend(loc="best", shadow=True) ax[1].plot(history.history["acc"], color="b", label="Training accuracy") ax[1].plot(history.history["val_acc"], color="r", label="Validation accuracy") legend = ax[1].legend(loc="best", shadow=True) def plot_confusion_matrix( cm, classes, normalize=False, title="Confusion matrix", cmap=plt.cm.Blues ): """ This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. """ plt.imshow(cm, interpolation="nearest", cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) if normalize: cm = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis] thresh = cm.max() / 2.0 for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text( j, i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > thresh else "black", ) plt.tight_layout() plt.ylabel("True label") plt.xlabel("Predicted label") Y_pred = model.predict(X_val) Y_pred_classes = np.argmax(Y_pred, axis=1) Y_true = np.argmax(Y_val, axis=1) confusion_mtx = confusion_matrix(Y_true, Y_pred_classes) plot_confusion_matrix(confusion_mtx, classes=range(10)) errors = Y_pred_classes - Y_true != 0 Y_pred_classes_errors = Y_pred_classes[errors] Y_pred_errors = Y_pred[errors] Y_true_errors = Y_true[errors] X_val_errors = X_val[errors] def display_errors(errors_index, img_errors, pred_errors, obs_errors): """This function shows 6 images with their predicted and real labels""" n = 0 nrows = 2 ncols = 3 fig, ax = plt.subplots(nrows, ncols, sharex=True, sharey=True) for row in range(nrows): for col in range(ncols): error = errors_index[n] ax[row, col].imshow((img_errors[error]).reshape((28, 28))) ax[row, col].set_title( "Predicted label :{}\nTrue label :{}".format( pred_errors[error], obs_errors[error] ) ) n += 1 Y_pred_errors_prob = np.max(Y_pred_errors, axis=1) true_prob_errors = np.diagonal(np.take(Y_pred_errors, Y_true_errors, axis=1)) delta_pred_true_errors = Y_pred_errors_prob - true_prob_errors sorted_dela_errors = np.argsort(delta_pred_true_errors) most_important_errors = sorted_dela_errors[-6:] display_errors( most_important_errors, X_val_errors, Y_pred_classes_errors, Y_true_errors ) results = model.predict(test) results = np.argmax(results, axis=1) results = pd.Series(results, name="Label") submission = pd.concat([pd.Series(range(1, 28001), name="ImageId"), results], axis=1) submission.to_csv("cnn_mnist_datagen.csv", index=False) ================================================ FILE: stress_tests/kaggle/kaggle7.py ================================================ import matplotlib matplotlib.use("PS") import warnings import numpy as np from sklearn.preprocessing import LabelEncoder import modin.pandas as pd warnings.filterwarnings("ignore") import matplotlib.pyplot as plt import seaborn as sns app_train = pd.read_csv("application_train.csv") print("Training data shape: ", app_train.shape) app_train.head() app_test = pd.read_csv("application_test.csv") print("Testing data shape: ", app_test.shape) app_test.head() app_train["TARGET"].value_counts() app_train["TARGET"].astype(int).plot.hist() def missing_values_table(df): # Total missing values mis_val = df.isnull().sum() mis_val_percent = 100 * df.isnull().sum() / len(df) mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1) mis_val_table_ren_columns = mis_val_table.rename( columns={0: "Missing Values", 1: "% of Total Values"} ) mis_val_table_ren_columns = ( mis_val_table_ren_columns[mis_val_table_ren_columns.iloc[:, 1] != 0] .sort_values("% of Total Values", ascending=False) .round(1) ) print( "Your selected dataframe has " + str(df.shape[1]) + " columns.\n" "There are " + str(mis_val_table_ren_columns.shape[0]) + " columns that have missing values." ) return mis_val_table_ren_columns app_train.dtypes.value_counts() app_train.select_dtypes("object").apply(pd.Series.nunique, axis=0) le = LabelEncoder() le_count = 0 for col in app_train: if app_train[col].dtype == "object": # If 2 or fewer unique categories if len(list(app_train[col].unique())) <= 2: # Train on the training data le.fit(app_train[col]) # Transform both training and testing data app_train[col] = le.transform(app_train[col]) app_test[col] = le.transform(app_test[col]) le_count += 1 print("%d columns were label encoded." % le_count) app_train = pd.get_dummies(app_train) app_test = pd.get_dummies(app_test) print("Training Features shape: ", app_train.shape) print("Testing Features shape: ", app_test.shape) train_labels = app_train["TARGET"] app_train, app_test = app_train.align(app_test, join="inner", axis=1) app_train["TARGET"] = train_labels print("Training Features shape: ", app_train.shape) print("Testing Features shape: ", app_test.shape) (app_train["DAYS_BIRTH"] / -365).describe() app_train["DAYS_EMPLOYED"].describe() app_train["DAYS_EMPLOYED"].plot.hist(title="Days Employment Histogram") plt.xlabel("Days Employment") anom = app_train[app_train["DAYS_EMPLOYED"] == 3] non_anom = app_train[app_train["DAYS_EMPLOYED"] != 3] print( "The non-anomalies default on %0.2f%% of loans" % (100 * non_anom["TARGET"].mean()) ) print("The anomalies default on %0.2f%% of loans" % (100 * anom["TARGET"].mean())) print("There are %d anomalous days of employment" % len(anom)) app_train["DAYS_EMPLOYED_ANOM"] = app_train["DAYS_EMPLOYED"] == 3 app_train["DAYS_EMPLOYED"].replace({3: np.nan}, inplace=True) app_train["DAYS_EMPLOYED"].plot.hist(title="Days Employment Histogram") plt.xlabel("Days Employment") app_test["DAYS_EMPLOYED_ANOM"] = app_test["DAYS_EMPLOYED"] == 3 app_test["DAYS_EMPLOYED"].replace({3: np.nan}, inplace=True) print( "There are %d anomalies in the test data out of %d entries" % (app_test["DAYS_EMPLOYED_ANOM"].sum(), len(app_test)) ) correlations = app_train.corr()["TARGET"].sort_values() print("Most Positive Correlations:\n", correlations.tail(15)) print("\nMost Negative Correlations:\n", correlations.head(15)) app_train["DAYS_BIRTH"] = abs(app_train["DAYS_BIRTH"]) app_train["DAYS_BIRTH"].corr(app_train["TARGET"]) plt.style.use("fivethirtyeight") plt.hist(app_train["DAYS_BIRTH"] / 365, edgecolor="k", bins=25) plt.title("Age of Client") plt.xlabel("Age (years)") plt.ylabel("Count") plt.figure(figsize=(10, 8)) # plt.xlabel("Age (years)") plt.ylabel("Density") plt.title("Distribution of Ages") age_data = app_train[["TARGET", "DAYS_BIRTH"]] age_data["YEARS_BIRTH"] = age_data["DAYS_BIRTH"] / 365 age_data["YEARS_BINNED"] = pd.cut( age_data["YEARS_BIRTH"], bins=np.linspace(20, 70, num=11) ) age_data.head(10) age_groups = age_data.groupby("YEARS_BINNED").mean() age_groups ext_data = app_train[ ["TARGET", "EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3", "DAYS_BIRTH"] ] ext_data_corrs = ext_data.corr() ext_data_corrs plt.figure(figsize=(8, 6)) sns.heatmap(ext_data_corrs, cmap=plt.cm.RdYlBu_r, vmin=-0.25, annot=True, vmax=0.6) plt.title("Correlation Heatmap") plot_data = ext_data.drop(columns=["DAYS_BIRTH"]).copy() plot_data["YEARS_BIRTH"] = age_data["YEARS_BIRTH"] plot_data = plot_data.dropna().loc[:100000, :] def corr_func(x, y, **kwargs): r = np.corrcoef(x, y)[0][1] ax = plt.gca() ax.annotate("r = {:.2f}".format(r), xy=(0.2, 0.8), xycoords=ax.transAxes, size=20) poly_features = app_train[ ["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3", "DAYS_BIRTH", "TARGET"] ] poly_features_test = app_test[ ["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3", "DAYS_BIRTH"] ] from sklearn.preprocessing import Imputer imputer = Imputer(strategy="median") poly_target = poly_features["TARGET"] poly_features = poly_features.drop(columns=["TARGET"]) poly_features = imputer.fit_transform(poly_features) poly_features_test = imputer.transform(poly_features_test) from sklearn.preprocessing import PolynomialFeatures poly_transformer = PolynomialFeatures(degree=3) poly_transformer.fit(poly_features) poly_features = poly_transformer.transform(poly_features) poly_features_test = poly_transformer.transform(poly_features_test) print("Polynomial Features shape: ", poly_features.shape) poly_transformer.get_feature_names( input_features=["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3", "DAYS_BIRTH"] )[:15] poly_features = pd.DataFrame( poly_features, columns=poly_transformer.get_feature_names( ["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3", "DAYS_BIRTH"] ), ) poly_features["TARGET"] = poly_target poly_corrs = poly_features.corr()["TARGET"].sort_values() print(poly_corrs.head(10)) print(poly_corrs.tail(5)) poly_features_test = pd.DataFrame( poly_features_test, columns=poly_transformer.get_feature_names( ["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3", "DAYS_BIRTH"] ), ) poly_features["SK_ID_CURR"] = app_train["SK_ID_CURR"] app_train_poly = app_train.merge(poly_features, on="SK_ID_CURR", how="left") poly_features_test["SK_ID_CURR"] = app_test["SK_ID_CURR"] app_test_poly = app_test.merge(poly_features_test, on="SK_ID_CURR", how="left") app_train_poly, app_test_poly = app_train_poly.align( app_test_poly, join="inner", axis=1 ) print("Training data with polynomial features shape: ", app_train_poly.shape) print("Testing data with polynomial features shape: ", app_test_poly.shape) app_train_domain = app_train.copy() app_test_domain = app_test.copy() app_train_domain["CREDIT_INCOME_PERCENT"] = ( app_train_domain["AMT_CREDIT"] / app_train_domain["AMT_INCOME_TOTAL"] ) app_train_domain["ANNUITY_INCOME_PERCENT"] = ( app_train_domain["AMT_ANNUITY"] / app_train_domain["AMT_INCOME_TOTAL"] ) app_train_domain["CREDIT_TERM"] = ( app_train_domain["AMT_ANNUITY"] / app_train_domain["AMT_CREDIT"] ) app_train_domain["DAYS_EMPLOYED_PERCENT"] = ( app_train_domain["DAYS_EMPLOYED"] / app_train_domain["DAYS_BIRTH"] ) app_test_domain["CREDIT_INCOME_PERCENT"] = ( app_test_domain["AMT_CREDIT"] / app_test_domain["AMT_INCOME_TOTAL"] ) app_test_domain["ANNUITY_INCOME_PERCENT"] = ( app_test_domain["AMT_ANNUITY"] / app_test_domain["AMT_INCOME_TOTAL"] ) app_test_domain["CREDIT_TERM"] = ( app_test_domain["AMT_ANNUITY"] / app_test_domain["AMT_CREDIT"] ) app_test_domain["DAYS_EMPLOYED_PERCENT"] = ( app_test_domain["DAYS_EMPLOYED"] / app_test_domain["DAYS_BIRTH"] ) from sklearn.preprocessing import Imputer, MinMaxScaler if "TARGET" in app_train.columns: train = app_train.drop(columns=["TARGET"]) # TODO (williamma12): Not sure why this line is necessary but it is app_test = app_test.drop(columns=["TARGET"]) else: train = app_train.copy() features = list(train.columns) test = app_test.copy() imputer = Imputer(strategy="median") scaler = MinMaxScaler(feature_range=(0, 1)) imputer.fit(train) train = imputer.transform(train) test = imputer.transform(app_test) scaler.fit(train) train = scaler.transform(train) test = scaler.transform(test) print("Training data shape: ", train.shape) print("Testing data shape: ", test.shape) from sklearn.linear_model import LogisticRegression log_reg = LogisticRegression(C=0.0001) log_reg.fit(train, train_labels) log_reg_pred = log_reg.predict_proba(test)[:, 1] submit = app_test[["SK_ID_CURR"]] submit["TARGET"] = log_reg_pred submit.head() submit.to_csv("log_reg_baseline.csv", index=False) from sklearn.ensemble import RandomForestClassifier random_forest = RandomForestClassifier( n_estimators=100, random_state=50, verbose=1, n_jobs=-1 ) random_forest.fit(train, train_labels) feature_importance_values = random_forest.feature_importances_ feature_importances = pd.DataFrame( {"feature": features, "importance": feature_importance_values} ) predictions = random_forest.predict_proba(test)[:, 1] submit = app_test[["SK_ID_CURR"]] submit["TARGET"] = predictions submit.to_csv("random_forest_baseline.csv", index=False) poly_features_names = list(app_train_poly.columns) imputer = Imputer(strategy="median") poly_features = imputer.fit_transform(app_train_poly) poly_features_test = imputer.transform(app_test_poly) scaler = MinMaxScaler(feature_range=(0, 1)) poly_features = scaler.fit_transform(poly_features) poly_features_test = scaler.transform(poly_features_test) random_forest_poly = RandomForestClassifier( n_estimators=100, random_state=50, verbose=1, n_jobs=-1 ) random_forest_poly.fit(poly_features, train_labels) predictions = random_forest_poly.predict_proba(poly_features_test)[:, 1] submit = app_test[["SK_ID_CURR"]] submit["TARGET"] = predictions submit.to_csv("random_forest_baseline_engineered.csv", index=False) app_train_domain = app_train_domain.drop(columns="TARGET") app_test_domain = app_test_domain.drop(columns="TARGET") domain_features_names = list(app_train_domain.columns) imputer = Imputer(strategy="median") domain_features = imputer.fit_transform(app_train_domain) domain_features_test = imputer.transform(app_test_domain) scaler = MinMaxScaler(feature_range=(0, 1)) domain_features = scaler.fit_transform(domain_features) domain_features_test = scaler.transform(domain_features_test) random_forest_domain = RandomForestClassifier( n_estimators=100, random_state=50, verbose=1, n_jobs=-1 ) random_forest_domain.fit(domain_features, train_labels) feature_importance_values_domain = random_forest_domain.feature_importances_ feature_importances_domain = pd.DataFrame( {"feature": domain_features_names, "importance": feature_importance_values_domain} ) predictions = random_forest_domain.predict_proba(domain_features_test)[:, 1] submit = app_test[["SK_ID_CURR"]] submit["TARGET"] = predictions submit.to_csv("random_forest_baseline_domain.csv", index=False) def plot_feature_importances(df): df = df.sort_values("importance", ascending=False).reset_index() df["importance_normalized"] = df["importance"] / df["importance"].sum() plt.figure(figsize=(10, 6)) ax = plt.subplot() ax.barh( list(reversed(list(df.index[:15]))), df["importance_normalized"].head(15), align="center", edgecolor="k", ) ax.set_yticks(list(reversed(list(df.index[:15])))) ax.set_yticklabels(df["feature"].head(15)) plt.xlabel("Normalized Importance") plt.title("Feature Importances") return df feature_importances_sorted = plot_feature_importances(feature_importances) feature_importances_domain_sorted = plot_feature_importances(feature_importances_domain) import gc import lightgbm as lgb from sklearn.metrics import roc_auc_score from sklearn.model_selection import KFold def model(features, test_features, encoding="ohe", n_folds=5): test_ids = test_features["SK_ID_CURR"] labels = features["TARGET"] features = features.drop(columns=["SK_ID_CURR", "TARGET"]) test_features = test_features.drop(columns=["SK_ID_CURR"]) if encoding == "ohe": features = pd.get_dummies(features) test_features = pd.get_dummies(test_features) features, test_features = features.align(test_features, join="inner", axis=1) cat_indices = "auto" elif encoding == "le": label_encoder = LabelEncoder() cat_indices = [] for i, col in enumerate(features): if features[col].dtype == "object": features[col] = label_encoder.fit_transform( np.array(features[col].astype(str)).reshape((-1,)) ) test_features[col] = label_encoder.transform( np.array(test_features[col].astype(str)).reshape((-1,)) ) cat_indices.append(i) else: raise ValueError("Encoding must be either 'ohe' or 'le'") print("Training Data Shape: ", features.shape) print("Testing Data Shape: ", test_features.shape) feature_names = list(features.columns) features = np.array(features) test_features = np.array(test_features) k_fold = KFold(n_splits=n_folds, shuffle=True, random_state=50) feature_importance_values = np.zeros(len(feature_names)) test_predictions = np.zeros(test_features.shape[0]) out_of_fold = np.zeros(features.shape[0]) valid_scores = [] train_scores = [] for train_indices, valid_indices in k_fold.split(features): train_features, train_labels = features[train_indices], labels[train_indices] valid_features, valid_labels = features[valid_indices], labels[valid_indices] model = lgb.LGBMClassifier( n_estimators=10000, objective="binary", class_weight="balanced", learning_rate=0.05, reg_alpha=0.1, reg_lambda=0.1, subsample=0.8, n_jobs=-1, random_state=50, ) model.fit( train_features, train_labels, eval_metric="auc", eval_set=[(valid_features, valid_labels), (train_features, train_labels)], eval_names=["valid", "train"], categorical_feature=cat_indices, early_stopping_rounds=100, verbose=200, ) best_iteration = model.best_iteration_ feature_importance_values += model.feature_importances_ / k_fold.n_splits test_predictions += ( model.predict_proba(test_features, num_iteration=best_iteration)[:, 1] / k_fold.n_splits ) out_of_fold[valid_indices] = model.predict_proba( valid_features, num_iteration=best_iteration )[:, 1] valid_score = model.best_score_["valid"]["auc"] train_score = model.best_score_["train"]["auc"] valid_scores.append(valid_score) train_scores.append(train_score) gc.enable() del model, train_features, valid_features gc.collect() submission = pd.DataFrame({"SK_ID_CURR": test_ids, "TARGET": test_predictions}) feature_importances = pd.DataFrame( {"feature": feature_names, "importance": feature_importance_values} ) valid_auc = roc_auc_score(labels, out_of_fold) valid_scores.append(valid_auc) train_scores.append(np.mean(train_scores)) fold_names = list(range(n_folds)) fold_names.append("overall") metrics = pd.DataFrame( {"fold": fold_names, "train": train_scores, "valid": valid_scores} ) return submission, feature_importances, metrics submission, fi, metrics = model(app_train, app_test) print("Baseline metrics") print(metrics) fi_sorted = plot_feature_importances(fi) submission.to_csv("baseline_lgb.csv", index=False) app_train_domain["TARGET"] = train_labels submission_domain, fi_domain, metrics_domain = model(app_train_domain, app_test_domain) print("Baseline with domain knowledge features metrics") print(metrics_domain) fi_sorted = plot_feature_importances(fi_domain) submission_domain.to_csv("baseline_lgb_domain_features.csv", index=False) ================================================ FILE: stress_tests/kaggle/kaggle8.py ================================================ from sklearn.ensemble import RandomForestRegressor import modin.pandas as pd train = pd.read_csv("train.csv") train_y = train.SalePrice predictor_cols = ["LotArea", "OverallQual", "YearBuilt", "TotRmsAbvGrd"] train_X = train[predictor_cols] my_model = RandomForestRegressor() my_model.fit(train_X, train_y) test = pd.read_csv("test.csv") test_X = test[predictor_cols] predicted_prices = my_model.predict(test_X) print(predicted_prices) my_submission = pd.DataFrame({"Id": test.Id, "SalePrice": predicted_prices}) my_submission.to_csv("submission.csv", index=False) ================================================ FILE: stress_tests/kaggle/kaggle9.py ================================================ import matplotlib matplotlib.use("PS") import matplotlib import matplotlib.pyplot as plt import numpy as np from scipy.stats import skew import modin.pandas as pd train = pd.read_csv("train.csv") test = pd.read_csv("test.csv") train.head() all_data = pd.concat( ( train.loc[:, "MSSubClass":"SaleCondition"], test.loc[:, "MSSubClass":"SaleCondition"], ) ) matplotlib.rcParams["figure.figsize"] = (12.0, 6.0) prices = pd.DataFrame( {"price": train["SalePrice"], "log(price + 1)": np.log1p(train["SalePrice"])} ) prices.hist() train["SalePrice"] = np.log1p(train["SalePrice"]) numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index skewed_feats = train[numeric_feats].apply( lambda x: skew(x.dropna()) ) # compute skewness skewed_feats = skewed_feats[skewed_feats > 0.75] skewed_feats = skewed_feats.index all_data[skewed_feats] = np.log1p(all_data[skewed_feats]) all_data = pd.get_dummies(all_data) all_data = all_data.fillna(all_data.mean()) X_train = all_data[: train.shape[0]] X_test = all_data[train.shape[0] :] y = train.SalePrice from sklearn.linear_model import LassoCV # RidgeCV, ElasticNet, LassoLarsCV from sklearn.linear_model import Ridge from sklearn.model_selection import cross_val_score def rmse_cv(model): rmse = np.sqrt( -cross_val_score(model, X_train, y, scoring="neg_mean_squared_error", cv=5) ) return rmse model_ridge = Ridge() alphas = [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75] cv_ridge = [rmse_cv(Ridge(alpha=alpha)).mean() for alpha in alphas] cv_ridge = pd.Series(cv_ridge, index=alphas) cv_ridge.plot(title="Validation - Just Do It") plt.xlabel("alpha") plt.ylabel("rmse") cv_ridge.min() model_lasso = LassoCV(alphas=[1, 0.1, 0.001, 0.0005]).fit(X_train, y) rmse_cv(model_lasso).mean() coef = pd.Series(model_lasso.coef_, index=X_train.columns) print( "Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables" ) imp_coef = pd.concat([coef.sort_values().head(10), coef.sort_values().tail(10)]) matplotlib.rcParams["figure.figsize"] = (8.0, 10.0) imp_coef.plot(kind="barh") plt.title("Coefficients in the Lasso Model") matplotlib.rcParams["figure.figsize"] = (6.0, 6.0) preds = pd.DataFrame({"preds": model_lasso.predict(X_train), "true": y}) preds["residuals"] = preds["true"] - preds["preds"] preds.plot(x="preds", y="residuals", kind="scatter") import xgboost as xgb dtrain = xgb.DMatrix(X_train, label=y) dtest = xgb.DMatrix(X_test) params = {"max_depth": 2, "eta": 0.1} model = xgb.cv(params, dtrain, num_boost_round=500, early_stopping_rounds=100) model.loc[30:, ["test-rmse-mean", "train-rmse-mean"]].plot() model_xgb = xgb.XGBRegressor( n_estimators=360, max_depth=2, learning_rate=0.1 ) # the params were tuned using xgb.cv model_xgb.fit(X_train, y) xgb_preds = np.expm1(model_xgb.predict(X_test)) lasso_preds = np.expm1(model_lasso.predict(X_test)) predictions = pd.DataFrame({"xgb": xgb_preds, "lasso": lasso_preds}) predictions.plot(x="xgb", y="lasso", kind="scatter") preds = 0.7 * lasso_preds + 0.3 * xgb_preds solution = pd.DataFrame({"id": test.Id, "SalePrice": preds}) solution.to_csv("ridge_sol.csv", index=False) from keras.layers import Dense from keras.models import Sequential from keras.regularizers import l1 from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler X_train = StandardScaler().fit_transform(X_train) X_tr, X_val, y_tr, y_val = train_test_split(X_train, y, random_state=3) X_tr.shape X_tr model = Sequential() model.add(Dense(1, input_dim=X_train.shape[1], W_regularizer=l1(0.001))) model.compile(loss="mse", optimizer="adam") model.summary() hist = model.fit(X_tr, y_tr, validation_data=(X_val, y_val)) pd.Series(model.predict(X_val)[:, 0]).hist() ================================================ FILE: stress_tests/run_stress_tests.sh ================================================ #!/usr/bin/env bash # Show explicitly which commands are currently running. set -x # TODO (williamma12): Once we use clusters, make sure to download latest wheels # from s3 bucket instead of building ray # Ray directory RAY_DIR=${1} ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd) RESULT_FILE=$ROOT_DIR/results-$(date '+%Y-%m-%d_%H-%M-%S').log echo "Logging to" $RESULT_FILE touch $RESULT_FILE setup_environment(){ pushd "$ROOT_DIR" # Create a virtual environment for the stress tests python -m virtualenv stress_tests_env >> $RESULT_FILE source stress_tests_env/bin/activate >> $RESULT_FILE # Install ray from source if available if [[ ! -z "$RAY_DIR" ]]; then pushd "$RAY_DIR" pip install -e . --verbose >> $RESULT_FILE popd fi # Install modin from source to virtual environment pushd "$ROOT_DIR/.." pip install -e . >> $RESULT_FILE popd # Install basic data science packages pip install matplotlib numpy seaborn scipy >> $RESULT_FILE # Install machine learning packages pip install scikit-learn xgboost lightgbm keras >> $RESULT_FILE # Install packages for kaggle18 pip install nltk wordcloud plotly bokeh pyLDAvis >> $RESULT_FILE popd } teardown_environment(){ pushd "$ROOT_DIR" rm -rf stress_tests_env >> $RESULT_FILE popd } run_test(){ local test_name=$1 echo "Try running $test_name." { pytest -vls "$test_name.py" >> $RESULT_FILE } || echo "FAIL: $test_name" >> $RESULT_FILE } pushd "$ROOT_DIR" setup_environment run_test test_kaggle_ipynb teardown_environment popd cat $RESULT_FILE [ ! -s $RESULT_FILE ] || exit 1 ================================================ FILE: stress_tests/test_kaggle_ipynb.py ================================================ import logging import os import subprocess import numpy as np import pytest import modin.pandas as pd # import ray # ray.init(address="localhost:6379") logger = logging.getLogger(__name__) # Size for synthetic datasets DF_SIZE = 1 * 2**10 * 2**10 # * 2**10 # 1 GiB dataframes # This file path DIR_PATH = os.path.dirname(os.path.realpath(__file__)) KAGGLE_DIR_PATH = "{}/kaggle".format(DIR_PATH) def create_dataframe(columns, dtypes, size): def _num_to_str(x): letters = "" while x: mod = (x - 1) % 26 letters += chr(mod + 65) x = (x - 1) // 26 result = "".join(reversed(letters)) if "NA" in result: return _num_to_str(x + 1) else: return result result_dict = {} for col, dtype in zip(columns, dtypes): if dtype is str: result_dict[col] = [_num_to_str(x + 1) for x in np.arange(size, dtype=int)] elif dtype is bool: result_dict[col] = [x % 2 == 0 for x in np.arange(size, dtype=int)] else: result_dict[col] = np.arange(size, dtype=dtype) return pd.DataFrame(result_dict) @pytest.fixture def generate_dataset(): """Generates a synthetic dataset using the given arguments. Args: columns (list): Column names of the result dtypes (list): List of dtypes for the corresponding column size (int): Number of rows for result Returns: Modin dataframe of synthetic data following arguments. """ # Record of files generated for a test filenames = [] def _dataset_builder(filename, columns, dtypes, size=DF_SIZE, files_to_remove=[]): # Add the files generated by the script to be removed for file in files_to_remove: filenames.append("{}/{}".format(KAGGLE_DIR_PATH, file)) # Update filename to include path filename = "{}/{}".format(KAGGLE_DIR_PATH, filename) # Check that the number of column names is the same as the nubmer of dtypes if len(columns) != len(dtypes): raise ValueError("len(columns) != len(dtypes)") # Determine number of rows for synthetic dataset row_size = ( create_dataframe(columns, dtypes, 1) .memory_usage(index=False, deep=True) .sum() ) result = create_dataframe(columns, dtypes, np.ceil(size / row_size)) result.to_csv(filename) filenames.append(filename) return result # Return dataset builder factory yield _dataset_builder # Delete files created for filename in filenames: if os.path.exists(filename): os.remove(filename) def test_kaggle3(generate_dataset): pokemon_columns = [ "#", "Name", "Type 1", "Type 2", "HP", "Attack", "Defense", "Sp. Atk", "Sp. Def", "Speed", "Generation", "Legendary", ] pokemon_dtypes = [int, str, str, str, int, int, int, int, int, int, int, bool] generate_dataset( "pokemon.csv", pokemon_columns, pokemon_dtypes, files_to_remove=["graph.png"] ) ipynb = subprocess.Popen( ["python", "kaggle3.py"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=KAGGLE_DIR_PATH, ) outs, errs = ipynb.communicate() if ipynb.returncode: logging.debug("Error message\n-------------\n %s", errs.decode("utf-8")) logging.info("Finished kaggle3") assert ipynb.returncode == 0 def test_kaggle4(generate_dataset): columns = [ "Id", "MSSubClass", "MSZoning", "LotFrontage", "LotArea", "Street", "Alley", "LotShape", "LandContour", "Utilities", "LotConfig", "LandSlope", "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle", "OverallQual", "OverallCond", "YearBuilt", "YearRemodAdd", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "MasVnrArea", "ExterQual", "ExterCond", "Foundation", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinSF1", "BsmtFinType2", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "Heating", "HeatingQC", "CentralAir", "Electrical", "1stFlrSF", "2ndFlrSF", "LowQualFinSF", "GrLivArea", "BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr", "KitchenQual", "TotRmsAbvGrd", "Functional", "Fireplaces", "FireplaceQu", "GarageType", "GarageYrBlt", "GarageFinish", "GarageCars", "GarageArea", "GarageQual", "GarageCond", "PavedDrive", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch", "PoolArea", "PoolQC", "Fence", "MiscFeature", "MiscVal", "MoSold", "YrSold", "SaleType", "SaleCondition", "SalePrice", ] dtypes = [ int, int, str, float, int, str, float, str, str, str, str, str, str, str, str, str, str, int, int, int, int, str, str, str, str, str, float, str, str, str, str, str, str, str, int, str, int, int, int, str, str, str, str, int, int, int, int, int, int, int, int, int, int, str, int, str, int, float, str, float, str, int, int, str, str, str, int, int, int, int, int, int, float, float, float, int, int, int, str, str, int, ] generate_dataset("train.csv", columns, dtypes) generate_dataset("test.csv", columns, dtypes) ipynb = subprocess.Popen( ["python", "kaggle4.py"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=KAGGLE_DIR_PATH, ) outs, errs = ipynb.communicate() if ipynb.returncode: logging.debug("Error message\n-------------\n %s", errs.decode("utf-8")) logging.info("Finished kaggle4") assert ipynb.returncode == 0 def test_kaggle5(generate_dataset): columns = [ "PassengerId", "Survived", "Pclass", "Name", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked", ] dtypes = [int, int, int, str, str, float, int, int, str, float, float, str] generate_dataset("train.csv", columns, dtypes) generate_dataset("test.csv", columns, dtypes) ipynb = subprocess.Popen( ["python", "kaggle5.py"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=KAGGLE_DIR_PATH, ) outs, errs = ipynb.communicate() if ipynb.returncode: logging.debug("Error message\n-------------\n %s", errs.decode("utf-8")) logging.info("Finished kaggle5") assert ipynb.returncode == 0 @pytest.mark.skip("Missing Original Data Schema") def test_kaggle6(generate_dataset): columns = [] dtypes = [] generate_dataset("test.csv", columns, dtypes) generate_dataset("train.csv", columns, dtypes) ipynb = subprocess.Popen( ["python", "kaggle6.py"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=KAGGLE_DIR_PATH, ) outs, errs = ipynb.communicate() if ipynb.returncode: logging.debug("Error message\n-------------\n %s", errs.decode("utf-8")) logging.info("Finished kaggle6") assert ipynb.returncode == 0 def test_kaggle7(generate_dataset): columns = [ "SK_ID_CURR", "TARGET", "NAME_CONTRACT_TYPE", "CODE_GENDER", "FLAG_OWN_CAR", "FLAG_OWN_REALTY", "CNT_CHILDREN", "AMT_INCOME_TOTAL", "AMT_CREDIT", "AMT_ANNUITY", "AMT_GOODS_PRICE", "NAME_TYPE_SUITE", "NAME_INCOME_TYPE", "NAME_EDUCATION_TYPE", "NAME_FAMILY_STATUS", "NAME_HOUSING_TYPE", "REGION_POPULATION_RELATIVE", "DAYS_BIRTH", "DAYS_EMPLOYED", "DAYS_REGISTRATION", "DAYS_ID_PUBLISH", "OWN_CAR_AGE", "FLAG_MOBIL", "FLAG_EMP_PHONE", "FLAG_WORK_PHONE", "FLAG_CONT_MOBILE", "FLAG_PHONE", "FLAG_EMAIL", "OCCUPATION_TYPE", "CNT_FAM_MEMBERS", "REGION_RATING_CLIENT", "REGION_RATING_CLIENT_W_CITY", "WEEKDAY_APPR_PROCESS_START", "HOUR_APPR_PROCESS_START", "REG_REGION_NOT_LIVE_REGION", "REG_REGION_NOT_WORK_REGION", "LIVE_REGION_NOT_WORK_REGION", "REG_CITY_NOT_LIVE_CITY", "REG_CITY_NOT_WORK_CITY", "LIVE_CITY_NOT_WORK_CITY", "ORGANIZATION_TYPE", "EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3", "APARTMENTS_AVG", "BASEMENTAREA_AVG", "YEARS_BEGINEXPLUATATION_AVG", "YEARS_BUILD_AVG", "COMMONAREA_AVG", "ELEVATORS_AVG", "ENTRANCES_AVG", "FLOORSMAX_AVG", "FLOORSMIN_AVG", "LANDAREA_AVG", "LIVINGAPARTMENTS_AVG", "LIVINGAREA_AVG", "NONLIVINGAPARTMENTS_AVG", "NONLIVINGAREA_AVG", "APARTMENTS_MODE", "BASEMENTAREA_MODE", "YEARS_BEGINEXPLUATATION_MODE", "YEARS_BUILD_MODE", "COMMONAREA_MODE", "ELEVATORS_MODE", "ENTRANCES_MODE", "FLOORSMAX_MODE", "FLOORSMIN_MODE", "LANDAREA_MODE", "LIVINGAPARTMENTS_MODE", "LIVINGAREA_MODE", "NONLIVINGAPARTMENTS_MODE", "NONLIVINGAREA_MODE", "APARTMENTS_MEDI", "BASEMENTAREA_MEDI", "YEARS_BEGINEXPLUATATION_MEDI", "YEARS_BUILD_MEDI", "COMMONAREA_MEDI", "ELEVATORS_MEDI", "ENTRANCES_MEDI", "FLOORSMAX_MEDI", "FLOORSMIN_MEDI", "LANDAREA_MEDI", "LIVINGAPARTMENTS_MEDI", "LIVINGAREA_MEDI", "NONLIVINGAPARTMENTS_MEDI", "NONLIVINGAREA_MEDI", "FONDKAPREMONT_MODE", "HOUSETYPE_MODE", "TOTALAREA_MODE", "WALLSMATERIAL_MODE", "EMERGENCYSTATE_MODE", "OBS_30_CNT_SOCIAL_CIRCLE", "DEF_30_CNT_SOCIAL_CIRCLE", "OBS_60_CNT_SOCIAL_CIRCLE", "DEF_60_CNT_SOCIAL_CIRCLE", "DAYS_LAST_PHONE_CHANGE", "FLAG_DOCUMENT_2", "FLAG_DOCUMENT_3", "FLAG_DOCUMENT_4", "FLAG_DOCUMENT_5", "FLAG_DOCUMENT_6", "FLAG_DOCUMENT_7", "FLAG_DOCUMENT_8", "FLAG_DOCUMENT_9", "FLAG_DOCUMENT_10", "FLAG_DOCUMENT_11", "FLAG_DOCUMENT_12", "FLAG_DOCUMENT_13", "FLAG_DOCUMENT_14", "FLAG_DOCUMENT_15", "FLAG_DOCUMENT_16", "FLAG_DOCUMENT_17", "FLAG_DOCUMENT_18", "FLAG_DOCUMENT_19", "FLAG_DOCUMENT_20", "FLAG_DOCUMENT_21", "AMT_REQ_CREDIT_BUREAU_HOUR", "AMT_REQ_CREDIT_BUREAU_DAY", "AMT_REQ_CREDIT_BUREAU_WEEK", "AMT_REQ_CREDIT_BUREAU_MON", "AMT_REQ_CREDIT_BUREAU_QRT", "AMT_REQ_CREDIT_BUREAU_YEAR", ] dtypes = [ int, int, str, str, str, str, int, float, float, float, float, str, str, str, str, str, float, int, int, float, int, float, int, int, int, int, int, int, str, float, int, int, str, int, int, int, int, int, int, int, str, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, str, str, float, str, str, float, float, float, float, float, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float, float, float, float, float, float, ] generate_dataset( "application_train.csv", columns, dtypes, files_to_remove=[ "log_reg_baseline.csv", "random_forest_baseline.csv", "random_forest_baseline_engineered.csv", "random_forest_baseline_domain.csv", "baseline_lgb.csv", "baseline_lgb_domain_features.csv", ], ) generate_dataset("application_test.csv", columns, dtypes) ipynb = subprocess.Popen( ["python", "kaggle7.py"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=KAGGLE_DIR_PATH, ) outs, errs = ipynb.communicate() if ipynb.returncode: logging.debug("Error message\n-------------\n %s", errs.decode("utf-8")) logging.info("Finished kaggle7") assert ipynb.returncode == 0 def test_kaggle8(generate_dataset): columns = [ "Id", "MSSubClass", "MSZoning", "LotFrontage", "LotArea", "Street", "Alley", "LotShape", "LandContour", "Utilities", "LotConfig", "LandSlope", "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle", "OverallQual", "OverallCond", "YearBuilt", "YearRemodAdd", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "MasVnrArea", "ExterQual", "ExterCond", "Foundation", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinSF1", "BsmtFinType2", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "Heating", "HeatingQC", "CentralAir", "Electrical", "1stFlrSF", "2ndFlrSF", "LowQualFinSF", "GrLivArea", "BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr", "KitchenQual", "TotRmsAbvGrd", "Functional", "Fireplaces", "FireplaceQu", "GarageType", "GarageYrBlt", "GarageFinish", "GarageCars", "GarageArea", "GarageQual", "GarageCond", "PavedDrive", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch", "PoolArea", "PoolQC", "Fence", "MiscFeature", "MiscVal", "MoSold", "YrSold", "SaleType", "SaleCondition", "SalePrice", ] dtypes = [ int, int, str, float, int, str, float, str, str, str, str, str, str, str, str, str, str, int, int, int, int, str, str, str, str, str, float, str, str, str, str, str, str, str, float, str, float, float, float, str, str, str, str, int, int, int, int, float, float, int, int, int, int, str, int, str, int, float, str, float, str, float, float, str, str, str, int, int, int, int, int, int, float, str, float, int, int, int, str, str, int, ] generate_dataset("test.csv", columns, dtypes, files_to_remove=["submission.csv"]) generate_dataset("train.csv", columns, dtypes) ipynb = subprocess.Popen( ["python", "kaggle8.py"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=KAGGLE_DIR_PATH, ) outs, errs = ipynb.communicate() if ipynb.returncode: logging.debug("Error message\n-------------\n %s", errs.decode("utf-8")) logging.info("Finished kaggle8") assert ipynb.returncode == 0 def test_kaggle9(generate_dataset): columns = [ "Id", "MSSubClass", "MSZoning", "LotFrontage", "LotArea", "Street", "Alley", "LotShape", "LandContour", "Utilities", "LotConfig", "LandSlope", "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle", "OverallQual", "OverallCond", "YearBuilt", "YearRemodAdd", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "MasVnrArea", "ExterQual", "ExterCond", "Foundation", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinSF1", "BsmtFinType2", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "Heating", "HeatingQC", "CentralAir", "Electrical", "1stFlrSF", "2ndFlrSF", "LowQualFinSF", "GrLivArea", "BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr", "KitchenQual", "TotRmsAbvGrd", "Functional", "Fireplaces", "FireplaceQu", "GarageType", "GarageYrBlt", "GarageFinish", "GarageCars", "GarageArea", "GarageQual", "GarageCond", "PavedDrive", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch", "PoolArea", "PoolQC", "Fence", "MiscFeature", "MiscVal", "MoSold", "YrSold", "SaleType", "SaleCondition", "SalePrice", ] dtypes = [ int, int, str, float, int, str, float, str, str, str, str, str, str, str, str, str, str, int, int, int, int, str, str, str, str, str, float, str, str, str, str, str, str, str, int, str, int, int, int, str, str, str, str, int, int, int, int, int, int, int, int, int, int, str, int, str, int, float, str, float, str, int, int, str, str, str, int, int, int, int, int, int, float, float, float, int, int, int, str, str, int, ] generate_dataset("test.csv", columns, dtypes, files_to_remove=["ridge_sol.csv"]) generate_dataset("train.csv", columns, dtypes) ipynb = subprocess.Popen( ["python", "kaggle9.py"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=KAGGLE_DIR_PATH, ) outs, errs = ipynb.communicate() if ipynb.returncode: logging.debug("Error message\n-------------\n %s", errs.decode("utf-8")) logging.info("Finished kaggle9") assert ipynb.returncode == 0 def test_kaggle10(generate_dataset): columns = [ "pelvic_incidence", "pelvic_tilt numeric", "lumbar_lordosis_angle", "sacral_slope", "pelvic_radius", "degree_spondylolisthesis", "class", ] dtypes = [float, float, float, float, float, float, str] generate_dataset( "column_2C_weka.csv", columns, dtypes, files_to_remove=["graph.png"] ) ipynb = subprocess.Popen( ["python", "kaggle10.py"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=KAGGLE_DIR_PATH, ) outs, errs = ipynb.communicate() if ipynb.returncode: logging.debug("Error message\n-------------\n %s", errs.decode("utf-8")) logging.info("Finished kaggle10") assert ipynb.returncode == 0 def test_kaggle12(generate_dataset): columns = [ "PassengerId", "Survived", "Pclass", "Name", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked", ] dtypes = [int, int, int, str, str, float, int, int, str, float, float, str] generate_dataset( "train.csv", columns, dtypes, files_to_remove=["ensemble_python_voting.csv"] ) generate_dataset("test.csv", columns, dtypes) ipynb = subprocess.Popen( ["python", "kaggle12.py"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=KAGGLE_DIR_PATH, ) outs, errs = ipynb.communicate() if ipynb.returncode: logging.debug("Error message\n-------------\n %s", errs.decode("utf-8")) logging.info("Finished kaggle12") assert ipynb.returncode == 0 def test_kaggle13(generate_dataset): columns = [ "Id", "SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm", "Species", ] dtypes = [int, float, float, float, float, str] generate_dataset("Iris.csv", columns, dtypes) ipynb = subprocess.Popen( ["python", "kaggle13.py"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=KAGGLE_DIR_PATH, ) outs, errs = ipynb.communicate() if ipynb.returncode: logging.debug("Error message\n-------------\n %s", errs.decode("utf-8")) logging.info("Finished kaggle13") assert ipynb.returncode == 0 def test_kaggle14(generate_dataset): columns = [ "PassengerId", "Survived", "Pclass", "Name", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked", ] dtypes = [int, int, int, str, str, float, int, int, str, float, float, str] generate_dataset("train.csv", columns, dtypes) generate_dataset("test.csv", columns, dtypes) ipynb = subprocess.Popen( ["python", "kaggle14.py"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=KAGGLE_DIR_PATH, ) outs, errs = ipynb.communicate() if ipynb.returncode: logging.debug("Error message\n-------------\n %s", errs.decode("utf-8")) logging.info("Finished kaggle14") assert ipynb.returncode == 0 def test_kaggle17(generate_dataset): columns = [ "Suburb", "Address", "Rooms", "Type", "Price", "Method", "SellerG", "Date", "Distance", "Postcode", "Bedroom2", "Bathroom", "Car", "Landsize", "BuildingArea", "YearBuilt", "CouncilArea", "Lattitude", "Longtitude", "Regionname", "Propertycount", ] dtypes = [ str, str, int, str, float, str, str, str, float, float, float, float, float, float, float, float, str, float, float, str, float, ] generate_dataset("melb_data.csv", columns, dtypes) ipynb = subprocess.Popen( ["python", "kaggle17.py"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=KAGGLE_DIR_PATH, ) outs, errs = ipynb.communicate() if ipynb.returncode: logging.debug("Error message\n-------------\n %s", errs.decode("utf-8")) logging.info("Finished kaggle17") assert ipynb.returncode == 0 def test_kaggle18(generate_dataset): columns = [ "train_id", "name", "item_condition_id", "category_name", "brand_name", "price", "shipping", "item_description", ] # TODO (williamma12): "category_name" should be strings but original data # that is not currently captured by the data generation dtypes = [int, str, int, int, float, float, int, str] generate_dataset("test.csv", columns, dtypes) generate_dataset("train.csv", columns, dtypes) ipynb = subprocess.Popen( ["python", "kaggle18.py"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=KAGGLE_DIR_PATH, ) outs, errs = ipynb.communicate() if ipynb.returncode: logging.debug("Error message\n-------------\n %s", errs.decode("utf-8")) logging.info("Finished kaggle18") assert ipynb.returncode == 0 def test_kaggle19(generate_dataset): columns = [ "Id", "groupId", "matchId", "assists", "boosts", "damageDealt", "DBNOs", "headshotKills", "heals", "killPlace", "killPoints", "kills", "killStreaks", "longestKill", "matchDuration", "matchType", "maxPlace", "numGroups", "rankPoints", "revives", "rideDistance", "roadKills", "swimDistance", "teamKills", "vehicleDestroys", "walkDistance", "weaponsAcquired", "winPoints", "winPlacePerc", ] dtypes = [ str, str, str, int, int, float, int, int, int, int, int, int, int, float, int, str, int, int, int, int, float, int, float, int, int, float, int, int, int, ] generate_dataset("train.csv", columns, dtypes) ipynb = subprocess.Popen( ["python", "kaggle19.py"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=KAGGLE_DIR_PATH, ) outs, errs = ipynb.communicate() if ipynb.returncode: logging.debug("Error message\n-------------\n %s", errs.decode("utf-8")) logging.info("Finished kaggle19") assert ipynb.returncode == 0 def test_kaggle20(generate_dataset): columns = [ "id", "diagnosis", "radius_mean", "texture_mean", "perimeter_mean", "area_mean", "smoothness_mean", "compactness_mean", "concavity_mean", "concave points_mean", "symmetry_mean", "fractal_dimension_mean", "radius_se", "texture_se", "perimeter_se", "area_se", "smoothness_se", "compactness_se", "concavity_se", "concave points_se", "symmetry_se", "fractal_dimension_se", "radius_worst", "texture_worst", "perimeter_worst", "area_worst", "smoothness_worst", "compactness_worst", "concavity_worst", "concave points_worst", "symmetry_worst", "fractal_dimension_worst", "Unnamed: 32", ] dtypes = [ int, str, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ] generate_dataset("data.csv", columns, dtypes) ipynb = subprocess.Popen( ["python", "kaggle20.py"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=KAGGLE_DIR_PATH, ) outs, errs = ipynb.communicate() if ipynb.returncode: logging.debug("Error message\n-------------\n %s", errs.decode("utf-8")) logging.info("Finished kaggle20") assert ipynb.returncode == 0 def test_kaggle22(generate_dataset): train_columns = [ "id", "comment_text", "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate", ] train_dtypes = [str, str, float, float, float, float, float, float] test_columns = ["id", "comment_text"] test_dtypes = [str, str] submission_columns = [ "id", "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate", ] submission_dtypes = [str, float, float, float, float, float, float] generate_dataset( "train.csv", train_columns, train_dtypes, files_to_remove=["submission.csv"] ) generate_dataset("test.csv", test_columns, test_dtypes) generate_dataset("sample_submission.csv", submission_columns, submission_dtypes) ipynb = subprocess.Popen( ["python", "kaggle22.py"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=KAGGLE_DIR_PATH, ) outs, errs = ipynb.communicate() if ipynb.returncode: logging.debug("Error message\n-------------\n %s", errs.decode("utf-8")) logging.info("Finished kaggle22") assert ipynb.returncode == 0 ================================================ FILE: versioneer.py ================================================ # Version: 0.29 """The Versioneer - like a rocketeer, but for versions. The Versioneer ============== * like a rocketeer, but for versions! * https://github.com/python-versioneer/python-versioneer * Brian Warner * License: Public Domain (Unlicense) * Compatible with: Python 3.7, 3.8, 3.9, 3.10, 3.11 and pypy3 * [![Latest Version][pypi-image]][pypi-url] * [![Build Status][travis-image]][travis-url] This is a tool for managing a recorded version number in setuptools-based python projects. The goal is to remove the tedious and error-prone "update the embedded version string" step from your release process. Making a new release should be as easy as recording a new tag in your version-control system, and maybe making new tarballs. ## Quick Install Versioneer provides two installation modes. The "classic" vendored mode installs a copy of versioneer into your repository. The experimental build-time dependency mode is intended to allow you to skip this step and simplify the process of upgrading. ### Vendored mode * `pip install versioneer` to somewhere in your $PATH * A [conda-forge recipe](https://github.com/conda-forge/versioneer-feedstock) is available, so you can also use `conda install -c conda-forge versioneer` * add a `[tool.versioneer]` section to your `pyproject.toml` or a `[versioneer]` section to your `setup.cfg` (see [Install](INSTALL.md)) * Note that you will need to add `tomli; python_version < "3.11"` to your build-time dependencies if you use `pyproject.toml` * run `versioneer install --vendor` in your source tree, commit the results * verify version information with `python setup.py version` ### Build-time dependency mode * `pip install versioneer` to somewhere in your $PATH * A [conda-forge recipe](https://github.com/conda-forge/versioneer-feedstock) is available, so you can also use `conda install -c conda-forge versioneer` * add a `[tool.versioneer]` section to your `pyproject.toml` or a `[versioneer]` section to your `setup.cfg` (see [Install](INSTALL.md)) * add `versioneer` (with `[toml]` extra, if configuring in `pyproject.toml`) to the `requires` key of the `build-system` table in `pyproject.toml`: ```toml [build-system] requires = ["setuptools", "versioneer[toml]"] build-backend = "setuptools.build_meta" ``` * run `versioneer install --no-vendor` in your source tree, commit the results * verify version information with `python setup.py version` ## Version Identifiers Source trees come from a variety of places: * a version-control system checkout (mostly used by developers) * a nightly tarball, produced by build automation * a snapshot tarball, produced by a web-based VCS browser, like github's "tarball from tag" feature * a release tarball, produced by "setup.py sdist", distributed through PyPI Within each source tree, the version identifier (either a string or a number, this tool is format-agnostic) can come from a variety of places: * ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows about recent "tags" and an absolute revision-id * the name of the directory into which the tarball was unpacked * an expanded VCS keyword ($Id$, etc) * a `_version.py` created by some earlier build step For released software, the version identifier is closely related to a VCS tag. Some projects use tag names that include more than just the version string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool needs to strip the tag prefix to extract the version identifier. For unreleased software (between tags), the version identifier should provide enough information to help developers recreate the same tree, while also giving them an idea of roughly how old the tree is (after version 1.2, before version 1.3). Many VCS systems can report a description that captures this, for example `git describe --tags --dirty --always` reports things like "0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the 0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has uncommitted changes). The version identifier is used for multiple purposes: * to allow the module to self-identify its version: `myproject.__version__` * to choose a name and prefix for a 'setup.py sdist' tarball ## Theory of Operation Versioneer works by adding a special `_version.py` file into your source tree, where your `__init__.py` can import it. This `_version.py` knows how to dynamically ask the VCS tool for version information at import time. `_version.py` also contains `$Revision$` markers, and the installation process marks `_version.py` to have this marker rewritten with a tag name during the `git archive` command. As a result, generated tarballs will contain enough information to get the proper version. To allow `setup.py` to compute a version too, a `versioneer.py` is added to the top level of your source tree, next to `setup.py` and the `setup.cfg` that configures it. This overrides several distutils/setuptools commands to compute the version when invoked, and changes `setup.py build` and `setup.py sdist` to replace `_version.py` with a small static file that contains just the generated version data. ## Installation See [INSTALL.md](./INSTALL.md) for detailed installation instructions. ## Version-String Flavors Code which uses Versioneer can learn about its version string at runtime by importing `_version` from your main `__init__.py` file and running the `get_versions()` function. From the "outside" (e.g. in `setup.py`), you can import the top-level `versioneer.py` and run `get_versions()`. Both functions return a dictionary with different flavors of version information: * `['version']`: A condensed version string, rendered using the selected style. This is the most commonly used value for the project's version string. The default "pep440" style yields strings like `0.11`, `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section below for alternative styles. * `['full-revisionid']`: detailed revision identifier. For Git, this is the full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac". * `['date']`: Date and time of the latest `HEAD` commit. For Git, it is the commit date in ISO 8601 format. This will be None if the date is not available. * `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that this is only accurate if run in a VCS checkout, otherwise it is likely to be False or None * `['error']`: if the version string could not be computed, this will be set to a string describing the problem, otherwise it will be None. It may be useful to throw an exception in setup.py if this is set, to avoid e.g. creating tarballs with a version string of "unknown". Some variants are more useful than others. Including `full-revisionid` in a bug report should allow developers to reconstruct the exact code being tested (or indicate the presence of local changes that should be shared with the developers). `version` is suitable for display in an "about" box or a CLI `--version` output: it can be easily compared against release notes and lists of bugs fixed in various releases. The installer adds the following text to your `__init__.py` to place a basic version in `YOURPROJECT.__version__`: from ._version import get_versions __version__ = get_versions()['version'] del get_versions ## Styles The setup.cfg `style=` configuration controls how the VCS information is rendered into a version string. The default style, "pep440", produces a PEP440-compliant string, equal to the un-prefixed tag name for actual releases, and containing an additional "local version" section with more detail for in-between builds. For Git, this is TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags --dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and that this commit is two revisions ("+2") beyond the "0.11" tag. For released software (exactly equal to a known tag), the identifier will only contain the stripped tag, e.g. "0.11". Other styles are available. See [details.md](details.md) in the Versioneer source tree for descriptions. ## Debugging Versioneer tries to avoid fatal errors: if something goes wrong, it will tend to return a version of "0+unknown". To investigate the problem, run `setup.py version`, which will run the version-lookup code in a verbose mode, and will display the full contents of `get_versions()` (including the `error` string, which may help identify what went wrong). ## Known Limitations Some situations are known to cause problems for Versioneer. This details the most significant ones. More can be found on Github [issues page](https://github.com/python-versioneer/python-versioneer/issues). ### Subprojects Versioneer has limited support for source trees in which `setup.py` is not in the root directory (e.g. `setup.py` and `.git/` are *not* siblings). The are two common reasons why `setup.py` might not be in the root: * Source trees which contain multiple subprojects, such as [Buildbot](https://github.com/buildbot/buildbot), which contains both "master" and "slave" subprojects, each with their own `setup.py`, `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI distributions (and upload multiple independently-installable tarballs). * Source trees whose main purpose is to contain a C library, but which also provide bindings to Python (and perhaps other languages) in subdirectories. Versioneer will look for `.git` in parent directories, and most operations should get the right version string. However `pip` and `setuptools` have bugs and implementation details which frequently cause `pip install .` from a subproject directory to fail to find a correct version string (so it usually defaults to `0+unknown`). `pip install --editable .` should work correctly. `setup.py install` might work too. Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in some later version. [Bug #38](https://github.com/python-versioneer/python-versioneer/issues/38) is tracking this issue. The discussion in [PR #61](https://github.com/python-versioneer/python-versioneer/pull/61) describes the issue from the Versioneer side in more detail. [pip PR#3176](https://github.com/pypa/pip/pull/3176) and [pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve pip to let Versioneer work correctly. Versioneer-0.16 and earlier only looked for a `.git` directory next to the `setup.cfg`, so subprojects were completely unsupported with those releases. ### Editable installs with setuptools <= 18.5 `setup.py develop` and `pip install --editable .` allow you to install a project into a virtualenv once, then continue editing the source code (and test) without re-installing after every change. "Entry-point scripts" (`setup(entry_points={"console_scripts": ..})`) are a convenient way to specify executable scripts that should be installed along with the python package. These both work as expected when using modern setuptools. When using setuptools-18.5 or earlier, however, certain operations will cause `pkg_resources.DistributionNotFound` errors when running the entrypoint script, which must be resolved by re-installing the package. This happens when the install happens with one version, then the egg_info data is regenerated while a different version is checked out. Many setup.py commands cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into a different virtualenv), so this can be surprising. [Bug #83](https://github.com/python-versioneer/python-versioneer/issues/83) describes this one, but upgrading to a newer version of setuptools should probably resolve it. ## Updating Versioneer To upgrade your project to a new release of Versioneer, do the following: * install the new Versioneer (`pip install -U versioneer` or equivalent) * edit `setup.cfg` and `pyproject.toml`, if necessary, to include any new configuration settings indicated by the release notes. See [UPGRADING](./UPGRADING.md) for details. * re-run `versioneer install --[no-]vendor` in your source tree, to replace `SRC/_version.py` * commit any changed files ## Future Directions This tool is designed to make it easily extended to other version-control systems: all VCS-specific components are in separate directories like src/git/ . The top-level `versioneer.py` script is assembled from these components by running make-versioneer.py . In the future, make-versioneer.py will take a VCS name as an argument, and will construct a version of `versioneer.py` that is specific to the given VCS. It might also take the configuration arguments that are currently provided manually during installation by editing setup.py . Alternatively, it might go the other direction and include code from all supported VCS systems, reducing the number of intermediate scripts. ## Similar projects * [setuptools_scm](https://github.com/pypa/setuptools_scm/) - a non-vendored build-time dependency * [minver](https://github.com/jbweston/miniver) - a lightweight reimplementation of versioneer * [versioningit](https://github.com/jwodder/versioningit) - a PEP 518-based setuptools plugin ## License To make Versioneer easier to embed, all its code is dedicated to the public domain. The `_version.py` that it creates is also in the public domain. Specifically, both are released under the "Unlicense", as described in https://unlicense.org/. [pypi-image]: https://img.shields.io/pypi/v/versioneer.svg [pypi-url]: https://pypi.python.org/pypi/versioneer/ [travis-image]: https://img.shields.io/travis/com/python-versioneer/python-versioneer.svg [travis-url]: https://travis-ci.com/github/python-versioneer/python-versioneer """ # pylint:disable=invalid-name,import-outside-toplevel,missing-function-docstring # pylint:disable=missing-class-docstring,too-many-branches,too-many-statements # pylint:disable=raise-missing-from,too-many-lines,too-many-locals,import-error # pylint:disable=too-few-public-methods,redefined-outer-name,consider-using-with # pylint:disable=attribute-defined-outside-init,too-many-arguments import configparser import errno import functools import json import os import re import subprocess import sys from pathlib import Path from typing import Any, Callable, Dict, List, NoReturn, Optional, Tuple, Union, cast have_tomllib = True if sys.version_info >= (3, 11): import tomllib else: try: import tomli as tomllib except ImportError: have_tomllib = False class VersioneerConfig: """Container for Versioneer configuration parameters.""" VCS: str style: str tag_prefix: str versionfile_source: str versionfile_build: Optional[str] parentdir_prefix: Optional[str] verbose: Optional[bool] def get_root() -> str: """Get the project root directory. We require that all commands are run from the project root, i.e. the directory that contains setup.py, setup.cfg, and versioneer.py . """ root = os.path.realpath(os.path.abspath(os.getcwd())) setup_py = os.path.join(root, "setup.py") pyproject_toml = os.path.join(root, "pyproject.toml") versioneer_py = os.path.join(root, "versioneer.py") if not ( os.path.exists(setup_py) or os.path.exists(pyproject_toml) or os.path.exists(versioneer_py) ): # allow 'python path/to/setup.py COMMAND' root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0]))) setup_py = os.path.join(root, "setup.py") pyproject_toml = os.path.join(root, "pyproject.toml") versioneer_py = os.path.join(root, "versioneer.py") if not ( os.path.exists(setup_py) or os.path.exists(pyproject_toml) or os.path.exists(versioneer_py) ): err = ( "Versioneer was unable to run the project root directory. " "Versioneer requires setup.py to be executed from " "its immediate directory (like 'python setup.py COMMAND'), " "or in a way that lets it use sys.argv[0] to find the root " "(like 'python path/to/setup.py COMMAND')." ) raise VersioneerBadRootError(err) try: # Certain runtime workflows (setup.py install/develop in a setuptools # tree) execute all dependencies in a single python process, so # "versioneer" may be imported multiple times, and python's shared # module-import table will cache the first one. So we can't use # os.path.dirname(__file__), as that will find whichever # versioneer.py was first imported, even in later projects. my_path = os.path.realpath(os.path.abspath(__file__)) me_dir = os.path.normcase(os.path.splitext(my_path)[0]) vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0]) if me_dir != vsr_dir and "VERSIONEER_PEP518" not in globals(): print( "Warning: build in %s is using versioneer.py from %s" % (os.path.dirname(my_path), versioneer_py) ) except NameError: pass return root def get_config_from_root(root: str) -> VersioneerConfig: """Read the project setup.cfg file to determine Versioneer config.""" # This might raise OSError (if setup.cfg is missing), or # configparser.NoSectionError (if it lacks a [versioneer] section), or # configparser.NoOptionError (if it lacks "VCS="). See the docstring at # the top of versioneer.py for instructions on writing your setup.cfg . root_pth = Path(root) pyproject_toml = root_pth / "pyproject.toml" setup_cfg = root_pth / "setup.cfg" section: Union[Dict[str, Any], configparser.SectionProxy, None] = None if pyproject_toml.exists() and have_tomllib: try: with open(pyproject_toml, "rb") as fobj: pp = tomllib.load(fobj) section = pp["tool"]["versioneer"] except (tomllib.TOMLDecodeError, KeyError) as e: print(f"Failed to load config from {pyproject_toml}: {e}") print("Try to load it from setup.cfg") if not section: parser = configparser.ConfigParser() with open(setup_cfg) as cfg_file: parser.read_file(cfg_file) parser.get("versioneer", "VCS") # raise error if missing section = parser["versioneer"] # `cast`` really shouldn't be used, but its simplest for the # common VersioneerConfig users at the moment. We verify against # `None` values elsewhere where it matters cfg = VersioneerConfig() cfg.VCS = section["VCS"] cfg.style = section.get("style", "") cfg.versionfile_source = cast(str, section.get("versionfile_source")) cfg.versionfile_build = section.get("versionfile_build") cfg.tag_prefix = cast(str, section.get("tag_prefix")) if cfg.tag_prefix in ("''", '""', None): cfg.tag_prefix = "" cfg.parentdir_prefix = section.get("parentdir_prefix") if isinstance(section, configparser.SectionProxy): # Make sure configparser translates to bool cfg.verbose = section.getboolean("verbose") else: cfg.verbose = section.get("verbose") return cfg class NotThisMethod(Exception): """Exception raised if a method is not valid for the current scenario.""" # these dictionaries contain VCS-specific tools LONG_VERSION_PY: Dict[str, str] = {} HANDLERS: Dict[str, Dict[str, Callable]] = {} def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator """Create decorator to mark a method as the handler of a VCS.""" def decorate(f: Callable) -> Callable: """Store f in HANDLERS[vcs][method].""" HANDLERS.setdefault(vcs, {})[method] = f return f return decorate def run_command( commands: List[str], args: List[str], cwd: Optional[str] = None, verbose: bool = False, hide_stderr: bool = False, env: Optional[Dict[str, str]] = None, ) -> Tuple[Optional[str], Optional[int]]: """Call the given command(s).""" assert isinstance(commands, list) process = None popen_kwargs: Dict[str, Any] = {} if sys.platform == "win32": # This hides the console window if pythonw.exe is used startupinfo = subprocess.STARTUPINFO() startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW popen_kwargs["startupinfo"] = startupinfo for command in commands: try: dispcmd = str([command] + args) # remember shell=False, so use git.cmd on windows, not just git process = subprocess.Popen( [command] + args, cwd=cwd, env=env, stdout=subprocess.PIPE, stderr=(subprocess.PIPE if hide_stderr else None), **popen_kwargs, ) break except OSError as e: if e.errno == errno.ENOENT: continue if verbose: print("unable to run %s" % dispcmd) print(e) return None, None else: if verbose: print("unable to find command, tried %s" % (commands,)) return None, None stdout = process.communicate()[0].strip().decode() if process.returncode != 0: if verbose: print("unable to run %s (error)" % dispcmd) print("stdout was %s" % stdout) return None, process.returncode return stdout, process.returncode LONG_VERSION_PY[ "git" ] = r''' # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build # directories (produced by setup.py build) will contain a much shorter file # that just contains the computed version number. # This file is released into the public domain. # Generated by versioneer-0.29 # https://github.com/python-versioneer/python-versioneer """Git implementation of _version.py.""" import errno import os import re import subprocess import sys from typing import Any, Callable, Dict, List, Optional, Tuple import functools def get_keywords() -> Dict[str, str]: """Get the keywords needed to look up the version information.""" # these strings will be replaced by git during git-archive. # setup.py/versioneer.py will grep for the variable names, so they must # each be defined on a line of their own. _version.py will just call # get_keywords(). git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s" git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s" git_date = "%(DOLLAR)sFormat:%%ci%(DOLLAR)s" keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} return keywords class VersioneerConfig: """Container for Versioneer configuration parameters.""" VCS: str style: str tag_prefix: str parentdir_prefix: str versionfile_source: str verbose: bool def get_config() -> VersioneerConfig: """Create, populate and return the VersioneerConfig() object.""" # these strings are filled in when 'setup.py versioneer' creates # _version.py cfg = VersioneerConfig() cfg.VCS = "git" cfg.style = "%(STYLE)s" cfg.tag_prefix = "%(TAG_PREFIX)s" cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s" cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s" cfg.verbose = False return cfg class NotThisMethod(Exception): """Exception raised if a method is not valid for the current scenario.""" LONG_VERSION_PY: Dict[str, str] = {} HANDLERS: Dict[str, Dict[str, Callable]] = {} def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator """Create decorator to mark a method as the handler of a VCS.""" def decorate(f: Callable) -> Callable: """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f return f return decorate def run_command( commands: List[str], args: List[str], cwd: Optional[str] = None, verbose: bool = False, hide_stderr: bool = False, env: Optional[Dict[str, str]] = None, ) -> Tuple[Optional[str], Optional[int]]: """Call the given command(s).""" assert isinstance(commands, list) process = None popen_kwargs: Dict[str, Any] = {} if sys.platform == "win32": # This hides the console window if pythonw.exe is used startupinfo = subprocess.STARTUPINFO() startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW popen_kwargs["startupinfo"] = startupinfo for command in commands: try: dispcmd = str([command] + args) # remember shell=False, so use git.cmd on windows, not just git process = subprocess.Popen([command] + args, cwd=cwd, env=env, stdout=subprocess.PIPE, stderr=(subprocess.PIPE if hide_stderr else None), **popen_kwargs) break except OSError as e: if e.errno == errno.ENOENT: continue if verbose: print("unable to run %%s" %% dispcmd) print(e) return None, None else: if verbose: print("unable to find command, tried %%s" %% (commands,)) return None, None stdout = process.communicate()[0].strip().decode() if process.returncode != 0: if verbose: print("unable to run %%s (error)" %% dispcmd) print("stdout was %%s" %% stdout) return None, process.returncode return stdout, process.returncode def versions_from_parentdir( parentdir_prefix: str, root: str, verbose: bool, ) -> Dict[str, Any]: """Try to determine the version from the parent directory name. Source tarballs conventionally unpack into a directory that includes both the project name and a version string. We will also support searching up two directory levels for an appropriately named parent directory """ rootdirs = [] for _ in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): return {"version": dirname[len(parentdir_prefix):], "full-revisionid": None, "dirty": False, "error": None, "date": None} rootdirs.append(root) root = os.path.dirname(root) # up a level if verbose: print("Tried directories %%s but none started with prefix %%s" %% (str(rootdirs), parentdir_prefix)) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") @register_vcs_handler("git", "get_keywords") def git_get_keywords(versionfile_abs: str) -> Dict[str, str]: """Extract version information from the given file.""" # the code embedded in _version.py can just fetch the value of these # keywords. When used from setup.py, we don't want to import _version.py, # so we do it with a regexp instead. This function is not used from # _version.py. keywords: Dict[str, str] = {} try: with open(versionfile_abs, "r") as fobj: for line in fobj: if line.strip().startswith("git_refnames ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["refnames"] = mo.group(1) if line.strip().startswith("git_full ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["full"] = mo.group(1) if line.strip().startswith("git_date ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["date"] = mo.group(1) except OSError: pass return keywords @register_vcs_handler("git", "keywords") def git_versions_from_keywords( keywords: Dict[str, str], tag_prefix: str, verbose: bool, ) -> Dict[str, Any]: """Get version information from git keywords.""" if "refnames" not in keywords: raise NotThisMethod("Short version file found") date = keywords.get("date") if date is not None: # Use only the last line. Previous lines may contain GPG signature # information. date = date.splitlines()[-1] # git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant # datestamp. However we prefer "%%ci" (which expands to an "ISO-8601 # -like" string, which we must then edit to make compliant), because # it's been around since git-1.5.3, and it's too difficult to # discover which version we're using, or to work around using an # older one. date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) refnames = keywords["refnames"].strip() if refnames.startswith("$Format"): if verbose: print("keywords are unexpanded, not using") raise NotThisMethod("unexpanded keywords, not a git-archive tarball") refs = {r.strip() for r in refnames.strip("()").split(",")} # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " tags = {r[len(TAG):] for r in refs if r.startswith(TAG)} if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %%d # expansion behaves like git log --decorate=short and strips out the # refs/heads/ and refs/tags/ prefixes that would let us distinguish # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". tags = {r for r in refs if re.search(r'\d', r)} if verbose: print("discarding '%%s', no digits" %% ",".join(refs - tags)) if verbose: print("likely tags: %%s" %% ",".join(sorted(tags))) for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): r = ref[len(tag_prefix):] # Filter out refs that exactly match prefix or that don't start # with a number once the prefix is stripped (mostly a concern # when prefix is '') if not re.match(r'\d', r): continue if verbose: print("picking %%s" %% r) return {"version": r, "full-revisionid": keywords["full"].strip(), "dirty": False, "error": None, "date": date} # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") return {"version": "0+unknown", "full-revisionid": keywords["full"].strip(), "dirty": False, "error": "no suitable tags", "date": None} @register_vcs_handler("git", "pieces_from_vcs") def git_pieces_from_vcs( tag_prefix: str, root: str, verbose: bool, runner: Callable = run_command ) -> Dict[str, Any]: """Get version from 'git describe' in the root of the source tree. This only gets called if the git-archive 'subst' keywords were *not* expanded, and _version.py hasn't already been rewritten with a short version string, meaning we're inside a checked out source tree. """ GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] # GIT_DIR can interfere with correct operation of Versioneer. # It may be intended to be passed to the Versioneer-versioned project, # but that should not change where we get our version from. env = os.environ.copy() env.pop("GIT_DIR", None) runner = functools.partial(runner, env=env) _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=not verbose) if rc != 0: if verbose: print("Directory %%s not under git control" %% root) raise NotThisMethod("'git rev-parse --git-dir' returned error") # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) describe_out, rc = runner(GITS, [ "describe", "--tags", "--dirty", "--always", "--long", "--match", f"{tag_prefix}[[:digit:]]*" ], cwd=root) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") describe_out = describe_out.strip() full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) if full_out is None: raise NotThisMethod("'git rev-parse' failed") full_out = full_out.strip() pieces: Dict[str, Any] = {} pieces["long"] = full_out pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root) # --abbrev-ref was added in git-1.6.3 if rc != 0 or branch_name is None: raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") branch_name = branch_name.strip() if branch_name == "HEAD": # If we aren't exactly on a branch, pick a branch which represents # the current commit. If all else fails, we are on a branchless # commit. branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) # --contains was added in git-1.5.4 if rc != 0 or branches is None: raise NotThisMethod("'git branch --contains' returned error") branches = branches.split("\n") # Remove the first line if we're running detached if "(" in branches[0]: branches.pop(0) # Strip off the leading "* " from the list of branches. branches = [branch[2:] for branch in branches] if "master" in branches: branch_name = "master" elif not branches: branch_name = None else: # Pick the first branch that is returned. Good or bad. branch_name = branches[0] pieces["branch"] = branch_name # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] # TAG might have hyphens. git_describe = describe_out # look for -dirty suffix dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: git_describe = git_describe[:git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) if not mo: # unparsable. Maybe git-describe is misbehaving? pieces["error"] = ("unable to parse git-describe output: '%%s'" %% describe_out) return pieces # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): if verbose: fmt = "tag '%%s' doesn't start with prefix '%%s'" print(fmt %% (full_tag, tag_prefix)) pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'" %% (full_tag, tag_prefix)) return pieces pieces["closest-tag"] = full_tag[len(tag_prefix):] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) # commit: short hex revision ID pieces["short"] = mo.group(3) else: # HEX: no tags pieces["closest-tag"] = None out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root) pieces["distance"] = len(out.split()) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() date = runner(GITS, ["show", "-s", "--format=%%ci", "HEAD"], cwd=root)[0].strip() # Use only the last line. Previous lines may contain GPG signature # information. date = date.splitlines()[-1] pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) return pieces def plus_or_dot(pieces: Dict[str, Any]) -> str: """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." return "+" def render_pep440(pieces: Dict[str, Any]) -> str: """Build up version string, with post-release "local version identifier". Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty Exceptions: 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += plus_or_dot(pieces) rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_branch(pieces: Dict[str, Any]) -> str: """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . The ".dev0" means not master branch. Note that .dev0 sorts backwards (a feature branch will appear "older" than the master branch). Exceptions: 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: if pieces["branch"] != "master": rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0" if pieces["branch"] != "master": rendered += ".dev0" rendered += "+untagged.%%d.g%%s" %% (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def pep440_split_post(ver: str) -> Tuple[str, Optional[int]]: """Split pep440 version string at the post-release segment. Returns the release segments before the post-release and the post-release version number (or -1 if no post-release segment is present). """ vc = str.split(ver, ".post") return vc[0], int(vc[1] or 0) if len(vc) == 2 else None def render_pep440_pre(pieces: Dict[str, Any]) -> str: """TAG[.postN.devDISTANCE] -- No -dirty. Exceptions: 1: no tags. 0.post0.devDISTANCE """ if pieces["closest-tag"]: if pieces["distance"]: # update the post release segment tag_version, post_version = pep440_split_post(pieces["closest-tag"]) rendered = tag_version if post_version is not None: rendered += ".post%%d.dev%%d" %% (post_version + 1, pieces["distance"]) else: rendered += ".post0.dev%%d" %% (pieces["distance"]) else: # no commits, use the tag as the version rendered = pieces["closest-tag"] else: # exception #1 rendered = "0.post0.dev%%d" %% pieces["distance"] return rendered def render_pep440_post(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]+gHEX] . The ".dev0" means dirty. Note that .dev0 sorts backwards (a dirty tree will appear "older" than the corresponding clean one), but you shouldn't be releasing software with -dirty anyways. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%%d" %% pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%%s" %% pieces["short"] else: # exception #1 rendered = "0.post%%d" %% pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += "+g%%s" %% pieces["short"] return rendered def render_pep440_post_branch(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . The ".dev0" means not master branch. Exceptions: 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%%d" %% pieces["distance"] if pieces["branch"] != "master": rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%%s" %% pieces["short"] if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0.post%%d" %% pieces["distance"] if pieces["branch"] != "master": rendered += ".dev0" rendered += "+g%%s" %% pieces["short"] if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_old(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%%d" %% pieces["distance"] if pieces["dirty"]: rendered += ".dev0" else: # exception #1 rendered = "0.post%%d" %% pieces["distance"] if pieces["dirty"]: rendered += ".dev0" return rendered def render_git_describe(pieces: Dict[str, Any]) -> str: """TAG[-DISTANCE-gHEX][-dirty]. Like 'git describe --tags --dirty --always'. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render_git_describe_long(pieces: Dict[str, Any]) -> str: """TAG-DISTANCE-gHEX[-dirty]. Like 'git describe --tags --dirty --always -long'. The distance/hash is unconditional. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]: """Render the given version pieces into the requested style.""" if pieces["error"]: return {"version": "unknown", "full-revisionid": pieces.get("long"), "dirty": None, "error": pieces["error"], "date": None} if not style or style == "default": style = "pep440" # the default if style == "pep440": rendered = render_pep440(pieces) elif style == "pep440-branch": rendered = render_pep440_branch(pieces) elif style == "pep440-pre": rendered = render_pep440_pre(pieces) elif style == "pep440-post": rendered = render_pep440_post(pieces) elif style == "pep440-post-branch": rendered = render_pep440_post_branch(pieces) elif style == "pep440-old": rendered = render_pep440_old(pieces) elif style == "git-describe": rendered = render_git_describe(pieces) elif style == "git-describe-long": rendered = render_git_describe_long(pieces) else: raise ValueError("unknown style '%%s'" %% style) return {"version": rendered, "full-revisionid": pieces["long"], "dirty": pieces["dirty"], "error": None, "date": pieces.get("date")} def get_versions() -> Dict[str, Any]: """Get version information or return default if unable to do so.""" # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have # __file__, we can work backwards from there to the root. Some # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which # case we can only use expanded keywords. cfg = get_config() verbose = cfg.verbose try: return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose) except NotThisMethod: pass try: root = os.path.realpath(__file__) # versionfile_source is the relative path from the top of the source # tree (where the .git directory might live) to this file. Invert # this to find the root from __file__. for _ in cfg.versionfile_source.split('/'): root = os.path.dirname(root) except NameError: return {"version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to find root of source tree", "date": None} try: pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) return render(pieces, cfg.style) except NotThisMethod: pass try: if cfg.parentdir_prefix: return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) except NotThisMethod: pass return {"version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to compute version", "date": None} ''' @register_vcs_handler("git", "get_keywords") def git_get_keywords(versionfile_abs: str) -> Dict[str, str]: """Extract version information from the given file.""" # the code embedded in _version.py can just fetch the value of these # keywords. When used from setup.py, we don't want to import _version.py, # so we do it with a regexp instead. This function is not used from # _version.py. keywords: Dict[str, str] = {} try: with open(versionfile_abs, "r") as fobj: for line in fobj: if line.strip().startswith("git_refnames ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["refnames"] = mo.group(1) if line.strip().startswith("git_full ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["full"] = mo.group(1) if line.strip().startswith("git_date ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["date"] = mo.group(1) except OSError: pass return keywords @register_vcs_handler("git", "keywords") def git_versions_from_keywords( keywords: Dict[str, str], tag_prefix: str, verbose: bool, ) -> Dict[str, Any]: """Get version information from git keywords.""" if "refnames" not in keywords: raise NotThisMethod("Short version file found") date = keywords.get("date") if date is not None: # Use only the last line. Previous lines may contain GPG signature # information. date = date.splitlines()[-1] # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 # -like" string, which we must then edit to make compliant), because # it's been around since git-1.5.3, and it's too difficult to # discover which version we're using, or to work around using an # older one. date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) refnames = keywords["refnames"].strip() if refnames.startswith("$Format"): if verbose: print("keywords are unexpanded, not using") raise NotThisMethod("unexpanded keywords, not a git-archive tarball") refs = {r.strip() for r in refnames.strip("()").split(",")} # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)} if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d # expansion behaves like git log --decorate=short and strips out the # refs/heads/ and refs/tags/ prefixes that would let us distinguish # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". tags = {r for r in refs if re.search(r"\d", r)} if verbose: print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: print("likely tags: %s" % ",".join(sorted(tags))) for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): r = ref[len(tag_prefix) :] # Filter out refs that exactly match prefix or that don't start # with a number once the prefix is stripped (mostly a concern # when prefix is '') if not re.match(r"\d", r): continue if verbose: print("picking %s" % r) return { "version": r, "full-revisionid": keywords["full"].strip(), "dirty": False, "error": None, "date": date, } # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") return { "version": "0+unknown", "full-revisionid": keywords["full"].strip(), "dirty": False, "error": "no suitable tags", "date": None, } @register_vcs_handler("git", "pieces_from_vcs") def git_pieces_from_vcs( tag_prefix: str, root: str, verbose: bool, runner: Callable = run_command ) -> Dict[str, Any]: """Get version from 'git describe' in the root of the source tree. This only gets called if the git-archive 'subst' keywords were *not* expanded, and _version.py hasn't already been rewritten with a short version string, meaning we're inside a checked out source tree. """ GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] # GIT_DIR can interfere with correct operation of Versioneer. # It may be intended to be passed to the Versioneer-versioned project, # but that should not change where we get our version from. env = os.environ.copy() env.pop("GIT_DIR", None) runner = functools.partial(runner, env=env) _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=not verbose) if rc != 0: if verbose: print("Directory %s not under git control" % root) raise NotThisMethod("'git rev-parse --git-dir' returned error") # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) describe_out, rc = runner( GITS, [ "describe", "--tags", "--dirty", "--always", "--long", "--match", f"{tag_prefix}[[:digit:]]*", ], cwd=root, ) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") describe_out = describe_out.strip() full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) if full_out is None: raise NotThisMethod("'git rev-parse' failed") full_out = full_out.strip() pieces: Dict[str, Any] = {} pieces["long"] = full_out pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root) # --abbrev-ref was added in git-1.6.3 if rc != 0 or branch_name is None: raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") branch_name = branch_name.strip() if branch_name == "HEAD": # If we aren't exactly on a branch, pick a branch which represents # the current commit. If all else fails, we are on a branchless # commit. branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) # --contains was added in git-1.5.4 if rc != 0 or branches is None: raise NotThisMethod("'git branch --contains' returned error") branches = branches.split("\n") # Remove the first line if we're running detached if "(" in branches[0]: branches.pop(0) # Strip off the leading "* " from the list of branches. branches = [branch[2:] for branch in branches] if "master" in branches: branch_name = "master" elif not branches: branch_name = None else: # Pick the first branch that is returned. Good or bad. branch_name = branches[0] pieces["branch"] = branch_name # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] # TAG might have hyphens. git_describe = describe_out # look for -dirty suffix dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: git_describe = git_describe[: git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) if not mo: # unparsable. Maybe git-describe is misbehaving? pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out return pieces # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( full_tag, tag_prefix, ) return pieces pieces["closest-tag"] = full_tag[len(tag_prefix) :] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) # commit: short hex revision ID pieces["short"] = mo.group(3) else: # HEX: no tags pieces["closest-tag"] = None out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root) pieces["distance"] = len(out.split()) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() # Use only the last line. Previous lines may contain GPG signature # information. date = date.splitlines()[-1] pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) return pieces def do_vcs_install(versionfile_source: str, ipy: Optional[str]) -> None: """Git-specific installation logic for Versioneer. For Git, this means creating/changing .gitattributes to mark _version.py for export-subst keyword substitution. """ GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] files = [versionfile_source] if ipy: files.append(ipy) if "VERSIONEER_PEP518" not in globals(): try: my_path = __file__ if my_path.endswith((".pyc", ".pyo")): my_path = os.path.splitext(my_path)[0] + ".py" versioneer_file = os.path.relpath(my_path) except NameError: versioneer_file = "versioneer.py" files.append(versioneer_file) present = False try: with open(".gitattributes", "r") as fobj: for line in fobj: if line.strip().startswith(versionfile_source): if "export-subst" in line.strip().split()[1:]: present = True break except OSError: pass if not present: with open(".gitattributes", "a+") as fobj: fobj.write(f"{versionfile_source} export-subst\n") files.append(".gitattributes") run_command(GITS, ["add", "--"] + files) def versions_from_parentdir( parentdir_prefix: str, root: str, verbose: bool, ) -> Dict[str, Any]: """Try to determine the version from the parent directory name. Source tarballs conventionally unpack into a directory that includes both the project name and a version string. We will also support searching up two directory levels for an appropriately named parent directory """ rootdirs = [] for _ in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): return { "version": dirname[len(parentdir_prefix) :], "full-revisionid": None, "dirty": False, "error": None, "date": None, } rootdirs.append(root) root = os.path.dirname(root) # up a level if verbose: print( "Tried directories %s but none started with prefix %s" % (str(rootdirs), parentdir_prefix) ) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") SHORT_VERSION_PY = """ # This file was generated by 'versioneer.py' (0.29) from # revision-control system data, or from the parent directory name of an # unpacked source archive. Distribution tarballs contain a pre-generated copy # of this file. import json version_json = ''' %s ''' # END VERSION_JSON def get_versions(): return json.loads(version_json) """ def versions_from_file(filename: str) -> Dict[str, Any]: """Try to determine the version from _version.py if present.""" try: with open(filename) as f: contents = f.read() except OSError: raise NotThisMethod("unable to read _version.py") mo = re.search( r"version_json = '''\n(.*)''' # END VERSION_JSON", contents, re.M | re.S ) if not mo: mo = re.search( r"version_json = '''\r\n(.*)''' # END VERSION_JSON", contents, re.M | re.S ) if not mo: raise NotThisMethod("no version_json in _version.py") return json.loads(mo.group(1)) def write_to_version_file(filename: str, versions: Dict[str, Any]) -> None: """Write the given version number to the given _version.py file.""" contents = json.dumps(versions, sort_keys=True, indent=1, separators=(",", ": ")) with open(filename, "w") as f: f.write(SHORT_VERSION_PY % contents) print("set %s to '%s'" % (filename, versions["version"])) def plus_or_dot(pieces: Dict[str, Any]) -> str: """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." return "+" def render_pep440(pieces: Dict[str, Any]) -> str: """Build up version string, with post-release "local version identifier". Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty Exceptions: 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += plus_or_dot(pieces) rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_branch(pieces: Dict[str, Any]) -> str: """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . The ".dev0" means not master branch. Note that .dev0 sorts backwards (a feature branch will appear "older" than the master branch). Exceptions: 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: if pieces["branch"] != "master": rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0" if pieces["branch"] != "master": rendered += ".dev0" rendered += "+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def pep440_split_post(ver: str) -> Tuple[str, Optional[int]]: """Split pep440 version string at the post-release segment. Returns the release segments before the post-release and the post-release version number (or -1 if no post-release segment is present). """ vc = str.split(ver, ".post") return vc[0], int(vc[1] or 0) if len(vc) == 2 else None def render_pep440_pre(pieces: Dict[str, Any]) -> str: """TAG[.postN.devDISTANCE] -- No -dirty. Exceptions: 1: no tags. 0.post0.devDISTANCE """ if pieces["closest-tag"]: if pieces["distance"]: # update the post release segment tag_version, post_version = pep440_split_post(pieces["closest-tag"]) rendered = tag_version if post_version is not None: rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"]) else: rendered += ".post0.dev%d" % (pieces["distance"]) else: # no commits, use the tag as the version rendered = pieces["closest-tag"] else: # exception #1 rendered = "0.post0.dev%d" % pieces["distance"] return rendered def render_pep440_post(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]+gHEX] . The ".dev0" means dirty. Note that .dev0 sorts backwards (a dirty tree will appear "older" than the corresponding clean one), but you shouldn't be releasing software with -dirty anyways. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%s" % pieces["short"] else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += "+g%s" % pieces["short"] return rendered def render_pep440_post_branch(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . The ".dev0" means not master branch. Exceptions: 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["branch"] != "master": rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%s" % pieces["short"] if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["branch"] != "master": rendered += ".dev0" rendered += "+g%s" % pieces["short"] if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_old(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" return rendered def render_git_describe(pieces: Dict[str, Any]) -> str: """TAG[-DISTANCE-gHEX][-dirty]. Like 'git describe --tags --dirty --always'. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render_git_describe_long(pieces: Dict[str, Any]) -> str: """TAG-DISTANCE-gHEX[-dirty]. Like 'git describe --tags --dirty --always -long'. The distance/hash is unconditional. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]: """Render the given version pieces into the requested style.""" if pieces["error"]: return { "version": "unknown", "full-revisionid": pieces.get("long"), "dirty": None, "error": pieces["error"], "date": None, } if not style or style == "default": style = "pep440" # the default if style == "pep440": rendered = render_pep440(pieces) elif style == "pep440-branch": rendered = render_pep440_branch(pieces) elif style == "pep440-pre": rendered = render_pep440_pre(pieces) elif style == "pep440-post": rendered = render_pep440_post(pieces) elif style == "pep440-post-branch": rendered = render_pep440_post_branch(pieces) elif style == "pep440-old": rendered = render_pep440_old(pieces) elif style == "git-describe": rendered = render_git_describe(pieces) elif style == "git-describe-long": rendered = render_git_describe_long(pieces) else: raise ValueError("unknown style '%s'" % style) return { "version": rendered, "full-revisionid": pieces["long"], "dirty": pieces["dirty"], "error": None, "date": pieces.get("date"), } class VersioneerBadRootError(Exception): """The project root directory is unknown or missing key files.""" def get_versions(verbose: bool = False) -> Dict[str, Any]: """Get the project version from whatever source is available. Returns dict with two keys: 'version' and 'full'. """ if "versioneer" in sys.modules: # see the discussion in cmdclass.py:get_cmdclass() del sys.modules["versioneer"] root = get_root() cfg = get_config_from_root(root) assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg" handlers = HANDLERS.get(cfg.VCS) assert handlers, "unrecognized VCS '%s'" % cfg.VCS verbose = verbose or bool(cfg.verbose) # `bool()` used to avoid `None` assert ( cfg.versionfile_source is not None ), "please set versioneer.versionfile_source" assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" versionfile_abs = os.path.join(root, cfg.versionfile_source) # extract version from first of: _version.py, VCS command (e.g. 'git # describe'), parentdir. This is meant to work for developers using a # source checkout, for users of a tarball created by 'setup.py sdist', # and for users of a tarball/zipball created by 'git archive' or github's # download-from-tag feature or the equivalent in other VCSes. get_keywords_f = handlers.get("get_keywords") from_keywords_f = handlers.get("keywords") if get_keywords_f and from_keywords_f: try: keywords = get_keywords_f(versionfile_abs) ver = from_keywords_f(keywords, cfg.tag_prefix, verbose) if verbose: print("got version from expanded keyword %s" % ver) return ver except NotThisMethod: pass try: ver = versions_from_file(versionfile_abs) if verbose: print("got version from file %s %s" % (versionfile_abs, ver)) return ver except NotThisMethod: pass from_vcs_f = handlers.get("pieces_from_vcs") if from_vcs_f: try: pieces = from_vcs_f(cfg.tag_prefix, root, verbose) ver = render(pieces, cfg.style) if verbose: print("got version from VCS %s" % ver) return ver except NotThisMethod: pass try: if cfg.parentdir_prefix: ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose) if verbose: print("got version from parentdir %s" % ver) return ver except NotThisMethod: pass if verbose: print("unable to compute version") return { "version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to compute version", "date": None, } def get_version() -> str: """Get the short version string for this project.""" return get_versions()["version"] def get_cmdclass(cmdclass: Optional[Dict[str, Any]] = None): """Get the custom setuptools subclasses used by Versioneer. If the package uses a different cmdclass (e.g. one from numpy), it should be provide as an argument. """ if "versioneer" in sys.modules: del sys.modules["versioneer"] # this fixes the "python setup.py develop" case (also 'install' and # 'easy_install .'), in which subdependencies of the main project are # built (using setup.py bdist_egg) in the same python process. Assume # a main project A and a dependency B, which use different versions # of Versioneer. A's setup.py imports A's Versioneer, leaving it in # sys.modules by the time B's setup.py is executed, causing B to run # with the wrong versioneer. Setuptools wraps the sub-dep builds in a # sandbox that restores sys.modules to it's pre-build state, so the # parent is protected against the child's "import versioneer". By # removing ourselves from sys.modules here, before the child build # happens, we protect the child from the parent's versioneer too. # Also see https://github.com/python-versioneer/python-versioneer/issues/52 cmds = {} if cmdclass is None else cmdclass.copy() # we add "version" to setuptools from setuptools import Command class cmd_version(Command): description = "report generated version string" user_options: List[Tuple[str, str, str]] = [] boolean_options: List[str] = [] def initialize_options(self) -> None: pass def finalize_options(self) -> None: pass def run(self) -> None: vers = get_versions(verbose=True) print("Version: %s" % vers["version"]) print(" full-revisionid: %s" % vers.get("full-revisionid")) print(" dirty: %s" % vers.get("dirty")) print(" date: %s" % vers.get("date")) if vers["error"]: print(" error: %s" % vers["error"]) cmds["version"] = cmd_version # we override "build_py" in setuptools # # most invocation pathways end up running build_py: # distutils/build -> build_py # distutils/install -> distutils/build ->.. # setuptools/bdist_wheel -> distutils/install ->.. # setuptools/bdist_egg -> distutils/install_lib -> build_py # setuptools/install -> bdist_egg ->.. # setuptools/develop -> ? # pip install: # copies source tree to a tempdir before running egg_info/etc # if .git isn't copied too, 'git describe' will fail # then does setup.py bdist_wheel, or sometimes setup.py install # setup.py egg_info -> ? # pip install -e . and setuptool/editable_wheel will invoke build_py # but the build_py command is not expected to copy any files. # we override different "build_py" commands for both environments if "build_py" in cmds: _build_py: Any = cmds["build_py"] else: from setuptools.command.build_py import build_py as _build_py class cmd_build_py(_build_py): def run(self) -> None: root = get_root() cfg = get_config_from_root(root) versions = get_versions() _build_py.run(self) if getattr(self, "editable_mode", False): # During editable installs `.py` and data files are # not copied to build_lib return # now locate _version.py in the new build/ directory and replace # it with an updated value if cfg.versionfile_build: target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build) print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) cmds["build_py"] = cmd_build_py if "build_ext" in cmds: _build_ext: Any = cmds["build_ext"] else: from setuptools.command.build_ext import build_ext as _build_ext class cmd_build_ext(_build_ext): def run(self) -> None: root = get_root() cfg = get_config_from_root(root) versions = get_versions() _build_ext.run(self) if self.inplace: # build_ext --inplace will only build extensions in # build/lib<..> dir with no _version.py to write to. # As in place builds will already have a _version.py # in the module dir, we do not need to write one. return # now locate _version.py in the new build/ directory and replace # it with an updated value if not cfg.versionfile_build: return target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build) if not os.path.exists(target_versionfile): print( f"Warning: {target_versionfile} does not exist, skipping " "version update. This can happen if you are running build_ext " "without first running build_py." ) return print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) cmds["build_ext"] = cmd_build_ext if "cx_Freeze" in sys.modules: # cx_freeze enabled? from cx_Freeze.dist import build_exe as _build_exe # type: ignore # nczeczulin reports that py2exe won't like the pep440-style string # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g. # setup(console=[{ # "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION # "product_version": versioneer.get_version(), # ... class cmd_build_exe(_build_exe): def run(self) -> None: root = get_root() cfg = get_config_from_root(root) versions = get_versions() target_versionfile = cfg.versionfile_source print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) _build_exe.run(self) os.unlink(target_versionfile) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] f.write( LONG % { "DOLLAR": "$", "STYLE": cfg.style, "TAG_PREFIX": cfg.tag_prefix, "PARENTDIR_PREFIX": cfg.parentdir_prefix, "VERSIONFILE_SOURCE": cfg.versionfile_source, } ) cmds["build_exe"] = cmd_build_exe del cmds["build_py"] if "py2exe" in sys.modules: # py2exe enabled? try: from py2exe.setuptools_buildexe import py2exe as _py2exe # type: ignore except ImportError: from py2exe.distutils_buildexe import py2exe as _py2exe # type: ignore class cmd_py2exe(_py2exe): def run(self) -> None: root = get_root() cfg = get_config_from_root(root) versions = get_versions() target_versionfile = cfg.versionfile_source print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) _py2exe.run(self) os.unlink(target_versionfile) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] f.write( LONG % { "DOLLAR": "$", "STYLE": cfg.style, "TAG_PREFIX": cfg.tag_prefix, "PARENTDIR_PREFIX": cfg.parentdir_prefix, "VERSIONFILE_SOURCE": cfg.versionfile_source, } ) cmds["py2exe"] = cmd_py2exe # sdist farms its file list building out to egg_info if "egg_info" in cmds: _egg_info: Any = cmds["egg_info"] else: from setuptools.command.egg_info import egg_info as _egg_info class cmd_egg_info(_egg_info): def find_sources(self) -> None: # egg_info.find_sources builds the manifest list and writes it # in one shot super().find_sources() # Modify the filelist and normalize it root = get_root() cfg = get_config_from_root(root) self.filelist.append("versioneer.py") if cfg.versionfile_source: # There are rare cases where versionfile_source might not be # included by default, so we must be explicit self.filelist.append(cfg.versionfile_source) self.filelist.sort() self.filelist.remove_duplicates() # The write method is hidden in the manifest_maker instance that # generated the filelist and was thrown away # We will instead replicate their final normalization (to unicode, # and POSIX-style paths) from setuptools import unicode_utils normalized = [ unicode_utils.filesys_decode(f).replace(os.sep, "/") for f in self.filelist.files ] manifest_filename = os.path.join(self.egg_info, "SOURCES.txt") with open(manifest_filename, "w") as fobj: fobj.write("\n".join(normalized)) cmds["egg_info"] = cmd_egg_info # we override different "sdist" commands for both environments if "sdist" in cmds: _sdist: Any = cmds["sdist"] else: from setuptools.command.sdist import sdist as _sdist class cmd_sdist(_sdist): def run(self) -> None: versions = get_versions() self._versioneer_generated_versions = versions # unless we update this, the command will keep using the old # version self.distribution.metadata.version = versions["version"] return _sdist.run(self) def make_release_tree(self, base_dir: str, files: List[str]) -> None: root = get_root() cfg = get_config_from_root(root) _sdist.make_release_tree(self, base_dir, files) # now locate _version.py in the new base_dir directory # (remembering that it may be a hardlink) and replace it with an # updated value target_versionfile = os.path.join(base_dir, cfg.versionfile_source) print("UPDATING %s" % target_versionfile) write_to_version_file( target_versionfile, self._versioneer_generated_versions ) cmds["sdist"] = cmd_sdist return cmds CONFIG_ERROR = """ setup.cfg is missing the necessary Versioneer configuration. You need a section like: [versioneer] VCS = git style = pep440 versionfile_source = src/myproject/_version.py versionfile_build = myproject/_version.py tag_prefix = parentdir_prefix = myproject- You will also need to edit your setup.py to use the results: import versioneer setup(version=versioneer.get_version(), cmdclass=versioneer.get_cmdclass(), ...) Please read the docstring in ./versioneer.py for configuration instructions, edit setup.cfg, and re-run the installer or 'python versioneer.py setup'. """ SAMPLE_CONFIG = """ # See the docstring in versioneer.py for instructions. Note that you must # re-run 'versioneer.py setup' after changing this section, and commit the # resulting files. [versioneer] #VCS = git #style = pep440 #versionfile_source = #versionfile_build = #tag_prefix = #parentdir_prefix = """ OLD_SNIPPET = """ from ._version import get_versions __version__ = get_versions()['version'] del get_versions """ INIT_PY_SNIPPET = """ from . import {0} __version__ = {0}.get_versions()['version'] """ def do_setup() -> int: """Do main VCS-independent setup function for installing Versioneer.""" root = get_root() try: cfg = get_config_from_root(root) except (OSError, configparser.NoSectionError, configparser.NoOptionError) as e: if isinstance(e, (OSError, configparser.NoSectionError)): print("Adding sample versioneer config to setup.cfg", file=sys.stderr) with open(os.path.join(root, "setup.cfg"), "a") as f: f.write(SAMPLE_CONFIG) print(CONFIG_ERROR, file=sys.stderr) return 1 print(" creating %s" % cfg.versionfile_source) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] f.write( LONG % { "DOLLAR": "$", "STYLE": cfg.style, "TAG_PREFIX": cfg.tag_prefix, "PARENTDIR_PREFIX": cfg.parentdir_prefix, "VERSIONFILE_SOURCE": cfg.versionfile_source, } ) ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py") maybe_ipy: Optional[str] = ipy if os.path.exists(ipy): try: with open(ipy, "r") as f: old = f.read() except OSError: old = "" module = os.path.splitext(os.path.basename(cfg.versionfile_source))[0] snippet = INIT_PY_SNIPPET.format(module) if OLD_SNIPPET in old: print(" replacing boilerplate in %s" % ipy) with open(ipy, "w") as f: f.write(old.replace(OLD_SNIPPET, snippet)) elif snippet not in old: print(" appending to %s" % ipy) with open(ipy, "a") as f: f.write(snippet) else: print(" %s unmodified" % ipy) else: print(" %s doesn't exist, ok" % ipy) maybe_ipy = None # Make VCS-specific changes. For git, this means creating/changing # .gitattributes to mark _version.py for export-subst keyword # substitution. do_vcs_install(cfg.versionfile_source, maybe_ipy) return 0 def scan_setup_py() -> int: """Validate the contents of setup.py against Versioneer's expectations.""" found = set() setters = False errors = 0 with open("setup.py", "r") as f: for line in f.readlines(): if "import versioneer" in line: found.add("import") if "versioneer.get_cmdclass()" in line: found.add("cmdclass") if "versioneer.get_version()" in line: found.add("get_version") if "versioneer.VCS" in line: setters = True if "versioneer.versionfile_source" in line: setters = True if len(found) != 3: print("") print("Your setup.py appears to be missing some important items") print("(but I might be wrong). Please make sure it has something") print("roughly like the following:") print("") print(" import versioneer") print(" setup( version=versioneer.get_version(),") print(" cmdclass=versioneer.get_cmdclass(), ...)") print("") errors += 1 if setters: print("You should remove lines like 'versioneer.VCS = ' and") print("'versioneer.versionfile_source = ' . This configuration") print("now lives in setup.cfg, and should be removed from setup.py") print("") errors += 1 return errors def setup_command() -> NoReturn: """Set up Versioneer and exit with appropriate error code.""" errors = do_setup() errors += scan_setup_py() sys.exit(1 if errors else 0) if __name__ == "__main__": cmd = sys.argv[1] if cmd == "setup": setup_command()