Showing preview only (7,260K chars total). Download the full file or copy to clipboard to get everything.
Repository: modin-project/modin
Branch: main
Commit: 7ca200b08597
Files: 681
Total size: 6.8 MB
Directory structure:
gitextract_eudtie4f/
├── .gitattributes
├── .github/
│ ├── ISSUE_TEMPLATE/
│ │ ├── bug-report.yaml
│ │ ├── feature_request.md
│ │ └── question.md
│ ├── PULL_REQUEST_TEMPLATE.md
│ ├── actions/
│ │ ├── mamba-env/
│ │ │ └── action.yml
│ │ ├── python-only/
│ │ │ └── action.yml
│ │ ├── run-core-tests/
│ │ │ ├── action.yml
│ │ │ ├── group_1/
│ │ │ │ └── action.yml
│ │ │ ├── group_2/
│ │ │ │ └── action.yml
│ │ │ ├── group_3/
│ │ │ │ └── action.yml
│ │ │ └── group_4/
│ │ │ └── action.yml
│ │ └── upload-coverage/
│ │ └── action.yml
│ ├── dependabot.yaml
│ ├── stale.yml
│ └── workflows/
│ ├── ci-notebooks.yml
│ ├── ci-required.yml
│ ├── ci.yml
│ ├── codeql/
│ │ └── codeql-config.yml
│ ├── codeql.yml
│ ├── fuzzydata-test.yml
│ ├── publish-to-pypi.yml
│ ├── push-to-main.yml
│ └── sql_server/
│ └── set_up_sql_server.sh
├── .gitignore
├── .readthedocs.yaml
├── CODEOWNERS
├── CODE_OF_CONDUCT.md
├── LICENSE
├── LICENSE_HEADER
├── MANIFEST.in
├── NOTICE
├── README.md
├── asv_bench/
│ ├── README.md
│ ├── asv.conf.dask.json
│ ├── asv.conf.json
│ ├── asv.conf.unidist.json
│ ├── benchmarks/
│ │ ├── __init__.py
│ │ ├── benchmarks.py
│ │ ├── io/
│ │ │ ├── __init__.py
│ │ │ ├── csv.py
│ │ │ └── parquet.py
│ │ ├── scalability/
│ │ │ ├── __init__.py
│ │ │ └── scalability_benchmarks.py
│ │ └── utils/
│ │ ├── __init__.py
│ │ ├── common.py
│ │ ├── compatibility.py
│ │ └── data_shapes.py
│ └── test/
│ ├── __init__.py
│ └── test_utils.py
├── ci/
│ └── teamcity/
│ ├── Dockerfile.teamcity-ci
│ ├── build-docker.py
│ └── comment_on_pr.py
├── codecov.yml
├── contributing/
│ ├── contributing.md
│ └── pre-commit
├── docker/
│ └── Dockerfile
├── docs/
│ ├── _static/
│ │ └── custom.js
│ ├── _templates/
│ │ └── layout.html
│ ├── conf.py
│ ├── contact.rst
│ ├── development/
│ │ ├── architecture.rst
│ │ ├── contributing.rst
│ │ ├── index.rst
│ │ ├── partition_api.rst
│ │ ├── using_pandas_on_dask.rst
│ │ ├── using_pandas_on_mpi.rst
│ │ ├── using_pandas_on_python.rst
│ │ └── using_pandas_on_ray.rst
│ ├── ecosystem.rst
│ ├── flow/
│ │ └── modin/
│ │ ├── config.rst
│ │ ├── core/
│ │ │ ├── dataframe/
│ │ │ │ ├── algebra.rst
│ │ │ │ ├── base/
│ │ │ │ │ ├── dataframe.rst
│ │ │ │ │ ├── index.rst
│ │ │ │ │ └── partitioning/
│ │ │ │ │ └── axis_partition.rst
│ │ │ │ ├── index.rst
│ │ │ │ └── pandas/
│ │ │ │ ├── dataframe.rst
│ │ │ │ ├── index.rst
│ │ │ │ ├── metadata/
│ │ │ │ │ ├── dtypes.rst
│ │ │ │ │ └── index.rst
│ │ │ │ └── partitioning/
│ │ │ │ ├── axis_partition.rst
│ │ │ │ ├── partition.rst
│ │ │ │ └── partition_manager.rst
│ │ │ ├── execution/
│ │ │ │ ├── dask/
│ │ │ │ │ └── implementations/
│ │ │ │ │ └── pandas_on_dask/
│ │ │ │ │ ├── dataframe.rst
│ │ │ │ │ ├── index.rst
│ │ │ │ │ └── partitioning/
│ │ │ │ │ ├── partition.rst
│ │ │ │ │ ├── partition_manager.rst
│ │ │ │ │ └── virtual_partition.rst
│ │ │ │ ├── dispatching.rst
│ │ │ │ ├── python/
│ │ │ │ │ └── implementations/
│ │ │ │ │ └── pandas_on_python/
│ │ │ │ │ ├── dataframe.rst
│ │ │ │ │ ├── index.rst
│ │ │ │ │ └── partitioning/
│ │ │ │ │ ├── axis_partition.rst
│ │ │ │ │ ├── partition.rst
│ │ │ │ │ └── partition_manager.rst
│ │ │ │ ├── ray/
│ │ │ │ │ ├── generic.rst
│ │ │ │ │ └── implementations/
│ │ │ │ │ └── pandas_on_ray/
│ │ │ │ │ ├── dataframe.rst
│ │ │ │ │ ├── index.rst
│ │ │ │ │ └── partitioning/
│ │ │ │ │ ├── axis_partition.rst
│ │ │ │ │ ├── partition.rst
│ │ │ │ │ └── partition_manager.rst
│ │ │ │ └── unidist/
│ │ │ │ ├── generic.rst
│ │ │ │ └── implementations/
│ │ │ │ └── pandas_on_unidist/
│ │ │ │ ├── dataframe.rst
│ │ │ │ ├── index.rst
│ │ │ │ └── partitioning/
│ │ │ │ ├── axis_partition.rst
│ │ │ │ ├── partition.rst
│ │ │ │ └── partition_manager.rst
│ │ │ ├── io/
│ │ │ │ └── index.rst
│ │ │ └── storage_formats/
│ │ │ ├── base/
│ │ │ │ └── query_compiler.rst
│ │ │ ├── index.rst
│ │ │ └── pandas/
│ │ │ ├── index.rst
│ │ │ ├── parsers.rst
│ │ │ └── query_compiler.rst
│ │ ├── distributed/
│ │ │ └── dataframe/
│ │ │ └── pandas.rst
│ │ ├── experimental/
│ │ │ ├── batch.rst
│ │ │ ├── core/
│ │ │ │ └── io/
│ │ │ │ └── index.rst
│ │ │ ├── index.rst
│ │ │ ├── pandas.rst
│ │ │ ├── range_partitioning_groupby.rst
│ │ │ ├── reshuffling_groupby.rst
│ │ │ ├── sklearn.rst
│ │ │ └── xgboost.rst
│ │ ├── pandas/
│ │ │ ├── base.rst
│ │ │ ├── dataframe.rst
│ │ │ └── series.rst
│ │ └── utils.rst
│ ├── getting_started/
│ │ ├── examples.rst
│ │ ├── faq.rst
│ │ ├── installation.rst
│ │ ├── quickstart.rst
│ │ ├── troubleshooting.rst
│ │ ├── using_modin/
│ │ │ ├── using_modin.rst
│ │ │ ├── using_modin_cluster.rst
│ │ │ └── using_modin_locally.rst
│ │ └── why_modin/
│ │ ├── modin_vs_dask_vs_koalas.rst
│ │ ├── out_of_core.rst
│ │ ├── pandas.rst
│ │ └── why_modin.rst
│ ├── index.rst
│ ├── release-procedure.md
│ ├── release_notes/
│ │ ├── release_notes-0.14.0.rst
│ │ ├── release_notes-0.15.0.rst
│ │ ├── release_notes-0.16.0.rst
│ │ └── release_notes-template.rst
│ ├── requirements-doc.txt
│ ├── supported_apis/
│ │ ├── dataframe_supported.rst
│ │ ├── defaulting_to_pandas.rst
│ │ ├── index.rst
│ │ ├── io_supported.rst
│ │ ├── older_pandas_compat.rst
│ │ ├── series_supported.rst
│ │ └── utilities_supported.rst
│ └── usage_guide/
│ ├── advanced_usage/
│ │ ├── batch.rst
│ │ ├── index.rst
│ │ ├── modin_engines.rst
│ │ ├── modin_logging.rst
│ │ ├── modin_metrics.rst
│ │ ├── modin_xgboost.rst
│ │ ├── progress_bar.rst
│ │ └── spreadsheets_api.rst
│ ├── benchmarking.rst
│ ├── examples/
│ │ └── index.rst
│ ├── index.rst
│ ├── integrations.rst
│ └── optimization_notes/
│ ├── index.rst
│ └── range_partitioning_ops.rst
├── environment-dev.yml
├── examples/
│ ├── data/
│ │ ├── boston_housing.csv
│ │ ├── census_1k.csv
│ │ ├── nyc-taxi_1k.csv
│ │ ├── plasticc_test_set_1k.csv
│ │ ├── plasticc_test_set_metadata_1k.csv
│ │ ├── plasticc_training_set_1k.csv
│ │ └── plasticc_training_set_metadata_1k.csv
│ ├── docker/
│ │ └── modin-ray/
│ │ ├── Dockerfile
│ │ ├── build-docker-image.sh
│ │ ├── census.py
│ │ ├── nyc-taxi.py
│ │ ├── plasticc.py
│ │ └── taxi.pstat
│ ├── jupyter/
│ │ ├── Modin_Taxi.ipynb
│ │ ├── Pandas_Taxi.ipynb
│ │ └── integrations/
│ │ ├── NLTK.ipynb
│ │ ├── altair.ipynb
│ │ ├── bokeh.ipynb
│ │ ├── huggingface.ipynb
│ │ ├── matplotlib.ipynb
│ │ ├── plotly.ipynb
│ │ ├── seaborn.ipynb
│ │ ├── sklearn.ipynb
│ │ ├── statsmodels.ipynb
│ │ ├── tensorflow.ipynb
│ │ └── xgboost.ipynb
│ ├── modin-scikit-learn-example.ipynb
│ ├── quickstart.ipynb
│ ├── spreadsheet/
│ │ ├── requirements.txt
│ │ └── tutorial.ipynb
│ └── tutorial/
│ ├── README.md
│ └── jupyter/
│ ├── README.md
│ └── execution/
│ ├── pandas_on_dask/
│ │ ├── Dockerfile
│ │ ├── cluster/
│ │ │ └── exercise_5.ipynb
│ │ ├── local/
│ │ │ ├── exercise_1.ipynb
│ │ │ ├── exercise_2.ipynb
│ │ │ ├── exercise_3.ipynb
│ │ │ └── exercise_4.ipynb
│ │ ├── requirements.txt
│ │ └── test/
│ │ └── test_notebooks.py
│ ├── pandas_on_ray/
│ │ ├── Dockerfile
│ │ ├── cluster/
│ │ │ ├── README.md
│ │ │ ├── exercise_5.py
│ │ │ └── modin-cluster.yaml
│ │ ├── local/
│ │ │ ├── exercise_1.ipynb
│ │ │ ├── exercise_2.ipynb
│ │ │ ├── exercise_3.ipynb
│ │ │ └── exercise_4.ipynb
│ │ ├── requirements.txt
│ │ └── test/
│ │ └── test_notebooks.py
│ ├── pandas_on_unidist/
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ ├── jupyter_unidist_env.yml
│ │ ├── local/
│ │ │ ├── exercise_1.ipynb
│ │ │ ├── exercise_2.ipynb
│ │ │ ├── exercise_3.ipynb
│ │ │ └── exercise_4.ipynb
│ │ ├── setup_kernel.py
│ │ └── test/
│ │ └── test_notebooks.py
│ └── test/
│ └── utils.py
├── modin/
│ ├── __init__.py
│ ├── __main__.py
│ ├── _version.py
│ ├── config/
│ │ ├── __init__.py
│ │ ├── __main__.py
│ │ ├── envvars.py
│ │ └── pubsub.py
│ ├── conftest.py
│ ├── core/
│ │ ├── __init__.py
│ │ ├── computation/
│ │ │ ├── __init__.py
│ │ │ ├── align.py
│ │ │ ├── check.py
│ │ │ ├── common.py
│ │ │ ├── engines.py
│ │ │ ├── eval.py
│ │ │ ├── expr.py
│ │ │ ├── ops.py
│ │ │ ├── parsing.py
│ │ │ └── scope.py
│ │ ├── dataframe/
│ │ │ ├── __init__.py
│ │ │ ├── algebra/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── binary.py
│ │ │ │ ├── default2pandas/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── binary.py
│ │ │ │ │ ├── cat.py
│ │ │ │ │ ├── dataframe.py
│ │ │ │ │ ├── datetime.py
│ │ │ │ │ ├── default.py
│ │ │ │ │ ├── groupby.py
│ │ │ │ │ ├── list.py
│ │ │ │ │ ├── resample.py
│ │ │ │ │ ├── rolling.py
│ │ │ │ │ ├── series.py
│ │ │ │ │ ├── str.py
│ │ │ │ │ └── struct.py
│ │ │ │ ├── fold.py
│ │ │ │ ├── groupby.py
│ │ │ │ ├── map.py
│ │ │ │ ├── operator.py
│ │ │ │ ├── reduce.py
│ │ │ │ └── tree_reduce.py
│ │ │ ├── base/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── dataframe/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── dataframe.py
│ │ │ │ │ └── utils.py
│ │ │ │ ├── interchange/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── dataframe_protocol/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── dataframe.py
│ │ │ │ │ └── utils.py
│ │ │ │ └── partitioning/
│ │ │ │ ├── __init__.py
│ │ │ │ └── axis_partition.py
│ │ │ └── pandas/
│ │ │ ├── __init__.py
│ │ │ ├── dataframe/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── dataframe.py
│ │ │ │ └── utils.py
│ │ │ ├── interchange/
│ │ │ │ ├── __init__.py
│ │ │ │ └── dataframe_protocol/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── buffer.py
│ │ │ │ ├── column.py
│ │ │ │ ├── dataframe.py
│ │ │ │ ├── exception.py
│ │ │ │ └── from_dataframe.py
│ │ │ ├── metadata/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── dtypes.py
│ │ │ │ └── index.py
│ │ │ ├── partitioning/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── axis_partition.py
│ │ │ │ ├── partition.py
│ │ │ │ └── partition_manager.py
│ │ │ └── utils.py
│ │ ├── execution/
│ │ │ ├── __init__.py
│ │ │ ├── dask/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── common/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── engine_wrapper.py
│ │ │ │ │ └── utils.py
│ │ │ │ └── implementations/
│ │ │ │ ├── __init__.py
│ │ │ │ └── pandas_on_dask/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── dataframe/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── dataframe.py
│ │ │ │ ├── io/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── io.py
│ │ │ │ └── partitioning/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── partition.py
│ │ │ │ ├── partition_manager.py
│ │ │ │ └── virtual_partition.py
│ │ │ ├── dispatching/
│ │ │ │ ├── __init__.py
│ │ │ │ └── factories/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── dispatcher.py
│ │ │ │ └── factories.py
│ │ │ ├── modin_aqp.py
│ │ │ ├── python/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── common/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── engine_wrapper.py
│ │ │ │ └── implementations/
│ │ │ │ ├── __init__.py
│ │ │ │ └── pandas_on_python/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── dataframe/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── dataframe.py
│ │ │ │ ├── io/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── io.py
│ │ │ │ └── partitioning/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── partition.py
│ │ │ │ ├── partition_manager.py
│ │ │ │ └── virtual_partition.py
│ │ │ ├── ray/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── common/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── deferred_execution.py
│ │ │ │ │ ├── engine_wrapper.py
│ │ │ │ │ └── utils.py
│ │ │ │ ├── generic/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── io/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── io.py
│ │ │ │ │ └── partitioning/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── partition_manager.py
│ │ │ │ └── implementations/
│ │ │ │ ├── __init__.py
│ │ │ │ └── pandas_on_ray/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── dataframe/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── dataframe.py
│ │ │ │ ├── io/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── io.py
│ │ │ │ └── partitioning/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── partition.py
│ │ │ │ ├── partition_manager.py
│ │ │ │ └── virtual_partition.py
│ │ │ ├── unidist/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── common/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── engine_wrapper.py
│ │ │ │ │ └── utils.py
│ │ │ │ ├── generic/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── io/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── io.py
│ │ │ │ │ └── partitioning/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── partition_manager.py
│ │ │ │ └── implementations/
│ │ │ │ ├── __init__.py
│ │ │ │ └── pandas_on_unidist/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── dataframe/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── dataframe.py
│ │ │ │ ├── io/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── io.py
│ │ │ │ └── partitioning/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── partition.py
│ │ │ │ ├── partition_manager.py
│ │ │ │ └── virtual_partition.py
│ │ │ └── utils.py
│ │ ├── io/
│ │ │ ├── __init__.py
│ │ │ ├── column_stores/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── column_store_dispatcher.py
│ │ │ │ ├── feather_dispatcher.py
│ │ │ │ ├── hdf_dispatcher.py
│ │ │ │ └── parquet_dispatcher.py
│ │ │ ├── file_dispatcher.py
│ │ │ ├── io.py
│ │ │ ├── sql/
│ │ │ │ ├── __init__.py
│ │ │ │ └── sql_dispatcher.py
│ │ │ └── text/
│ │ │ ├── __init__.py
│ │ │ ├── csv_dispatcher.py
│ │ │ ├── excel_dispatcher.py
│ │ │ ├── fwf_dispatcher.py
│ │ │ ├── json_dispatcher.py
│ │ │ ├── text_file_dispatcher.py
│ │ │ └── utils.py
│ │ └── storage_formats/
│ │ ├── __init__.py
│ │ ├── base/
│ │ │ ├── __init__.py
│ │ │ ├── doc_utils.py
│ │ │ ├── query_compiler.py
│ │ │ └── query_compiler_calculator.py
│ │ └── pandas/
│ │ ├── __init__.py
│ │ ├── aggregations.py
│ │ ├── groupby.py
│ │ ├── merge.py
│ │ ├── native_query_compiler.py
│ │ ├── parsers.py
│ │ ├── query_compiler.py
│ │ ├── query_compiler_caster.py
│ │ └── utils.py
│ ├── db_conn.py
│ ├── distributed/
│ │ ├── __init__.py
│ │ └── dataframe/
│ │ ├── __init__.py
│ │ └── pandas/
│ │ ├── __init__.py
│ │ └── partitions.py
│ ├── error_message.py
│ ├── experimental/
│ │ ├── __init__.py
│ │ ├── batch/
│ │ │ ├── __init__.py
│ │ │ └── pipeline.py
│ │ ├── core/
│ │ │ ├── __init__.py
│ │ │ ├── execution/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── dask/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── implementations/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── pandas_on_dask/
│ │ │ │ │ └── __init__.py
│ │ │ │ ├── ray/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── implementations/
│ │ │ │ │ └── __init__.py
│ │ │ │ └── unidist/
│ │ │ │ ├── __init__.py
│ │ │ │ └── implementations/
│ │ │ │ ├── __init__.py
│ │ │ │ └── pandas_on_unidist/
│ │ │ │ └── __init__.py
│ │ │ ├── io/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── glob/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── glob_dispatcher.py
│ │ │ │ ├── sql/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── sql_dispatcher.py
│ │ │ │ │ └── utils.py
│ │ │ │ └── text/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── csv_glob_dispatcher.py
│ │ │ │ └── custom_text_dispatcher.py
│ │ │ └── storage_formats/
│ │ │ ├── __init__.py
│ │ │ └── pandas/
│ │ │ ├── __init__.py
│ │ │ └── parsers.py
│ │ ├── fuzzydata/
│ │ │ └── __init__.py
│ │ ├── pandas/
│ │ │ ├── __init__.py
│ │ │ └── io.py
│ │ ├── sklearn/
│ │ │ ├── __init__.py
│ │ │ └── model_selection/
│ │ │ ├── __init__.py
│ │ │ └── train_test_split.py
│ │ ├── spreadsheet/
│ │ │ ├── __init__.py
│ │ │ └── general.py
│ │ ├── torch/
│ │ │ ├── __init__.py
│ │ │ └── datasets.py
│ │ └── xgboost/
│ │ ├── __init__.py
│ │ ├── utils.py
│ │ ├── xgboost.py
│ │ └── xgboost_ray.py
│ ├── logging/
│ │ ├── __init__.py
│ │ ├── class_logger.py
│ │ ├── config.py
│ │ ├── logger_decorator.py
│ │ └── metrics.py
│ ├── numpy/
│ │ ├── __init__.py
│ │ ├── arr.py
│ │ ├── array_creation.py
│ │ ├── array_shaping.py
│ │ ├── constants.py
│ │ ├── indexing.py
│ │ ├── linalg.py
│ │ ├── logic.py
│ │ ├── math.py
│ │ ├── trigonometry.py
│ │ └── utils.py
│ ├── pandas/
│ │ ├── __init__.py
│ │ ├── accessor.py
│ │ ├── api/
│ │ │ ├── __init__.py
│ │ │ └── extensions/
│ │ │ ├── __init__.py
│ │ │ └── extensions.py
│ │ ├── arrays/
│ │ │ └── __init__.py
│ │ ├── base.py
│ │ ├── dataframe.py
│ │ ├── errors/
│ │ │ └── __init__.py
│ │ ├── general.py
│ │ ├── groupby.py
│ │ ├── indexing.py
│ │ ├── io.py
│ │ ├── iterator.py
│ │ ├── plotting.py
│ │ ├── resample.py
│ │ ├── series.py
│ │ ├── series_utils.py
│ │ ├── testing/
│ │ │ └── __init__.py
│ │ ├── utils.py
│ │ └── window.py
│ ├── polars/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── dataframe.py
│ │ ├── groupby.py
│ │ ├── lazyframe.py
│ │ └── series.py
│ ├── tests/
│ │ ├── __init__.py
│ │ ├── config/
│ │ │ ├── __init__.py
│ │ │ ├── docs_module/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── classes.py
│ │ │ │ └── functions.py
│ │ │ ├── docs_module_with_just_base/
│ │ │ │ ├── __init__.py
│ │ │ │ └── classes.py
│ │ │ ├── test_envvars.py
│ │ │ └── test_parameter.py
│ │ ├── core/
│ │ │ ├── __init__.py
│ │ │ ├── storage_formats/
│ │ │ │ ├── base/
│ │ │ │ │ └── test_internals.py
│ │ │ │ ├── cudf/
│ │ │ │ │ ├── test_gpu_managers.py
│ │ │ │ │ └── test_internals.py
│ │ │ │ └── pandas/
│ │ │ │ └── test_internals.py
│ │ │ └── test_dispatcher.py
│ │ ├── experimental/
│ │ │ ├── __init__.py
│ │ │ ├── spreadsheet/
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_general.py
│ │ │ ├── test_fuzzydata.py
│ │ │ ├── test_io_exp.py
│ │ │ ├── test_pipeline.py
│ │ │ ├── torch/
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_dataloader.py
│ │ │ └── xgboost/
│ │ │ ├── __init__.py
│ │ │ ├── test_default.py
│ │ │ ├── test_dmatrix.py
│ │ │ └── test_xgboost.py
│ │ ├── interchange/
│ │ │ ├── __init__.py
│ │ │ └── dataframe_protocol/
│ │ │ ├── __init__.py
│ │ │ ├── base/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── test_sanity.py
│ │ │ │ └── test_utils.py
│ │ │ ├── pandas/
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_protocol.py
│ │ │ └── test_general.py
│ │ ├── numpy/
│ │ │ ├── __init__.py
│ │ │ ├── test_array.py
│ │ │ ├── test_array_arithmetic.py
│ │ │ ├── test_array_axis_functions.py
│ │ │ ├── test_array_creation.py
│ │ │ ├── test_array_indexing.py
│ │ │ ├── test_array_linalg.py
│ │ │ ├── test_array_logic.py
│ │ │ ├── test_array_math.py
│ │ │ ├── test_array_shaping.py
│ │ │ └── utils.py
│ │ ├── pandas/
│ │ │ ├── __init__.py
│ │ │ ├── conftest.py
│ │ │ ├── data/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── airline.sas7bdat
│ │ │ │ ├── blah.csv
│ │ │ │ ├── every_other_row_nan.xlsx
│ │ │ │ ├── excel_sheetname_title.xlsx
│ │ │ │ ├── hdfs.parquet/
│ │ │ │ │ ├── part-00000-a7bff54c-2ff4-4654-9783-626542bd3a90-c000.snappy.parquet
│ │ │ │ │ ├── part-00001-a7bff54c-2ff4-4654-9783-626542bd3a90-c000.snappy.parquet
│ │ │ │ │ └── part-00002-a7bff54c-2ff4-4654-9783-626542bd3a90-c000.snappy.parquet
│ │ │ │ ├── issue5159.parquet/
│ │ │ │ │ └── part-0000.snappy.parquet/
│ │ │ │ │ ├── par=a/
│ │ │ │ │ │ └── 44c5b23d806c4dc8a97d70c4fb2219f5-0.parquet
│ │ │ │ │ └── par=b/
│ │ │ │ │ └── 44c5b23d806c4dc8a97d70c4fb2219f5-0.parquet
│ │ │ │ ├── issue_1930.csv
│ │ │ │ ├── issue_2074.csv
│ │ │ │ ├── issue_2239.csv
│ │ │ │ ├── issue_3119.csv
│ │ │ │ ├── issue_4543.csv
│ │ │ │ ├── issue_976.csv
│ │ │ │ ├── modin_error_book.xlsx
│ │ │ │ ├── multiple_csv/
│ │ │ │ │ ├── test_data0.csv
│ │ │ │ │ └── test_data1.csv
│ │ │ │ ├── newlines.csv
│ │ │ │ ├── test_border_rows.xlsx
│ │ │ │ ├── test_categories.csv
│ │ │ │ ├── test_categories.json
│ │ │ │ ├── test_data.feather
│ │ │ │ ├── test_data.fwf
│ │ │ │ ├── test_data.json
│ │ │ │ ├── test_data.parquet
│ │ │ │ ├── test_data_dir.parquet/
│ │ │ │ │ ├── part_0.parquet
│ │ │ │ │ ├── part_1.parquet
│ │ │ │ │ ├── part_10.parquet
│ │ │ │ │ ├── part_11.parquet
│ │ │ │ │ ├── part_12.parquet
│ │ │ │ │ ├── part_13.parquet
│ │ │ │ │ ├── part_14.parquet
│ │ │ │ │ ├── part_15.parquet
│ │ │ │ │ ├── part_2.parquet
│ │ │ │ │ ├── part_3.parquet
│ │ │ │ │ ├── part_4.parquet
│ │ │ │ │ ├── part_5.parquet
│ │ │ │ │ ├── part_6.parquet
│ │ │ │ │ ├── part_7.parquet
│ │ │ │ │ ├── part_8.parquet
│ │ │ │ │ └── part_9.parquet
│ │ │ │ ├── test_delim.csv
│ │ │ │ ├── test_different_columns_in_rows.json
│ │ │ │ ├── test_empty_rows.xlsx
│ │ │ │ ├── test_emptyline.xlsx
│ │ │ │ ├── test_null_col.csv
│ │ │ │ ├── test_time_parsing.csv
│ │ │ │ └── test_usecols.csv
│ │ │ ├── dataframe/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── test_binary.py
│ │ │ │ ├── test_default.py
│ │ │ │ ├── test_indexing.py
│ │ │ │ ├── test_iter.py
│ │ │ │ ├── test_join_sort.py
│ │ │ │ ├── test_map_metadata.py
│ │ │ │ ├── test_pickle.py
│ │ │ │ ├── test_reduce.py
│ │ │ │ ├── test_udf.py
│ │ │ │ └── test_window.py
│ │ │ ├── extensions/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── conftest.py
│ │ │ │ ├── test_api_reexport.py
│ │ │ │ ├── test_base_extensions.py
│ │ │ │ ├── test_dataframe_extensions.py
│ │ │ │ ├── test_groupby_extensions.py
│ │ │ │ ├── test_pd_extensions.py
│ │ │ │ └── test_series_extensions.py
│ │ │ ├── integrations/
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_lazy_import.py
│ │ │ ├── internals/
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_benchmark_mode.py
│ │ │ ├── native_df_interoperability/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── conftest.py
│ │ │ │ ├── test_binary.py
│ │ │ │ ├── test_compiler_caster.py
│ │ │ │ ├── test_copy_on_write.py
│ │ │ │ ├── test_default.py
│ │ │ │ ├── test_default_to_pandas_without_warnings.py
│ │ │ │ ├── test_general.py
│ │ │ │ ├── test_indexing.py
│ │ │ │ ├── test_iter.py
│ │ │ │ ├── test_join_sort.py
│ │ │ │ ├── test_map_metadata.py
│ │ │ │ ├── test_pickle.py
│ │ │ │ ├── test_window.py
│ │ │ │ └── utils.py
│ │ │ ├── test_api.py
│ │ │ ├── test_backend.py
│ │ │ ├── test_concat.py
│ │ │ ├── test_expanding.py
│ │ │ ├── test_general.py
│ │ │ ├── test_groupby.py
│ │ │ ├── test_io.py
│ │ │ ├── test_repartition.py
│ │ │ ├── test_reshape.py
│ │ │ ├── test_rolling.py
│ │ │ ├── test_series.py
│ │ │ └── utils.py
│ │ ├── polars/
│ │ │ └── test_dataframe.py
│ │ ├── test_dataframe_api_standard.py
│ │ ├── test_docstring_urls.py
│ │ ├── test_envvar_catcher.py
│ │ ├── test_envvar_npartitions.py
│ │ ├── test_executions_api.py
│ │ ├── test_headers.py
│ │ ├── test_logging.py
│ │ ├── test_metrics.py
│ │ ├── test_partition_api.py
│ │ └── test_utils.py
│ └── utils.py
├── modin-autoimport-pandas.pth
├── mypy.ini
├── requirements/
│ ├── env_unidist_linux.yml
│ ├── env_unidist_win.yml
│ └── requirements-no-engine.yml
├── requirements-dev.txt
├── scripts/
│ ├── __init__.py
│ ├── doc_checker.py
│ ├── release.py
│ └── test/
│ ├── __init__.py
│ ├── examples.py
│ └── test_doc_checker.py
├── setup.cfg
├── setup.py
├── stress_tests/
│ ├── kaggle/
│ │ ├── kaggle10.py
│ │ ├── kaggle12.py
│ │ ├── kaggle13.py
│ │ ├── kaggle14.py
│ │ ├── kaggle17.py
│ │ ├── kaggle18.py
│ │ ├── kaggle19.py
│ │ ├── kaggle20.py
│ │ ├── kaggle22.py
│ │ ├── kaggle3.py
│ │ ├── kaggle4.py
│ │ ├── kaggle5.py
│ │ ├── kaggle6.py
│ │ ├── kaggle7.py
│ │ ├── kaggle8.py
│ │ └── kaggle9.py
│ ├── run_stress_tests.sh
│ └── test_kaggle_ipynb.py
└── versioneer.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitattributes
================================================
* text=auto
modin/_version.py export-subst
================================================
FILE: .github/ISSUE_TEMPLATE/bug-report.yaml
================================================
name: Bug report
description: Report incorrect behavior in the Modin library
title: 'BUG: '
labels: ['bug 🦗', 'Triage 🩹']
body:
- type: checkboxes
id: checks
attributes:
label: Modin version checks
options:
- label: >
I have checked that this issue has not already been reported.
required: true
- label: >
I have confirmed this bug exists on the latest released version of Modin.
required: true
- label: >
I have confirmed this bug exists on the main branch of Modin. (In order to do this you
can follow [this guide](https://modin.readthedocs.io/en/stable/getting_started/installation.html#installing-from-the-github-main-branch).)
- type: textarea
id: example
attributes:
label: Reproducible Example
description: >
Please follow [this guide](https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) on how to
provide a minimal, copy-pastable example.
placeholder: >
import modin.pandas as pd
df = pd.DataFrame(range(5))
...
render: python
validations:
required: true
- type: textarea
id: problem
attributes:
label: Issue Description
description: >
Please provide a description of the issue shown in the reproducible example.
validations:
required: true
- type: textarea
id: expected-behavior
attributes:
label: Expected Behavior
description: >
Please describe or show a code example of the expected behavior.
validations:
required: true
- type: textarea
id: logs
attributes:
label: Error Logs
description: >
Please paste the output of any relevant error logs.
value: >
<details>
```python-traceback
Replace this line with the error backtrace (if applicable).
```
</details>
- type: textarea
id: version
attributes:
label: Installed Versions
description: >
Please paste the output of ``pd.show_versions()``
value: >
<details>
Replace this line with the output of pd.show_versions()
</details>
validations:
required: true
================================================
FILE: .github/ISSUE_TEMPLATE/feature_request.md
================================================
---
name: Feature request
about: Request a new API or feature implementation
title: ''
labels: 'new feature/request 💬, Triage 🩹'
assignees: ''
---
**Is your feature request related to a problem? Please describe.**
A clear and concise description of what the problem is. What kind of performance improvements would you like to see with this new API?
================================================
FILE: .github/ISSUE_TEMPLATE/question.md
================================================
---
name: Question
about: You want to ask a question
title: ''
labels: 'question ❓, Triage 🩹'
assignees: ''
---
================================================
FILE: .github/PULL_REQUEST_TEMPLATE.md
================================================
<!--
Thank you for your contribution!
Please review the contributing docs: https://modin.readthedocs.io/en/latest/development/contributing.html
if you have questions about contributing.
-->
## What do these changes do?
<!-- Please give a short brief about these changes. -->
- [x] first commit message and PR title follow format outlined [here](https://modin.readthedocs.io/en/latest/development/contributing.html#commit-message-formatting)
> **_NOTE:_** If you edit the PR title to match this format, you need to add another commit (even if it's empty) or amend your last commit for the CI job that checks the PR title to pick up the new PR title.
- [ ] passes `flake8 modin/ asv_bench/benchmarks scripts/doc_checker.py`
- [ ] passes `black --check modin/ asv_bench/benchmarks scripts/doc_checker.py`
- [ ] signed commit with `git commit -s` <!-- you can amend your commit with a signature via `git commit -amend -s` -->
- [ ] Resolves #? <!-- issue must be created for each patch -->
- [ ] tests added and passing
- [ ] module layout described at `docs/development/architecture.rst` is up-to-date <!-- if you have added, renamed or removed files or directories please update the documentation accordingly -->
================================================
FILE: .github/actions/mamba-env/action.yml
================================================
name: "Install environment using Mamba"
description: "Prepare the environment to run Modin"
inputs:
python-version:
description: "Python version to install"
default: "3.9"
environment-file:
description: "Conda environment yml"
required: true
activate-environment:
description: "Conda environment to activate"
default: "modin"
runs:
using: "composite"
steps:
- name: Get current week
id: get-week
# use current week as cache key to periodically refresh the cache,
# as cache is based on requirements, but dependencies push
# updated versions at some irregular pace
run: echo "thisweek=$(/bin/date -u '+%Y.w%W')" >> $GITHUB_OUTPUT
shell: bash
- name: Cache conda
id: cache-conda
uses: actions/cache@v4
with:
path: |
~/conda_pkgs_dir
~/.cache/pip
key:
${{ runner.os }}-conda-${{ steps.get-week.outputs.thisweek }}-${{ hashFiles(inputs.environment-file) }}
- uses: conda-incubator/setup-miniconda@v3
with:
miniforge-variant: Miniforge3
miniforge-version: latest
use-mamba: true
activate-environment: ${{ inputs.activate-environment }}
environment-file: ${{ inputs.environment-file }}
python-version: ${{ inputs.python-version }}
channel-priority: strict
# we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed
# for more info see https://github.com/conda-incubator/setup-miniconda/issues/264
use-only-tar-bz2: false
- shell: bash -l {0}
run: |
conda run -n ${{ inputs.activate-environment }} pip install .
conda list -n ${{ inputs.activate-environment }}
================================================
FILE: .github/actions/python-only/action.yml
================================================
name: "Install Python only"
description: "Prepare the environment to run simple tasks"
inputs:
python-version:
description: "Python version to install"
default: "3.9"
runs:
using: "composite"
steps:
- uses: actions/setup-python@v5
with:
python-version: ${{ inputs.python-version }}
architecture: "x64"
cache: 'pip'
================================================
FILE: .github/actions/run-core-tests/action.yml
================================================
name: "Run core Modin tests"
description: "Run core Modin tests like dataframe or groupby"
inputs:
runner:
description: "Runner for tests"
default: "python -m pytest"
parallel:
description: "How to run tests in parallel"
default: "-n 2"
runs:
using: "composite"
steps:
- uses: ./.github/actions/run-core-tests/group_1
with:
runner: ${{ inputs.runner }}
parallel: ${{ inputs.parallel }}
- uses: ./.github/actions/run-core-tests/group_2
with:
runner: ${{ inputs.runner }}
parallel: ${{ inputs.parallel }}
- uses: ./.github/actions/run-core-tests/group_3
with:
runner: ${{ inputs.runner }}
parallel: ${{ inputs.parallel }}
- uses: ./.github/actions/run-core-tests/group_4
with:
runner: ${{ inputs.runner }}
parallel: ${{ inputs.parallel }}
================================================
FILE: .github/actions/run-core-tests/group_1/action.yml
================================================
name: "Run core Modin tests - group 1"
description: "Run core Modin tests like dataframe or groupby"
inputs:
runner:
description: "Runner for tests"
default: "python -m pytest"
parallel:
description: "How to run tests in parallel"
default: "-n 2"
runs:
using: "composite"
steps:
- run: |
echo "::group::Running dataframe tests (group 1)..."
${{ inputs.runner }} ${{ inputs.parallel }} modin/tests/pandas/dataframe/test_binary.py \
modin/tests/pandas/dataframe/test_default.py \
modin/tests/pandas/dataframe/test_indexing.py \
modin/tests/pandas/dataframe/test_iter.py
echo "::endgroup::"
shell: bash -l {0}
================================================
FILE: .github/actions/run-core-tests/group_2/action.yml
================================================
name: "Run core Modin tests - group 2"
description: "Run core Modin tests like dataframe or groupby"
inputs:
runner:
description: "Runner for tests"
default: "python -m pytest"
parallel:
description: "How to run tests in parallel"
default: "-n 2"
runs:
using: "composite"
steps:
- run: |
echo "::group::Running dataframe tests (group 2)..."
${{ inputs.runner }} ${{ inputs.parallel }} modin/tests/pandas/dataframe/test_join_sort.py \
modin/tests/pandas/dataframe/test_reduce.py \
modin/tests/pandas/dataframe/test_udf.py \
modin/tests/pandas/dataframe/test_window.py \
modin/tests/pandas/dataframe/test_pickle.py \
modin/tests/pandas/test_repartition.py \
modin/tests/pandas/test_backend.py
echo "::endgroup::"
shell: bash -l {0}
================================================
FILE: .github/actions/run-core-tests/group_3/action.yml
================================================
name: "Run core Modin tests - group 3"
description: "Run core Modin tests like dataframe or groupby"
inputs:
runner:
description: "Runner for tests"
default: "python -m pytest"
parallel:
description: "How to run tests in parallel"
default: "-n 2"
runs:
using: "composite"
steps:
- run: |
echo "::group::Running tests (group 3)..."
${{ inputs.runner }} ${{ inputs.parallel }} modin/tests/pandas/test_series.py \
modin/tests/pandas/dataframe/test_map_metadata.py
echo "::endgroup::"
shell: bash -l {0}
- run: |
echo "::group::Running range-partitioning tests (group 3)..."
MODIN_RANGE_PARTITIONING=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/tests/pandas/test_groupby.py
MODIN_RANGE_PARTITIONING=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/tests/pandas/test_series.py -k "test_unique or test_nunique or drop_duplicates or test_resample"
MODIN_RANGE_PARTITIONING=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/tests/pandas/test_general.py -k "test_unique"
MODIN_RANGE_PARTITIONING=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/tests/pandas/dataframe/test_map_metadata.py -k "drop_duplicates"
MODIN_RANGE_PARTITIONING=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/tests/pandas/dataframe/test_join_sort.py -k "merge"
MODIN_RANGE_PARTITIONING=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/tests/pandas/dataframe/test_default.py -k "resample"
echo "::endgroup::"
shell: bash -l {0}
================================================
FILE: .github/actions/run-core-tests/group_4/action.yml
================================================
name: "Run core Modin tests - group 4"
description: "Run core Modin tests like dataframe or groupby"
inputs:
runner:
description: "Runner for tests"
default: "python -m pytest"
parallel:
description: "How to run tests in parallel"
default: "-n 2"
runs:
using: "composite"
steps:
- run: |
echo "::group::Running tests (group 4)..."
${{ inputs.runner }} ${{ inputs.parallel }} modin/tests/pandas/test_rolling.py \
modin/tests/pandas/test_expanding.py \
modin/tests/pandas/test_groupby.py \
modin/tests/pandas/test_reshape.py \
modin/tests/pandas/test_general.py
echo "::endgroup::"
shell: bash -l {0}
- run: |
echo "::group::Running concat tests (group 4)..."
${{ inputs.runner }} modin/tests/pandas/test_concat.py # Ray and Dask versions fails with -n 2
echo "::endgroup::"
shell: bash -l {0}
================================================
FILE: .github/actions/upload-coverage/action.yml
================================================
name: Upload Coverage
description: Upload coverage files
runs:
using: "composite"
steps:
- run: |
COVERAGE_UUID=$(python3 -c "import uuid; print(uuid.uuid4())")
mv .coverage .coverage.${COVERAGE_UUID}
echo "COVERAGE_UUID=${COVERAGE_UUID}" >> $GITHUB_ENV
id: coverage-uuid
shell: bash
- uses: actions/upload-artifact@v4
with:
name: coverage-data-${{ env.COVERAGE_UUID }}
path: .coverage*
include-hidden-files: true
================================================
FILE: .github/dependabot.yaml
================================================
version: 2
updates:
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "monthly"
groups:
github-actions:
patterns:
- "*"
================================================
FILE: .github/stale.yml
================================================
# Number of days of inactivity before an Issue or Pull Request becomes stale
daysUntilStale: 365
# Number of days of inactivity before an Issue or Pull Request with the stale label is closed.
# Set to false to disable. If disabled, issues still need to be closed manually, but will remain marked as stale.
daysUntilClose: 7
# Only issues or pull requests with all of these labels are check if stale. Defaults to `[]` (disabled)
onlyLabels: []
# Issues or Pull Requests with these labels will never be considered stale. Set to `[]` to disable
exemptLabels: []
# Set to true to ignore issues in a project (defaults to false)
exemptProjects: false
# Set to true to ignore issues in a milestone (defaults to false)
exemptMilestones: false
# Set to true to ignore issues with an assignee (defaults to false)
exemptAssignees: false
# Label to use when marking as stale
staleLabel: stale
# Comment to post when marking as stale. Set to `false` to disable
markComment: >
This issue has been automatically marked as stale because it has not had
recent activity. It will be closed if no further activity occurs within the next
7 days. Thank you for your contributions.
# Comment to post when removing the stale label.
# unmarkComment: >
# Your comment here.
# Comment to post when closing a stale Issue or Pull Request.
closeComment: >
Closing as stale.
================================================
FILE: .github/workflows/ci-notebooks.yml
================================================
name: ci-notebooks
on:
pull_request:
paths:
- modin/**
- examples/tutorial/**
- .github/workflows/ci-notebooks.yml
- setup.cfg
- setup.py
- requirements/env_unidist_linux.yml
concurrency:
# Cancel other jobs in the same branch. We don't care whether CI passes
# on old commits.
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
env:
MODIN_GITHUB_CI: true
jobs:
test-tutorial-notebooks:
defaults:
run:
shell: bash -l {0}
name: test tutorial notebooks
runs-on: ubuntu-latest
strategy:
matrix:
execution: [pandas_on_ray, pandas_on_dask, pandas_on_unidist]
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/python-only
if: matrix.execution != 'pandas_on_unidist'
- uses: ./.github/actions/mamba-env
with:
environment-file: requirements/env_unidist_linux.yml
activate-environment: modin_on_unidist
if: matrix.execution == 'pandas_on_unidist'
- name: Cache datasets
uses: actions/cache@v4
with:
path: taxi.csv
# update cache only if notebooks require it to be changed
key: taxi-csv-dataset-${{ hashFiles('examples/tutorial/jupyter/**') }}
# replace modin with . in the tutorial requirements file for `pandas_on_ray` and
# `pandas_on_dask` since we need Modin built from sources
- run: sed -i 's/modin/./g' examples/tutorial/jupyter/execution/${{ matrix.execution }}/requirements.txt
if: matrix.execution != 'pandas_on_unidist'
# install dependencies required for notebooks execution for `pandas_on_ray` and `pandas_on_dask`
# Override modin-spreadsheet install for now
- run: |
pip install -r examples/tutorial/jupyter/execution/${{ matrix.execution }}/requirements.txt
pip install git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5
if: matrix.execution != 'pandas_on_unidist'
# Build Modin from sources for `pandas_on_unidist`
- run: pip install -e .
if: matrix.execution == 'pandas_on_unidist'
# install test dependencies
# NOTE: If you are changing the set of packages installed here, make sure that
# the dev requirements match them.
- run: pip install pytest pytest-cov black flake8 flake8-print flake8-no-implicit-concat
if: matrix.execution != 'pandas_on_unidist'
- run: pip install flake8-print jupyter nbformat nbconvert
if: matrix.execution == 'pandas_on_unidist'
- run: pip list
if: matrix.execution != 'pandas_on_unidist'
- run: |
conda info
conda list
if: matrix.execution == 'pandas_on_unidist'
# setup kernel configuration for `pandas_on_unidist` execution with mpi backend
- run: python examples/tutorial/jupyter/execution/${{ matrix.execution }}/setup_kernel.py
if: matrix.execution == 'pandas_on_unidist'
- run: jupyter kernelspec list
- run: |
black --check --diff examples/tutorial/jupyter/execution/${{ matrix.execution }}/test/test_notebooks.py
black --check --diff examples/tutorial/jupyter/execution/test/utils.py
- run: |
flake8 --enable=T examples/tutorial/jupyter/execution/${{ matrix.execution }}/test/test_notebooks.py
flake8 --enable=T examples/tutorial/jupyter/execution/test/utils.py
- run: python -m pytest examples/tutorial/jupyter/execution/${{ matrix.execution }}/test/test_notebooks.py
================================================
FILE: .github/workflows/ci-required.yml
================================================
name: ci-required
on: pull_request
concurrency:
# Cancel other jobs in the same branch. We don't care whether CI passes
# on old commits.
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
env:
MODIN_GITHUB_CI: true
jobs:
check-pr-title:
runs-on: ubuntu-latest
steps:
- uses: Slashgear/action-check-pr-title@v4.3.0
with:
# NOTE: If you change the allowed prefixes here, update
# the documentation about them in /docs/development/contributing.rst
regexp: '^(?:FEAT|DOCS|FIX|REFACTOR|TEST|PERF)-#\d+:'
build-docs:
name: build docs
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 1
- uses: actions/setup-python@v5
with:
python-version: "3.9"
architecture: "x64"
cache: "pip"
cache-dependency-path: '**/requirements-doc.txt'
- run: pip install -r docs/requirements-doc.txt
- run: cd docs && sphinx-build -T -E -W -b html . build
lint-pydocstyle:
name: lint (pydocstyle)
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/python-only
# The `numpydoc` version here MUST match the versions in the dev requirements files.
- run: pip install pytest pytest-cov pydocstyle numpydoc==1.6.0
- run: python -m pytest scripts/test
- run: pip install -e ".[all]"
- run: |
python scripts/doc_checker.py --add-ignore=D101,D102,D103,D105 --disable-numpydoc \
modin/pandas/dataframe.py modin/pandas/series.py \
modin/pandas/groupby.py \
modin/pandas/series_utils.py modin/pandas/general.py \
modin/pandas/plotting.py modin/pandas/utils.py \
modin/pandas/iterator.py modin/pandas/indexing.py \
- run: python scripts/doc_checker.py modin/core/dataframe
- run: python scripts/doc_checker.py modin/core/execution/dask
- run: |
python scripts/doc_checker.py \
modin/pandas/accessor.py modin/pandas/general.py \
modin/pandas/groupby.py modin/pandas/indexing.py \
modin/pandas/iterator.py modin/pandas/plotting.py \
modin/pandas/series_utils.py modin/pandas/utils.py \
modin/pandas/base.py \
modin/pandas/io.py \
asv_bench/benchmarks/utils \
asv_bench/benchmarks/__init__.py asv_bench/benchmarks/io/__init__.py \
asv_bench/benchmarks/scalability/__init__.py \
modin/core/io \
modin/pandas/series.py \
modin/core/execution/python \
modin/pandas/dataframe.py \
modin/config/__init__.py \
modin/config/__main__.py \
modin/config/envvars.py \
modin/config/pubsub.py
- run: python scripts/doc_checker.py modin/distributed
- run: python scripts/doc_checker.py modin/utils.py
- run: python scripts/doc_checker.py modin/experimental/sklearn
- run: |
python scripts/doc_checker.py modin/experimental/xgboost/__init__.py \
modin/experimental/xgboost/utils.py modin/experimental/xgboost/xgboost.py \
modin/experimental/xgboost/xgboost_ray.py
- run: python scripts/doc_checker.py modin/core/execution/ray
- run: |
python scripts/doc_checker.py modin/core/execution/dispatching/factories/factories.py \
modin/core/execution/dispatching/factories/dispatcher.py \
- run: python scripts/doc_checker.py scripts/doc_checker.py
- run: |
python scripts/doc_checker.py modin/experimental/pandas/io.py \
modin/experimental/pandas/__init__.py
- run: python scripts/doc_checker.py modin/core/storage_formats/base
- run: python scripts/doc_checker.py modin/core/storage_formats/pandas
- run: python scripts/doc_checker.py modin/experimental/batch/pipeline.py
- run: python scripts/doc_checker.py modin/logging
lint-black-isort:
name: lint (black and isort)
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/python-only
- run: pip install black>=24.1.0 isort>=5.12
# NOTE: keep the black command here in sync with the pre-commit hook in
# /contributing/pre-commit
- run: black --check --diff modin/ asv_bench/benchmarks scripts/doc_checker.py
- run: isort . --check-only
================================================
FILE: .github/workflows/ci.yml
================================================
name: ci
on:
pull_request:
paths:
# NOTE: keep these paths in sync with the paths that trigger the
# fuzzydata Github Actions in .github/workflows/fuzzydata-test.yml
- .github/workflows/**
- .github/actions/**
- '!.github/workflows/push-to-main.yml'
- asv_bench/**
- modin/**
- requirements/**
- scripts/**
- environment-dev.yml
- requirements-dev.txt
- setup.cfg
- setup.py
- versioneer.py
push:
schedule:
- cron: "30 2 * * WED"
- cron: "30 2 * * THU"
concurrency:
# Cancel other jobs in the same branch. We don't care whether CI passes
# on old commits.
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
env:
MODIN_GITHUB_CI: true
jobs:
python-filter:
runs-on: ubuntu-latest
outputs:
python-version: ${{ steps.choose.outputs.python-version }}
steps:
- id: choose
run: |
if [[ "${{ github.event.schedule }}" = "30 2 * * WED" ]]
then
echo "python-version=3.10" >> "$GITHUB_OUTPUT"
elif [[ "${{ github.event.schedule }}" = "30 2 * * THU" ]]
then
echo "python-version=3.11" >> "$GITHUB_OUTPUT"
else
echo "python-version=3.9" >> "$GITHUB_OUTPUT"
fi
lint-mypy:
needs: [python-filter]
name: lint (mypy)
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/python-only
with:
python-version: ${{ needs.python-filter.outputs.python-version }}
- run: pip install -r requirements-dev.txt
- run: mypy --config-file mypy.ini
lint-flake8:
needs: [python-filter]
name: lint (flake8)
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/python-only
with:
python-version: ${{ needs.python-filter.outputs.python-version }}
# NOTE: If you are changing the set of packages installed here, make sure that
# the dev requirements match them.
- run: pip install flake8 flake8-print flake8-no-implicit-concat
# NOTE: keep the flake8 command here in sync with the pre-commit hook in
# /contributing/pre-commit
- run: flake8 modin/ asv_bench/benchmarks scripts/doc_checker.py
test-api-and-no-engine:
needs: [python-filter]
name: Test API, headers and no-engine mode
runs-on: ubuntu-latest
defaults:
run:
shell: bash -l {0}
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/mamba-env
with:
environment-file: requirements/requirements-no-engine.yml
python-version: ${{ needs.python-filter.outputs.python-version }}
- run: python -m pytest modin/tests/pandas/test_api.py
- run: python -m pytest modin/tests/test_executions_api.py
- run: python -m pytest modin/tests/test_headers.py
- run: python -m pytest modin/tests/core/test_dispatcher.py::test_add_option
- uses: ./.github/actions/upload-coverage
test-clean-install:
needs: [lint-flake8, python-filter]
strategy:
matrix:
os:
- ubuntu
- windows
runs-on: ${{ matrix.os }}-latest
defaults:
run:
shell: bash -l {0}
name: test-clean-install-${{ matrix.os }}
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/python-only
with:
python-version: ${{ needs.python-filter.outputs.python-version }}
- run: python -m pip install -e ".[all]"
- name: Ensure Ray and Dask engines start up
run: |
MODIN_ENGINE=dask python -c "import modin.pandas as pd; print(pd.DataFrame([1,2,3]))"
MODIN_ENGINE=ray python -c "import modin.pandas as pd; print(pd.DataFrame([1,2,3]))"
- name: Ensure MPI engine start up
# Install a working MPI implementation beforehand so mpi4py can link to it
run: |
sudo apt-get update
sudo apt-get install software-properties-common
sudo add-apt-repository "deb http://archive.ubuntu.com/ubuntu jammy main universe restricted multiverse"
sudo add-apt-repository "deb http://archive.ubuntu.com/ubuntu jammy-updates main universe restricted multiverse"
sudo add-apt-repository "deb http://security.ubuntu.com/ubuntu jammy-security main universe restricted multiverse"
sudo apt-get update
sudo apt-get install libmpich-dev=4.0-3 libmpich12=4.0-3 mpich=4.0-3
python -m pip install -e ".[mpi]"
# mpi4py 4.1 does not work with the mpich versions above.
# TODO(https://github.com/modin-project/modin/issues/7615): figure out
# the correct libmpich versions for mpi4py >= 4.1
python -m pip install "mpi4py<4.1"
MODIN_ENGINE=unidist UNIDIST_BACKEND=mpi mpiexec -n 1 python -c "import modin.pandas as pd; print(pd.DataFrame([1,2,3]))"
if: matrix.os == 'ubuntu'
test-internals:
needs: [lint-flake8, python-filter]
runs-on: ubuntu-latest
defaults:
run:
shell: bash -l {0}
name: test-internals
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/mamba-env
with:
environment-file: environment-dev.yml
python-version: ${{ needs.python-filter.outputs.python-version }}
- name: Internals tests
run: python -m pytest modin/tests/core/test_dispatcher.py
- run: python -m pytest modin/tests/config
- run: python -m pytest modin/tests/test_envvar_catcher.py
- run: python -m pytest modin/tests/core/storage_formats/base/test_internals.py
- run: python -m pytest modin/tests/core/storage_formats/pandas/test_internals.py
- run: python -m pytest modin/tests/test_envvar_npartitions.py
- run: python -m pytest modin/tests/test_utils.py
- run: python -m pytest asv_bench/test/test_utils.py
- run: python -m pytest modin/tests/interchange/dataframe_protocol/base
- run: python -m pytest modin/tests/test_dataframe_api_standard.py
- run: python -m pytest modin/tests/test_logging.py
- run: python -m pytest modin/tests/test_metrics.py
- run: python -m pytest modin/tests/pandas/extensions
- uses: ./.github/actions/upload-coverage
test-defaults:
needs: [lint-flake8, python-filter]
runs-on: ubuntu-latest
defaults:
run:
shell: bash -l {0}
strategy:
matrix:
execution: [BaseOnPython]
env:
MODIN_TEST_DATASET_SIZE: "small"
name: Test ${{ matrix.execution }} execution, Python ${{ needs.python-filter.outputs.python-version }}"
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/mamba-env
with:
environment-file: environment-dev.yml
python-version: ${{ needs.python-filter.outputs.python-version }}
- name: Install HDF5
run: sudo apt update && sudo apt install -y libhdf5-dev
- name: xgboost tests
run: |
# TODO(https://github.com/modin-project/modin/issues/5194): Uncap xgboost
# when we use collective instead of rabit.
# Per the thread https://github.com/conda-forge/miniforge/issues/513,
# remove unused conda packages and caches to avoid `Found incorrect
# download: joblib` error from mamba.
mamba clean --all
mamba install "xgboost>=1.7.1,<2.0.0" scikit-learn -c conda-forge
python -m pytest modin/tests/experimental/xgboost/test_default.py --execution=${{ matrix.execution }}
- run: python -m pytest -n 2 modin/tests/core/storage_formats/base/test_internals.py --execution=${{ matrix.execution }}
- uses: ./.github/actions/run-core-tests
with:
runner: python -m pytest --execution=${{ matrix.execution }}
- uses: ./.github/actions/upload-coverage
test-asv-benchmarks:
if: github.event_name == 'pull_request'
needs: [lint-flake8]
runs-on: ubuntu-latest
defaults:
run:
shell: bash -l {0}
env:
MODIN_ENGINE: ray
MODIN_MEMORY: 1000000000
MODIN_TEST_DATASET_SIZE: small
name: test-asv-benchmarks
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 1
- uses: conda-incubator/setup-miniconda@v3
with:
auto-activate-base: true
activate-environment: ""
miniforge-variant: Miniforge3
miniforge-version: latest
use-mamba: true
- name: Running benchmarks
run: |
git remote add upstream https://github.com/modin-project/modin.git
git fetch upstream
if git diff upstream/main --name-only | grep -q "^asv_bench/"; then
cd asv_bench
mamba env create -f ../environment-dev.yml
conda activate modin
pip install ..
asv machine --yes
# check Modin on Ray
asv run --quick --dry-run --python=same --strict --show-stderr --launch-method=spawn \
-b ^benchmarks -b ^io -b ^scalability | tee benchmarks.log
# check pure pandas
MODIN_ASV_USE_IMPL=pandas asv run --quick --dry-run --python=same --strict --show-stderr --launch-method=spawn \
-b ^benchmarks -b ^io | tee benchmarks.log
else
echo "Benchmarks did not run, no changes detected"
fi
if: always()
- name: Publish benchmarks artifact
uses: actions/upload-artifact@v4
with:
name: Benchmarks log
path: asv_bench/benchmarks.log
include-hidden-files: true
if: failure()
execution-filter:
# Choose which executions we want to run all tests for on a pull request.
# We always test 'native' and 'python' executions completely because they
# are fast, but we only test ray, dask, and unidist, if we think this pull
# request is affecting how we execute with those engines specifically.
runs-on: ubuntu-latest
outputs:
ray: ${{ steps.filter.outputs.ray }}
dask: ${{ steps.filter.outputs.dask }}
unidist: ${{ steps.filter.outputs.unidist }}
engines: ${{ steps.engines.outputs.engines }}
experimental: ${{ steps.experimental.outputs.experimental }}
steps:
- uses: actions/checkout@v4
- uses: dorny/paths-filter@v3
id: filter
with:
filters: |
shared: &shared
- 'modin/core/execution/dispatching/**'
ray:
- *shared
- 'modin/core/execution/ray/**'
dask:
- *shared
- 'modin/core/execution/dask/**'
unidist:
- *shared
- 'modin/core/execution/unidist/**'
experimental:
- 'modin/experimental/**'
- uses: actions/setup-python@v5
- id: engines
run: |
python -c "import sys, json; print('engines=' + json.dumps(['python', 'native'] + (sys.argv[1] == 'true' and ['ray'] or []) + (sys.argv[2] == 'true' and ['dask'] or []) ))" \
"${{ steps.filter.outputs.ray }}" "${{ steps.filter.outputs.dask }}" >> $GITHUB_OUTPUT
test-all-unidist:
needs: [lint-flake8, execution-filter, python-filter]
if: github.event_name == 'push' || needs.execution-filter.outputs.unidist == 'true'
runs-on: ubuntu-latest
defaults:
run:
shell: bash -l {0}
strategy:
matrix:
python-version: [ "${{ needs.python-filter.outputs.python-version }}" ]
unidist-backend: ["mpi"]
env:
MODIN_ENGINE: "Unidist"
UNIDIST_BACKEND: ${{matrix.unidist-backend}}
# Only test reading from SQL server and postgres on ubuntu for now.
# Eventually, we should test on Windows, too, but we will have to set up
# the servers differently.
MODIN_TEST_READ_FROM_SQL_SERVER: true
MODIN_TEST_READ_FROM_POSTGRES: true
name: test-ubuntu (engine unidist ${{matrix.unidist-backend}}, python ${{matrix.python-version}})
services:
moto:
image: motoserver/moto:5.0.13
ports:
- 5000:5000
env:
AWS_ACCESS_KEY_ID: foobar_key
AWS_SECRET_ACCESS_KEY: foobar_secret
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/mamba-env
with:
environment-file: requirements/env_unidist_linux.yml
activate-environment: modin_on_unidist
python-version: ${{matrix.python-version}}
- name: Install HDF5
run: sudo apt update && sudo apt install -y libhdf5-dev
- name: Set up postgres
# Locally, specifying port 2345:5432 works, but 2345:2345 and 5432:5432 do not. This solution is from
# https://stackoverflow.com/questions/36415654/cant-connect-docker-postgresql-9-3
run: |
sudo docker pull postgres
sudo docker run --name some-postgres -e POSTGRES_USER=sa -e POSTGRES_PASSWORD=Strong.Pwd-123 -e POSTGRES_DB=postgres -d -p 2345:5432 postgres
- run: mpiexec -n 1 python -m pytest modin/tests/pandas/internals/test_benchmark_mode.py
- run: mpiexec -n 1 python -m pytest modin/tests/test_partition_api.py
- uses: ./.github/actions/run-core-tests
with:
runner: mpiexec -n 1 python -m pytest
parallel: ""
- run: mpiexec -n 1 python -m pytest modin/tests/numpy
- run: chmod +x ./.github/workflows/sql_server/set_up_sql_server.sh
- run: ./.github/workflows/sql_server/set_up_sql_server.sh
# need an extra argument "genv" to set environment variables for mpiexec. We need
# these variables to test writing to the mock s3 filesystem.
- uses: nick-fields/retry@v3
# to avoid issues with non-stable `to_csv` tests for unidist on MPI backend.
# for details see: https://github.com/modin-project/modin/pull/6776
with:
timeout_minutes: 15
max_attempts: 3
command: |
conda run --no-capture-output -n modin_on_unidist mpiexec -n 1 -genv AWS_ACCESS_KEY_ID foobar_key \
-genv AWS_SECRET_ACCESS_KEY foobar_secret python -m pytest modin/tests/pandas/test_io.py --verbose
- run: |
mpiexec -n 1 -genv AWS_ACCESS_KEY_ID foobar_key -genv AWS_SECRET_ACCESS_KEY foobar_secret \
python -m pytest modin/tests/experimental/test_io_exp.py
- run: mpiexec -n 1 python -m pytest modin/tests/interchange/dataframe_protocol/test_general.py
- run: mpiexec -n 1 python -m pytest modin/tests/interchange/dataframe_protocol/pandas/test_protocol.py
- run: |
python -m pip install lazy_import
mpiexec -n 1 python -m pytest modin/tests/pandas/integrations/
- uses: ./.github/actions/upload-coverage
test-all:
needs: [lint-flake8, execution-filter, python-filter]
strategy:
matrix:
os:
- ubuntu
- windows
python-version: [ "${{ needs.python-filter.outputs.python-version }}" ]
# On push, run the tests for all engines. Otherwise, for pull requests,
# only run tests for engines that depend on files changed in this PR.
engine: ${{ fromJSON( (github.event_name == 'push' && '["python", "ray", "dask", "native"]') || needs.execution-filter.outputs.engines ) }}
test_task:
- group_1
- group_2
- group_3
- group_4
exclude: # python and native engines only have one task group that contains all the tests
- engine: "python"
test_task: "group_2"
- engine: "native"
test_task: "group_2"
- engine: "python"
test_task: "group_3"
- engine: "native"
test_task: "group_3"
- engine: "python"
test_task: "group_4"
- engine: "native"
test_task: "group_4"
runs-on: ${{ matrix.os }}-latest
defaults:
run:
shell: bash -l {0}
env:
MODIN_ENGINE: ${{matrix.engine}}
# Only test reading from SQL server and postgres on ubuntu for now.
# Eventually, we should test on Windows, too, but we will have to set up
# the servers differently.
MODIN_TEST_READ_FROM_SQL_SERVER: ${{ matrix.os == 'ubuntu' }}
MODIN_TEST_READ_FROM_POSTGRES: ${{ matrix.os == 'ubuntu' }}
name: test-${{ matrix.os }} (engine ${{matrix.engine}}, python ${{matrix.python-version}}, ${{matrix.test_task}})
services:
# Using workaround https://github.com/actions/runner/issues/822#issuecomment-1524826092
moto:
# we only need moto service on Ubuntu and for group_4 task, or for native or python engine.
image: ${{ (matrix.os == 'ubuntu' && (matrix.engine == 'python' || matrix.engine == 'native' || matrix.test_task == 'group_4')) && 'motoserver/moto:5.0.13' || '' }}
ports:
- 5000:5000
env:
AWS_ACCESS_KEY_ID: foobar_key
AWS_SECRET_ACCESS_KEY: foobar_secret
steps:
- name: Set native storage format
run: echo "MODIN_STORAGE_FORMAT=Native" >> $GITHUB_ENV
if: matrix.engine == 'native'
- name: Limit ray memory
run: echo "MODIN_MEMORY=1000000000" >> $GITHUB_ENV
if: matrix.os == 'ubuntu' && matrix.engine == 'ray'
- name: Tell Modin to use existing ray cluster
run: echo "MODIN_RAY_CLUSTER=True" >> $GITHUB_ENV
if: matrix.os == 'windows' && matrix.engine == 'ray'
- uses: actions/checkout@v4
- uses: ./.github/actions/mamba-env
with:
environment-file: environment-dev.yml
python-version: ${{matrix.python-version}}
- name: Start local ray cluster
# Try a few times to start ray to work around
# https://github.com/modin-project/modin/issues/4562
uses: nick-fields/retry@v3
with:
timeout_minutes: 5
max_attempts: 5
command: ray start --head --port=6379 --object-store-memory=1000000000
if: matrix.os == 'windows' && matrix.engine == 'ray'
- name: Install HDF5
run: sudo apt update && sudo apt install -y libhdf5-dev
if: matrix.os == 'ubuntu'
- name: Set up postgres
# Locally, specifying port 2345:5432 works, but 2345:2345 and 5432:5432 do not. This solution is from
# https://stackoverflow.com/questions/36415654/cant-connect-docker-postgresql-9-3
run: |
sudo docker pull postgres
sudo docker run --name some-postgres -e POSTGRES_USER=sa -e POSTGRES_PASSWORD=Strong.Pwd-123 -e POSTGRES_DB=postgres -d -p 2345:5432 postgres
if: matrix.os == 'ubuntu'
# BEGIN partitioned execution tests. We run these tests along with group 1,
# or if we are on the "python" engine, which only has a single group. We
# skip these tests on the "native" engine, which does not use partitions.
- run: python -m pytest modin/tests/pandas/internals/test_benchmark_mode.py
if: matrix.engine != 'native' && (matrix.engine == 'python' || matrix.test_task == 'group_1')
- run: python -m pytest modin/tests/test_partition_api.py
# Skip this test for python because we do not define unwrap_partitions()
# for python execution.
if: matrix.engine != 'native' && matrix.engine != 'python' && matrix.test_task == 'group_1'
- name: xgboost tests
run: |
# TODO(https://github.com/modin-project/modin/issues/5194): Uncap xgboost
# when we use collective instead of rabit.
mamba install "xgboost>=1.7.1,<2.0.0" scikit-learn -c conda-forge
python -m pytest -n 2 \
modin/tests/experimental/xgboost/test_default.py \
modin/tests/experimental/xgboost/test_xgboost.py \
modin/tests/experimental/xgboost/test_dmatrix.py
if: matrix.engine != 'native' && matrix.os != 'windows' && (matrix.engine == 'python' || matrix.test_task == 'group_1')
- run: python -m pytest -n 2 modin/tests/experimental/test_pipeline.py
if: matrix.engine != 'native' && (matrix.engine == 'python' || matrix.test_task == 'group_1')
# END partitioned execution tests.
# BEGIN test groups.
# Run all the tests in the corresponding group for this instance of the
# test matrix. For example, if we are in the matrix's 'group_4', run the
# tests for 'group_4'. For each of 'native' and 'python' engines, we run
# all tests in a single job, so we ignore the grouping.
- uses: ./.github/actions/run-core-tests/group_1
with:
# When running with Ray engine on Windows using 2 pytest workers tests are failing in CI.
# See https://github.com/modin-project/modin/issues/7387.
parallel: ${{ matrix.engine == 'ray' && matrix.os == 'windows' && '-n 1' || '-n 2' }}
if: matrix.engine == 'python' || matrix.engine == 'native' || matrix.test_task == 'group_1'
- uses: ./.github/actions/run-core-tests/group_2
with:
# When running with Ray engine on Windows using 2 pytest workers tests are failing in CI.
# See https://github.com/modin-project/modin/issues/7387.
parallel: ${{ matrix.engine == 'ray' && matrix.os == 'windows' && '-n 1' || '-n 2' }}
if: matrix.engine == 'python' || matrix.engine == 'native' || matrix.test_task == 'group_2'
- uses: ./.github/actions/run-core-tests/group_3
with:
# When running with Ray engine on Windows using 2 pytest workers tests are failing in CI.
# See https://github.com/modin-project/modin/issues/7387.
parallel: ${{ matrix.engine == 'ray' && matrix.os == 'windows' && '-n 1' || '-n 2' }}
if: matrix.engine == 'python' || matrix.engine == 'native' || matrix.test_task == 'group_3'
- uses: ./.github/actions/run-core-tests/group_4
with:
# When running with Ray engine on Windows using 2 pytest workers tests are failing in CI.
# See https://github.com/modin-project/modin/issues/7387.
parallel: ${{ matrix.engine == 'ray' && matrix.os == 'windows' && '-n 1' || '-n 2' }}
if: matrix.engine == 'python' || matrix.engine == 'native' || matrix.test_task == 'group_4'
- run: python -m pytest -n 2 modin/tests/numpy
# Native execution does not support the modin Numpy API.
if: matrix.engine == 'python' || matrix.test_task == 'group_4'
# END test groups.
# BEGIN some tests that we run along with group 4 for engines other than
# 'native' and 'python'. 'native' and 'python' jobs will run these tests
# along with all other tests in a single group.
- run: chmod +x ./.github/workflows/sql_server/set_up_sql_server.sh
if: matrix.os == 'ubuntu' && (matrix.engine == 'python' || matrix.engine == 'native' || matrix.test_task == 'group_4')
- run: ./.github/workflows/sql_server/set_up_sql_server.sh
if: matrix.os == 'ubuntu' && (matrix.engine == 'python' || matrix.engine == 'native' || matrix.test_task == 'group_4')
# Do not add parallelism (`-n` argument) here - it will cause mock S3 service to fail.
- run: python -m pytest modin/tests/pandas/test_io.py --verbose
timeout-minutes: 60
if: matrix.engine == 'python' || matrix.engine == 'native' || matrix.test_task == 'group_4'
- run: python -m pytest modin/tests/experimental/test_io_exp.py
if: matrix.engine == 'python' || matrix.engine == 'native' || matrix.test_task == 'group_4'
- run: python -m pytest modin/tests/interchange/dataframe_protocol/test_general.py
if: matrix.engine == 'python' || matrix.engine == 'native' || matrix.test_task == 'group_4'
- run: python -m pytest modin/tests/interchange/dataframe_protocol/pandas/test_protocol.py
if: matrix.engine == 'python' || matrix.engine == 'native' || matrix.test_task == 'group_4'
- run: python -m pytest modin/tests/polars/test_dataframe.py
- run: |
python -m pip install lazy_import
python -m pytest modin/tests/pandas/integrations/
if: matrix.engine == 'python' || matrix.engine == 'native' || matrix.test_task == 'group_4'
# END tests that run on group 4, or in the single group for 'native' and
# python' engines.
- uses: ./.github/actions/upload-coverage
- name: Stop local ray cluster
run: ray stop
if: matrix.os == 'windows' && matrix.engine == 'ray'
test-sanity:
# The "sanity" tests run on each pull request to test that a subset of the
# full tests work with the slower engines (ray, dask, and unidist-MPI).
needs: [lint-flake8, execution-filter, python-filter]
# If we don't need to run any sanity tests, the job matrix that we generate
# here gives a single job with all the matrix fields empty (that is, os,
# execution, etc. are not set, so we treat them as "").
# so, if the matrix is going to be empty, we need to skip this job
# completely. This bizarre behavior is not in the official documentation,
# of GitHub actions matrices, but someone does mention it here:
# https://stackoverflow.com/a/77118991
if: |
github.event_name == 'pull_request' &&
(
needs.execution-filter.outputs.ray != 'true' ||
needs.execution-filter.outputs.dask != 'true' ||
needs.execution-filter.outputs.unidist != 'true'
)
strategy:
matrix:
os:
- ubuntu
- windows
python-version: [ "${{ needs.python-filter.outputs.python-version }}" ]
running-all-ray-tests: [ "${{ needs.execution-filter.outputs.ray }}" ]
running-all-dask-tests: [ "${{needs.execution-filter.outputs.dask}}" ]
running-all-unidist-tests: [ "${{needs.execution-filter.outputs.unidist}}" ]
execution: [ray, dask, unidist]
# If we're going to run all ray tests because we've detected a
# change to the ray engine, we don't need to run these sanity tests
# on ray. Likewise for dask and unidist.
exclude:
- running-all-ray-tests: 'true'
execution: ray
- running-all-dask-tests: 'true'
execution: dask
- running-all-unidist-tests: 'true'
execution: unidist
runs-on: ${{ matrix.os }}-latest
defaults:
run:
shell: bash -l {0}
env:
MODIN_ENGINE: ${{ matrix.execution }}
UNIDIST_BACKEND: "mpi"
PARALLEL: ${{ matrix.execution != 'unidist' && matrix.os != 'windows' && '-n 2' || '' }}
PYTEST_COMMAND: >-
${{
(
(matrix.execution == 'ray' || matrix.execution == 'dask') &&
'python -m pytest'
) ||
(
matrix.execution == 'unidist' &&
'mpiexec -n 1 -genv AWS_ACCESS_KEY_ID foobar_key -genv AWS_SECRET_ACCESS_KEY foobar_secret python -m pytest'
) ||
'UNKNOWN_PYTEST_COMMAND'
}}
name: test-${{ matrix.os }}-sanity (engine ${{ matrix.execution }}, python ${{matrix.python-version}})
services:
moto:
image: ${{ matrix.os != 'windows' && 'motoserver/moto:5.0.13' || '' }}
ports:
- 5000:5000
env:
AWS_ACCESS_KEY_ID: foobar_key
AWS_SECRET_ACCESS_KEY: foobar_secret
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/mamba-env
with:
environment-file: ${{ matrix.os == 'ubuntu' && matrix.execution == 'unidist' && 'requirements/env_unidist_linux.yml' || matrix.os == 'windows' && matrix.execution == 'unidist' && 'requirements/env_unidist_win.yml' || 'environment-dev.yml' }}
activate-environment: ${{ matrix.execution == 'unidist' && 'modin_on_unidist' || 'modin' }}
python-version: ${{matrix.python-version}}
- name: Install HDF5
run: sudo apt update && sudo apt install -y libhdf5-dev
if: matrix.os != 'windows'
- name: Limit ray memory
run: echo "MODIN_MEMORY=1000000000" >> $GITHUB_ENV
if: matrix.os != 'windows' && matrix.execution == 'ray'
- name: Tell Modin to use existing ray cluster
run: echo "MODIN_RAY_CLUSTER=True" >> $GITHUB_ENV
if: matrix.os == 'windows' && matrix.execution == 'ray'
- name: Start local ray cluster
# Try a few times to start ray to work around
# https://github.com/modin-project/modin/issues/4562
uses: nick-fields/retry@v3
with:
timeout_minutes: 5
max_attempts: 5
command: ray start --head --port=6379 --object-store-memory=1000000000
if: matrix.os == 'windows' && matrix.execution == 'ray'
- run: MODIN_BENCHMARK_MODE=True $PYTEST_COMMAND modin/tests/pandas/internals/test_benchmark_mode.py
- run: $PYTEST_COMMAND $PARALLEL modin/tests/test_partition_api.py
- run: $PYTEST_COMMAND modin/tests/pandas/extensions
- name: xgboost tests
run: |
# TODO(https://github.com/modin-project/modin/issues/5194): Uncap xgboost
# when we use collective instead of rabit.
mamba install "xgboost>=1.7.1,<2.0.0" scikit-learn -c conda-forge
$PYTEST_COMMAND $PARALLEL \
modin/tests/experimental/xgboost/test_default.py \
modin/tests/experimental/xgboost/test_xgboost.py \
modin/tests/experimental/xgboost/test_dmatrix.py
if: matrix.os != 'windows' && needs.execution-filter.outputs.experimental == 'true'
- run: $PYTEST_COMMAND $PARALLEL modin/tests/experimental/test_pipeline.py
if: matrix.os != 'windows' && matrix.execution != 'unidist' && needs.execution-filter.outputs.experimental == 'true'
- name: "test DF: binary, default, iter"
run: |
$PYTEST_COMMAND $PARALLEL \
modin/tests/pandas/dataframe/test_binary.py \
modin/tests/pandas/dataframe/test_default.py \
modin/tests/pandas/dataframe/test_iter.py
if: matrix.os != 'windows'
- name: "test DF: reduce, udf, window, pickle"
run: |
$PYTEST_COMMAND $PARALLEL \
modin/tests/pandas/dataframe/test_reduce.py \
modin/tests/pandas/dataframe/test_udf.py \
modin/tests/pandas/dataframe/test_window.py \
modin/tests/pandas/dataframe/test_pickle.py
if: matrix.os != 'windows'
- run: $PYTEST_COMMAND modin/tests/pandas/test_series.py
if: matrix.execution == 'ray'
- run: $PYTEST_COMMAND -m "not exclude_in_sanity" modin/tests/pandas/test_series.py
if: matrix.execution != 'ray'
- run: $PYTEST_COMMAND modin/tests/pandas/dataframe/test_map_metadata.py
if: matrix.execution == 'ray'
- run: $PYTEST_COMMAND -m "not exclude_in_sanity" modin/tests/pandas/dataframe/test_map_metadata.py
if: matrix.execution != 'ray'
- name: "test rolling, expanding, reshape, general, concat"
run: |
$PYTEST_COMMAND $PARALLEL \
modin/tests/pandas/test_rolling.py \
modin/tests/pandas/test_expanding.py \
modin/tests/pandas/test_reshape.py \
modin/tests/pandas/test_general.py \
modin/tests/pandas/test_concat.py
if: matrix.os != 'windows'
- run: $PYTEST_COMMAND $PARALLEL modin/tests/numpy
- run: $PYTEST_COMMAND -m "not exclude_in_sanity" modin/tests/pandas/test_io.py --verbose
if: matrix.execution != 'unidist'
- uses: nick-fields/retry@v3
# to avoid issues with non-stable `to_csv` tests for unidist on MPI backend.
# for details see: https://github.com/modin-project/modin/pull/6776
with:
timeout_minutes: 15
max_attempts: 3
command: conda run --no-capture-output -n modin_on_unidist $PYTEST_COMMAND -m "not exclude_in_sanity" modin/tests/pandas/test_io.py --verbose
if: matrix.execution == 'unidist'
- run: $PYTEST_COMMAND modin/tests/experimental/test_io_exp.py
- run: $PYTEST_COMMAND $PARALLEL modin/tests/interchange/dataframe_protocol/test_general.py
- run: $PYTEST_COMMAND $PARALLEL modin/tests/interchange/dataframe_protocol/pandas/test_protocol.py
- name: Stop local ray cluster
run: ray stop
if: matrix.os == 'windows' && matrix.execution == 'ray'
- uses: ./.github/actions/upload-coverage
test-experimental:
needs: [lint-flake8, python-filter]
runs-on: ubuntu-latest
defaults:
run:
shell: bash -l {0}
env:
MODIN_ENGINE: "python"
MODIN_EXPERIMENTAL: "True"
name: test experimental
services:
moto:
image: motoserver/moto:5.0.13
ports:
- 5000:5000
env:
AWS_ACCESS_KEY_ID: foobar_key
AWS_SECRET_ACCESS_KEY: foobar_secret
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/mamba-env
with:
environment-file: environment-dev.yml
python-version: ${{ needs.python-filter.outputs.python-version }}
- name: Install HDF5
run: sudo apt update && sudo apt install -y libhdf5-dev
- run: python -m pytest -n 2 modin/tests/pandas/dataframe/test_map_metadata.py
- run: python -m pytest -n 2 modin/tests/pandas/test_series.py
# Do not add parallelism (`-n` argument) here - it will cause mock S3 service to fail.
- run: python -m pytest modin/tests/pandas/test_io.py --verbose
- uses: ./.github/actions/upload-coverage
test-spreadsheet:
needs: [lint-flake8, python-filter]
runs-on: ubuntu-latest
defaults:
run:
shell: bash -l {0}
strategy:
matrix:
python-version: [ "${{ needs.python-filter.outputs.python-version }}" ]
engine: ["ray", "dask"]
env:
MODIN_EXPERIMENTAL: "True"
MODIN_ENGINE: ${{matrix.engine}}
name: test-spreadsheet (engine ${{matrix.engine}}, python ${{matrix.python-version}})
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/mamba-env
with:
environment-file: environment-dev.yml
python-version: ${{matrix.python-version}}
- run: python -m pytest modin/tests/experimental/spreadsheet/test_general.py
test-native-dataframe-interoperability:
needs: [ lint-flake8]
if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' }}
runs-on: ubuntu-latest
defaults:
run:
shell: bash -l {0}
strategy:
matrix:
python-version: ["3.9"]
env:
# Test interoperability between PandasOnPython dataframes/series and
# native dataframes/series.
MODIN_ENGINE: "Python"
name: test-native-dataframe-interoperability python ${{matrix.python-version}})
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/mamba-env
with:
environment-file: environment-dev.yml
python-version: ${{matrix.python-version}}
- run: python -m pytest modin/tests/pandas/native_df_interoperability/ -n 2
- uses: ./.github/actions/upload-coverage
merge-coverage-artifacts:
needs: [test-internals, test-api-and-no-engine, test-defaults, test-all-unidist, test-all, test-experimental, test-sanity, test-native-dataframe-interoperability]
if: always() # we need to run it regardless of some job being skipped, like in PR
runs-on: ubuntu-latest
defaults:
run:
shell: bash -l {0}
steps:
- name: Merge Artifacts
uses: actions/upload-artifact/merge@v4
with:
name: coverage-data
pattern: coverage-data-*
include-hidden-files: true
delete-merged: true
upload-coverage:
needs: [merge-coverage-artifacts, python-filter]
if: always() # we need to run it regardless of some job being skipped, like in PR
runs-on: ubuntu-latest
defaults:
run:
shell: bash -l {0}
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/python-only
with:
python-version: ${{ needs.python-filter.outputs.python-version }}
- name: Download coverage data
uses: actions/download-artifact@v4
with:
name: coverage-data
- run: pip install coverage
- name: Combine coverage
run: python -m coverage combine
- name: Generate coverage report in xml format
run: python -m coverage xml
- uses: codecov/codecov-action@v4
with:
fail_ci_if_error: ${{ github.event_name == 'push' }} # do not care about uploads in PR
token: ${{ secrets.CODECOV_TOKEN }} # this token is available at https://app.codecov.io/account/github/modin-project/
================================================
FILE: .github/workflows/codeql/codeql-config.yml
================================================
name: "Modin CodeQL config"
paths:
- modin/**
================================================
FILE: .github/workflows/codeql.yml
================================================
name: "CodeQL"
on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]
concurrency:
# Cancel other jobs in the same branch. We don't care whether CI passes
# on old commits.
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
env:
MODIN_GITHUB_CI: true
jobs:
analyze:
name: Analyze
runs-on: ubuntu-latest
permissions:
actions: read
contents: read
security-events: write
strategy:
fail-fast: false
matrix:
language: [ python ]
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Initialize CodeQL
uses: github/codeql-action/init@v3
with:
languages: ${{ matrix.language }}
queries: +security-and-quality
config-file: ./.github/workflows/codeql/codeql-config.yml
- name: Autobuild
uses: github/codeql-action/autobuild@v3
- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v3
with:
category: "/language:${{ matrix.language }}"
================================================
FILE: .github/workflows/fuzzydata-test.yml
================================================
name: fuzzy
on:
pull_request:
paths:
# NOTE: keep these paths in sync with the paths that trigger the CI Github
# Actions in .github/workflows/ci.yml
- .github/workflows/**
- '!.github/workflows/push-to-main.yml'
- asv_bench/**
- modin/**
- requirements/**
- scripts/**
- environment-dev.yml
- requirements-dev.txt
- setup.cfg
- setup.py
- versioneer.py
concurrency:
# Cancel other jobs in the same branch. We don't care whether CI passes
# on old commits.
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
env:
MODIN_GITHUB_CI: true
jobs:
test-fuzzydata:
runs-on: ubuntu-latest
defaults:
run:
shell: bash -l {0}
strategy:
matrix:
python-version: ["3.9"]
engine: ["ray", "dask"]
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/mamba-env
with:
environment-file: environment-dev.yml
python-version: ${{matrix.python-version}}
- name: test-fuzzydata (engine ${{matrix.engine}}, python ${{matrix.python-version}})
run: python -m pytest modin/tests/experimental/test_fuzzydata.py -Wignore::UserWarning --log-file=/tmp/fuzzydata-test-wf-${{matrix.engine}}/run.log --log-file-level=INFO
env:
MODIN_ENGINE: ${{matrix.engine}}
- uses: actions/upload-artifact@v4
if: success() || failure()
with:
name: fuzzydata-test-workflow-${{matrix.engine}}
path: /tmp/fuzzydata-test-wf-${{matrix.engine}}/* # Must match output dir in test_fuzzydata.py
if-no-files-found: error
include-hidden-files: true
================================================
FILE: .github/workflows/publish-to-pypi.yml
================================================
name: Publish Modin wheel to PyPI
on:
schedule:
- cron: "42 0 * * WED"
push:
tags:
- '*'
workflow_dispatch:
jobs:
build-n-publish:
name: Build and publish Modin wheel to PyPI
environment: release
runs-on: ubuntu-latest
permissions:
id-token: write # IMPORTANT: this permission is mandatory for trusted publishing
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
fetch-tags: true
- name: Checkout latest git tag
run: git checkout $(git describe --tags "$(git rev-list --tags --max-count=1)")
if: github.event_name == 'push'
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.9.x"
- name: Install/update tools
run: python3 -m pip install --upgrade build wheel
- name: Build a pure Python wheel
run: python3 setup.py sdist bdist_wheel
- uses: actions/upload-artifact@v4
with:
name: modin-wheel-and-source-tarball
path: ./dist/
include-hidden-files: true
- name: Publish Modin wheel to PyPI
if: github.event_name == 'push'
uses: pypa/gh-action-pypi-publish@release/v1
================================================
FILE: .github/workflows/push-to-main.yml
================================================
name: push-to-main
on:
push:
branches:
- main
concurrency:
# Cancel other jobs in the same branch. We don't care whether CI passes
# on old commits.
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
env:
MODIN_GITHUB_CI: true
jobs:
test-ray-master:
runs-on: ubuntu-latest
defaults:
run:
# `shell: bash -l {0}` - special way to activate modin environment
shell: bash -l {0}
services:
moto:
image: motoserver/moto:5.0.13
ports:
- 5000:5000
env:
AWS_ACCESS_KEY_ID: foobar_key
AWS_SECRET_ACCESS_KEY: foobar_secret
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/mamba-env
with:
environment-file: environment-dev.yml
- name: install Ray nightly build
# Use --force-reinstall to always reinstall ray and its dependencies.
# botocore isn't compatible with urllib3>=2; see #6094 for details
run: pip install --force-reinstall "urllib3<2" https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp39-cp39-manylinux2014_x86_64.whl
- name: Conda environment
run: |
conda info
conda list
- run: sudo apt update && sudo apt install -y libhdf5-dev
- name: Run parallelizable Modin Tests
run: >
python -m pytest -n 2
modin/tests/pandas/dataframe/test_binary.py
modin/tests/pandas/dataframe/test_default.py
modin/tests/pandas/dataframe/test_indexing.py
modin/tests/pandas/dataframe/test_iter.py
modin/tests/pandas/dataframe/test_join_sort.py
modin/tests/pandas/dataframe/test_map_metadata.py
modin/tests/pandas/dataframe/test_reduce.py
modin/tests/pandas/dataframe/test_udf.py
modin/tests/pandas/dataframe/test_window.py
modin/tests/pandas/test_series.py
modin/tests/numpy/test_array.py
modin/tests/numpy/test_array_creation.py
modin/tests/numpy/test_array_arithmetic.py
modin/tests/numpy/test_array_axis_functions.py
modin/tests/numpy/test_array_logic.py
modin/tests/numpy/test_array_linalg.py
modin/tests/numpy/test_array_indexing.py
modin/tests/numpy/test_array_math.py
modin/tests/numpy/test_array_shaping.py
modin/tests/pandas/test_rolling.py
modin/tests/pandas/test_expanding.py
modin/tests/pandas/test_concat.py
modin/tests/pandas/test_groupby.py
modin/tests/pandas/test_reshape.py
modin/tests/pandas/test_general.py
- name: Run non-parallelizable Modin Tests
run: >
python -m pytest
modin/tests/pandas/test_io.py
modin/tests/experimental/test_io_exp.py
test-docs:
runs-on: ubuntu-latest
defaults:
run:
shell: bash -l {0}
name: test docs
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/mamba-env
with:
environment-file: environment-dev.yml
- run: sudo apt update && sudo apt install -y libhdf5-dev
- name: Docstring URL validity check
run: python -m pytest modin/tests/test_docstring_urls.py
================================================
FILE: .github/workflows/sql_server/set_up_sql_server.sh
================================================
# This script sets up a SQL server listening at 0.0.0.0:1234.
# If any step fails, we can't set up a valid SQL server for unit tests.
set -e
# Pull the 2019 SQL server docker container image by following:
# https://docs.microsoft.com/en-us/sql/linux/quickstart-install-connect-docker?view=sql-server-ver15&pivots=cs1-powershell#pullandrun2019
sudo docker pull mcr.microsoft.com/mssql/server:2019-latest
sudo docker run -d --name example_sql_server -e 'ACCEPT_EULA=Y' -e 'SA_PASSWORD=Strong.Pwd-123' -p 1433:1433 mcr.microsoft.com/mssql/server:2019-latest
# Wait 10 seconds because if we don't the server typically will not be ready
# to accept connections by the time we want to make them.
sleep 10
================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
scripts/gh-users-cache.json
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
docs/flow/modin/configs_help.csv
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
# User-specific stuff
.idea/
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf
*.DS_Store
# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml
# Gradle
.idea/**/gradle.xml
.idea/**/libraries
# vscode settings
.vscode/
# CMake
cmake-build-*/
# Mongo Explorer plugin
.idea/**/mongoSettings.xml
# File-based project format
*.iws
# IntelliJ
out/
# mpeltonen/sbt-idea plugin
.idea_modules/
# JIRA plugin
atlassian-ide-plugin.xml
# Cursive Clojure plugin
.idea/replstate.xml
# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties
# Editor-based Rest Client
.idea/httpRequests
# Cscope and Tags
tags
cscope.files
cscope.out
# PYTest Benchmarks
.benchmarks/
# Dask workspace
dask-worker-space/
node_modules
# Asv stuff
asv_bench/.asv/
asv_bench/modin/
# Sublime stuff
*.sublime-workspace
*.sublime-project
================================================
FILE: .readthedocs.yaml
================================================
# .readthedocs.yaml
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
# Required
version: 2
# Set the version of Python and other tools you might need
build:
os: ubuntu-20.04
tools:
python: "3.9"
# Build documentation in the docs/ directory with Sphinx
sphinx:
configuration: docs/conf.py
formats: all
python:
install:
- requirements: docs/requirements-doc.txt
================================================
FILE: CODEOWNERS
================================================
# These owners will be the default owners for everything in
# the repo unless a later match takes precedence,
* @modin-project/modin-core @devin-petersohn @mvashishtha @RehanSD @YarShev @vnlitvinov @anmyachev @dchigarev
================================================
FILE: CODE_OF_CONDUCT.md
================================================
# Contributor Covenant Code of Conduct
## Our Pledge
In the interest of fostering an open and welcoming environment, we as
contributors and maintainers pledge to making participation in our project and
our community a harassment-free experience for everyone, regardless of age, body
size, disability, ethnicity, sex characteristics, gender identity and expression,
level of experience, education, socio-economic status, nationality, personal
appearance, race, religion, or sexual identity and orientation.
## Our Standards
Examples of behavior that contributes to creating a positive environment
include:
* Using welcoming and inclusive language
* Being respectful of differing viewpoints and experiences
* Gracefully accepting constructive criticism
* Focusing on what is best for the community
* Showing empathy towards other community members
Examples of unacceptable behavior by participants include:
* The use of sexualized language or imagery and unwelcome sexual attention or
advances
* Trolling, insulting/derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or electronic
address, without explicit permission
* Other conduct which could reasonably be considered inappropriate in a
professional setting
## Our Responsibilities
Project maintainers are responsible for clarifying the standards of acceptable
behavior and are expected to take appropriate and fair corrective action in
response to any instances of unacceptable behavior.
Project maintainers have the right and responsibility to remove, edit, or
reject comments, commits, code, wiki edits, issues, and other contributions
that are not aligned to this Code of Conduct, or to ban temporarily or
permanently any contributor for other behaviors that they deem inappropriate,
threatening, offensive, or harmful.
## Scope
This Code of Conduct applies both within project spaces and in public spaces
when an individual is representing the project or its community. Examples of
representing a project or community include using an official project e-mail
address, posting via an official social media account, or acting as an appointed
representative at an online or offline event. Representation of a project may be
further defined and clarified by project maintainers.
## Enforcement
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported by contacting the project team at conduct@gr-oss.io. All
complaints will be reviewed and investigated and will result in a response that
is deemed necessary and appropriate to the circumstances. The project team is
obligated to maintain confidentiality with regard to the reporter of an incident.
Further details of specific enforcement policies may be posted separately.
Project maintainers who do not follow or enforce the Code of Conduct in good
faith may face temporary or permanent repercussions as determined by other
members of the project's leadership.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
[homepage]: https://www.contributor-covenant.org
For answers to common questions about this code of conduct, see
https://www.contributor-covenant.org/faq
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
# Certain code used and distributed in this package is forked from pandas
# (https://github.com/pandas-dev/pandas). The pandas LICENSE
# below applies to those certain forked components in this project:
BSD 3-Clause License
Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team
All rights reserved.
Copyright (c) 2011-2025, Open source contributors.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
================================================
FILE: LICENSE_HEADER
================================================
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.
================================================
FILE: MANIFEST.in
================================================
include versioneer.py
include modin/_version.py
include modin/tests/pandas/data/*.csv
================================================
FILE: NOTICE
================================================
Modin
Copyright (c) 2018-2024 Modin Developers.
================================================
FILE: README.md
================================================
<p align="center"><a href="https://modin.readthedocs.io"><img width=77% alt="" src="https://github.com/modin-project/modin/raw/7c009c747caa90554607e30b9ac2bd1b190b8c7d/docs/img/MODIN_ver2_hrz.png?raw=true"></a></p>
<h2 align="center">Scale your pandas workflows by changing one line of code</h2>
<div align="center">
| <h3>Dev Community & Support</h3> | <h3>Forums</h3> | <h3>Socials</h3> | <h3>Docs</h3> |
|:---: | :---: | :---: | :---: |
| [](https://join.slack.com/t/modin-project/shared_invite/zt-yvk5hr3b-f08p_ulbuRWsAfg9rMY3uA) | [](https://stackoverflow.com/questions/tagged/modin) | <a href="https://twitter.com/modin_project"><img alt="Twitter Follow" src="https://img.shields.io/twitter/follow/modin_project?style=social" height=28 align="center"></a> | <a href="https://modin.readthedocs.io/en/latest/?badge=latest"><img alt="" src="https://readthedocs.org/projects/modin/badge/?version=latest" height=28 align="center"></a> |
</div>
<p align="center">
<a href="https://pepy.tech/project/modin"><img src="https://static.pepy.tech/personalized-badge/modin?period=total&units=international_system&left_color=black&right_color=blue&left_text=Downloads" align="center"></a>
<a href="https://codecov.io/gh/modin-project/modin"><img src="https://codecov.io/gh/modin-project/modin/branch/main/graph/badge.svg" align="center"/></a>
<a href="https://github.com/modin-project/modin/actions/workflows/push-to-main.yml?query=event%3Apush"><img src="https://github.com/modin-project/modin/actions/workflows/push-to-main.yml/badge.svg?branch=main" align="center"></a>
<a href="https://github.com/modin-project/modin/actions/workflows/ci.yml?query=event%3Apush"><img src="https://github.com/modin-project/modin/actions/workflows/ci.yml/badge.svg?branch=main" align="center"></a>
<a href="https://pypi.org/project/modin/"><img src="https://badge.fury.io/py/modin.svg" alt="PyPI version" align="center"></a>
<a href="https://modin.org/modin-bench/#/"><img src="https://img.shields.io/badge/benchmarked%20by-asv-blue.svg" align="center"></a>
</p>
### What is Modin?
Modin is a drop-in replacement for [pandas](https://github.com/pandas-dev/pandas). While pandas is
single-threaded, Modin lets you instantly speed up your workflows by scaling pandas so it uses all of your
cores. Modin works especially well on larger datasets, where pandas becomes painfully slow or runs
[out of memory](https://modin.readthedocs.io/en/latest/getting_started/why_modin/out_of_core.html).
Also, Modin comes with the [additional APIs](https://modin.readthedocs.io/en/latest/usage_guide/advanced_usage/index.html#additional-apis)
to improve user experience.
By simply replacing the import statement, Modin offers users effortless speed and scale for their pandas workflows:
<img src="https://github.com/modin-project/modin/raw/main/docs/img/Import.gif" style="display: block;margin-left: auto;margin-right: auto;" width="100%"></img>
In the GIFs below, Modin (left) and pandas (right) perform *the same pandas operations* on a 2GB dataset. The only difference between the two notebook examples is the import statement.
<table class="tg">
<thead>
<tr>
<th class="tg-0lax" style="text-align: center;"><img src="https://github.com/modin-project/modin/raw/7c009c747caa90554607e30b9ac2bd1b190b8c7d/docs/img/MODIN_ver2_hrz.png?raw=True" height="35px"></th>
<th class="tg-0lax" style="text-align: center;"><img src="https://pandas.pydata.org/static/img/pandas.svg" height="50px"></img></th>
</tr>
</thead>
<tbody>
<tr>
<td class="tg-0lax"><img src="https://github.com/modin-project/modin/raw/7c009c747caa90554607e30b9ac2bd1b190b8c7d/docs/img/Modin.gif"></img></td>
<td class="tg-0lax"><img src="https://github.com/modin-project/modin/raw/7c009c747caa90554607e30b9ac2bd1b190b8c7d/docs/img/Pandas.gif"></img></td>
</tr>
</tbody>
</table>
The charts below show the speedup you get by replacing pandas with Modin based on the examples above. The example notebooks can be found [here](examples/jupyter). To learn more about the speedups you could get with Modin and try out some examples on your own, check out our [10-minute quickstart guide](https://modin.readthedocs.io/en/latest/getting_started/quickstart.html) to try out some examples on your own!
<img src="https://github.com/modin-project/modin/raw/7c009c747caa90554607e30b9ac2bd1b190b8c7d/docs/img/Modin_Speedup.svg" style="display: block;margin-left: auto;margin-right: auto;" width="100%"></img>
### Installation
#### From PyPI
Modin can be installed with `pip` on Linux, Windows and MacOS:
```bash
pip install "modin[all]" # (Recommended) Install Modin with Ray and Dask engines.
```
If you want to install Modin with a specific engine, we recommend:
```bash
pip install "modin[ray]" # Install Modin dependencies and Ray.
pip install "modin[dask]" # Install Modin dependencies and Dask.
pip install "modin[mpi]" # Install Modin dependencies and MPI through unidist.
```
To get Modin on MPI through unidist (as of unidist 0.5.0) fully working
it is required to have a working MPI implementation installed beforehand.
Otherwise, installation of `modin[mpi]` may fail. Refer to
[Installing with pip](https://unidist.readthedocs.io/en/latest/installation.html#installing-with-pip)
section of the unidist documentation for more details about installation.
**Note:** Since Modin 0.30.0 we use a reduced set of Ray dependencies: `ray` instead of `ray[default]`.
This means that the dashboard and cluster launcher are no longer installed by default.
If you need those, consider installing `ray[default]` along with `modin[ray]`.
Modin automatically detects which engine(s) you have installed and uses that for scheduling computation.
#### From conda-forge
Installing from [conda forge](https://github.com/conda-forge/modin-feedstock) using `modin-all`
will install Modin and three engines: [Ray](https://github.com/ray-project/ray), [Dask](https://github.com/dask/dask) and
[MPI through unidist](https://github.com/modin-project/unidist).
```bash
conda install -c conda-forge modin-all
```
Each engine can also be installed individually (and also as a combination of several engines):
```bash
conda install -c conda-forge modin-ray # Install Modin dependencies and Ray.
conda install -c conda-forge modin-dask # Install Modin dependencies and Dask.
conda install -c conda-forge modin-mpi # Install Modin dependencies and MPI through unidist.
```
**Note:** Since Modin 0.30.0 we use a reduced set of Ray dependencies: `ray-core` instead of `ray-default`.
This means that the dashboard and cluster launcher are no longer installed by default.
If you need those, consider installing `ray-default` along with `modin-ray`.
Refer to
[Installing with conda](https://unidist.readthedocs.io/en/latest/installation.html#installing-with-conda)
section of the unidist documentation for more details on how to install a specific MPI implementation to run on.
To speed up conda installation we recommend using libmamba solver. To do this install it in a base environment:
```bash
conda install -n base conda-libmamba-solver
```
and then use it during istallation either like:
```bash
conda install -c conda-forge modin-ray --experimental-solver=libmamba
```
or starting from conda 22.11 and libmamba solver 22.12 versions:
```bash
conda install -c conda-forge modin-ray --solver=libmamba
```
#### Choosing a Compute Engine
If you want to choose a specific compute engine to run on, you can set the environment
variable `MODIN_ENGINE` and Modin will do computation with that engine:
```bash
export MODIN_ENGINE=ray # Modin will use Ray
export MODIN_ENGINE=dask # Modin will use Dask
export MODIN_ENGINE=unidist # Modin will use Unidist
```
If you want to choose the Unidist engine, you should set the additional environment
variable ``UNIDIST_BACKEND``. Currently, Modin only supports MPI through unidist:
```bash
export UNIDIST_BACKEND=mpi # Unidist will use MPI backend
```
This can also be done within a notebook/interpreter before you import Modin:
```python
import modin.config as modin_cfg
import unidist.config as unidist_cfg
modin_cfg.Engine.put("ray") # Modin will use Ray
modin_cfg.Engine.put("dask") # Modin will use Dask
modin_cfg.Engine.put('unidist') # Modin will use Unidist
unidist_cfg.Backend.put('mpi') # Unidist will use MPI backend
```
_Note: You should not change the engine after your first operation with Modin as it will result in undefined behavior._
#### Which engine should I use?
On Linux, MacOS, and Windows you can install and use either Ray, Dask or MPI through unidist. There is no knowledge required
to use either of these engines as Modin abstracts away all of the complexity, so feel
free to pick either!
### Pandas API Coverage
<p align="center">
| pandas Object | Modin's Ray Engine Coverage | Modin's Dask Engine Coverage | Modin's Unidist Engine Coverage |
|-------------------|:------------------------------------------------------------------------------------:|:---------------:|:---------------:|
| `pd.DataFrame` | <img src=https://img.shields.io/badge/api%20coverage-90.8%25-hunter.svg> | <img src=https://img.shields.io/badge/api%20coverage-90.8%25-hunter.svg> | <img src=https://img.shields.io/badge/api%20coverage-90.8%25-hunter.svg> |
| `pd.Series` | <img src=https://img.shields.io/badge/api%20coverage-88.05%25-green.svg> | <img src=https://img.shields.io/badge/api%20coverage-88.05%25-green.svg> | <img src=https://img.shields.io/badge/api%20coverage-88.05%25-green.svg>
| `pd.read_csv` | ✅ | ✅ | ✅ |
| `pd.read_table` | ✅ | ✅ | ✅ |
| `pd.read_parquet` | ✅ | ✅ | ✅ |
| `pd.read_sql` | ✅ | ✅ | ✅ |
| `pd.read_feather` | ✅ | ✅ | ✅ |
| `pd.read_excel` | ✅ | ✅ | ✅ |
| `pd.read_json` | [✳️](https://github.com/modin-project/modin/issues/554) | [✳️](https://github.com/modin-project/modin/issues/554) | [✳️](https://github.com/modin-project/modin/issues/554) |
| `pd.read_<other>` | [✴️](https://modin.readthedocs.io/en/latest/supported_apis/io_supported.html) | [✴️](https://modin.readthedocs.io/en/latest/supported_apis/io_supported.html) | [✴️](https://modin.readthedocs.io/en/latest/supported_apis/io_supported.html) |
</p>
Some pandas APIs are easier to implement than others, so if something is missing feel
free to open an issue!
### More about Modin
For the complete documentation on Modin, visit our [ReadTheDocs](https://modin.readthedocs.io/en/latest/index.html) page.
#### Scale your pandas workflow by changing a single line of code.
_Note: In local mode (without a cluster), Modin will create and manage a local (Dask or Ray) cluster for the execution._
To use Modin, you do not need to specify how to distribute the data, or even know how many
cores your system has. In fact, you can continue using your previous
pandas notebooks while experiencing a considerable speedup from Modin, even on a single
machine. Once you've changed your import statement, you're ready to use Modin just like
you would with pandas!
#### Faster pandas, even on your laptop
<img align="right" style="display:inline;" height="350" width="300" src="https://github.com/modin-project/modin/raw/7c009c747caa90554607e30b9ac2bd1b190b8c7d/docs/img/read_csv_benchmark.png?raw=true"></a>
The `modin.pandas` DataFrame is an extremely light-weight parallel DataFrame.
Modin transparently distributes the data and computation so that you can continue using the same pandas API
while working with more data faster. Because it is so light-weight,
Modin provides speed-ups of up to 4x on a laptop with 4 physical cores.
In pandas, you are only able to use one core at a time when you are doing computation of
any kind. With Modin, you are able to use all of the CPU cores on your machine. Even with a
traditionally synchronous task like `read_csv`, we see large speedups by efficiently
distributing the work across your entire machine.
```python
import modin.pandas as pd
df = pd.read_csv("my_dataset.csv")
```
#### Modin can handle the datasets that pandas can't
Often data scientists have to switch between different tools
for operating on datasets of different sizes. Processing large dataframes with pandas
is slow, and pandas does not support working with dataframes that are too large to fit
into the available memory. As a result, pandas workflows that work well
for prototyping on a few MBs of data do not scale to tens or hundreds of GBs (depending on the size
of your machine). Modin supports operating on data that does not fit in memory, so that you can comfortably
work with hundreds of GBs without worrying about substantial slowdown or memory errors.
With [cluster](https://modin.readthedocs.io/en/latest/getting_started/using_modin/using_modin_cluster.html)
and [out of core](https://modin.readthedocs.io/en/latest/getting_started/why_modin/out_of_core.html)
support, Modin is a DataFrame library with both great single-node performance and high
scalability in a cluster.
#### Modin Architecture
We designed [Modin's architecture](https://modin.readthedocs.io/en/latest/development/architecture.html)
to be modular so we can plug in different components as they develop and improve:
<img src="https://github.com/modin-project/modin/raw/7c009c747caa90554607e30b9ac2bd1b190b8c7d/docs/img/modin_architecture.png" alt="Modin's architecture" width="75%"></img>
### Other Resources
#### Getting Started with Modin
- [Documentation](https://modin.readthedocs.io/en/latest/)
- [10-min Quickstart Guide](https://modin.readthedocs.io/en/latest/getting_started/quickstart.html)
- [Examples and Tutorials](https://modin.readthedocs.io/en/latest/getting_started/examples.html)
- [Videos and Blogposts](https://modin.readthedocs.io/en/latest/getting_started/examples.html#talks-podcasts)
- [Benchmarking Modin](https://modin.readthedocs.io/en/latest/usage_guide/benchmarking.html)
#### Modin Community
- [Slack](https://join.slack.com/t/modin-project/shared_invite/zt-yvk5hr3b-f08p_ulbuRWsAfg9rMY3uA)
- [Twitter](https://twitter.com/modin_project)
- [Mailing List](https://groups.google.com/g/modin-dev)
- [GitHub Issues](https://github.com/modin-project/modin/issues)
- [StackOverflow](https://stackoverflow.com/questions/tagged/modin)
#### Learn More about Modin
- [Frequently Asked Questions (FAQs)](https://modin.readthedocs.io/en/latest/getting_started/faq.html)
- [Troubleshooting Guide](https://modin.readthedocs.io/en/latest/getting_started/troubleshooting.html)
- [Development Guide](https://modin.readthedocs.io/en/latest/development/index.html)
- Modin is built on many years of research and development at UC Berkeley. Check out these selected papers to learn more about how Modin works:
- [Flexible Rule-Based Decomposition and Metadata Independence in Modin](https://people.eecs.berkeley.edu/~totemtang/paper/Modin.pdf) (VLDB 2021)
- [Dataframe Systems: Theory, Architecture, and Implementation](https://www2.eecs.berkeley.edu/Pubs/TechRpts/2021/EECS-2021-193.pdf) (PhD Dissertation 2021)
- [Towards Scalable Dataframe Systems](https://arxiv.org/pdf/2001.00888.pdf) (VLDB 2020)
#### Getting Involved
***`modin.pandas` is currently under active development. Requests and contributions are welcome!***
For more information on how to contribute to Modin, check out the
[Modin Contribution Guide](https://modin.readthedocs.io/en/latest/development/contributing.html).
### License
[Apache License 2.0](LICENSE)
================================================
FILE: asv_bench/README.md
================================================
# Modin ASV benchmarks
## Here are some scenarios in which [ASV](https://asv.readthedocs.io/en/stable/index.html) can be used:
* Check the impact of the new patch on the performance of a certain set of operations:
`asv continuous -f 1.05 src/main HEAD -b TimeGroupBy --launch-method=spawn`
* Check for presence of errors inside of benchmarks after changing them or writing new ones:
`asv run --quick --show-stderr --python=same --launch-method=spawn`
* Run entire benchmark suite to get the current times:
`asv run --launch-method=spawn`
* Check the range of commits for performance degradation:
```
asv run [start_hash]..[end_hash] --launch-method=spawn
asv publish
asv preview
```
For more consistent results, you may need to use the following parameters which
description is in [ASV docs](https://asv.readthedocs.io/en/stable/benchmarks.html?highlight=sample_time#timing-benchmarks):
* `-a sample_time=1`
* `-a warmup_time=1`
### Notes about using Modin on Ray with Asv:
* `--launch-method=forkserver` is not working;
* Each set of parameters for each test is launched in its own process, which brings
a large overhead, since for each process redis server and other necessary processes
from ray initialization are started and destroyed.
## Adding new benchmark
Basic information on writing benchmarks is present [in ASV documentation](https://asv.readthedocs.io/en/stable/writing_benchmarks.html)
Benchmarks from `benchmarks/benchmarks.py`, `benchmarks/scalability/scalability_benchmarks.py` or `benchmarks/io/csv.py`
could be used as a starting point.
Requirements:
* the benchmark should be able to run both on Modin and on Pandas when the appropriate value
of the environment variable `MODIN_ASV_USE_IMPL` is selected.
* the size of the benchmark dataset should depend on the environment variable `MODIN_TEST_DATASET_SIZE`.
## Changing existing benchmark
It should be remembered that the hash calculated from the benchmark source code is used to display the results.
When changing the benchmark, the old results will no longer be displayed in the dashboard. In general, this is the correct
behavior so as not to get a situation when incomparable numbers are displayed in the dashboard.
But it should be noted that there could be changes in the source code when it is still correct to compare
the "before" and "after" versions, for example, name of a variable changed, comment added, etc.
In this case you must either run a new version of the benchmark for all the commits ever accounted for or manually change
the hash in the corresponding result files.
## Pipeline for displaying results in a dashboard
Step 1: checking benchmarks for validity, runs in PRs CI.
During the test, the benchmarks are run once on small data.
The implementation can be found in `test-asv-benchmarks` job of [ci.yml](https://github.com/modin-project/modin/blob/main/.github/workflows/ci.yml)
Step 2: running benchmarks with saving the results in [modin-bench@master](https://github.com/modin-project/modin-bench).
The launch takes place on internal server using specific TeamCity configuration.
The description of the server can be found in the ["Benchmark list"](https://modin.org/modin-bench/#summarylist?sort=0&dir=asc) tab,
on the left when you hover the mouse over the machine name.
This step starts as scheduled (now every half hour), subject to the presence of new commits in the Modin `main` branch.
Command to run benchmarks: `asv run HASHFILE:hashfile.txt --show-stderr --machine xeon-e5 --launch-method=spawn`.
In the file `hashfile.txt` is the last modin commit hash.
Writing to a `modin-bench@master` triggers 3 step of the pipeline.
Step 3: converting the results to html representation, which is saved in [modin-bench@gh-pages](https://github.com/modin-project/modin-bench)
The implementation can be found in `deploy-gh-pages` job of [push.yml](https://github.com/modin-project/modin-bench/blob/master/.github/workflows/push.yml)
Basic actions for step 2:
* setup environment variable:
* export MODIN_TEST_DATASET=Big
* export MODIN_CPUS=44
* setup git client
* prepare json file with machine description
* This file should be placed in the user's home directory.
* ASV does not always automatically create the file with the description of the machine correctly (e.g. due to being run in a container).
It is recommended to create a file using [asv machine](https://asv.readthedocs.io/en/stable/commands.html?highlight=machine%20description#asv-machine) command, and manually check the result.
[Example](https://github.com/modin-project/modin-bench/blob/master/results/xeon-e5/machine.json)
* copy old result to folder where new result will appear
(conflict resolution will be performed by ASV itself instead of git)
* push performance result to modin-bench repository
================================================
FILE: asv_bench/asv.conf.dask.json
================================================
{
// The version of the config file format. Do not change, unless
// you know what you are doing.
"version": 1,
// The name of the project being benchmarked
"project": "modin",
// The project's homepage
"project_url": "https://modin.readthedocs.io/",
// The URL or local path of the source code repository for the
// project being benchmarked
"repo": "..",
// List of branches to benchmark. If not provided, defaults to "master"
// (for git) or "default" (for mercurial).
"branches": ["main"],
// Customizable commands for building, installing, and
// uninstalling the project. See asv.conf.json documentation.
"install_command": ["in-dir={env_dir} python -mpip install {wheel_file}[dask]"],
// The tool to use to create environments. May be "conda",
// "virtualenv" or other value depending on the plugins in use.
// If missing or the empty string, the tool will be automatically
// determined by looking for tools on the PATH environment
// variable.
"environment_type": "conda",
// timeout in seconds for installing any dependencies in environment
// defaults to 10 min
"install_timeout": 6000,
// the base URL to show a commit for the project.
"show_commit_url": "https://github.com/modin-project/modin/commit/",
// The Pythons you'd like to test against. If not provided, defaults
// to the current version of Python used to run `asv`.
"pythons": ["3.9"],
// The list of conda channel names to be searched for benchmark
// dependency packages in the specified order
"conda_channels": ["conda-forge", "defaults"],
// The directory (relative to the current directory) to cache the Python
// environments in. If not provided, defaults to "env"
"env_dir": ".asv/env",
// The directory (relative to the current directory) that raw benchmark
// results are stored in. If not provided, defaults to "results".
"results_dir": ".asv/results",
// The directory (relative to the current directory) that the html tree
// should be written to. If not provided, defaults to "html".
"html_dir": ".asv/html",
}
================================================
FILE: asv_bench/asv.conf.json
================================================
{
// The version of the config file format. Do not change, unless
// you know what you are doing.
"version": 1,
// The name of the project being benchmarked
"project": "modin",
// The project's homepage
"project_url": "https://modin.readthedocs.io/",
// The URL or local path of the source code repository for the
// project being benchmarked
"repo": "..",
// List of branches to benchmark. If not provided, defaults to "master"
// (for git) or "default" (for mercurial).
"branches": ["main"],
// Customizable commands for building, installing, and
// uninstalling the project. See asv.conf.json documentation.
"install_command": ["in-dir={env_dir} python -mpip install {wheel_file}[ray]"],
// The tool to use to create environments. May be "conda",
// "virtualenv" or other value depending on the plugins in use.
// If missing or the empty string, the tool will be automatically
// determined by looking for tools on the PATH environment
// variable.
"environment_type": "conda",
// timeout in seconds for installing any dependencies in environment
// defaults to 10 min
"install_timeout": 6000,
// the base URL to show a commit for the project.
"show_commit_url": "https://github.com/modin-project/modin/commit/",
// The Pythons you'd like to test against. If not provided, defaults
// to the current version of Python used to run `asv`.
"pythons": ["3.9"],
// The list of conda channel names to be searched for benchmark
// dependency packages in the specified order
"conda_channels": ["conda-forge", "defaults"],
// The directory (relative to the current directory) to cache the Python
// environments in. If not provided, defaults to "env"
"env_dir": ".asv/env",
// The directory (relative to the current directory) that raw benchmark
// results are stored in. If not provided, defaults to "results".
"results_dir": ".asv/results",
// The directory (relative to the current directory) that the html tree
// should be written to. If not provided, defaults to "html".
"html_dir": ".asv/html",
}
================================================
FILE: asv_bench/asv.conf.unidist.json
================================================
{
// The version of the config file format. Do not change, unless
// you know what you are doing.
"version": 1,
// The name of the project being benchmarked
"project": "modin",
// The project's homepage
"project_url": "https://modin.readthedocs.io/",
// The URL or local path of the source code repository for the
// project being benchmarked
"repo": "..",
// List of branches to benchmark. If not provided, defaults to "master"
// (for git) or "default" (for mercurial).
"branches": ["main"],
// Customizable commands for building, installing, and
// uninstalling the project. See asv.conf.json documentation.
"install_command": ["in-dir={env_dir} python -mpip install {wheel_file}[unidist]"],
// The tool to use to create environments. May be "conda",
// "virtualenv" or other value depending on the plugins in use.
// If missing or the empty string, the tool will be automatically
// determined by looking for tools on the PATH environment
// variable.
"environment_type": "conda",
// timeout in seconds for installing any dependencies in environment
// defaults to 10 min
"install_timeout": 6000,
// the base URL to show a commit for the project.
"show_commit_url": "https://github.com/modin-project/modin/commit/",
// The Pythons you'd like to test against. If not provided, defaults
// to the current version of Python used to run `asv`.
"pythons": ["3.9"],
// The list of conda channel names to be searched for benchmark
// dependency packages in the specified order
"conda_channels": ["conda-forge", "defaults"],
// The directory (relative to the current directory) to cache the Python
// environments in. If not provided, defaults to "env"
"env_dir": ".asv/env",
// The directory (relative to the current directory) that raw benchmark
// results are stored in. If not provided, defaults to "results".
"results_dir": ".asv/results",
// The directory (relative to the current directory) that the html tree
// should be written to. If not provided, defaults to "html".
"html_dir": ".asv/html",
}
================================================
FILE: asv_bench/benchmarks/__init__.py
================================================
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.
"""Modin benchmarks."""
================================================
FILE: asv_bench/benchmarks/benchmarks.py
================================================
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.
"""General Modin benchmarks."""
# define `MODIN_CPUS` env var to control the number of partitions
# it should be defined before modin.pandas import (in case of using os.environ)
# define `MODIN_ASV_USE_IMPL` env var to choose library for using in performance
# measurements
import math
import numpy as np
from .utils import (
GROUPBY_NGROUPS,
IMPL,
RAND_HIGH,
RAND_LOW,
execute,
gen_nan_data,
generate_dataframe,
get_benchmark_shapes,
random_booleans,
random_columns,
random_string,
translator_groupby_ngroups,
)
class BaseTimeGroupBy:
def setup(self, shape, ngroups=5, groupby_ncols=1):
ngroups = translator_groupby_ngroups(ngroups, shape)
self.df, self.groupby_columns = generate_dataframe(
"int",
*shape,
RAND_LOW,
RAND_HIGH,
groupby_ncols,
count_groups=ngroups,
)
class TimeGroupByMultiColumn(BaseTimeGroupBy):
param_names = ["shape", "ngroups", "groupby_ncols"]
params = [
get_benchmark_shapes("TimeGroupByMultiColumn"),
GROUPBY_NGROUPS,
[6],
]
def time_groupby_agg_quan(self, *args, **kwargs):
execute(self.df.groupby(by=self.groupby_columns).agg("quantile"))
def time_groupby_agg_mean(self, *args, **kwargs):
execute(self.df.groupby(by=self.groupby_columns).apply(lambda df: df.mean()))
class TimeGroupByDefaultAggregations(BaseTimeGroupBy):
param_names = ["shape", "ngroups"]
params = [
get_benchmark_shapes("TimeGroupByDefaultAggregations"),
GROUPBY_NGROUPS,
]
def time_groupby_count(self, *args, **kwargs):
execute(self.df.groupby(by=self.groupby_columns).count())
def time_groupby_size(self, *args, **kwargs):
execute(self.df.groupby(by=self.groupby_columns).size())
def time_groupby_sum(self, *args, **kwargs):
execute(self.df.groupby(by=self.groupby_columns).sum())
def time_groupby_mean(self, *args, **kwargs):
execute(self.df.groupby(by=self.groupby_columns).mean())
class TimeGroupByDictionaryAggregation(BaseTimeGroupBy):
param_names = ["shape", "ngroups", "operation_type"]
params = [
get_benchmark_shapes("TimeGroupByDictionaryAggregation"),
GROUPBY_NGROUPS,
["reduce", "aggregation"],
]
operations = {
"reduce": ["sum", "count", "prod"],
"aggregation": ["quantile", "std", "median"],
}
def setup(self, shape, ngroups, operation_type):
super().setup(shape, ngroups)
self.cols_to_agg = self.df.columns[1:4]
operations = self.operations[operation_type]
self.agg_dict = {
c: operations[i % len(operations)] for i, c in enumerate(self.cols_to_agg)
}
def time_groupby_dict_agg(self, *args, **kwargs):
execute(self.df.groupby(by=self.groupby_columns).agg(self.agg_dict))
class TimeJoin:
param_names = ["shapes", "how", "sort"]
params = [
get_benchmark_shapes("TimeJoin"),
["left", "inner"],
[False],
]
def setup(self, shapes, how, sort):
self.df1 = generate_dataframe("int", *shapes[0], RAND_LOW, RAND_HIGH)
self.df2 = generate_dataframe("int", *shapes[1], RAND_LOW, RAND_HIGH)
def time_join(self, shapes, how, sort):
# join dataframes on index to get the predictable shape
execute(self.df1.join(self.df2, how=how, lsuffix="left_", sort=sort))
class TimeJoinStringIndex:
param_names = ["shapes", "sort"]
params = [
get_benchmark_shapes("TimeJoinStringIndex"),
[True, False],
]
def setup(self, shapes, sort):
assert shapes[0] % 100 == 0, "implementation restriction"
level1 = IMPL.Index([f"i-{i}" for i in range(10)], dtype=object).values
level2 = IMPL.Index(
[f"i-{i}" for i in range(shapes[0] // 100)], dtype=object
).values
codes1 = np.arange(10).repeat(shapes[0] // 100)
codes2 = np.tile(np.arange(shapes[0] // 100), 10)
index2 = IMPL.MultiIndex(levels=[level1, level2], codes=[codes1, codes2])
self.df_multi = IMPL.DataFrame(
np.random.randn(len(index2), 4), index=index2, columns=["A", "B", "C", "D"]
)
self.key1 = np.tile(level1.take(codes1), 10)
self.key2 = np.tile(level2.take(codes2), 10)
self.df = generate_dataframe("int", *shapes, RAND_LOW, RAND_HIGH)
# just to keep source shape
self.df = self.df.drop(columns=self.df.columns[-2:])
self.df["key1"] = self.key1
self.df["key2"] = self.key2
execute(self.df)
self.df_key1 = IMPL.DataFrame(
np.random.randn(len(level1), 4), index=level1, columns=["A", "B", "C", "D"]
)
self.df_key2 = IMPL.DataFrame(
np.random.randn(len(level2), 4), index=level2, columns=["A", "B", "C", "D"]
)
def time_join_dataframe_index_multi(self, shapes, sort):
execute(self.df.join(self.df_multi, on=["key1", "key2"], sort=sort))
def time_join_dataframe_index_single_key_bigger(self, shapes, sort):
execute(self.df.join(self.df_key2, on="key2", sort=sort))
def time_join_dataframe_index_single_key_small(self, shapes, sort):
execute(self.df.join(self.df_key1, on="key1", sort=sort))
class TimeMergeDefault:
param_names = ["shapes", "how", "sort"]
params = [
get_benchmark_shapes("TimeMergeDefault"),
["left", "inner"],
[True, False],
]
def setup(self, shapes, how, sort):
self.df1 = generate_dataframe("int", *shapes[0], RAND_LOW, RAND_HIGH)
self.df2 = generate_dataframe("int", *shapes[1], RAND_LOW, RAND_HIGH)
def time_merge(self, shapes, how, sort):
execute(IMPL.merge(self.df1, self.df2, how=how, sort=sort))
class TimeMerge:
param_names = ["shapes", "how", "sort"]
params = [
get_benchmark_shapes("TimeMerge"),
["left", "inner"],
[True, False],
]
def setup(self, shapes, how, sort):
self.df1 = generate_dataframe("int", *shapes[0], RAND_LOW, RAND_HIGH)
self.df2 = generate_dataframe("int", *shapes[1], RAND_LOW, RAND_HIGH)
def time_merge(self, shapes, how, sort):
# merge dataframes by index to get the predictable shape
execute(
self.df1.merge(
self.df2, left_index=True, right_index=True, how=how, sort=sort
)
)
def time_merge_dataframe_empty_right(self, shapes, how, sort):
# Getting an empty dataframe using `iloc` should be very fast,
# so the impact on the time of the merge operation should be negligible.
execute(IMPL.merge(self.df1, self.df2.iloc[:0], how=how, sort=sort))
def time_merge_dataframe_empty_left(self, shapes, how, sort):
# Getting an empty dataframe using `iloc` should be very fast,
# so the impact on the time of the merge operation should be negligible.
execute(IMPL.merge(self.df1.iloc[:0], self.df2, how=how, sort=sort))
class TimeMergeCategoricals:
param_names = ["shapes", "data_type"]
params = [
get_benchmark_shapes("MergeCategoricals"),
["object", "category"],
]
def setup(self, shapes, data_type):
assert len(shapes) == 2
assert shapes[1] == 2
size = (shapes[0],)
self.left = IMPL.DataFrame(
{
"X": np.random.choice(range(0, 10), size=size),
"Y": np.random.choice(["one", "two", "three"], size=size),
}
)
self.right = IMPL.DataFrame(
{
"X": np.random.choice(range(0, 10), size=size),
"Z": np.random.choice(["jjj", "kkk", "sss"], size=size),
}
)
if data_type == "category":
self.left = self.left.assign(Y=self.left["Y"].astype("category"))
execute(self.left)
self.right = self.right.assign(Z=self.right["Z"].astype("category"))
execute(self.right)
def time_merge_categoricals(self, shapes, data_type):
execute(IMPL.merge(self.left, self.right, on="X"))
class TimeConcat:
param_names = ["shapes", "how", "axis", "ignore_index"]
params = [
get_benchmark_shapes("TimeConcat"),
["inner", "outer"],
[0, 1],
[True, False],
]
def setup(self, shapes, how, axis, ignore_index):
self.df1 = generate_dataframe("int", *shapes[0], RAND_LOW, RAND_HIGH)
self.df2 = generate_dataframe("int", *shapes[1], RAND_LOW, RAND_HIGH)
def time_concat(self, shapes, how, axis, ignore_index):
execute(
IMPL.concat(
[self.df1, self.df2], axis=axis, join=how, ignore_index=ignore_index
)
)
class TimeBinaryOp:
param_names = ["shapes", "binary_op", "axis"]
params = [
get_benchmark_shapes("TimeBinaryOp"),
["mul"],
[0, 1],
]
def setup(self, shapes, binary_op, axis):
self.df1 = generate_dataframe("int", *shapes[0], RAND_LOW, RAND_HIGH)
self.df2 = generate_dataframe("int", *shapes[1], RAND_LOW, RAND_HIGH)
self.op = getattr(self.df1, binary_op)
def time_binary_op(self, shapes, binary_op, axis):
execute(self.op(self.df2, axis=axis))
class TimeBinaryOpSeries:
param_names = ["shapes", "binary_op"]
params = [
get_benchmark_shapes("TimeBinaryOpSeries"),
["mul"],
]
def setup(self, shapes, binary_op):
df1 = generate_dataframe("int", *shapes[0], RAND_LOW, RAND_HIGH)
df2 = generate_dataframe("int", *shapes[1], RAND_LOW, RAND_HIGH)
self.series1 = df1[df1.columns[0]]
self.series2 = df2[df2.columns[0]]
self.op = getattr(self.series1, binary_op)
execute(self.series1)
execute(self.series2)
def time_binary_op_series(self, shapes, binary_op):
execute(self.op(self.series2))
class BaseTimeSetItem:
param_names = ["shape", "item_length", "loc", "is_equal_indices"]
@staticmethod
def get_loc(df, loc, axis, item_length):
locs_dict = {
"zero": 0,
"middle": len(df.axes[axis]) // 2,
"last": len(df.axes[axis]) - 1,
}
base_loc = locs_dict[loc]
range_based_loc = np.arange(
base_loc, min(len(df.axes[axis]), base_loc + item_length)
)
return (
(df.axes[axis][base_loc], base_loc)
if len(range_based_loc) == 1
else (df.axes[axis][range_based_loc], range_based_loc)
)
def setup(self, shape, item_length, loc, is_equal_indices):
self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH).copy()
self.loc, self.iloc = self.get_loc(
self.df, loc, item_length=item_length, axis=1
)
self.item = self.df[self.loc] + 1
self.item_raw = self.item.to_numpy()
if not is_equal_indices:
self.item.index = reversed(self.item.index)
class TimeSetItem(BaseTimeSetItem):
params = [
get_benchmark_shapes("TimeSetItem"),
[1],
["zero", "middle", "last"],
[True, False],
]
def time_setitem_qc(self, *args, **kwargs):
self.df[self.loc] = self.item
execute(self.df)
def time_setitem_raw(self, *args, **kwargs):
self.df[self.loc] = self.item_raw
execute(self.df)
class TimeInsert(BaseTimeSetItem):
params = [
get_benchmark_shapes("TimeInsert"),
[1],
["zero", "middle", "last"],
[True, False],
]
def time_insert_qc(self, *args, **kwargs):
self.df.insert(loc=self.iloc, column=random_string(), value=self.item)
execute(self.df)
def time_insert_raw(self, *args, **kwargs):
self.df.insert(loc=self.iloc, column=random_string(), value=self.item_raw)
execute(self.df)
class TimeArithmetic:
param_names = ["shape", "axis"]
params = [
get_benchmark_shapes("TimeArithmetic"),
[0, 1],
]
def setup(self, shape, axis):
self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH)
def time_sum(self, shape, axis):
execute(self.df.sum(axis=axis))
def time_count(self, shape, axis):
execute(self.df.count(axis=axis))
def time_median(self, shape, axis):
execute(self.df.median(axis=axis))
def time_nunique(self, shape, axis):
execute(self.df.nunique(axis=axis))
def time_apply(self, shape, axis):
execute(self.df.apply(lambda df: df.sum(), axis=axis))
def time_mean(self, shape, axis):
execute(self.df.mean(axis=axis))
def time_mode(self, shape, axis):
execute(self.df.mode(axis=axis))
def time_add(self, shape, axis):
execute(self.df.add(2, axis=axis))
def time_mul(self, shape, axis):
execute(self.df.mul(2, axis=axis))
def time_mod(self, shape, axis):
execute(self.df.mod(2, axis=axis))
def time_abs(self, shape, axis):
execute(self.df.abs())
def time_aggregate(self, shape, axis):
execute(self.df.aggregate(lambda df: df.sum(), axis=axis))
def time_is_in(self, shape, axis):
execute(self.df.isin([0, 2]))
def time_transpose(self, shape, axis):
execute(self.df.transpose())
class TimeSortValues:
param_names = ["shape", "columns_number", "ascending_list"]
params = [
get_benchmark_shapes("TimeSortValues"),
[1, 2, 10, 100],
[False, True],
]
def setup(self, shape, columns_number, ascending_list):
self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH)
self.columns = random_columns(self.df.columns, columns_number)
self.ascending = (
random_booleans(columns_number)
if ascending_list
else bool(random_booleans(1)[0])
)
def time_sort_values(self, shape, columns_number, ascending_list):
execute(self.df.sort_values(self.columns, ascending=self.ascending))
class TimeDrop:
param_names = ["shape", "axis", "drop_ncols"]
params = [
get_benchmark_shapes("TimeDrop"),
[0, 1],
[1, 0.8],
]
def setup(self, shape, axis, drop_ncols):
self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH)
drop_count = (
int(len(self.df.axes[axis]) * drop_ncols)
if isinstance(drop_ncols, float)
else drop_ncols
)
self.labels = self.df.axes[axis][:drop_count]
def time_drop(self, shape, axis, drop_ncols):
execute(self.df.drop(self.labels, axis=axis))
class TimeHead:
param_names = ["shape", "head_count"]
params = [
get_benchmark_shapes("TimeHead"),
[5, 0.8],
]
def setup(self, shape, head_count):
self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH)
self.head_count = (
int(head_count * len(self.df.index))
if isinstance(head_count, float)
else head_count
)
def time_head(self, shape, head_count):
execute(self.df.head(self.head_count))
class TimeTail:
param_names = ["shape", "tail_count"]
params = [
get_benchmark_shapes("TimeTail"),
[5, 0.8],
]
def setup(self, shape, tail_count):
self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH)
self.tail_count = (
int(tail_count * len(self.df.index))
if isinstance(tail_count, float)
else tail_count
)
def time_tail(self, shape, tail_count):
execute(self.df.tail(self.tail_count))
class TimeExplode:
param_names = ["shape"]
params = [
get_benchmark_shapes("TimeExplode"),
]
def setup(self, shape):
self.df = generate_dataframe(
"int", *shape, RAND_LOW, RAND_HIGH, gen_unique_key=True
)
def time_explode(self, shape):
execute(self.df.explode("col1"))
class TimeFillnaSeries:
param_names = ["value_type", "shape", "limit"]
params = [
["scalar", "dict", "Series"],
get_benchmark_shapes("TimeFillnaSeries"),
[None, 0.8],
]
def setup(self, value_type, shape, limit):
self.series = gen_nan_data(*shape)
if value_type == "scalar":
self.value = 18.19
elif value_type == "dict":
self.value = {k: k * 1.23 for k in range(shape[0])}
elif value_type == "Series":
self.value = IMPL.Series(
[k * 1.23 for k in range(shape[0])], index=IMPL.RangeIndex(shape[0])
)
else:
assert False
limit = int(limit * shape[0]) if limit else None
self.kw = {"value": self.value, "limit": limit}
def time_fillna(self, value_type, shape, limit):
execute(self.series.fillna(**self.kw))
def time_fillna_inplace(self, value_type, shape, limit):
self.series.fillna(inplace=True, **self.kw)
execute(self.series)
class TimeFillnaDataFrame:
param_names = ["value_type", "shape", "limit"]
params = [
["scalar", "dict", "DataFrame", "Series"],
get_benchmark_shapes("TimeFillnaDataFrame"),
[None, 0.8],
]
def setup(self, value_type, shape, limit):
self.df = gen_nan_data(*shape)
columns = self.df.columns
if value_type == "scalar":
self.value = 18.19
elif value_type == "dict":
self.value = {k: i * 1.23 for i, k in enumerate(columns)}
elif value_type == "Series":
self.value = IMPL.Series(
[i * 1.23 for i in range(len(columns))], index=columns
)
elif value_type == "DataFrame":
self.value = IMPL.DataFrame(
{
k: [i + j * 1.23 for j in range(shape[0])]
for i, k in enumerate(columns)
},
index=IMPL.RangeIndex(shape[0]),
columns=columns,
)
else:
assert False
limit = int(limit * shape[0]) if limit else None
self.kw = {"value": self.value, "limit": limit}
def time_fillna(self, value_type, shape, limit):
execute(self.df.fillna(**self.kw))
def time_fillna_inplace(self, value_type, shape, limit):
self.df.fillna(inplace=True, **self.kw)
execute(self.df)
class BaseTimeValueCounts:
def setup(self, shape, ngroups=5, subset=1):
ngroups = translator_groupby_ngroups(ngroups, shape)
self.df, self.subset = generate_dataframe(
"int",
*shape,
RAND_LOW,
RAND_HIGH,
groupby_ncols=subset,
count_groups=ngroups,
)
class TimeValueCountsFrame(BaseTimeValueCounts):
param_names = ["shape", "ngroups", "subset"]
params = [
get_benchmark_shapes("TimeValueCountsFrame"),
GROUPBY_NGROUPS,
[2, 10],
]
def time_value_counts(self, *args, **kwargs):
execute(self.df.value_counts(subset=self.subset))
class TimeValueCountsSeries(BaseTimeValueCounts):
param_names = ["shape", "ngroups", "bins"]
params = [
get_benchmark_shapes("TimeValueCountsSeries"),
GROUPBY_NGROUPS,
[None, 3],
]
def setup(self, shape, ngroups, bins):
super().setup(ngroups=ngroups, shape=shape)
self.df = self.df[self.subset[0]]
def time_value_counts(self, shape, ngroups, bins):
execute(self.df.value_counts(bins=bins))
class TimeIndexing:
param_names = ["shape", "indexer_type"]
params = [
get_benchmark_shapes("TimeIndexing"),
[
"bool_array",
"bool_series",
"scalar",
"slice",
"continuous_slice",
"numpy_array_take_all_values",
"python_list_take_10_values",
"function",
],
]
indexer_getters = {
"bool_array": lambda df: np.array([False, True] * (len(df) // 2)),
# This boolean-Series is a projection of the source frame, it shouldn't
# be reimported or triggered to execute:
"bool_series": lambda df: df.iloc[:, 0] > 50,
"scalar": lambda df: len(df) // 2,
"slice": lambda df: slice(0, len(df), 2),
"continuous_slice": lambda df: slice(len(df) // 2),
"numpy_array_take_all_values": lambda df: np.arange(len(df)),
"python_list_take_10_values": lambda df: list(range(min(10, len(df)))),
"function": lambda df: (lambda df: df.index[::-2]),
}
def setup(self, shape, indexer_type):
self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH)
self.indexer = self.indexer_getters[indexer_type](self.df)
if isinstance(self.indexer, (IMPL.Series, IMPL.DataFrame)):
# HACK: Triggering `dtypes` meta-data computation in advance,
# so it won't affect the `loc/iloc` time:
self.indexer.dtypes
def time_iloc(self, shape, indexer_type):
# Pandas doesn't implement `df.iloc[series boolean_mask]` and raises an exception on it.
# Replacing this with the semantically equivalent construction:
if indexer_type != "bool_series":
execute(self.df.iloc[self.indexer])
else:
execute(self.df[self.indexer])
def time_loc(self, shape, indexer_type):
execute(self.df.loc[self.indexer])
class TimeIndexingColumns:
param_names = ["shape"]
params = [get_benchmark_shapes("TimeIndexing")]
def setup(self, shape):
self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH)
self.numeric_indexer = [0, 1]
self.labels_indexer = self.df.columns[self.numeric_indexer].tolist()
def time_iloc(self, shape):
execute(self.df.iloc[:, self.numeric_indexer])
def time_loc(self, shape):
execute(self.df.loc[:, self.labels_indexer])
def time___getitem__(self, shape):
execute(self.df[self.labels_indexer])
class TimeMultiIndexing:
param_names = ["shape"]
params = [get_benchmark_shapes("TimeMultiIndexing")]
def setup(self, shape):
df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH)
index = IMPL.MultiIndex.from_product(
[df.index[: shape[0] // 2], ["bar", "foo"]]
)
columns = IMPL.MultiIndex.from_product(
[df.columns[: shape[1] // 2], ["buz", "fuz"]]
)
df.index = index
df.columns = columns
self.df = df.sort_index(axis=1)
def time_multiindex_loc(self, shape):
execute(
self.df.loc[
self.df.index[2] : self.df.index[-2],
self.df.columns[2] : self.df.columns[-2],
]
)
class TimeResetIndex:
param_names = ["shape", "drop", "level"]
params = [
get_benchmark_shapes("TimeResetIndex"),
[False, True],
[None, "level_1"],
]
def setup(self, shape, drop, level):
self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH)
if level:
index = IMPL.MultiIndex.from_product(
[self.df.index[: shape[0] // 2], ["bar", "foo"]],
names=["level_1", "level_2"],
)
self.df.index = index
def time_reset_index(self, shape, drop, level):
execute(self.df.reset_index(drop=drop, level=level))
class TimeAstype:
param_names = ["shape", "dtype", "astype_ncolumns"]
params = [
get_benchmark_shapes("TimeAstype"),
["float64", "category"],
["one", "all"],
]
def setup(self, shape, dtype, astype_ncolumns):
self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH)
if astype_ncolumns == "all":
self.astype_arg = dtype
elif astype_ncolumns == "one":
self.astype_arg = {"col1": dtype}
else:
raise ValueError(f"astype_ncolumns: {astype_ncolumns} isn't supported")
def time_astype(self, shape, dtype, astype_ncolumns):
execute(self.df.astype(self.astype_arg))
class TimeDescribe:
param_names = ["shape"]
params = [
get_benchmark_shapes("TimeDescribe"),
]
def setup(self, shape):
self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH)
def time_describe(self, shape):
execute(self.df.describe())
class TimeProperties:
param_names = ["shape"]
params = [
get_benchmark_shapes("TimeProperties"),
]
def setup(self, shape):
self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH)
def time_shape(self, shape):
return self.df.shape
def time_columns(self, shape):
return self.df.columns
def time_index(self, shape):
return self.df.index
class TimeIndexingNumericSeries:
param_names = ["shape", "dtype", "index_structure"]
params = [
get_benchmark_shapes("TimeIndexingNumericSeries"),
(np.int64, np.uint64, np.float64),
("unique_monotonic_inc", "nonunique_monotonic_inc"),
]
def setup(self, shape, dtype, index_structure):
N = shape[0]
indices = {
"unique_monotonic_inc": IMPL.Index(range(N), dtype=dtype),
"nonunique_monotonic_inc": IMPL.Index(
list(range(N // 100)) + [(N // 100) - 1] + list(range(N // 100, N - 1)),
dtype=dtype,
),
}
self.data = IMPL.Series(np.random.rand(N), index=indices[index_structure])
self.array = np.arange(N // 2)
self.index_to_query = N // 2
self.array_list = self.array.tolist()
execute(self.data)
def time_getitem_scalar(self, shape, index, index_structure):
# not calling execute as execute function fails for scalar
self.data[self.index_to_query]
def time_getitem_slice(self, shape, index, index_structure):
execute(self.data[: self.index_to_query])
def time_getitem_list_like(self, shape, index, index_structure):
execute(self.data[[self.index_to_query]])
def time_getitem_array(self, shape, index, index_structure):
execute(self.data[self.array])
def time_getitem_lists(self, shape, index, index_structure):
execute(self.data[self.array_list])
def time_iloc_array(self, shape, index, index_structure):
execute(self.data.iloc[self.array])
def time_iloc_list_like(self, shape, index, index_structure):
execute(self.data.iloc[[self.index_to_query]])
def time_iloc_scalar(self, shape, index, index_structure):
# not calling execute as execute function fails for scalar
self.data.iloc[self.index_to_query]
def time_iloc_slice(self, shape, index, index_structure):
execute(self.data.iloc[: self.index_to_query])
def time_loc_array(self, shape, index, index_structure):
execute(self.data.loc[self.array])
def time_loc_list_like(self, shape, index, index_structure):
execute(self.data.loc[[self.index_to_query]])
def time_loc_scalar(self, shape, index, index_structure):
self.data.loc[self.index_to_query]
def time_loc_slice(self, shape, index, index_structure):
execute(self.data.loc[: self.index_to_query])
class TimeReindex:
param_names = ["shape"]
params = [get_benchmark_shapes("TimeReindex")]
def setup(self, shape):
rows, cols = shape
rng = IMPL.date_range(start="1/1/1970", periods=rows, freq="1min")
self.df = IMPL.DataFrame(
np.random.rand(rows, cols), index=rng, columns=range(cols)
)
self.df["foo"] = "bar"
self.rng_subset = IMPL.Index(rng[::2])
self.df2 = IMPL.DataFrame(
index=range(rows), data=np.random.rand(rows, cols), columns=range(cols)
)
level1 = IMPL.Index(
[f"i-{i}" for i in range(rows // 10)], dtype=object
).values.repeat(10)
level2 = np.tile(
IMPL.Index([f"i-{i}" for i in range(10)], dtype=object).values, rows // 10
)
index = IMPL.MultiIndex.from_arrays([level1, level2])
self.s = IMPL.Series(np.random.randn(rows), index=index)
self.s_subset = self.s[::2]
self.s_subset_no_cache = self.s[::2].copy()
mi = IMPL.MultiIndex.from_product([rng[: len(rng) // 10], range(10)])
self.s2 = IMPL.Series(np.random.randn(len(mi)), index=mi)
self.s2_subset = self.s2[::2].copy()
execute(self.df), execute(self.df2)
execute(self.s), execute(self.s_subset)
execute(self.s2), execute(self.s2_subset)
execute(self.s_subset_no_cache)
def time_reindex_dates(self, shape):
execute(self.df.reindex(self.rng_subset))
def time_reindex_columns(self, shape):
execute(self.df2.reindex(columns=self.df.columns[1:5]))
def time_reindex_multiindex_with_cache(self, shape):
# MultiIndex._values gets cached (pandas specific)
execute(self.s.reindex(self.s_subset.index))
def time_reindex_multiindex_no_cache(self, shape):
# Copy to avoid MultiIndex._values getting cached (pandas specific)
execute(self.s.reindex(self.s_subset_no_cache.index.copy()))
def time_reindex_multiindex_no_cache_dates(self, shape):
# Copy to avoid MultiIndex._values getting cached (pandas specific)
execute(self.s2_subset.reindex(self.s2.index.copy()))
class TimeReindexMethod:
params = [
get_benchmark_shapes("TimeReindexMethod"),
["pad", "backfill"],
[IMPL.date_range, IMPL.period_range],
]
param_names = ["shape", "method", "constructor"]
def setup(self, shape, method, constructor):
N = shape[0]
self.idx = constructor("1/1/2000", periods=N, freq="1min")
self.ts = IMPL.Series(np.random.randn(N), index=self.idx)[::2]
execute(self.ts)
def time_reindex_method(self, shape, method, constructor):
execute(self.ts.reindex(self.idx, method=method))
class TimeFillnaMethodSeries:
params = [get_benchmark_shapes("TimeFillnaMethodSeries"), ["pad", "backfill"]]
param_names = ["shape", "method"]
def setup(self, shape, method):
N = shape[0]
self.idx = IMPL.date_range("1/1/2000", periods=N, freq="1min")
ts = IMPL.Series(np.random.randn(N), index=self.idx)[::2]
self.ts_reindexed = ts.reindex(self.idx)
self.ts_float32 = self.ts_reindexed.astype("float32")
execute(self.ts_reindexed), execute(self.ts_float32)
def time_reindexed(self, shape, method):
execute(self.ts_reindexed.fillna(method=method))
def time_float_32(self, shape, method):
execute(self.ts_float32.fillna(method=method))
class TimeFillnaMethodDataframe:
params = [get_benchmark_shapes("TimeFillnaMethodDataframe"), ["pad", "backfill"]]
param_names = ["shape", "method"]
def setup(self, shape, method):
self.idx = IMPL.date_range("1/1/2000", periods=shape[0], freq="1min")
df_ts = IMPL.DataFrame(np.random.randn(*shape), index=self.idx)[::2]
self.df_ts_reindexed = df_ts.reindex(self.idx)
self.df_ts_float32 = self.df_ts_reindexed.astype("float32")
execute(self.df_ts_reindexed), execute(self.df_ts_float32)
def time_reindexed(self, shape, method):
execute(self.df_ts_reindexed.fillna(method=method))
def time_float_32(self, shape, method):
execute(self.df_ts_float32.fillna(method=method))
class TimeLevelAlign:
params = [get_benchmark_shapes("TimeLevelAlign")]
param_names = ["shapes"]
def setup(self, shapes):
rows, cols = shapes[0]
rows_sqrt = round(math.sqrt(rows))
# the new number of rows may differ from the requested (slightly, so ok)
rows = rows_sqrt * rows_sqrt
self.index = IMPL.MultiIndex(
levels=[np.arange(10), np.arange(rows_sqrt), np.arange(rows_sqrt)],
codes=[
np.arange(10).repeat(rows),
np.tile(np.arange(rows_sqrt).repeat(rows_sqrt), 10),
np.tile(np.tile(np.arange(rows_sqrt), rows_sqrt), 10),
],
)
self.df1 = IMPL.DataFrame(
np.random.randn(len(self.index), cols), index=self.index
)
self.df2 = IMPL.DataFrame(np.random.randn(*shapes[1]))
execute(self.df1), execute(self.df2)
def time_align_level(self, shapes):
left, right = self.df1.align(self.df2, level=1, copy=False)
execute(left), execute(right)
def time_reindex_level(self, shapes):
# `reindex` returns the same result here as `align`.
# Approximately the same performance is expected.
execute(self.df2.reindex(self.index, level=1))
class TimeDropDuplicatesDataframe:
params = [get_benchmark_shapes("TimeDropDuplicatesDataframe")]
param_names = ["shape"]
def setup(self, shape):
rows, cols = shape
N = rows // 10
K = 10
data = {}
# dataframe would have cols-1 keys(strings) and one value(int) column
for col in range(cols - 1):
data["key" + str(col + 1)] = IMPL.Index(
[f"i-{i}" for i in range(N)], dtype=object
).values.repeat(K)
data["value"] = np.random.randn(N * K)
self.df = IMPL.DataFrame(data)
execute(self.df)
def time_drop_dups(self, shape):
execute(self.df.drop_duplicates(self.df.columns[:-1]))
def time_drop_dups_inplace(self, shape):
self.df.drop_duplicates(self.df.columns[:-1], inplace=True)
execute(self.df)
class TimeDropDuplicatesSeries:
params = [get_benchmark_shapes("TimeDropDuplicatesSeries")]
param_names = ["shape"]
def setup(self, shape):
rows = shape[0]
self.series = IMPL.Series(
np.tile(
IMPL.Index([f"i-{i}" for i in range(rows // 10)], dtype=object).values,
10,
)
)
execute(self.series)
def time_drop_dups(self, shape):
execute(self.series.drop_duplicates())
def time_drop_dups_string(self, shape):
self.series.drop_duplicates(inplace=True)
execute(self.series)
class TimeDatetimeAccessor:
params = [get_benchmark_shapes("TimeDatetimeAccessor")]
param_names = ["shape"]
def setup(self, shape):
self.series = IMPL.Series(
IMPL.timedelta_range("1 days", periods=shape[0], freq="h")
)
execute(self.series)
def time_dt_accessor(self, shape):
execute(self.series.dt)
def time_timedelta_days(self, shape):
execute(self.series.dt.days)
def time_timedelta_seconds(self, shape):
execute(self.series.dt.seconds)
class BaseCategories:
def setup(self, shape):
rows = shape[0]
arr = [f"s{i:04d}" for i in np.random.randint(0, rows // 10, size=rows)]
self.ts = IMPL.Series(arr).astype("category")
execute(self.ts)
class TimeSetCategories(BaseCategories):
params = [get_benchmark_shapes("TimeSetCategories")]
param_names = ["shape"]
def time_set_categories(self, shape):
execute(self.ts.cat.set_categories(self.ts.cat.categories[::2]))
class TimeRemoveCategories(BaseCategories):
params = [get_benchmark_shapes("TimeRemoveCategories")]
param_names = ["shape"]
def time_remove_categories(self, shape):
execute(self.ts.cat.remove_categories(self.ts.cat.categories[::2]))
class BaseReshape:
def setup(self, shape):
rows, cols = shape
k = 10
arrays = [
np.arange(rows // k).repeat(k),
np.roll(np.tile(np.arange(rows // k), k), 25),
]
index = IMPL.MultiIndex.from_arrays(arrays)
self.df = IMPL.DataFrame(np.random.randn(rows, cols), index=index)
execute(self.df)
class TimeStack(BaseReshape):
params = [get_benchmark_shapes("TimeStack")]
param_names = ["shape"]
def setup(self, shape):
super().setup(shape)
self.udf = self.df.unstack(1)
execute(self.udf)
def time_stack(self, shape):
execute(self.udf.stack())
class TimeUnstack(BaseReshape):
params = [get_benchmark_shapes("TimeUnstack")]
param_names = ["shape"]
def time_unstack(self, shape):
execute(self.df.unstack(1))
class TimeReplace:
params = [get_benchmark_shapes("TimeReplace")]
param_names = ["shape"]
def setup(self, shape):
rows, cols = shape
self.to_replace = {i: getattr(IMPL, "Timestamp")(i) for i in range(rows)}
self.df = IMPL.DataFrame(np.random.randint(rows, size=(rows, cols)))
execute(self.df)
def time_replace(self, shape):
execute(self.df.replace(self.to_replace))
class TimeGroups:
params = [get_benchmark_shapes("TimeGroups")]
param_names = ["shape"]
def setup(self, shape):
self.series = IMPL.Series(np.random.randint(0, 100, size=shape[0]))
execute(self.series)
# returns a pretty dict thus not calling execute
def time_series_groups(self, shape):
self.series.groupby(self.series).groups
# returns a dict thus not calling execute
def time_series_indices(self, shape):
self.series.groupby(self.series).indices
class TimeRepr:
params = [get_benchmark_shapes("TimeRepr")]
param_names = ["shape"]
def setup(self, shape):
self.df = IMPL.DataFrame(np.random.randn(*shape))
execute(self.df)
# returns a string thus not calling execute
def time_repr(self, shape):
repr(self.df)
class TimeMaskBool:
params = [get_benchmark_shapes("TimeMaskBool")]
param_names = ["shape"]
def setup(self, shape):
self.df = IMPL.DataFrame(np.random.randn(*shape))
self.mask = self.df < 0
execute(self.df), execute(self.mask)
def time_frame_mask(self, shape):
execute(self.df.mask(self.mask))
class TimeIsnull:
params = [get_benchmark_shapes("TimeIsnull")]
param_names = ["shape"]
def setup(self, shape):
sample = np.array([np.nan, 1.0])
data = np.random.choice(sample, (shape[0], shape[1]))
self.df = IMPL.DataFrame(data)
execute(self.df)
def time_isnull(self, shape):
execute(IMPL.isnull(self.df))
class TimeDropna:
params = (["all", "any"], [0, 1], get_benchmark_shapes("TimeDropna"))
param_names = ["how", "axis", "shape"]
def setup(self, how, axis, shape):
row, col = shape
self.df = IMPL.DataFrame(np.random.randn(row, col))
self.df.iloc[row // 20 : row // 10, col // 3 : col // 2] = np.nan
self.df["foo"] = "bar"
execute(self.df)
def time_dropna(self, how, axis, shape):
execute(self.df.dropna(how=how, axis=axis))
class TimeEquals:
params = [get_benchmark_shapes("TimeEquals")]
param_names = ["shape"]
def setup(self, shape):
self.df = IMPL.DataFrame(np.random.randn(*shape))
self.df.iloc[-1, -1] = np.nan
execute(self.df)
# returns a boolean thus not calling execute
def time_frame_float_equal(self, shape):
self.df.equals(self.df)
from .utils import setup # noqa: E402, F401
================================================
FILE: asv_bench/benchmarks/io/__init__.py
================================================
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.
"""IO Modin benchmarks."""
================================================
FILE: asv_bench/benchmarks/io/csv.py
================================================
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.
import numpy as np
from ..utils import (
ASV_USE_IMPL,
IMPL,
RAND_HIGH,
RAND_LOW,
execute,
generate_dataframe,
get_benchmark_shapes,
get_shape_id,
prepare_io_data,
)
class BaseReadCsv:
# test data file should be created only once
def setup_cache(self, test_filename="io_test_file"):
test_filenames = prepare_io_data(
test_filename, self.data_type, get_benchmark_shapes(self.__class__.__name__)
)
return test_filenames
def setup(self, test_filenames, shape, *args, **kwargs):
# ray init
if ASV_USE_IMPL == "modin":
IMPL.DataFrame([])
self.shape_id = get_shape_id(shape)
class TimeReadCsvSkiprows(BaseReadCsv):
shapes = get_benchmark_shapes("TimeReadCsvSkiprows")
skiprows_mapping = {
"lambda_even_rows": lambda x: x % 2,
"range_uniform": np.arange(1, shapes[0][0] // 10),
"range_step2": np.arange(1, shapes[0][0], 2),
}
data_type = "str_int"
param_names = ["shape", "skiprows"]
params = [
shapes,
[None, "lambda_even_rows", "range_uniform", "range_step2"],
]
def setup(self, test_filenames, shape, skiprows):
super().setup(test_filenames, shape, skiprows)
self.skiprows = self.skiprows_mapping[skiprows] if skiprows else None
def time_skiprows(self, test_filenames, shape, skiprows):
execute(IMPL.read_csv(test_filenames[self.shape_id], skiprows=self.skiprows))
class TimeReadCsvTrueFalseValues(BaseReadCsv):
data_type = "true_false_int"
param_names = ["shape"]
params = [get_benchmark_shapes("TimeReadCsvTrueFalseValues")]
def time_true_false_values(self, test_filenames, shape):
execute(
IMPL.read_csv(
test_filenames[self.shape_id],
true_values=["Yes", "true"],
false_values=["No", "false"],
),
)
class TimeReadCsvNamesDtype:
shapes = get_benchmark_shapes("TimeReadCsvNamesDtype")
_dtypes_params = ["Int64", "Int64_Timestamp"]
_timestamp_columns = ["col1", "col2"]
param_names = ["shape", "names", "dtype"]
params = [
shapes,
["array-like"],
_dtypes_params,
]
def _get_file_id(self, shape, dtype):
return get_shape_id(shape) + dtype
def _add_timestamp_columns(self, df):
df = df.copy()
date_column = IMPL.date_range("2000", periods=df.shape[0], freq="ms")
for col in self._timestamp_columns:
df[col] = date_column
return df
def setup_cache(self, test_filename="io_test_file_csv_names_dtype"):
# filenames with a metadata of saved dataframes
cache = {}
for shape in self.shapes:
for dtype in self._dtypes_params:
df = generate_dataframe(
"int", *shape, RAND_LOW, RAND_HIGH, impl="pandas"
)
if dtype == "Int64_Timestamp":
df = self._add_timestamp_columns(df)
file_id = self._get_file_id(shape, dtype)
cache[file_id] = (
f"{test_filename}_{file_id}.csv",
df.columns.to_list(),
df.dtypes.to_dict(),
)
df.to_csv(cache[file_id][0], index=False)
return cache
def setup(self, cache, shape, names, dtype):
# ray init
if ASV_USE_IMPL == "modin":
IMPL.DataFrame([])
file_id = self._get_file_id(shape, dtype)
self.filename, self.names, self.dtype = cache[file_id]
self.parse_dates = None
if dtype == "Int64_Timestamp":
# cached version of dtype should not change
self.dtype = self.dtype.copy()
for col in self._timestamp_columns:
del self.dtype[col]
self.parse_dates = self._timestamp_columns
def time_read_csv_names_dtype(self, cache, shape, names, dtype):
execute(
IMPL.read_csv(
self.filename,
names=self.names,
header=0,
dtype=self.dtype,
parse_dates=self.parse_dates,
)
)
from ..utils import setup # noqa: E402, F401
================================================
FILE: asv_bench/benchmarks/io/parquet.py
================================================
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.
from ..utils import (
ASV_USE_IMPL,
IMPL,
execute,
get_benchmark_shapes,
get_shape_id,
prepare_io_data_parquet,
)
class TimeReadParquet:
shapes = get_benchmark_shapes("TimeReadParquet")
data_type = "str_int"
param_names = ["shape"]
params = [
shapes,
]
# test data file should be created only once
def setup_cache(self, test_filename="io_test_file"):
test_filenames = prepare_io_data_parquet(
test_filename, self.data_type, get_benchmark_shapes(self.__class__.__name__)
)
return test_filenames
def setup(self, test_filenames, shape):
# ray init
if ASV_USE_IMPL == "modin":
IMPL.DataFrame([])
self.shape_id = get_shape_id(shape)
def time_read_parquet(self, test_filenames, shape):
execute(
IMPL.read_parquet(
test_filenames[self.shape_id],
)
)
from ..utils import setup # noqa: E402, F401
================================================
FILE: asv_bench/benchmarks/scalability/__init__.py
================================================
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.
"""Benchmarks measuring how Modin performance scales when MODIN_CPUS are changed."""
================================================
FILE: asv_bench/benchmarks/scalability/scalability_benchmarks.py
================================================
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.
"""These benchmarks are supposed to be run only for modin, since they do not make sense for pandas."""
import modin.pandas as pd
try:
from modin.pandas.io import from_pandas
except ImportError:
from modin.pandas.utils import from_pandas
try:
from modin.pandas.io import to_numpy, to_pandas
except ImportError:
try:
from modin.utils import to_numpy, to_pandas
except ImportError:
# This provides compatibility with older versions of the Modin, allowing us to test old commits.
from modin.pandas.utils import to_pandas
import pandas
from ..utils import (
RAND_HIGH,
RAND_LOW,
execute,
gen_data,
generate_dataframe,
get_benchmark_shapes,
)
class TimeFromPandas:
param_names = ["shape", "cpus"]
params = [
get_benchmark_shapes("TimeFromPandas"),
[4, 16, 32],
]
def setup(self, shape, cpus):
self.data = pandas.DataFrame(gen_data("int", *shape, RAND_LOW, RAND_HIGH))
from modin.config import NPartitions
NPartitions.get = lambda: cpus
# trigger ray init
pd.DataFrame([])
def time_from_pandas(self, shape, cpus):
execute(from_pandas(self.data))
class TimeToPandas:
param_names = ["shape", "cpus"]
params = [
get_benchmark_shapes("TimeToPandas"),
[4, 16, 32],
]
def setup(self, shape, cpus):
from modin.config import NPartitions
NPartitions.get = lambda: cpus
self.data = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH, impl="modin")
def time_to_pandas(self, shape, cpus):
# to_pandas is already synchronous
to_pandas(self.data)
class TimeToNumPy:
param_names = ["shape", "cpus"]
params = [
get_benchmark_shapes("TimeToNumPy"),
[4, 16, 32],
]
def setup(self, shape, cpus):
from modin.config import NPartitions
NPartitions.get = lambda: cpus
self.data = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH, impl="modin")
def time_to_numpy(self, shape, cpus):
# to_numpy is already synchronous
to_numpy(self.data)
from ..utils import setup # noqa: E402, F401
================================================
FILE: asv_bench/benchmarks/utils/__init__.py
================================================
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.
"""Modin benchmarks utils."""
from .common import (
IMPL,
execute,
gen_data,
gen_nan_data,
generate_dataframe,
get_shape_id,
prepare_io_data,
prepare_io_data_parquet,
random_booleans,
random_columns,
random_string,
setup,
translator_groupby_ngroups,
)
from .compatibility import ASV_USE_IMPL, ASV_USE_STORAGE_FORMAT
from .data_shapes import GROUPBY_NGROUPS, RAND_HIGH, RAND_LOW, get_benchmark_shapes
__all__ = [
"ASV_USE_IMPL",
"ASV_USE_STORAGE_FORMAT",
"RAND_LOW",
"RAND_HIGH",
"GROUPBY_NGROUPS",
"get_benchmark_shapes",
"IMPL",
"execute",
"get_shape_id",
"gen_data",
"gen_nan_data",
"generate_dataframe",
"prepare_io_data",
"prepare_io_data_parquet",
"random_string",
"random_columns",
"random_booleans",
"translator_groupby_ngroups",
"setup",
]
================================================
FILE: asv_bench/benchmarks/utils/common.py
================================================
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.
"""
The module contains the functionality that is used when benchmarking Modin commits.
In the case of using utilities from the main Modin code, there is a chance that when
benchmarking old commits, the utilities changed, which in turn can unexpectedly affect
the performance results, hence some utility functions are duplicated here.
"""
import logging
import uuid
from typing import Optional, Union
import numpy as np
import pandas
import modin.pandas
from .compatibility import ASV_DATASET_SIZE, ASV_USE_ENGINE, ASV_USE_IMPL
from .data_shapes import RAND_HIGH, RAND_LOW
POSSIBLE_IMPL = {
"modin": modin.pandas,
"pandas": pandas,
}
IMPL = POSSIBLE_IMPL[ASV_USE_IMPL]
def translator_groupby_ngroups(groupby_ngroups: Union[str, int], shape: tuple) -> int:
"""
Translate a string representation of the number of groups, into a number.
Parameters
----------
groupby_ngroups : str or int
Number of groups that will be used in `groupby` operation.
shape : tuple
Same as pandas.Dataframe.shape.
Returns
-------
int
"""
if ASV_DATASET_SIZE == "big":
if groupby_ngroups == "huge_amount_groups":
return min(shape[0] // 2, 5000)
return groupby_ngroups
else:
return groupby_ngroups
class weakdict(dict): # noqa: GL08
__slots__ = ("__weakref__",)
data_cache = dict()
dataframes_cache = dict()
def gen_nan_data(nrows: int, ncols: int) -> dict:
"""
Generate nan data with caching.
The generated data are saved in the dictionary and on a subsequent call,
if the keys match, saved data will be returned. Therefore, we need
to carefully monitor the changing of saved data and make its copy if needed.
Parameters
----------
nrows : int
Number of rows.
ncols : int
Number of columns.
Returns
-------
modin.pandas.DataFrame or pandas.DataFrame or modin.pandas.Series or pandas.Series
DataFrame or Series with shape (nrows, ncols) or (nrows,), respectively.
"""
cache_key = (ASV_USE_IMPL, nrows, ncols)
if cache_key in data_cache:
return data_cache[cache_key]
logging.info("Generating nan data {} rows and {} columns".format(nrows, ncols))
if ncols > 1:
columns = [f"col{x}" for x in range(ncols)]
data = IMPL.DataFrame(np.nan, index=IMPL.RangeIndex(nrows), columns=columns)
elif ncols == 1:
data = IMPL.Series(np.nan, index=IMPL.RangeIndex(nrows))
else:
assert False, "Number of columns (ncols) should be >= 1"
data_cache[cache_key] = data
return data
def gen_int_data(nrows: int, ncols: int, rand_low: int, rand_high: int) -> dict:
"""
Generate int data.
Parameters
----------
nrows : int
Number of rows.
ncols : int
Number of columns.
rand_low : int
Low bound for random generator.
rand_high : int
High bound for random generator.
Returns
-------
dict
Number of keys - `ncols`, each of them store np.ndarray of `nrows` length.
"""
data = {
"col{}".format(i): np.random.randint(rand_low, rand_high, size=(nrows))
for i in range(ncols)
}
return data
def gen_str_int_data(nrows: int, ncols: int, rand_low: int, rand_high: int) -> dict:
"""
Generate int data and string data.
Parameters
----------
nrows : int
Number of rows.
ncols : int
Number of columns.
rand_low : int
Low bound for random generator.
rand_high : int
High bound for random generator.
Returns
-------
dict
Number of keys - `ncols`, each of them store np.ndarray of `nrows` length.
One of the columns with string values.
"""
data = gen_int_data(nrows, ncols, rand_low, rand_high).copy()
# convert values in arbitary column to string type
key = list(data.keys())[0]
data[key] = [f"str_{x}" for x in data[key]]
return data
def gen_true_false_int_data(nrows, ncols, rand_low, rand_high):
"""
Generate int data and string data "true" and "false" values.
Parameters
----------
nrows : int
Number of rows.
ncols : int
Number of columns.
rand_low : int
Low bound for random generator.
rand_high : int
High bound for random generator.
Returns
-------
dict
Number of keys - `ncols`, each of them store np.ndarray of `nrows` length.
One half of the columns with integer values, another half - with "true" and
"false" string values.
"""
data = gen_int_data(nrows // 2, ncols // 2, rand_low, rand_high)
data_true_false = {
"tf_col{}".format(i): np.random.choice(
["Yes", "true", "No", "false"], size=(nrows - nrows // 2)
)
for i in range(ncols - ncols // 2)
}
data.update(data_true_false)
return data
def gen_data(
data_type: str,
nrows: int,
ncols: int,
rand_low: int,
rand_high: int,
) -> dict:
"""
Generate data with caching.
The generated data are saved in the dictionary and on a subsequent call,
if the keys match, saved data will be returned. Therefore, we need
to carefully monitor the changing of saved data and make its copy if needed.
Parameters
----------
data_type : {"int", "str_int", "true_false_int"}
Type of data generation.
nrows : int
Number of rows.
ncols : int
Number of columns.
rand_low : int
Low bound for random generator.
rand_high : int
High bound for random generator.
Returns
-------
dict
Number of keys - `ncols`, each of them store np.ndarray of `nrows` length.
Notes
-----
Returned data type depends on the `data_type` parameter in the next way:
- `data_type`=="int" - all columns will be contain only integer values;
- `data_type`=="str_int" some of the columns will be of string type;
- `data_type`=="true_false_int" half of the columns will be filled with
string values representing "true" and "false" values and another half - with
integers.
"""
type_to_generator = {
"int": gen_int_data,
"str_int": gen_str_int_data,
"true_false_int": gen_true_false_int_data,
}
cache_key = (data_type, nrows, ncols, rand_low, rand_high)
if cache_key in data_cache:
return data_cache[cache_key]
logging.info(
"Generating {} data {} rows and {} columns [{}-{}]".format(
data_type, nrows, ncols, rand_low, rand_high
)
)
assert data_type in type_to_generator
data_generator = type_to_generator[data_type]
data = data_generator(nrows, ncols, rand_low, rand_high)
data_cache[cache_key] = weakdict(data)
return data
def generate_dataframe(
data_type: str,
nrows: int,
ncols: int,
rand_low: int,
rand_high: int,
groupby_ncols: Optional[int] = None,
count_groups: Optional[int] = None,
gen_unique_key: bool = False,
cache_prefix: str = None,
impl: str = None,
) -> Union[modin.pandas.DataFrame, pandas.DataFrame]:
"""
Generate DataFrame with caching.
The generated dataframes are saved in the dictionary and on a subsequent call,
if the keys match, one of the saved dataframes will be returned. Therefore, we need
to carefully monitor that operations that change the dataframe work with its copy.
Parameters
----------
data_type : str
Type of data generation;
supported types: {"int", "str_int"}.
nrows : int
Number of rows.
ncols : int
Number of columns.
rand_low : int
Low bound for random generator.
rand_high : int
High bound for random generator.
groupby_ncols : int, default: None
Number of columns for which `groupby` will be called in the future;
to get more stable performance results, we need to have the same number of values
in each group every benchmarking time.
count_groups : int, default: None
Count of groups in groupby columns.
gen_unique_key : bool, default: False
Generate `col1` column where all elements are unique.
cache_prefix : str, optional
Prefix to add to the cache key of the requested frame.
impl : str, optional
Implementation used to create the dataframe;
supported implemetations: {"modin", "pandas"}.
Returns
-------
modin.pandas.DataFrame or pandas.DataFrame [and list]
Notes
-----
The list of groupby columns names returns when groupby columns are generated.
"""
assert not (
(groupby_ncols is None) ^ (count_groups is None)
), "You must either specify both parameters 'groupby_ncols' and 'count_groups' or none of them."
if groupby_ncols and count_groups:
ncols -= groupby_ncols
if impl is None:
impl = ASV_USE_IMPL
cache_key = (
impl,
data_type,
nrows,
ncols,
rand_low,
rand_high,
groupby_ncols,
count_groups,
gen_unique_key,
)
if cache_prefix is not None:
cache_key = (cache_prefix, *cache_key)
if cache_key in dataframes_cache:
return dataframes_cache[cache_key]
logging.info(
"Allocating {} DataFrame {}: {} rows and {} columns [{}-{}]".format(
impl, data_type, nrows, ncols, rand_low, rand_high
)
)
data = gen_data(data_type, nrows, ncols, rand_low, rand_high)
if groupby_ncols and count_groups:
groupby_columns = [f"groupby_col{x}" for x in range(groupby_ncols)]
for groupby_col in groupby_columns:
data[groupby_col] = np.tile(np.arange(count_groups), nrows // count_groups)
if gen_unique_key:
data["col1"] = np.arange(nrows)
df = POSSIBLE_IMPL[impl].DataFrame(data)
if groupby_ncols and count_groups:
dataframes_cache[cache_key] = df, groupby_columns
return df, groupby_columns
dataframes_cache[cache_key] = df
return df
def random_string() -> str:
"""
Create a 36-character random string.
Returns
-------
str
"""
return str(uuid.uuid4())
def random_columns(df_columns: list, columns_number: int) -> list:
"""
Pick sublist of random columns from a given sequence.
Parameters
----------
df_columns : list
Columns to choose from.
columns_number : int
How many columns to pick.
Returns
-------
list
"""
return list(np.random.choice(df_columns, size=columns_number))
def random_booleans(number: int) -> list:
"""
Create random list of booleans with `number` elements.
Parameters
----------
number : int
Count of booleans in result list.
Returns
-------
list
"""
return list(np.random.choice([True, False], size=number))
def execute(df: Union[modin.pandas.DataFrame, pandas.DataFrame]):
"""
Make sure the calculations are finished.
Parameters
----------
df : modin.pandas.DataFrame or pandas.Datarame
DataFrame to be executed.
"""
if ASV_USE_IMPL == "modin":
partitions = df._query_compiler._modin_frame._partitions.flatten()
mgr_cls = df._query_compiler._modin_frame._partition_mgr_cls
if len(partitions) and hasattr(mgr_cls, "wait_partitions"):
mgr_cls.wait_partitions(partitions)
return
# compatibility with old Modin versions
all(
map(
lambda partition: partition.drain_call_queue() or True,
partitions,
)
)
if ASV_USE_ENGINE == "ray":
from ray import wait
all(map(lambda partition: wait([partition._data]), partitions))
elif ASV_USE_ENGINE == "dask":
from dask.distributed import wait
all(map(lambda partition: wait(partition._data), partitions))
elif ASV_USE_ENGINE == "python":
pass
elif ASV_USE_IMPL == "pandas":
pass
def get_shape_id(shape: tuple) -> str:
"""
Join shape numbers into a string with `_` delimiters.
Parameters
----------
shape : tuple
Same as pandas.Dataframe.shape.
Returns
-------
str
"""
return "_".join([str(element) for element in shape])
def prepare_io_data(test_filename: str, data_type: str, shapes: list):
"""
Prepare data for IO tests with caching.
Parameters
----------
test_filename : str
Unique file identifier that is used to distinguish data
for different tests.
data_type : {"int", "str_int", "true_false_int"}
Type of data generation.
shapes : list
Data shapes to prepare.
Returns
-------
test_filenames : dict
Dictionary that maps dataset shape to the file on disk.
"""
test_filenames = {}
for shape in shapes:
shape_id = get_shape_id(shape)
test_filenames[shape_id] = f"{test_filename}_{shape_id}_{data_type}.csv"
df = generate_dataframe(data_type, *shape, RAND_LOW, RAND_HIGH, impl="pandas")
df.to_csv(test_filenames[shape_id], index=False)
return test_filenames
def prepare_io_data_parquet(test_filename: str, data_type: str, shapes: list):
"""
Prepare data for IO tests with caching.
Parameters
----------
test_filename : str
Unique file identifier that is used to distinguish data
for different tests.
data_type : "str_int"
Type of data generation.
shapes : list
Data shapes to prepare.
Returns
-------
test_filenames : dict
Dictionary that maps dataset shape to the file on disk.
"""
test_filenames = {}
for shape in shapes:
shape_id = get_shape_id(shape)
test_filenames[shape_id] = f"{test_filename}_{shape_id}_{data_type}.parquet"
df = generate_dataframe(data_type, *shape, RAND_LOW, RAND_HIGH, impl="pandas")
df.to_parquet(test_filenames[shape_id], index=False)
return test_filenames
def setup(*args, **kwargs): # noqa: GL08
# This function just needs to be imported into each benchmark file to
# set up the random seed before each function. ASV run it automatically.
# https://asv.readthedocs.io/en/latest/writing_benchmarks.html
np.random.seed(42)
================================================
FILE: asv_bench/benchmarks/utils/compatibility.py
================================================
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.
"""Compatibility layer for parameters used by ASV."""
import os
import modin.pandas as pd
try:
from modin.config import NPartitions
NPARTITIONS = NPartitions.get()
except ImportError:
NPARTITIONS = pd.DEFAULT_NPARTITIONS
try:
from modin.config import AsvImplementation, Engine, StorageFormat, TestDatasetSize
ASV_USE_IMPL = AsvImplementation.get()
ASV_DATASET_SIZE = TestDatasetSize.get() or "Small"
ASV_USE_ENGINE = Engine.get()
ASV_USE_STORAGE_FORMAT = StorageFormat.get()
except ImportError:
# The same benchmarking code can be run for different versions of Modin, so in
# case of an error importing important variables, we'll just use predefined values
ASV_USE_IMPL = os.environ.get("MODIN_ASV_USE_IMPL", "modin")
ASV_DATASET_SIZE = os.environ.get("MODIN_TEST_DATASET_SIZE", "Small")
ASV_USE_ENGINE = os.environ.get("MODIN_ENGINE", "Ray")
ASV_USE_STORAGE_FORMAT = os.environ.get("MODIN_STORAGE_FORMAT", "Pandas")
ASV_USE_IMPL = ASV_USE_IMPL.lower()
ASV_DATASET_SIZE = ASV_DATASET_SIZE.lower()
ASV_USE_ENGINE = ASV_USE_ENGINE.lower()
ASV_USE_STORAGE_FORMAT = ASV_USE_STORAGE_FORMAT.lower()
assert ASV_USE_IMPL in ("modin", "pandas")
assert ASV_DATASET_SIZE in ("big", "small")
assert ASV_USE_ENGINE in ("ray", "dask", "python", "unidist")
assert ASV_USE_STORAGE_FORMAT in ("pandas")
================================================
FILE: asv_bench/benchmarks/utils/data_shapes.py
================================================
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.
"""Define data shapes."""
import json
import os
from .compatibility import ASV_DATASET_SIZE
RAND_LOW = 0
RAND_HIGH = 100
BINARY_OP_DATA_SIZE = {
"big": [
[[5000, 5000], [5000, 5000]],
# the case extremely inefficient
# [[20, 500_000], [10, 1_000_000]],
[[500_000, 20], [1_000_000, 10]],
],
"small": [[[250, 250], [250, 250]], [[10_000, 20], [25_000, 10]]],
}
UNARY_OP_DATA_SIZE = {
"big": [
[5000, 5000],
# the case extremely inefficient
# [10, 1_000_000],
[1_000_000, 10],
],
"small": [[250, 250], [10_000, 10]],
}
SERIES_DATA_SIZE = {
"big": [[100_000, 1]],
"small": [[10_000, 1]],
}
BINARY_OP_SERIES_DATA_SIZE = {
"big": [
[[500_000, 1], [1_000_000, 1]],
[[500_000, 1], [500_000, 1]],
],
"small": [[[5_000, 1], [10_000, 1]]],
}
DEFAULT_GROUPBY_NGROUPS = {
"big": [100, "huge_amount_groups"],
"small": [5],
}
GROUPBY_NGROUPS = DEFAULT_GROUPBY_NGROUPS[ASV_DATASET_SIZE]
_DEFAULT_CONFIG_T = [
(
UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE],
[
# Pandas storage format benchmarks
"TimeGroupByMultiColumn",
"TimeGroupByDefaultAggregations",
"TimeGroupByDictionaryAggregation",
"TimeSetItem",
"TimeInsert",
"TimeArithmetic",
"TimeSortValues",
"TimeDrop",
"TimeHead",
"TimeTail",
"TimeExplode",
"TimeFillna",
"TimeFillnaDataFrame",
"TimeValueCountsFrame",
"TimeValueCountsSeries",
"TimeIndexing",
"TimeMultiIndexing",
"TimeResetIndex",
"TimeAstype",
"TimeDescribe",
"TimeProperties",
"TimeReindex",
"TimeReindexMethod",
"TimeFillnaMethodDataframe",
"TimeDropDuplicatesDataframe",
"TimeStack",
"TimeUnstack",
"TimeRepr",
"TimeMaskBool",
"TimeIsnull",
"TimeDropna",
"TimeEquals",
# IO benchmarks
"TimeReadCsvSkiprows",
"TimeReadCsvTrueFalseValues",
"TimeReadCsvNamesDtype",
"TimeReadParquet",
# Scalability benchmarks
"TimeFromPandas",
"TimeToPandas",
"TimeToNumPy",
],
),
(
BINARY_OP_DATA_SIZE[ASV_DATASET_SIZE],
[
# Pandas storage format benchmarks
"TimeJoin",
"TimeMerge",
"TimeMergeDefault",
"TimeConcat",
"TimeAppend",
"TimeBinaryOp",
"TimeLevelAlign",
],
),
(
SERIES_DATA_SIZE[ASV_DATASET_SIZE],
[
# Pandas storage format benchmarks
"TimeFillnaSeries",
"TimeGroups",
"TimeIndexingNumericSeries",
"TimeFillnaMethodSeries",
"TimeDatetimeAccessor",
"TimeSetCategories",
"TimeRemoveCategories",
"TimeDropDuplicatesSeries",
],
),
(
BINARY_OP_SERIES_DATA_SIZE[ASV_DATASET_SIZE],
[
# Pandas storage format benchmarks
"TimeBinaryOpSeries",
],
),
]
DEFAULT_CONFIG = {}
DEFAULT_CONFIG["MergeCategoricals"] = (
[[10_000, 2]] if ASV_DATASET_SIZE == "big" else [[1_000, 2]]
)
DEFAULT_CONFIG["TimeJoinStringIndex"] = (
[[100_000, 64]] if ASV_DATASET_SIZE == "big" else [[1_000, 4]]
)
DEFAULT_CONFIG["TimeReplace"] = (
[[10_000, 2]] if ASV_DATASET_SIZE == "big" else [[1_000, 2]]
)
for config in (_DEFAULT_CONFIG_T,):
for _shape, _names in config:
DEFAULT_CONFIG.update({_name: _shape for _name in _names})
# Correct shapes in the case when the operation ended with a timeout error
if ASV_DATASET_SIZE == "big":
DEFAULT_CONFIG["TimeMergeDefault"] = [
[[1000, 1000], [1000, 1000]],
[[500_000, 20], [1_000_000, 10]],
]
DEFAULT_CONFIG["TimeLevelAlign"] = [
[[2500, 2500], [2500, 2500]],
[[250_000, 20], [500_000, 10]],
]
DEFAULT_CONFIG["TimeStack"] = [
[1500, 1500],
[100_000, 10],
]
DEFAULT_CONFIG["TimeUnstack"] = DEFAULT_CONFIG["TimeStack"]
CONFIG_FROM_FILE = None
def get_benchmark_shapes(bench_id: str):
"""
Get custom benchmark shapes from a json file stored in MODIN_ASV_DATASIZE_CONFIG.
If `bench_id` benchmark is not found in the file, then the default value will
be used.
Parameters
----------
bench_id : str
Unique benchmark identifier that is used to get shapes.
Returns
-------
list
Benchmark shapes.
"""
global CONFIG_FROM_FILE
if not CONFIG_FROM_FILE:
try:
from modin.config import AsvDataSizeConfig
filename = AsvDataSizeConfig.get()
except ImportError:
filename = os.environ.get("MODIN_ASV_DATASIZE_CONFIG", None)
if filename:
# should be json
with open(filename) as _f:
CONFIG_FROM_FILE = json.load(_f)
if CONFIG_FROM_FILE and bench_id in CONFIG_FROM_FILE:
return CONFIG_FROM_FILE[bench_id]
return DEFAULT_CONFIG[bench_id]
================================================
FILE: asv_bench/test/__init__.py
================================================
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the
gitextract_eudtie4f/ ├── .gitattributes ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ ├── bug-report.yaml │ │ ├── feature_request.md │ │ └── question.md │ ├── PULL_REQUEST_TEMPLATE.md │ ├── actions/ │ │ ├── mamba-env/ │ │ │ └── action.yml │ │ ├── python-only/ │ │ │ └── action.yml │ │ ├── run-core-tests/ │ │ │ ├── action.yml │ │ │ ├── group_1/ │ │ │ │ └── action.yml │ │ │ ├── group_2/ │ │ │ │ └── action.yml │ │ │ ├── group_3/ │ │ │ │ └── action.yml │ │ │ └── group_4/ │ │ │ └── action.yml │ │ └── upload-coverage/ │ │ └── action.yml │ ├── dependabot.yaml │ ├── stale.yml │ └── workflows/ │ ├── ci-notebooks.yml │ ├── ci-required.yml │ ├── ci.yml │ ├── codeql/ │ │ └── codeql-config.yml │ ├── codeql.yml │ ├── fuzzydata-test.yml │ ├── publish-to-pypi.yml │ ├── push-to-main.yml │ └── sql_server/ │ └── set_up_sql_server.sh ├── .gitignore ├── .readthedocs.yaml ├── CODEOWNERS ├── CODE_OF_CONDUCT.md ├── LICENSE ├── LICENSE_HEADER ├── MANIFEST.in ├── NOTICE ├── README.md ├── asv_bench/ │ ├── README.md │ ├── asv.conf.dask.json │ ├── asv.conf.json │ ├── asv.conf.unidist.json │ ├── benchmarks/ │ │ ├── __init__.py │ │ ├── benchmarks.py │ │ ├── io/ │ │ │ ├── __init__.py │ │ │ ├── csv.py │ │ │ └── parquet.py │ │ ├── scalability/ │ │ │ ├── __init__.py │ │ │ └── scalability_benchmarks.py │ │ └── utils/ │ │ ├── __init__.py │ │ ├── common.py │ │ ├── compatibility.py │ │ └── data_shapes.py │ └── test/ │ ├── __init__.py │ └── test_utils.py ├── ci/ │ └── teamcity/ │ ├── Dockerfile.teamcity-ci │ ├── build-docker.py │ └── comment_on_pr.py ├── codecov.yml ├── contributing/ │ ├── contributing.md │ └── pre-commit ├── docker/ │ └── Dockerfile ├── docs/ │ ├── _static/ │ │ └── custom.js │ ├── _templates/ │ │ └── layout.html │ ├── conf.py │ ├── contact.rst │ ├── development/ │ │ ├── architecture.rst │ │ ├── contributing.rst │ │ ├── index.rst │ │ ├── partition_api.rst │ │ ├── using_pandas_on_dask.rst │ │ ├── using_pandas_on_mpi.rst │ │ ├── using_pandas_on_python.rst │ │ └── using_pandas_on_ray.rst │ ├── ecosystem.rst │ ├── flow/ │ │ └── modin/ │ │ ├── config.rst │ │ ├── core/ │ │ │ ├── dataframe/ │ │ │ │ ├── algebra.rst │ │ │ │ ├── base/ │ │ │ │ │ ├── dataframe.rst │ │ │ │ │ ├── index.rst │ │ │ │ │ └── partitioning/ │ │ │ │ │ └── axis_partition.rst │ │ │ │ ├── index.rst │ │ │ │ └── pandas/ │ │ │ │ ├── dataframe.rst │ │ │ │ ├── index.rst │ │ │ │ ├── metadata/ │ │ │ │ │ ├── dtypes.rst │ │ │ │ │ └── index.rst │ │ │ │ └── partitioning/ │ │ │ │ ├── axis_partition.rst │ │ │ │ ├── partition.rst │ │ │ │ └── partition_manager.rst │ │ │ ├── execution/ │ │ │ │ ├── dask/ │ │ │ │ │ └── implementations/ │ │ │ │ │ └── pandas_on_dask/ │ │ │ │ │ ├── dataframe.rst │ │ │ │ │ ├── index.rst │ │ │ │ │ └── partitioning/ │ │ │ │ │ ├── partition.rst │ │ │ │ │ ├── partition_manager.rst │ │ │ │ │ └── virtual_partition.rst │ │ │ │ ├── dispatching.rst │ │ │ │ ├── python/ │ │ │ │ │ └── implementations/ │ │ │ │ │ └── pandas_on_python/ │ │ │ │ │ ├── dataframe.rst │ │ │ │ │ ├── index.rst │ │ │ │ │ └── partitioning/ │ │ │ │ │ ├── axis_partition.rst │ │ │ │ │ ├── partition.rst │ │ │ │ │ └── partition_manager.rst │ │ │ │ ├── ray/ │ │ │ │ │ ├── generic.rst │ │ │ │ │ └── implementations/ │ │ │ │ │ └── pandas_on_ray/ │ │ │ │ │ ├── dataframe.rst │ │ │ │ │ ├── index.rst │ │ │ │ │ └── partitioning/ │ │ │ │ │ ├── axis_partition.rst │ │ │ │ │ ├── partition.rst │ │ │ │ │ └── partition_manager.rst │ │ │ │ └── unidist/ │ │ │ │ ├── generic.rst │ │ │ │ └── implementations/ │ │ │ │ └── pandas_on_unidist/ │ │ │ │ ├── dataframe.rst │ │ │ │ ├── index.rst │ │ │ │ └── partitioning/ │ │ │ │ ├── axis_partition.rst │ │ │ │ ├── partition.rst │ │ │ │ └── partition_manager.rst │ │ │ ├── io/ │ │ │ │ └── index.rst │ │ │ └── storage_formats/ │ │ │ ├── base/ │ │ │ │ └── query_compiler.rst │ │ │ ├── index.rst │ │ │ └── pandas/ │ │ │ ├── index.rst │ │ │ ├── parsers.rst │ │ │ └── query_compiler.rst │ │ ├── distributed/ │ │ │ └── dataframe/ │ │ │ └── pandas.rst │ │ ├── experimental/ │ │ │ ├── batch.rst │ │ │ ├── core/ │ │ │ │ └── io/ │ │ │ │ └── index.rst │ │ │ ├── index.rst │ │ │ ├── pandas.rst │ │ │ ├── range_partitioning_groupby.rst │ │ │ ├── reshuffling_groupby.rst │ │ │ ├── sklearn.rst │ │ │ └── xgboost.rst │ │ ├── pandas/ │ │ │ ├── base.rst │ │ │ ├── dataframe.rst │ │ │ └── series.rst │ │ └── utils.rst │ ├── getting_started/ │ │ ├── examples.rst │ │ ├── faq.rst │ │ ├── installation.rst │ │ ├── quickstart.rst │ │ ├── troubleshooting.rst │ │ ├── using_modin/ │ │ │ ├── using_modin.rst │ │ │ ├── using_modin_cluster.rst │ │ │ └── using_modin_locally.rst │ │ └── why_modin/ │ │ ├── modin_vs_dask_vs_koalas.rst │ │ ├── out_of_core.rst │ │ ├── pandas.rst │ │ └── why_modin.rst │ ├── index.rst │ ├── release-procedure.md │ ├── release_notes/ │ │ ├── release_notes-0.14.0.rst │ │ ├── release_notes-0.15.0.rst │ │ ├── release_notes-0.16.0.rst │ │ └── release_notes-template.rst │ ├── requirements-doc.txt │ ├── supported_apis/ │ │ ├── dataframe_supported.rst │ │ ├── defaulting_to_pandas.rst │ │ ├── index.rst │ │ ├── io_supported.rst │ │ ├── older_pandas_compat.rst │ │ ├── series_supported.rst │ │ └── utilities_supported.rst │ └── usage_guide/ │ ├── advanced_usage/ │ │ ├── batch.rst │ │ ├── index.rst │ │ ├── modin_engines.rst │ │ ├── modin_logging.rst │ │ ├── modin_metrics.rst │ │ ├── modin_xgboost.rst │ │ ├── progress_bar.rst │ │ └── spreadsheets_api.rst │ ├── benchmarking.rst │ ├── examples/ │ │ └── index.rst │ ├── index.rst │ ├── integrations.rst │ └── optimization_notes/ │ ├── index.rst │ └── range_partitioning_ops.rst ├── environment-dev.yml ├── examples/ │ ├── data/ │ │ ├── boston_housing.csv │ │ ├── census_1k.csv │ │ ├── nyc-taxi_1k.csv │ │ ├── plasticc_test_set_1k.csv │ │ ├── plasticc_test_set_metadata_1k.csv │ │ ├── plasticc_training_set_1k.csv │ │ └── plasticc_training_set_metadata_1k.csv │ ├── docker/ │ │ └── modin-ray/ │ │ ├── Dockerfile │ │ ├── build-docker-image.sh │ │ ├── census.py │ │ ├── nyc-taxi.py │ │ ├── plasticc.py │ │ └── taxi.pstat │ ├── jupyter/ │ │ ├── Modin_Taxi.ipynb │ │ ├── Pandas_Taxi.ipynb │ │ └── integrations/ │ │ ├── NLTK.ipynb │ │ ├── altair.ipynb │ │ ├── bokeh.ipynb │ │ ├── huggingface.ipynb │ │ ├── matplotlib.ipynb │ │ ├── plotly.ipynb │ │ ├── seaborn.ipynb │ │ ├── sklearn.ipynb │ │ ├── statsmodels.ipynb │ │ ├── tensorflow.ipynb │ │ └── xgboost.ipynb │ ├── modin-scikit-learn-example.ipynb │ ├── quickstart.ipynb │ ├── spreadsheet/ │ │ ├── requirements.txt │ │ └── tutorial.ipynb │ └── tutorial/ │ ├── README.md │ └── jupyter/ │ ├── README.md │ └── execution/ │ ├── pandas_on_dask/ │ │ ├── Dockerfile │ │ ├── cluster/ │ │ │ └── exercise_5.ipynb │ │ ├── local/ │ │ │ ├── exercise_1.ipynb │ │ │ ├── exercise_2.ipynb │ │ │ ├── exercise_3.ipynb │ │ │ └── exercise_4.ipynb │ │ ├── requirements.txt │ │ └── test/ │ │ └── test_notebooks.py │ ├── pandas_on_ray/ │ │ ├── Dockerfile │ │ ├── cluster/ │ │ │ ├── README.md │ │ │ ├── exercise_5.py │ │ │ └── modin-cluster.yaml │ │ ├── local/ │ │ │ ├── exercise_1.ipynb │ │ │ ├── exercise_2.ipynb │ │ │ ├── exercise_3.ipynb │ │ │ └── exercise_4.ipynb │ │ ├── requirements.txt │ │ └── test/ │ │ └── test_notebooks.py │ ├── pandas_on_unidist/ │ │ ├── Dockerfile │ │ ├── README.md │ │ ├── jupyter_unidist_env.yml │ │ ├── local/ │ │ │ ├── exercise_1.ipynb │ │ │ ├── exercise_2.ipynb │ │ │ ├── exercise_3.ipynb │ │ │ └── exercise_4.ipynb │ │ ├── setup_kernel.py │ │ └── test/ │ │ └── test_notebooks.py │ └── test/ │ └── utils.py ├── modin/ │ ├── __init__.py │ ├── __main__.py │ ├── _version.py │ ├── config/ │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── envvars.py │ │ └── pubsub.py │ ├── conftest.py │ ├── core/ │ │ ├── __init__.py │ │ ├── computation/ │ │ │ ├── __init__.py │ │ │ ├── align.py │ │ │ ├── check.py │ │ │ ├── common.py │ │ │ ├── engines.py │ │ │ ├── eval.py │ │ │ ├── expr.py │ │ │ ├── ops.py │ │ │ ├── parsing.py │ │ │ └── scope.py │ │ ├── dataframe/ │ │ │ ├── __init__.py │ │ │ ├── algebra/ │ │ │ │ ├── __init__.py │ │ │ │ ├── binary.py │ │ │ │ ├── default2pandas/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── binary.py │ │ │ │ │ ├── cat.py │ │ │ │ │ ├── dataframe.py │ │ │ │ │ ├── datetime.py │ │ │ │ │ ├── default.py │ │ │ │ │ ├── groupby.py │ │ │ │ │ ├── list.py │ │ │ │ │ ├── resample.py │ │ │ │ │ ├── rolling.py │ │ │ │ │ ├── series.py │ │ │ │ │ ├── str.py │ │ │ │ │ └── struct.py │ │ │ │ ├── fold.py │ │ │ │ ├── groupby.py │ │ │ │ ├── map.py │ │ │ │ ├── operator.py │ │ │ │ ├── reduce.py │ │ │ │ └── tree_reduce.py │ │ │ ├── base/ │ │ │ │ ├── __init__.py │ │ │ │ ├── dataframe/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── dataframe.py │ │ │ │ │ └── utils.py │ │ │ │ ├── interchange/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── dataframe_protocol/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── dataframe.py │ │ │ │ │ └── utils.py │ │ │ │ └── partitioning/ │ │ │ │ ├── __init__.py │ │ │ │ └── axis_partition.py │ │ │ └── pandas/ │ │ │ ├── __init__.py │ │ │ ├── dataframe/ │ │ │ │ ├── __init__.py │ │ │ │ ├── dataframe.py │ │ │ │ └── utils.py │ │ │ ├── interchange/ │ │ │ │ ├── __init__.py │ │ │ │ └── dataframe_protocol/ │ │ │ │ ├── __init__.py │ │ │ │ ├── buffer.py │ │ │ │ ├── column.py │ │ │ │ ├── dataframe.py │ │ │ │ ├── exception.py │ │ │ │ └── from_dataframe.py │ │ │ ├── metadata/ │ │ │ │ ├── __init__.py │ │ │ │ ├── dtypes.py │ │ │ │ └── index.py │ │ │ ├── partitioning/ │ │ │ │ ├── __init__.py │ │ │ │ ├── axis_partition.py │ │ │ │ ├── partition.py │ │ │ │ └── partition_manager.py │ │ │ └── utils.py │ │ ├── execution/ │ │ │ ├── __init__.py │ │ │ ├── dask/ │ │ │ │ ├── __init__.py │ │ │ │ ├── common/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── engine_wrapper.py │ │ │ │ │ └── utils.py │ │ │ │ └── implementations/ │ │ │ │ ├── __init__.py │ │ │ │ └── pandas_on_dask/ │ │ │ │ ├── __init__.py │ │ │ │ ├── dataframe/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── dataframe.py │ │ │ │ ├── io/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── io.py │ │ │ │ └── partitioning/ │ │ │ │ ├── __init__.py │ │ │ │ ├── partition.py │ │ │ │ ├── partition_manager.py │ │ │ │ └── virtual_partition.py │ │ │ ├── dispatching/ │ │ │ │ ├── __init__.py │ │ │ │ └── factories/ │ │ │ │ ├── __init__.py │ │ │ │ ├── dispatcher.py │ │ │ │ └── factories.py │ │ │ ├── modin_aqp.py │ │ │ ├── python/ │ │ │ │ ├── __init__.py │ │ │ │ ├── common/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── engine_wrapper.py │ │ │ │ └── implementations/ │ │ │ │ ├── __init__.py │ │ │ │ └── pandas_on_python/ │ │ │ │ ├── __init__.py │ │ │ │ ├── dataframe/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── dataframe.py │ │ │ │ ├── io/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── io.py │ │ │ │ └── partitioning/ │ │ │ │ ├── __init__.py │ │ │ │ ├── partition.py │ │ │ │ ├── partition_manager.py │ │ │ │ └── virtual_partition.py │ │ │ ├── ray/ │ │ │ │ ├── __init__.py │ │ │ │ ├── common/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── deferred_execution.py │ │ │ │ │ ├── engine_wrapper.py │ │ │ │ │ └── utils.py │ │ │ │ ├── generic/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── io/ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── io.py │ │ │ │ │ └── partitioning/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── partition_manager.py │ │ │ │ └── implementations/ │ │ │ │ ├── __init__.py │ │ │ │ └── pandas_on_ray/ │ │ │ │ ├── __init__.py │ │ │ │ ├── dataframe/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── dataframe.py │ │ │ │ ├── io/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── io.py │ │ │ │ └── partitioning/ │ │ │ │ ├── __init__.py │ │ │ │ ├── partition.py │ │ │ │ ├── partition_manager.py │ │ │ │ └── virtual_partition.py │ │ │ ├── unidist/ │ │ │ │ ├── __init__.py │ │ │ │ ├── common/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── engine_wrapper.py │ │ │ │ │ └── utils.py │ │ │ │ ├── generic/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── io/ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── io.py │ │ │ │ │ └── partitioning/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── partition_manager.py │ │ │ │ └── implementations/ │ │ │ │ ├── __init__.py │ │ │ │ └── pandas_on_unidist/ │ │ │ │ ├── __init__.py │ │ │ │ ├── dataframe/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── dataframe.py │ │ │ │ ├── io/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── io.py │ │ │ │ └── partitioning/ │ │ │ │ ├── __init__.py │ │ │ │ ├── partition.py │ │ │ │ ├── partition_manager.py │ │ │ │ └── virtual_partition.py │ │ │ └── utils.py │ │ ├── io/ │ │ │ ├── __init__.py │ │ │ ├── column_stores/ │ │ │ │ ├── __init__.py │ │ │ │ ├── column_store_dispatcher.py │ │ │ │ ├── feather_dispatcher.py │ │ │ │ ├── hdf_dispatcher.py │ │ │ │ └── parquet_dispatcher.py │ │ │ ├── file_dispatcher.py │ │ │ ├── io.py │ │ │ ├── sql/ │ │ │ │ ├── __init__.py │ │ │ │ └── sql_dispatcher.py │ │ │ └── text/ │ │ │ ├── __init__.py │ │ │ ├── csv_dispatcher.py │ │ │ ├── excel_dispatcher.py │ │ │ ├── fwf_dispatcher.py │ │ │ ├── json_dispatcher.py │ │ │ ├── text_file_dispatcher.py │ │ │ └── utils.py │ │ └── storage_formats/ │ │ ├── __init__.py │ │ ├── base/ │ │ │ ├── __init__.py │ │ │ ├── doc_utils.py │ │ │ ├── query_compiler.py │ │ │ └── query_compiler_calculator.py │ │ └── pandas/ │ │ ├── __init__.py │ │ ├── aggregations.py │ │ ├── groupby.py │ │ ├── merge.py │ │ ├── native_query_compiler.py │ │ ├── parsers.py │ │ ├── query_compiler.py │ │ ├── query_compiler_caster.py │ │ └── utils.py │ ├── db_conn.py │ ├── distributed/ │ │ ├── __init__.py │ │ └── dataframe/ │ │ ├── __init__.py │ │ └── pandas/ │ │ ├── __init__.py │ │ └── partitions.py │ ├── error_message.py │ ├── experimental/ │ │ ├── __init__.py │ │ ├── batch/ │ │ │ ├── __init__.py │ │ │ └── pipeline.py │ │ ├── core/ │ │ │ ├── __init__.py │ │ │ ├── execution/ │ │ │ │ ├── __init__.py │ │ │ │ ├── dask/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── implementations/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── pandas_on_dask/ │ │ │ │ │ └── __init__.py │ │ │ │ ├── ray/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── implementations/ │ │ │ │ │ └── __init__.py │ │ │ │ └── unidist/ │ │ │ │ ├── __init__.py │ │ │ │ └── implementations/ │ │ │ │ ├── __init__.py │ │ │ │ └── pandas_on_unidist/ │ │ │ │ └── __init__.py │ │ │ ├── io/ │ │ │ │ ├── __init__.py │ │ │ │ ├── glob/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── glob_dispatcher.py │ │ │ │ ├── sql/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── sql_dispatcher.py │ │ │ │ │ └── utils.py │ │ │ │ └── text/ │ │ │ │ ├── __init__.py │ │ │ │ ├── csv_glob_dispatcher.py │ │ │ │ └── custom_text_dispatcher.py │ │ │ └── storage_formats/ │ │ │ ├── __init__.py │ │ │ └── pandas/ │ │ │ ├── __init__.py │ │ │ └── parsers.py │ │ ├── fuzzydata/ │ │ │ └── __init__.py │ │ ├── pandas/ │ │ │ ├── __init__.py │ │ │ └── io.py │ │ ├── sklearn/ │ │ │ ├── __init__.py │ │ │ └── model_selection/ │ │ │ ├── __init__.py │ │ │ └── train_test_split.py │ │ ├── spreadsheet/ │ │ │ ├── __init__.py │ │ │ └── general.py │ │ ├── torch/ │ │ │ ├── __init__.py │ │ │ └── datasets.py │ │ └── xgboost/ │ │ ├── __init__.py │ │ ├── utils.py │ │ ├── xgboost.py │ │ └── xgboost_ray.py │ ├── logging/ │ │ ├── __init__.py │ │ ├── class_logger.py │ │ ├── config.py │ │ ├── logger_decorator.py │ │ └── metrics.py │ ├── numpy/ │ │ ├── __init__.py │ │ ├── arr.py │ │ ├── array_creation.py │ │ ├── array_shaping.py │ │ ├── constants.py │ │ ├── indexing.py │ │ ├── linalg.py │ │ ├── logic.py │ │ ├── math.py │ │ ├── trigonometry.py │ │ └── utils.py │ ├── pandas/ │ │ ├── __init__.py │ │ ├── accessor.py │ │ ├── api/ │ │ │ ├── __init__.py │ │ │ └── extensions/ │ │ │ ├── __init__.py │ │ │ └── extensions.py │ │ ├── arrays/ │ │ │ └── __init__.py │ │ ├── base.py │ │ ├── dataframe.py │ │ ├── errors/ │ │ │ └── __init__.py │ │ ├── general.py │ │ ├── groupby.py │ │ ├── indexing.py │ │ ├── io.py │ │ ├── iterator.py │ │ ├── plotting.py │ │ ├── resample.py │ │ ├── series.py │ │ ├── series_utils.py │ │ ├── testing/ │ │ │ └── __init__.py │ │ ├── utils.py │ │ └── window.py │ ├── polars/ │ │ ├── __init__.py │ │ ├── base.py │ │ ├── dataframe.py │ │ ├── groupby.py │ │ ├── lazyframe.py │ │ └── series.py │ ├── tests/ │ │ ├── __init__.py │ │ ├── config/ │ │ │ ├── __init__.py │ │ │ ├── docs_module/ │ │ │ │ ├── __init__.py │ │ │ │ ├── classes.py │ │ │ │ └── functions.py │ │ │ ├── docs_module_with_just_base/ │ │ │ │ ├── __init__.py │ │ │ │ └── classes.py │ │ │ ├── test_envvars.py │ │ │ └── test_parameter.py │ │ ├── core/ │ │ │ ├── __init__.py │ │ │ ├── storage_formats/ │ │ │ │ ├── base/ │ │ │ │ │ └── test_internals.py │ │ │ │ ├── cudf/ │ │ │ │ │ ├── test_gpu_managers.py │ │ │ │ │ └── test_internals.py │ │ │ │ └── pandas/ │ │ │ │ └── test_internals.py │ │ │ └── test_dispatcher.py │ │ ├── experimental/ │ │ │ ├── __init__.py │ │ │ ├── spreadsheet/ │ │ │ │ ├── __init__.py │ │ │ │ └── test_general.py │ │ │ ├── test_fuzzydata.py │ │ │ ├── test_io_exp.py │ │ │ ├── test_pipeline.py │ │ │ ├── torch/ │ │ │ │ ├── __init__.py │ │ │ │ └── test_dataloader.py │ │ │ └── xgboost/ │ │ │ ├── __init__.py │ │ │ ├── test_default.py │ │ │ ├── test_dmatrix.py │ │ │ └── test_xgboost.py │ │ ├── interchange/ │ │ │ ├── __init__.py │ │ │ └── dataframe_protocol/ │ │ │ ├── __init__.py │ │ │ ├── base/ │ │ │ │ ├── __init__.py │ │ │ │ ├── test_sanity.py │ │ │ │ └── test_utils.py │ │ │ ├── pandas/ │ │ │ │ ├── __init__.py │ │ │ │ └── test_protocol.py │ │ │ └── test_general.py │ │ ├── numpy/ │ │ │ ├── __init__.py │ │ │ ├── test_array.py │ │ │ ├── test_array_arithmetic.py │ │ │ ├── test_array_axis_functions.py │ │ │ ├── test_array_creation.py │ │ │ ├── test_array_indexing.py │ │ │ ├── test_array_linalg.py │ │ │ ├── test_array_logic.py │ │ │ ├── test_array_math.py │ │ │ ├── test_array_shaping.py │ │ │ └── utils.py │ │ ├── pandas/ │ │ │ ├── __init__.py │ │ │ ├── conftest.py │ │ │ ├── data/ │ │ │ │ ├── __init__.py │ │ │ │ ├── airline.sas7bdat │ │ │ │ ├── blah.csv │ │ │ │ ├── every_other_row_nan.xlsx │ │ │ │ ├── excel_sheetname_title.xlsx │ │ │ │ ├── hdfs.parquet/ │ │ │ │ │ ├── part-00000-a7bff54c-2ff4-4654-9783-626542bd3a90-c000.snappy.parquet │ │ │ │ │ ├── part-00001-a7bff54c-2ff4-4654-9783-626542bd3a90-c000.snappy.parquet │ │ │ │ │ └── part-00002-a7bff54c-2ff4-4654-9783-626542bd3a90-c000.snappy.parquet │ │ │ │ ├── issue5159.parquet/ │ │ │ │ │ └── part-0000.snappy.parquet/ │ │ │ │ │ ├── par=a/ │ │ │ │ │ │ └── 44c5b23d806c4dc8a97d70c4fb2219f5-0.parquet │ │ │ │ │ └── par=b/ │ │ │ │ │ └── 44c5b23d806c4dc8a97d70c4fb2219f5-0.parquet │ │ │ │ ├── issue_1930.csv │ │ │ │ ├── issue_2074.csv │ │ │ │ ├── issue_2239.csv │ │ │ │ ├── issue_3119.csv │ │ │ │ ├── issue_4543.csv │ │ │ │ ├── issue_976.csv │ │ │ │ ├── modin_error_book.xlsx │ │ │ │ ├── multiple_csv/ │ │ │ │ │ ├── test_data0.csv │ │ │ │ │ └── test_data1.csv │ │ │ │ ├── newlines.csv │ │ │ │ ├── test_border_rows.xlsx │ │ │ │ ├── test_categories.csv │ │ │ │ ├── test_categories.json │ │ │ │ ├── test_data.feather │ │ │ │ ├── test_data.fwf │ │ │ │ ├── test_data.json │ │ │ │ ├── test_data.parquet │ │ │ │ ├── test_data_dir.parquet/ │ │ │ │ │ ├── part_0.parquet │ │ │ │ │ ├── part_1.parquet │ │ │ │ │ ├── part_10.parquet │ │ │ │ │ ├── part_11.parquet │ │ │ │ │ ├── part_12.parquet │ │ │ │ │ ├── part_13.parquet │ │ │ │ │ ├── part_14.parquet │ │ │ │ │ ├── part_15.parquet │ │ │ │ │ ├── part_2.parquet │ │ │ │ │ ├── part_3.parquet │ │ │ │ │ ├── part_4.parquet │ │ │ │ │ ├── part_5.parquet │ │ │ │ │ ├── part_6.parquet │ │ │ │ │ ├── part_7.parquet │ │ │ │ │ ├── part_8.parquet │ │ │ │ │ └── part_9.parquet │ │ │ │ ├── test_delim.csv │ │ │ │ ├── test_different_columns_in_rows.json │ │ │ │ ├── test_empty_rows.xlsx │ │ │ │ ├── test_emptyline.xlsx │ │ │ │ ├── test_null_col.csv │ │ │ │ ├── test_time_parsing.csv │ │ │ │ └── test_usecols.csv │ │ │ ├── dataframe/ │ │ │ │ ├── __init__.py │ │ │ │ ├── test_binary.py │ │ │ │ ├── test_default.py │ │ │ │ ├── test_indexing.py │ │ │ │ ├── test_iter.py │ │ │ │ ├── test_join_sort.py │ │ │ │ ├── test_map_metadata.py │ │ │ │ ├── test_pickle.py │ │ │ │ ├── test_reduce.py │ │ │ │ ├── test_udf.py │ │ │ │ └── test_window.py │ │ │ ├── extensions/ │ │ │ │ ├── __init__.py │ │ │ │ ├── conftest.py │ │ │ │ ├── test_api_reexport.py │ │ │ │ ├── test_base_extensions.py │ │ │ │ ├── test_dataframe_extensions.py │ │ │ │ ├── test_groupby_extensions.py │ │ │ │ ├── test_pd_extensions.py │ │ │ │ └── test_series_extensions.py │ │ │ ├── integrations/ │ │ │ │ ├── __init__.py │ │ │ │ └── test_lazy_import.py │ │ │ ├── internals/ │ │ │ │ ├── __init__.py │ │ │ │ └── test_benchmark_mode.py │ │ │ ├── native_df_interoperability/ │ │ │ │ ├── __init__.py │ │ │ │ ├── conftest.py │ │ │ │ ├── test_binary.py │ │ │ │ ├── test_compiler_caster.py │ │ │ │ ├── test_copy_on_write.py │ │ │ │ ├── test_default.py │ │ │ │ ├── test_default_to_pandas_without_warnings.py │ │ │ │ ├── test_general.py │ │ │ │ ├── test_indexing.py │ │ │ │ ├── test_iter.py │ │ │ │ ├── test_join_sort.py │ │ │ │ ├── test_map_metadata.py │ │ │ │ ├── test_pickle.py │ │ │ │ ├── test_window.py │ │ │ │ └── utils.py │ │ │ ├── test_api.py │ │ │ ├── test_backend.py │ │ │ ├── test_concat.py │ │ │ ├── test_expanding.py │ │ │ ├── test_general.py │ │ │ ├── test_groupby.py │ │ │ ├── test_io.py │ │ │ ├── test_repartition.py │ │ │ ├── test_reshape.py │ │ │ ├── test_rolling.py │ │ │ ├── test_series.py │ │ │ └── utils.py │ │ ├── polars/ │ │ │ └── test_dataframe.py │ │ ├── test_dataframe_api_standard.py │ │ ├── test_docstring_urls.py │ │ ├── test_envvar_catcher.py │ │ ├── test_envvar_npartitions.py │ │ ├── test_executions_api.py │ │ ├── test_headers.py │ │ ├── test_logging.py │ │ ├── test_metrics.py │ │ ├── test_partition_api.py │ │ └── test_utils.py │ └── utils.py ├── modin-autoimport-pandas.pth ├── mypy.ini ├── requirements/ │ ├── env_unidist_linux.yml │ ├── env_unidist_win.yml │ └── requirements-no-engine.yml ├── requirements-dev.txt ├── scripts/ │ ├── __init__.py │ ├── doc_checker.py │ ├── release.py │ └── test/ │ ├── __init__.py │ ├── examples.py │ └── test_doc_checker.py ├── setup.cfg ├── setup.py ├── stress_tests/ │ ├── kaggle/ │ │ ├── kaggle10.py │ │ ├── kaggle12.py │ │ ├── kaggle13.py │ │ ├── kaggle14.py │ │ ├── kaggle17.py │ │ ├── kaggle18.py │ │ ├── kaggle19.py │ │ ├── kaggle20.py │ │ ├── kaggle22.py │ │ ├── kaggle3.py │ │ ├── kaggle4.py │ │ ├── kaggle5.py │ │ ├── kaggle6.py │ │ ├── kaggle7.py │ │ ├── kaggle8.py │ │ └── kaggle9.py │ ├── run_stress_tests.sh │ └── test_kaggle_ipynb.py └── versioneer.py
Showing preview only (484K chars total). Download the full file or copy to clipboard to get everything.
SYMBOL INDEX (6064 symbols across 286 files)
FILE: asv_bench/benchmarks/benchmarks.py
class BaseTimeGroupBy (line 42) | class BaseTimeGroupBy:
method setup (line 43) | def setup(self, shape, ngroups=5, groupby_ncols=1):
class TimeGroupByMultiColumn (line 55) | class TimeGroupByMultiColumn(BaseTimeGroupBy):
method time_groupby_agg_quan (line 63) | def time_groupby_agg_quan(self, *args, **kwargs):
method time_groupby_agg_mean (line 66) | def time_groupby_agg_mean(self, *args, **kwargs):
class TimeGroupByDefaultAggregations (line 70) | class TimeGroupByDefaultAggregations(BaseTimeGroupBy):
method time_groupby_count (line 77) | def time_groupby_count(self, *args, **kwargs):
method time_groupby_size (line 80) | def time_groupby_size(self, *args, **kwargs):
method time_groupby_sum (line 83) | def time_groupby_sum(self, *args, **kwargs):
method time_groupby_mean (line 86) | def time_groupby_mean(self, *args, **kwargs):
class TimeGroupByDictionaryAggregation (line 90) | class TimeGroupByDictionaryAggregation(BaseTimeGroupBy):
method setup (line 102) | def setup(self, shape, ngroups, operation_type):
method time_groupby_dict_agg (line 110) | def time_groupby_dict_agg(self, *args, **kwargs):
class TimeJoin (line 114) | class TimeJoin:
method setup (line 122) | def setup(self, shapes, how, sort):
method time_join (line 126) | def time_join(self, shapes, how, sort):
class TimeJoinStringIndex (line 131) | class TimeJoinStringIndex:
method setup (line 138) | def setup(self, shapes, sort):
method time_join_dataframe_index_multi (line 167) | def time_join_dataframe_index_multi(self, shapes, sort):
method time_join_dataframe_index_single_key_bigger (line 170) | def time_join_dataframe_index_single_key_bigger(self, shapes, sort):
method time_join_dataframe_index_single_key_small (line 173) | def time_join_dataframe_index_single_key_small(self, shapes, sort):
class TimeMergeDefault (line 177) | class TimeMergeDefault:
method setup (line 185) | def setup(self, shapes, how, sort):
method time_merge (line 189) | def time_merge(self, shapes, how, sort):
class TimeMerge (line 193) | class TimeMerge:
method setup (line 201) | def setup(self, shapes, how, sort):
method time_merge (line 205) | def time_merge(self, shapes, how, sort):
method time_merge_dataframe_empty_right (line 213) | def time_merge_dataframe_empty_right(self, shapes, how, sort):
method time_merge_dataframe_empty_left (line 218) | def time_merge_dataframe_empty_left(self, shapes, how, sort):
class TimeMergeCategoricals (line 224) | class TimeMergeCategoricals:
method setup (line 231) | def setup(self, shapes, data_type):
method time_merge_categoricals (line 255) | def time_merge_categoricals(self, shapes, data_type):
class TimeConcat (line 259) | class TimeConcat:
method setup (line 268) | def setup(self, shapes, how, axis, ignore_index):
method time_concat (line 272) | def time_concat(self, shapes, how, axis, ignore_index):
class TimeBinaryOp (line 280) | class TimeBinaryOp:
method setup (line 288) | def setup(self, shapes, binary_op, axis):
method time_binary_op (line 293) | def time_binary_op(self, shapes, binary_op, axis):
class TimeBinaryOpSeries (line 297) | class TimeBinaryOpSeries:
method setup (line 304) | def setup(self, shapes, binary_op):
method time_binary_op_series (line 313) | def time_binary_op_series(self, shapes, binary_op):
class BaseTimeSetItem (line 317) | class BaseTimeSetItem:
method get_loc (line 321) | def get_loc(df, loc, axis, item_length):
method setup (line 337) | def setup(self, shape, item_length, loc, is_equal_indices):
class TimeSetItem (line 349) | class TimeSetItem(BaseTimeSetItem):
method time_setitem_qc (line 357) | def time_setitem_qc(self, *args, **kwargs):
method time_setitem_raw (line 361) | def time_setitem_raw(self, *args, **kwargs):
class TimeInsert (line 366) | class TimeInsert(BaseTimeSetItem):
method time_insert_qc (line 374) | def time_insert_qc(self, *args, **kwargs):
method time_insert_raw (line 378) | def time_insert_raw(self, *args, **kwargs):
class TimeArithmetic (line 383) | class TimeArithmetic:
method setup (line 390) | def setup(self, shape, axis):
method time_sum (line 393) | def time_sum(self, shape, axis):
method time_count (line 396) | def time_count(self, shape, axis):
method time_median (line 399) | def time_median(self, shape, axis):
method time_nunique (line 402) | def time_nunique(self, shape, axis):
method time_apply (line 405) | def time_apply(self, shape, axis):
method time_mean (line 408) | def time_mean(self, shape, axis):
method time_mode (line 411) | def time_mode(self, shape, axis):
method time_add (line 414) | def time_add(self, shape, axis):
method time_mul (line 417) | def time_mul(self, shape, axis):
method time_mod (line 420) | def time_mod(self, shape, axis):
method time_abs (line 423) | def time_abs(self, shape, axis):
method time_aggregate (line 426) | def time_aggregate(self, shape, axis):
method time_is_in (line 429) | def time_is_in(self, shape, axis):
method time_transpose (line 432) | def time_transpose(self, shape, axis):
class TimeSortValues (line 436) | class TimeSortValues:
method setup (line 444) | def setup(self, shape, columns_number, ascending_list):
method time_sort_values (line 453) | def time_sort_values(self, shape, columns_number, ascending_list):
class TimeDrop (line 457) | class TimeDrop:
method setup (line 465) | def setup(self, shape, axis, drop_ncols):
method time_drop (line 474) | def time_drop(self, shape, axis, drop_ncols):
class TimeHead (line 478) | class TimeHead:
method setup (line 485) | def setup(self, shape, head_count):
method time_head (line 493) | def time_head(self, shape, head_count):
class TimeTail (line 497) | class TimeTail:
method setup (line 504) | def setup(self, shape, tail_count):
method time_tail (line 512) | def time_tail(self, shape, tail_count):
class TimeExplode (line 516) | class TimeExplode:
method setup (line 522) | def setup(self, shape):
method time_explode (line 527) | def time_explode(self, shape):
class TimeFillnaSeries (line 531) | class TimeFillnaSeries:
method setup (line 539) | def setup(self, value_type, shape, limit):
method time_fillna (line 555) | def time_fillna(self, value_type, shape, limit):
method time_fillna_inplace (line 558) | def time_fillna_inplace(self, value_type, shape, limit):
class TimeFillnaDataFrame (line 563) | class TimeFillnaDataFrame:
method setup (line 571) | def setup(self, value_type, shape, limit):
method time_fillna (line 597) | def time_fillna(self, value_type, shape, limit):
method time_fillna_inplace (line 600) | def time_fillna_inplace(self, value_type, shape, limit):
class BaseTimeValueCounts (line 605) | class BaseTimeValueCounts:
method setup (line 606) | def setup(self, shape, ngroups=5, subset=1):
class TimeValueCountsFrame (line 618) | class TimeValueCountsFrame(BaseTimeValueCounts):
method time_value_counts (line 626) | def time_value_counts(self, *args, **kwargs):
class TimeValueCountsSeries (line 630) | class TimeValueCountsSeries(BaseTimeValueCounts):
method setup (line 638) | def setup(self, shape, ngroups, bins):
method time_value_counts (line 642) | def time_value_counts(self, shape, ngroups, bins):
class TimeIndexing (line 646) | class TimeIndexing:
method setup (line 675) | def setup(self, shape, indexer_type):
method time_iloc (line 684) | def time_iloc(self, shape, indexer_type):
method time_loc (line 692) | def time_loc(self, shape, indexer_type):
class TimeIndexingColumns (line 696) | class TimeIndexingColumns:
method setup (line 700) | def setup(self, shape):
method time_iloc (line 705) | def time_iloc(self, shape):
method time_loc (line 708) | def time_loc(self, shape):
method time___getitem__ (line 711) | def time___getitem__(self, shape):
class TimeMultiIndexing (line 715) | class TimeMultiIndexing:
method setup (line 719) | def setup(self, shape):
method time_multiindex_loc (line 734) | def time_multiindex_loc(self, shape):
class TimeResetIndex (line 743) | class TimeResetIndex:
method setup (line 751) | def setup(self, shape, drop, level):
method time_reset_index (line 761) | def time_reset_index(self, shape, drop, level):
class TimeAstype (line 765) | class TimeAstype:
method setup (line 773) | def setup(self, shape, dtype, astype_ncolumns):
method time_astype (line 782) | def time_astype(self, shape, dtype, astype_ncolumns):
class TimeDescribe (line 786) | class TimeDescribe:
method setup (line 792) | def setup(self, shape):
method time_describe (line 795) | def time_describe(self, shape):
class TimeProperties (line 799) | class TimeProperties:
method setup (line 805) | def setup(self, shape):
method time_shape (line 808) | def time_shape(self, shape):
method time_columns (line 811) | def time_columns(self, shape):
method time_index (line 814) | def time_index(self, shape):
class TimeIndexingNumericSeries (line 818) | class TimeIndexingNumericSeries:
method setup (line 826) | def setup(self, shape, dtype, index_structure):
method time_getitem_scalar (line 841) | def time_getitem_scalar(self, shape, index, index_structure):
method time_getitem_slice (line 845) | def time_getitem_slice(self, shape, index, index_structure):
method time_getitem_list_like (line 848) | def time_getitem_list_like(self, shape, index, index_structure):
method time_getitem_array (line 851) | def time_getitem_array(self, shape, index, index_structure):
method time_getitem_lists (line 854) | def time_getitem_lists(self, shape, index, index_structure):
method time_iloc_array (line 857) | def time_iloc_array(self, shape, index, index_structure):
method time_iloc_list_like (line 860) | def time_iloc_list_like(self, shape, index, index_structure):
method time_iloc_scalar (line 863) | def time_iloc_scalar(self, shape, index, index_structure):
method time_iloc_slice (line 867) | def time_iloc_slice(self, shape, index, index_structure):
method time_loc_array (line 870) | def time_loc_array(self, shape, index, index_structure):
method time_loc_list_like (line 873) | def time_loc_list_like(self, shape, index, index_structure):
method time_loc_scalar (line 876) | def time_loc_scalar(self, shape, index, index_structure):
method time_loc_slice (line 879) | def time_loc_slice(self, shape, index, index_structure):
class TimeReindex (line 883) | class TimeReindex:
method setup (line 887) | def setup(self, shape):
method time_reindex_dates (line 917) | def time_reindex_dates(self, shape):
method time_reindex_columns (line 920) | def time_reindex_columns(self, shape):
method time_reindex_multiindex_with_cache (line 923) | def time_reindex_multiindex_with_cache(self, shape):
method time_reindex_multiindex_no_cache (line 927) | def time_reindex_multiindex_no_cache(self, shape):
method time_reindex_multiindex_no_cache_dates (line 931) | def time_reindex_multiindex_no_cache_dates(self, shape):
class TimeReindexMethod (line 936) | class TimeReindexMethod:
method setup (line 944) | def setup(self, shape, method, constructor):
method time_reindex_method (line 950) | def time_reindex_method(self, shape, method, constructor):
class TimeFillnaMethodSeries (line 954) | class TimeFillnaMethodSeries:
method setup (line 958) | def setup(self, shape, method):
method time_reindexed (line 966) | def time_reindexed(self, shape, method):
method time_float_32 (line 969) | def time_float_32(self, shape, method):
class TimeFillnaMethodDataframe (line 973) | class TimeFillnaMethodDataframe:
method setup (line 977) | def setup(self, shape, method):
method time_reindexed (line 984) | def time_reindexed(self, shape, method):
method time_float_32 (line 987) | def time_float_32(self, shape, method):
class TimeLevelAlign (line 991) | class TimeLevelAlign:
method setup (line 995) | def setup(self, shapes):
method time_align_level (line 1014) | def time_align_level(self, shapes):
method time_reindex_level (line 1018) | def time_reindex_level(self, shapes):
class TimeDropDuplicatesDataframe (line 1024) | class TimeDropDuplicatesDataframe:
method setup (line 1028) | def setup(self, shape):
method time_drop_dups (line 1042) | def time_drop_dups(self, shape):
method time_drop_dups_inplace (line 1045) | def time_drop_dups_inplace(self, shape):
class TimeDropDuplicatesSeries (line 1050) | class TimeDropDuplicatesSeries:
method setup (line 1054) | def setup(self, shape):
method time_drop_dups (line 1064) | def time_drop_dups(self, shape):
method time_drop_dups_string (line 1067) | def time_drop_dups_string(self, shape):
class TimeDatetimeAccessor (line 1072) | class TimeDatetimeAccessor:
method setup (line 1076) | def setup(self, shape):
method time_dt_accessor (line 1082) | def time_dt_accessor(self, shape):
method time_timedelta_days (line 1085) | def time_timedelta_days(self, shape):
method time_timedelta_seconds (line 1088) | def time_timedelta_seconds(self, shape):
class BaseCategories (line 1092) | class BaseCategories:
method setup (line 1093) | def setup(self, shape):
class TimeSetCategories (line 1100) | class TimeSetCategories(BaseCategories):
method time_set_categories (line 1104) | def time_set_categories(self, shape):
class TimeRemoveCategories (line 1108) | class TimeRemoveCategories(BaseCategories):
method time_remove_categories (line 1112) | def time_remove_categories(self, shape):
class BaseReshape (line 1116) | class BaseReshape:
method setup (line 1117) | def setup(self, shape):
class TimeStack (line 1129) | class TimeStack(BaseReshape):
method setup (line 1133) | def setup(self, shape):
method time_stack (line 1138) | def time_stack(self, shape):
class TimeUnstack (line 1142) | class TimeUnstack(BaseReshape):
method time_unstack (line 1146) | def time_unstack(self, shape):
class TimeReplace (line 1150) | class TimeReplace:
method setup (line 1154) | def setup(self, shape):
method time_replace (line 1160) | def time_replace(self, shape):
class TimeGroups (line 1164) | class TimeGroups:
method setup (line 1168) | def setup(self, shape):
method time_series_groups (line 1173) | def time_series_groups(self, shape):
method time_series_indices (line 1177) | def time_series_indices(self, shape):
class TimeRepr (line 1181) | class TimeRepr:
method setup (line 1185) | def setup(self, shape):
method time_repr (line 1190) | def time_repr(self, shape):
class TimeMaskBool (line 1194) | class TimeMaskBool:
method setup (line 1198) | def setup(self, shape):
method time_frame_mask (line 1203) | def time_frame_mask(self, shape):
class TimeIsnull (line 1207) | class TimeIsnull:
method setup (line 1211) | def setup(self, shape):
method time_isnull (line 1217) | def time_isnull(self, shape):
class TimeDropna (line 1221) | class TimeDropna:
method setup (line 1225) | def setup(self, how, axis, shape):
method time_dropna (line 1232) | def time_dropna(self, how, axis, shape):
class TimeEquals (line 1236) | class TimeEquals:
method setup (line 1240) | def setup(self, shape):
method time_frame_float_equal (line 1246) | def time_frame_float_equal(self, shape):
FILE: asv_bench/benchmarks/io/csv.py
class BaseReadCsv (line 29) | class BaseReadCsv:
method setup_cache (line 31) | def setup_cache(self, test_filename="io_test_file"):
method setup (line 37) | def setup(self, test_filenames, shape, *args, **kwargs):
class TimeReadCsvSkiprows (line 44) | class TimeReadCsvSkiprows(BaseReadCsv):
method setup (line 59) | def setup(self, test_filenames, shape, skiprows):
method time_skiprows (line 63) | def time_skiprows(self, test_filenames, shape, skiprows):
class TimeReadCsvTrueFalseValues (line 67) | class TimeReadCsvTrueFalseValues(BaseReadCsv):
method time_true_false_values (line 73) | def time_true_false_values(self, test_filenames, shape):
class TimeReadCsvNamesDtype (line 83) | class TimeReadCsvNamesDtype:
method _get_file_id (line 95) | def _get_file_id(self, shape, dtype):
method _add_timestamp_columns (line 98) | def _add_timestamp_columns(self, df):
method setup_cache (line 105) | def setup_cache(self, test_filename="io_test_file_csv_names_dtype"):
method setup (line 125) | def setup(self, cache, shape, names, dtype):
method time_read_csv_names_dtype (line 140) | def time_read_csv_names_dtype(self, cache, shape, names, dtype):
FILE: asv_bench/benchmarks/io/parquet.py
class TimeReadParquet (line 24) | class TimeReadParquet:
method setup_cache (line 34) | def setup_cache(self, test_filename="io_test_file"):
method setup (line 40) | def setup(self, test_filenames, shape):
method time_read_parquet (line 46) | def time_read_parquet(self, test_filenames, shape):
FILE: asv_bench/benchmarks/scalability/scalability_benchmarks.py
class TimeFromPandas (line 44) | class TimeFromPandas:
method setup (line 51) | def setup(self, shape, cpus):
method time_from_pandas (line 59) | def time_from_pandas(self, shape, cpus):
class TimeToPandas (line 63) | class TimeToPandas:
method setup (line 70) | def setup(self, shape, cpus):
method time_to_pandas (line 76) | def time_to_pandas(self, shape, cpus):
class TimeToNumPy (line 81) | class TimeToNumPy:
method setup (line 88) | def setup(self, shape, cpus):
method time_to_numpy (line 94) | def time_to_numpy(self, shape, cpus):
FILE: asv_bench/benchmarks/utils/common.py
function translator_groupby_ngroups (line 41) | def translator_groupby_ngroups(groupby_ngroups: Union[str, int], shape: ...
class weakdict (line 64) | class weakdict(dict): # noqa: GL08
function gen_nan_data (line 72) | def gen_nan_data(nrows: int, ncols: int) -> dict:
function gen_int_data (line 110) | def gen_int_data(nrows: int, ncols: int, rand_low: int, rand_high: int) ...
function gen_str_int_data (line 137) | def gen_str_int_data(nrows: int, ncols: int, rand_low: int, rand_high: i...
function gen_true_false_int_data (line 165) | def gen_true_false_int_data(nrows, ncols, rand_low, rand_high):
function gen_data (line 199) | def gen_data(
function generate_dataframe (line 263) | def generate_dataframe(
function random_string (line 370) | def random_string() -> str:
function random_columns (line 381) | def random_columns(df_columns: list, columns_number: int) -> list:
function random_booleans (line 399) | def random_booleans(number: int) -> list:
function execute (line 415) | def execute(df: Union[modin.pandas.DataFrame, pandas.DataFrame]):
function get_shape_id (line 453) | def get_shape_id(shape: tuple) -> str:
function prepare_io_data (line 469) | def prepare_io_data(test_filename: str, data_type: str, shapes: list):
function prepare_io_data_parquet (line 498) | def prepare_io_data_parquet(test_filename: str, data_type: str, shapes: ...
function setup (line 527) | def setup(*args, **kwargs): # noqa: GL08
FILE: asv_bench/benchmarks/utils/data_shapes.py
function get_benchmark_shapes (line 178) | def get_benchmark_shapes(bench_id: str):
FILE: asv_bench/test/test_utils.py
function test_get_benchmark_shapes (line 46) | def test_get_benchmark_shapes(asv_config_content, result):
function test_get_benchmark_shapes_default (line 63) | def test_get_benchmark_shapes_default(asv_config_content, result):
function test_execute (line 69) | def test_execute():
FILE: ci/teamcity/build-docker.py
function execute_command (line 5) | def execute_command(cmd):
FILE: docs/conf.py
function noop_decorator (line 19) | def noop_decorator(*args, **kwargs):
FILE: examples/docker/modin-ray/census.py
function read (line 28) | def read(filename):
function etl (line 135) | def etl(df):
function mse (line 181) | def mse(y_test, y_pred):
function cod (line 185) | def cod(y_test, y_pred):
function ml (line 192) | def ml(X, y, random_state, n_runs, test_size):
function measure (line 232) | def measure(name, func, *args, **kw):
function main (line 240) | def main():
FILE: examples/docker/modin-ray/nyc-taxi.py
function read (line 20) | def read(filename):
function q1 (line 80) | def q1(df):
function q2 (line 84) | def q2(df):
function q3 (line 90) | def q3(df):
function q4 (line 102) | def q4(df):
function measure (line 119) | def measure(name, func, *args, **kw):
function main (line 127) | def main():
FILE: examples/docker/modin-ray/plasticc.py
function create_dtypes (line 30) | def create_dtypes():
function ravel_column_names (line 64) | def ravel_column_names(cols):
function measure (line 70) | def measure(name, func, *args, **kw):
function all_etl (line 78) | def all_etl(train, train_meta, test, test_meta):
function split_step (line 84) | def split_step(train_final, test_final):
function multi_weighted_logloss (line 105) | def multi_weighted_logloss(y_true, y_preds, classes, class_weights):
function xgb_multi_weighted_logloss (line 124) | def xgb_multi_weighted_logloss(y_predicted, y_true, classes, class_weigh...
function read (line 134) | def read(
function etl (line 159) | def etl(df, df_meta):
function ml (line 198) | def ml(train_final, test_final):
function main (line 242) | def main():
FILE: examples/tutorial/jupyter/execution/pandas_on_dask/test/test_notebooks.py
function test_exercise_1 (line 35) | def test_exercise_1():
function test_exercise_2 (line 49) | def test_exercise_2():
function test_exercise_3 (line 70) | def test_exercise_3():
function test_exercise_4 (line 103) | def test_exercise_4():
FILE: examples/tutorial/jupyter/execution/pandas_on_ray/test/test_notebooks.py
function test_exercise_1 (line 36) | def test_exercise_1():
function test_exercise_2 (line 50) | def test_exercise_2():
function test_exercise_3 (line 74) | def test_exercise_3():
function test_exercise_4 (line 107) | def test_exercise_4():
FILE: examples/tutorial/jupyter/execution/pandas_on_unidist/setup_kernel.py
function custom_make_ipkernel_cmd (line 21) | def custom_make_ipkernel_cmd(*args, **kwargs):
FILE: examples/tutorial/jupyter/execution/pandas_on_unidist/test/test_notebooks.py
function test_exercise_1 (line 41) | def test_exercise_1():
function test_exercise_2 (line 55) | def test_exercise_2():
function test_exercise_3 (line 76) | def test_exercise_3():
function test_exercise_4 (line 109) | def test_exercise_4():
FILE: examples/tutorial/jupyter/execution/test/utils.py
function set_kernel (line 30) | def set_kernel(kernel_name):
function make_execute_preprocessor (line 43) | def make_execute_preprocessor():
function _execute_notebook (line 60) | def _execute_notebook(notebook):
function _find_code_cell_idx (line 74) | def _find_code_cell_idx(nb, identifier):
function _replace_str (line 104) | def _replace_str(nb, original_str, str_to_replace):
FILE: modin/__init__.py
function custom_formatwarning (line 20) | def custom_formatwarning(
function set_execution (line 37) | def set_execution(engine: Any = None, storage_format: Any = None) -> Tup...
FILE: modin/__main__.py
function main (line 19) | def main() -> None:
FILE: modin/_version.py
function get_keywords (line 22) | def get_keywords() -> Dict[str, str]:
class VersioneerConfig (line 35) | class VersioneerConfig:
function get_config (line 46) | def get_config() -> VersioneerConfig:
class NotThisMethod (line 60) | class NotThisMethod(Exception):
function register_vcs_handler (line 68) | def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator
function run_command (line 81) | def run_command(
function versions_from_parentdir (line 133) | def versions_from_parentdir(
function git_get_keywords (line 168) | def git_get_keywords(versionfile_abs: str) -> Dict[str, str]:
function git_versions_from_keywords (line 196) | def git_versions_from_keywords(
function git_pieces_from_vcs (line 271) | def git_pieces_from_vcs(
function plus_or_dot (line 413) | def plus_or_dot(pieces: Dict[str, Any]) -> str:
function render_pep440 (line 420) | def render_pep440(pieces: Dict[str, Any]) -> str:
function render_pep440_branch (line 444) | def render_pep440_branch(pieces: Dict[str, Any]) -> str:
function pep440_split_post (line 473) | def pep440_split_post(ver: str) -> Tuple[str, Optional[int]]:
function render_pep440_pre (line 483) | def render_pep440_pre(pieces: Dict[str, Any]) -> str:
function render_pep440_post (line 507) | def render_pep440_post(pieces: Dict[str, Any]) -> str:
function render_pep440_post_branch (line 534) | def render_pep440_post_branch(pieces: Dict[str, Any]) -> str:
function render_pep440_old (line 563) | def render_pep440_old(pieces: Dict[str, Any]) -> str:
function render_git_describe (line 585) | def render_git_describe(pieces: Dict[str, Any]) -> str:
function render_git_describe_long (line 605) | def render_git_describe_long(pieces: Dict[str, Any]) -> str:
function render (line 625) | def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]:
function get_versions (line 667) | def get_versions() -> Dict[str, Any]:
FILE: modin/config/__main__.py
function print_config_help (line 30) | def print_config_help() -> None:
function export_config_help (line 42) | def export_config_help(filename: str) -> None:
FILE: modin/config/envvars.py
class EnvironmentVariable (line 38) | class EnvironmentVariable(Parameter, type=str, abstract=True):
method _get_value_from_config (line 44) | def _get_value_from_config(cls) -> Any:
method get_help (line 65) | def get_help(cls) -> str:
class EnvWithSibilings (line 79) | class EnvWithSibilings(
method _sibling (line 90) | def _sibling(cls) -> type["EnvWithSibilings"]:
method get (line 95) | def get(cls) -> Any:
method put (line 148) | def put(cls, value: Any) -> None:
class EnvironmentVariableDisallowingExecutionAndBackendBothSet (line 169) | class EnvironmentVariableDisallowingExecutionAndBackendBothSet(
method _get_value_from_config (line 178) | def _get_value_from_config(cls) -> str:
class IsDebug (line 189) | class IsDebug(EnvironmentVariable, type=bool):
class Engine (line 195) | class Engine(
method _get_default (line 212) | def _get_default(cls) -> str:
method add_option (line 273) | def add_option(cls, choice: Any) -> Any:
method put (line 280) | def put(cls, value: str) -> None:
method get (line 298) | def get(cls) -> str:
class StorageFormat (line 335) | class StorageFormat(EnvironmentVariableDisallowingExecutionAndBackendBot...
method put (line 339) | def put(cls, value: str) -> None:
method get (line 357) | def get(cls) -> str:
class Backend (line 401) | class Backend(EnvironmentVariableDisallowingExecutionAndBackendBothSet, ...
method put (line 423) | def put(cls, value: str) -> None:
method _get_default (line 436) | def _get_default(cls) -> str:
method register_backend (line 450) | def register_backend(cls: type["Backend"], name: str, execution: Execu...
method add_option (line 475) | def add_option(cls, choice: str) -> NoReturn:
method set_active_backends (line 494) | def set_active_backends(cls, new_choices: tuple) -> None:
method activate (line 520) | def activate(cls, backend: str) -> None:
method get_active_backends (line 541) | def get_active_backends(cls) -> tuple[str, ...]:
method get_backend_for_execution (line 553) | def get_backend_for_execution(cls, execution: Execution) -> str:
method get_execution_for_backend (line 575) | def get_execution_for_backend(cls, backend: str) -> Execution:
method get (line 613) | def get(cls) -> str:
class AutoSwitchBackend (line 654) | class AutoSwitchBackend(EnvironmentVariable, type=bool):
method enable (line 667) | def enable(cls) -> None:
method disable (line 672) | def disable(cls) -> None:
class ShowBackendSwitchProgress (line 677) | class ShowBackendSwitchProgress(EnvironmentVariable, type=bool):
method enable (line 689) | def enable(cls) -> None:
method disable (line 694) | def disable(cls) -> None:
class IsExperimental (line 699) | class IsExperimental(EnvironmentVariable, type=bool):
class IsRayCluster (line 705) | class IsRayCluster(EnvironmentVariable, type=bool):
class RayRedisAddress (line 711) | class RayRedisAddress(EnvironmentVariable, type=ExactStr):
class RayRedisPassword (line 717) | class RayRedisPassword(EnvironmentVariable, type=ExactStr):
class RayInitCustomResources (line 724) | class RayInitCustomResources(EnvironmentVariable, type=dict):
class RayTaskCustomResources (line 741) | class RayTaskCustomResources(EnvironmentVariable, type=dict):
class CpuCount (line 767) | class CpuCount(EnvironmentVariable, type=int):
method _put (line 773) | def _put(cls, value: int) -> None:
method _get_default (line 791) | def _get_default(cls) -> int:
method get (line 804) | def get(cls) -> int:
class GpuCount (line 818) | class GpuCount(EnvironmentVariable, type=int):
class Memory (line 824) | class Memory(EnvironmentVariable, type=int):
class NPartitions (line 837) | class NPartitions(EnvironmentVariable, type=int):
method _put (line 843) | def _put(cls, value: int) -> None:
method _get_default (line 861) | def _get_default(cls) -> int:
method get (line 872) | def get(cls) -> int:
class TestDatasetSize (line 886) | class TestDatasetSize(EnvironmentVariable, type=str):
class TrackFileLeaks (line 893) | class TrackFileLeaks(EnvironmentVariable, type=bool):
class AsvImplementation (line 903) | class AsvImplementation(EnvironmentVariable, type=ExactStr):
class AsvDataSizeConfig (line 912) | class AsvDataSizeConfig(EnvironmentVariable, type=ExactStr):
class ProgressBar (line 919) | class ProgressBar(EnvironmentVariable, type=bool):
method enable (line 926) | def enable(cls) -> None:
method disable (line 931) | def disable(cls) -> None:
method put (line 936) | def put(cls, value: bool) -> None:
class BenchmarkMode (line 950) | class BenchmarkMode(EnvironmentVariable, type=bool):
method put (line 957) | def put(cls, value: bool) -> None:
class LogMode (line 971) | class LogMode(EnvironmentVariable, type=ExactStr):
method enable (line 979) | def enable(cls) -> None:
method disable (line 984) | def disable(cls) -> None:
class LogMemoryInterval (line 989) | class LogMemoryInterval(EnvironmentVariable, type=int):
method put (line 996) | def put(cls, value: int) -> None:
method get (line 1010) | def get(cls) -> int:
class LogFileSize (line 1026) | class LogFileSize(EnvironmentVariable, type=int):
method put (line 1033) | def put(cls, value: int) -> None:
method get (line 1047) | def get(cls) -> int:
class MetricsMode (line 1063) | class MetricsMode(EnvironmentVariable, type=ExactStr):
method enable (line 1078) | def enable(cls) -> None:
method disable (line 1083) | def disable(cls) -> None:
class PersistentPickle (line 1088) | class PersistentPickle(EnvironmentVariable, type=bool):
class MinPartitionSize (line 1099) | class MinPartitionSize(EnvironmentVariable, type=int):
method put (line 1111) | def put(cls, value: int) -> None:
method get (line 1125) | def get(cls) -> int:
class MinRowPartitionSize (line 1149) | class MinRowPartitionSize(EnvironmentVariable, type=int):
method put (line 1161) | def put(cls, value: int) -> None:
method get (line 1177) | def get(cls) -> int:
class MinColumnPartitionSize (line 1193) | class MinColumnPartitionSize(EnvironmentVariable, type=int):
method put (line 1205) | def put(cls, value: int) -> None:
method get (line 1221) | def get(cls) -> int:
class TestReadFromSqlServer (line 1237) | class TestReadFromSqlServer(EnvironmentVariable, type=bool):
class TestReadFromPostgres (line 1244) | class TestReadFromPostgres(EnvironmentVariable, type=bool):
class GithubCI (line 1251) | class GithubCI(EnvironmentVariable, type=bool):
class ModinNumpy (line 1258) | class ModinNumpy(EnvironmentVariable, type=bool):
class RangePartitioning (line 1265) | class RangePartitioning(EnvironmentVariable, type=bool):
class CIAWSSecretAccessKey (line 1277) | class CIAWSSecretAccessKey(EnvironmentVariable, type=str):
class CIAWSAccessKeyID (line 1284) | class CIAWSAccessKeyID(EnvironmentVariable, type=str):
class AsyncReadMode (line 1291) | class AsyncReadMode(EnvironmentVariable, type=bool):
class ReadSqlEngine (line 1315) | class ReadSqlEngine(EnvironmentVariable, type=str):
class LazyExecution (line 1323) | class LazyExecution(EnvironmentVariable, type=str):
class DocModule (line 1338) | class DocModule(EnvironmentVariable, type=ExactStr):
class DaskThreadsPerWorker (line 1350) | class DaskThreadsPerWorker(EnvironmentVariable, type=int):
class NativePandasMaxRows (line 1357) | class NativePandasMaxRows(EnvironmentVariable, type=int):
class NativePandasTransferThreshold (line 1364) | class NativePandasTransferThreshold(EnvironmentVariable, type=int):
class NativePandasDeepCopy (line 1376) | class NativePandasDeepCopy(EnvironmentVariable, type=bool):
method enable (line 1405) | def enable(cls) -> None:
method disable (line 1410) | def disable(cls) -> None:
class BackendMergeCastInPlace (line 1415) | class BackendMergeCastInPlace(EnvironmentVariable, type=bool):
method enable (line 1429) | def enable(cls) -> None:
method disable (line 1434) | def disable(cls) -> None:
class BackendJoinConsiderAllBackends (line 1439) | class BackendJoinConsiderAllBackends(EnvironmentVariable, type=bool):
method enable (line 1453) | def enable(cls) -> None:
method disable (line 1458) | def disable(cls) -> None:
class DynamicPartitioning (line 1463) | class DynamicPartitioning(EnvironmentVariable, type=bool):
function _check_vars (line 1475) | def _check_vars() -> None:
FILE: modin/config/pubsub.py
class DeprecationDescriptor (line 36) | class DeprecationDescriptor:
method __init__ (line 54) | def __init__(
method deprecation_message (line 64) | def deprecation_message(self, use_envvar_names: bool = False) -> str:
class TypeDescriptor (line 94) | class TypeDescriptor(NamedTuple):
class ExactStr (line 118) | class ExactStr(str):
class ValueSource (line 184) | class ValueSource(IntEnum): # noqa: PR01
class Parameter (line 195) | class Parameter(object):
method _warn_if_deprecated (line 227) | def _warn_if_deprecated(cls) -> None:
method _get_value_from_config (line 235) | def _get_value_from_config(cls) -> Any:
method get_help (line 252) | def get_help(cls) -> str:
method __init_subclass__ (line 266) | def __init_subclass__(cls, type: Any, abstract: bool = False, **kw: di...
method subscribe (line 288) | def subscribe(cls, callback: Callable) -> None:
method _get_default (line 301) | def _get_default(cls) -> Any:
method get_value_source (line 312) | def get_value_source(cls) -> ValueSource:
method get (line 329) | def get(cls) -> Any:
method put (line 351) | def put(cls, value: Any) -> None:
method normalize (line 365) | def normalize(cls, value: Any) -> Any:
method once (line 382) | def once(cls, onvalue: Any, callback: Callable) -> None:
method _put_nocallback (line 403) | def _put_nocallback(cls, value: Any) -> Any:
method _check_callbacks (line 424) | def _check_callbacks(cls, oldvalue: Any) -> None:
method add_option (line 441) | def add_option(cls, choice: Any) -> Any:
function context (line 466) | def context(**config: dict[str, Any]) -> Iterator[None]:
FILE: modin/conftest.py
function _saving_make_api_url (line 51) | def _saving_make_api_url(token, _make_api_url=modin.utils._make_api_url):
function pytest_addoption (line 94) | def pytest_addoption(parser):
function set_experimental_env (line 103) | def set_experimental_env(mode):
function enforce_config (line 108) | def enforce_config():
class TestQC (line 176) | class TestQC(BaseQueryCompiler):
method __init__ (line 177) | def __init__(self, modin_frame):
method finalize (line 185) | def finalize(self):
method execute (line 188) | def execute(self):
method from_pandas (line 193) | def from_pandas(cls, df, data_cls):
method from_arrow (line 197) | def from_arrow(cls, at, data_cls):
method free (line 200) | def free(self):
method to_interchange_dataframe (line 203) | def to_interchange_dataframe(
method from_interchange_dataframe (line 211) | def from_interchange_dataframe(cls, df, data_cls):
class BaseOnPythonIO (line 220) | class BaseOnPythonIO(PandasOnPythonIO):
class BaseOnPythonFactory (line 224) | class BaseOnPythonFactory(factories.BaseFactory):
method prepare (line 226) | def prepare(cls):
function set_base_execution (line 230) | def set_base_execution(name=BASE_EXECUTION_NAME):
function get_unique_base_execution (line 243) | def get_unique_base_execution():
function pytest_configure (line 282) | def pytest_configure(config):
function pytest_runtest_call (line 298) | def pytest_runtest_call(item):
function TestReadCSVFixture (line 329) | def TestReadCSVFixture(tmp_path_factory):
function make_csv_file (line 359) | def make_csv_file(tmp_path):
function create_fixture (line 363) | def create_fixture(file_type):
function make_parquet_file (line 378) | def make_parquet_file():
function make_sql_connection (line 448) | def make_sql_connection():
function TestReadGlobCSVFixture (line 478) | def TestReadGlobCSVFixture(tmp_path_factory):
function get_generated_doc_urls (line 492) | def get_generated_doc_urls():
function set_num_partitions (line 497) | def set_num_partitions(request):
function set_benchmark_mode (line 505) | def set_benchmark_mode(request):
function set_async_read_mode (line 513) | def set_async_read_mode(request):
function set_min_row_partition_size (line 521) | def set_min_row_partition_size(request):
function s3_storage_options (line 532) | def s3_storage_options(worker_id):
function monkeysession (line 550) | def monkeysession():
function s3_base (line 556) | def s3_base(worker_id, monkeysession):
function s3_resource (line 638) | def s3_resource(s3_base):
function modify_config (line 710) | def modify_config(request):
function copy_and_restore (line 735) | def copy_and_restore(
function clean_up_extensions (line 764) | def clean_up_extensions():
function clean_up_auto_backend_switching (line 787) | def clean_up_auto_backend_switching():
function assert_no_root_logging (line 799) | def assert_no_root_logging(caplog):
FILE: modin/core/computation/align.py
function _align_core_single_unary_op (line 42) | def _align_core_single_unary_op(
function _zip_axes_from_type (line 58) | def _zip_axes_from_type(
function _any_pandas_objects (line 64) | def _any_pandas_objects(terms) -> bool:
function _filter_special_cases (line 71) | def _filter_special_cases(f) -> Callable[[F], F]:
function _align_core (line 90) | def _align_core(terms):
function align_terms (line 144) | def align_terms(terms):
function reconstruct_object (line 167) | def reconstruct_object(typ, obj, axes, dtype):
FILE: modin/core/computation/common.py
function ensure_decoded (line 28) | def ensure_decoded(s) -> str:
function result_type_many (line 37) | def result_type_many(*arrays_and_dtypes):
FILE: modin/core/computation/engines.py
function _check_ne_builtin_clash (line 41) | def _check_ne_builtin_clash(expr: Expr) -> None:
class AbstractEngine (line 60) | class AbstractEngine(metaclass=abc.ABCMeta):
method __init__ (line 65) | def __init__(self, expr) -> None:
method convert (line 70) | def convert(self) -> str:
method evaluate (line 78) | def evaluate(self) -> object:
method _is_aligned (line 100) | def _is_aligned(self) -> bool:
method _evaluate (line 104) | def _evaluate(self):
class NumExprEngine (line 120) | class NumExprEngine(AbstractEngine):
method _evaluate (line 125) | def _evaluate(self):
class PythonEngine (line 137) | class PythonEngine(AbstractEngine):
method evaluate (line 146) | def evaluate(self):
method _evaluate (line 149) | def _evaluate(self) -> None:
FILE: modin/core/computation/eval.py
function _check_engine (line 39) | def _check_engine(engine: str | None) -> str:
function _check_parser (line 82) | def _check_parser(parser: str):
function _check_resolvers (line 101) | def _check_resolvers(resolvers):
function _check_expression (line 112) | def _check_expression(expr):
function _convert_expression (line 130) | def _convert_expression(expr) -> str:
function _check_for_locals (line 159) | def _check_for_locals(expr: str, stack_level: int, parser: str):
function eval (line 177) | def eval(
FILE: modin/core/computation/expr.py
function _rewrite_assign (line 61) | def _rewrite_assign(tok: tuple[int, str]) -> tuple[int, str]:
function _replace_booleans (line 80) | def _replace_booleans(tok: tuple[int, str]) -> tuple[int, str]:
function _replace_locals (line 105) | def _replace_locals(tok: tuple[int, str]) -> tuple[int, str]:
function _compose2 (line 131) | def _compose2(f, g):
function _compose (line 138) | def _compose(*funcs):
function _preparse (line 146) | def _preparse(
function _is_type (line 180) | def _is_type(t):
function _filter_nodes (line 199) | def _filter_nodes(superclass, all_nodes=_all_nodes):
function _node_not_implemented (line 259) | def _node_not_implemented(node_name: str) -> Callable[..., None]:
function disallow (line 273) | def disallow(nodes: set[str]) -> Callable[[type[_T]], type[_T]]:
function _op_maker (line 297) | def _op_maker(op_class, op_symbol):
function add_ops (line 322) | def add_ops(op_classes):
class BaseExprVisitor (line 343) | class BaseExprVisitor(ast.NodeVisitor):
method __init__ (line 396) | def __init__(self, env, engine, parser, preparser=_preparse) -> None:
method visit (line 403) | def visit(self, node, **kwargs):
method visit_Module (line 417) | def visit_Module(self, node, **kwargs):
method visit_Expr (line 423) | def visit_Expr(self, node, **kwargs):
method _rewrite_membership_op (line 426) | def _rewrite_membership_op(self, node, left, right):
method _maybe_transform_eq_ne (line 453) | def _maybe_transform_eq_ne(self, node, left=None, right=None):
method _maybe_downcast_constants (line 461) | def _maybe_downcast_constants(self, left, right):
method _maybe_eval (line 484) | def _maybe_eval(self, binop, eval_in_python):
method _maybe_evaluate_binop (line 495) | def _maybe_evaluate_binop(
method visit_BinOp (line 534) | def visit_BinOp(self, node, **kwargs):
method visit_UnaryOp (line 539) | def visit_UnaryOp(self, node, **kwargs):
method visit_Name (line 544) | def visit_Name(self, node, **kwargs) -> Term:
method visit_NameConstant (line 548) | def visit_NameConstant(self, node, **kwargs) -> Term:
method visit_Num (line 552) | def visit_Num(self, node, **kwargs) -> Term:
method visit_Constant (line 555) | def visit_Constant(self, node, **kwargs) -> Term:
method visit_Str (line 559) | def visit_Str(self, node, **kwargs) -> Term:
method visit_List (line 563) | def visit_List(self, node, **kwargs) -> Term:
method visit_Index (line 569) | def visit_Index(self, node, **kwargs):
method visit_Subscript (line 573) | def visit_Subscript(self, node, **kwargs) -> Term:
method visit_Slice (line 593) | def visit_Slice(self, node, **kwargs) -> slice:
method visit_Assign (line 607) | def visit_Assign(self, node, **kwargs):
method visit_Attribute (line 637) | def visit_Attribute(self, node, **kwargs):
method visit_Call (line 657) | def visit_Call(self, node, side=None, **kwargs):
method translate_In (line 708) | def translate_In(self, op):
method visit_Compare (line 711) | def visit_Compare(self, node, **kwargs):
method _try_visit_binop (line 732) | def _try_visit_binop(self, bop):
method visit_BoolOp (line 737) | def visit_BoolOp(self, node, **kwargs):
class PandasExprVisitor (line 756) | class PandasExprVisitor(BaseExprVisitor):
method __init__ (line 757) | def __init__(
class PythonExprVisitor (line 771) | class PythonExprVisitor(BaseExprVisitor):
method __init__ (line 772) | def __init__(
class Expr (line 778) | class Expr:
method __init__ (line 795) | def __init__(
method assigner (line 811) | def assigner(self):
method __call__ (line 814) | def __call__(self):
method __repr__ (line 817) | def __repr__(self) -> str:
method __len__ (line 820) | def __len__(self) -> int:
method parse (line 823) | def parse(self):
method names (line 830) | def names(self):
FILE: modin/core/computation/ops.py
class Term (line 86) | class Term:
method __new__ (line 87) | def __new__(cls, name, env, side=None, encoding=None):
method __init__ (line 95) | def __init__(self, name, env, side=None, encoding=None) -> None:
method local_name (line 106) | def local_name(self) -> str:
method __repr__ (line 109) | def __repr__(self) -> str:
method __call__ (line 112) | def __call__(self, *args, **kwargs):
method evaluate (line 115) | def evaluate(self, *args, **kwargs) -> Term:
method _resolve_name (line 118) | def _resolve_name(self):
method update (line 135) | def update(self, value) -> None:
method is_scalar (line 154) | def is_scalar(self) -> bool:
method type (line 158) | def type(self):
method raw (line 173) | def raw(self) -> str:
method is_datetime (line 177) | def is_datetime(self) -> bool:
method value (line 186) | def value(self):
method value (line 190) | def value(self, new_value) -> None:
method name (line 194) | def name(self):
method ndim (line 198) | def ndim(self) -> int:
class Constant (line 202) | class Constant(Term):
method _resolve_name (line 203) | def _resolve_name(self):
method name (line 207) | def name(self):
method __repr__ (line 210) | def __repr__(self) -> str:
class Op (line 219) | class Op:
method __init__ (line 226) | def __init__(self, op: str, operands: Iterable[Term | Op], encoding=No...
method __iter__ (line 231) | def __iter__(self) -> Iterator:
method __repr__ (line 234) | def __repr__(self) -> str:
method return_type (line 243) | def return_type(self):
method has_invalid_return_type (line 250) | def has_invalid_return_type(self) -> bool:
method operand_types (line 256) | def operand_types(self):
method is_scalar (line 260) | def is_scalar(self) -> bool:
method is_datetime (line 264) | def is_datetime(self) -> bool:
function _in (line 273) | def _in(x, y):
function _not_in (line 289) | def _not_in(x, y):
function is_term (line 343) | def is_term(obj) -> bool:
class BinOp (line 347) | class BinOp(Op):
method __init__ (line 358) | def __init__(self, op: str, lhs, rhs) -> None:
method __call__ (line 376) | def __call__(self, env):
method evaluate (line 395) | def evaluate(self, env, engine: str, parser, term_type, eval_in_python):
method convert_values (line 444) | def convert_values(self) -> None:
method _disallow_scalar_only_bool_ops (line 477) | def _disallow_scalar_only_bool_ops(self):
function isnumeric (line 499) | def isnumeric(dtype) -> bool:
class UnaryOp (line 508) | class UnaryOp(Op):
method __init__ (line 525) | def __init__(self, op: Literal["+", "-", "~", "not"], operand) -> None:
method __call__ (line 536) | def __call__(self, env) -> MathCall:
method __repr__ (line 541) | def __repr__(self) -> str:
method return_type (line 545) | def return_type(self) -> np.dtype:
class MathCall (line 556) | class MathCall(Op):
method __init__ (line 557) | def __init__(self, func, args) -> None:
method __call__ (line 561) | def __call__(self, env):
method __repr__ (line 566) | def __repr__(self) -> str:
class FuncNode (line 571) | class FuncNode:
method __init__ (line 572) | def __init__(self, name: str) -> None:
method __call__ (line 578) | def __call__(self, *args) -> MathCall:
FILE: modin/core/computation/parsing.py
function create_valid_python_identifier (line 36) | def create_valid_python_identifier(name: str) -> str:
function clean_backtick_quoted_toks (line 87) | def clean_backtick_quoted_toks(tok: tuple[int, str]) -> tuple[int, str]:
function clean_column_name (line 113) | def clean_column_name(name: Hashable) -> Hashable:
function tokenize_backtick_quoted_string (line 150) | def tokenize_backtick_quoted_string(
function tokenize_string (line 186) | def tokenize_string(source: str) -> Iterator[tuple[int, str]]:
FILE: modin/core/computation/scope.py
class DeepChainMap (line 39) | class DeepChainMap(ChainMap[_KT, _VT]):
method __setitem__ (line 46) | def __setitem__(self, key: _KT, value: _VT) -> None:
method __delitem__ (line 53) | def __delitem__(self, key: _KT) -> None:
function ensure_scope (line 67) | def ensure_scope(
function _replacer (line 80) | def _replacer(x) -> str:
function _raw_hex_id (line 96) | def _raw_hex_id(obj) -> str:
function _get_pretty_string (line 115) | def _get_pretty_string(obj) -> str:
class Scope (line 134) | class Scope:
method __init__ (line 161) | def __init__(
method __repr__ (line 201) | def __repr__(self) -> str:
method has_resolvers (line 207) | def has_resolvers(self) -> bool:
method resolve (line 220) | def resolve(self, key: str, is_local: bool):
method swapkey (line 259) | def swapkey(self, old_key: str, new_key: str, new_value=None) -> None:
method _get_vars (line 284) | def _get_vars(self, stack, scopes: list[str]) -> None:
method _update (line 307) | def _update(self, level: int) -> None:
method add_tmp (line 329) | def add_tmp(self, value) -> str:
method ntemps (line 354) | def ntemps(self) -> int:
method full_scope (line 359) | def full_scope(self) -> DeepChainMap:
FILE: modin/core/dataframe/algebra/binary.py
function maybe_compute_dtypes_common_cast (line 35) | def maybe_compute_dtypes_common_cast(
function maybe_build_dtypes_series (line 176) | def maybe_build_dtypes_series(
function try_compute_new_dtypes (line 225) | def try_compute_new_dtypes(
class Binary (line 293) | class Binary(Operator):
method register (line 297) | def register(
FILE: modin/core/dataframe/algebra/default2pandas/binary.py
class BinaryDefault (line 22) | class BinaryDefault(DefaultMethod):
method build_default_to_pandas (line 26) | def build_default_to_pandas(cls, fn, fn_name):
FILE: modin/core/dataframe/algebra/default2pandas/cat.py
class CatDefault (line 19) | class CatDefault(SeriesDefault):
method frame_wrapper (line 23) | def frame_wrapper(cls, df):
FILE: modin/core/dataframe/algebra/default2pandas/dataframe.py
class DataFrameDefault (line 24) | class DataFrameDefault(DefaultMethod):
FILE: modin/core/dataframe/algebra/default2pandas/datetime.py
class DateTimeDefault (line 19) | class DateTimeDefault(SeriesDefault):
method frame_wrapper (line 23) | def frame_wrapper(cls, df):
FILE: modin/core/dataframe/algebra/default2pandas/default.py
class ObjTypeDeterminer (line 23) | class ObjTypeDeterminer:
method __getattr__ (line 31) | def __getattr__(self, key):
class DefaultMethod (line 56) | class DefaultMethod(Operator):
method register (line 72) | def register(cls, func, obj_type=None, inplace=None, fn_name=None):
method build_wrapper (line 178) | def build_wrapper(cls, fn, fn_name):
method build_property_wrapper (line 212) | def build_property_wrapper(cls, prop):
method build_default_to_pandas (line 234) | def build_default_to_pandas(cls, fn, fn_name):
method frame_wrapper (line 259) | def frame_wrapper(cls, df):
FILE: modin/core/dataframe/algebra/default2pandas/groupby.py
class GroupBy (line 32) | class GroupBy:
method is_transformation_kernel (line 43) | def is_transformation_kernel(agg_func: Any) -> bool:
method _call_groupby (line 65) | def _call_groupby(cls, df, *args, **kwargs): # noqa: PR01
method validate_by (line 72) | def validate_by(cls, by):
method inplace_applyier_builder (line 109) | def inplace_applyier_builder(cls, key, func=None):
method get_func (line 134) | def get_func(cls, key, **kwargs):
method build_aggregate_method (line 167) | def build_aggregate_method(cls, key):
method build_groupby_reduce_method (line 205) | def build_groupby_reduce_method(cls, agg_func):
method is_aggregate (line 295) | def is_aggregate(cls, key): # noqa: PR01
method build_groupby (line 300) | def build_groupby(cls, func):
method handle_as_index_for_dataframe (line 320) | def handle_as_index_for_dataframe(
method handle_as_index (line 393) | def handle_as_index(
class SeriesGroupBy (line 558) | class SeriesGroupBy(GroupBy):
method _call_groupby (line 562) | def _call_groupby(cls, df, *args, **kwargs): # noqa: PR01
class GroupByDefault (line 578) | class GroupByDefault(DefaultMethod):
method register (line 586) | def register(cls, func, **kwargs):
method get_aggregation_method (line 624) | def get_aggregation_method(cls, how):
class SeriesGroupByDefault (line 644) | class SeriesGroupByDefault(GroupByDefault):
FILE: modin/core/dataframe/algebra/default2pandas/list.py
class ListDefault (line 19) | class ListDefault(SeriesDefault):
method frame_wrapper (line 23) | def frame_wrapper(cls, df):
FILE: modin/core/dataframe/algebra/default2pandas/resample.py
class Resampler (line 21) | class Resampler:
method build_resample (line 25) | def build_resample(cls, func, squeeze_self):
class ResampleDefault (line 57) | class ResampleDefault(DefaultMethod):
method register (line 63) | def register(cls, func, squeeze_self=False, **kwargs):
FILE: modin/core/dataframe/algebra/default2pandas/rolling.py
class RollingDefault (line 19) | class RollingDefault(DefaultMethod):
method _build_rolling (line 25) | def _build_rolling(cls, func):
method register (line 52) | def register(cls, func, **kwargs):
class ExpandingDefault (line 74) | class ExpandingDefault(DefaultMethod):
method _build_expanding (line 80) | def _build_expanding(cls, func, squeeze_self):
method register (line 111) | def register(cls, func, squeeze_self=False, **kwargs):
FILE: modin/core/dataframe/algebra/default2pandas/series.py
class SeriesDefault (line 19) | class SeriesDefault(DefaultMethod):
method frame_wrapper (line 25) | def frame_wrapper(cls, df):
FILE: modin/core/dataframe/algebra/default2pandas/str.py
class StrDefault (line 19) | class StrDefault(SeriesDefault):
method frame_wrapper (line 23) | def frame_wrapper(cls, df):
FILE: modin/core/dataframe/algebra/default2pandas/struct.py
class StructDefault (line 19) | class StructDefault(SeriesDefault):
method frame_wrapper (line 23) | def frame_wrapper(cls, df):
FILE: modin/core/dataframe/algebra/fold.py
class Fold (line 28) | class Fold(Operator):
method register (line 32) | def register(
FILE: modin/core/dataframe/algebra/groupby.py
class GroupByReduce (line 33) | class GroupByReduce(TreeReduce):
method register (line 55) | def register(
method register_implementation (line 105) | def register_implementation(
method map (line 124) | def map(
method reduce (line 211) | def reduce(
method caller (line 303) | def caller(
method get_callable (line 453) | def get_callable(
method _build_callable_for_dict (line 496) | def _build_callable_for_dict(
method is_registered_implementation (line 672) | def is_registered_implementation(cls, func: Callable) -> bool:
method build_map_reduce_functions (line 687) | def build_map_reduce_functions(
FILE: modin/core/dataframe/algebra/map.py
class Map (line 28) | class Map(Operator):
method register (line 32) | def register(
FILE: modin/core/dataframe/algebra/operator.py
class Operator (line 21) | class Operator(object):
method __init__ (line 24) | def __init__(self) -> None:
method register (line 32) | def register(cls, func: Callable, **kwargs: dict):
method validate_axis (line 50) | def validate_axis(cls, axis: Optional[int]) -> int:
FILE: modin/core/dataframe/algebra/reduce.py
class Reduce (line 28) | class Reduce(Operator):
method register (line 32) | def register(
FILE: modin/core/dataframe/algebra/tree_reduce.py
class TreeReduce (line 29) | class TreeReduce(Operator):
method register (line 33) | def register(
FILE: modin/core/dataframe/base/dataframe/dataframe.py
class ModinDataframe (line 26) | class ModinDataframe(ABC):
method take_2d_labels_or_positional (line 38) | def take_2d_labels_or_positional(
method filter_by_types (line 75) | def filter_by_types(self, types: List[Hashable]) -> "ModinDataframe":
method map (line 92) | def map(
method filter (line 128) | def filter(self, axis: Union[int, Axis], condition: Callable) -> "Modi...
method explode (line 149) | def explode(
method window (line 183) | def window(
method groupby (line 219) | def groupby(
method reduce (line 262) | def reduce(
method tree_reduce (line 294) | def tree_reduce(
method infer_types (line 335) | def infer_types(self, columns_list: List[str]) -> "ModinDataframe":
method join (line 352) | def join(
method concat (line 391) | def concat(
method transpose (line 420) | def transpose(self) -> "ModinDataframe":
method to_labels (line 438) | def to_labels(self, column_labels: Union[str, List[str]]) -> "ModinDat...
method from_labels (line 460) | def from_labels(self) -> "ModinDataframe":
method rename (line 477) | def rename(
method sort_by (line 500) | def sort_by(
FILE: modin/core/dataframe/base/dataframe/utils.py
class Axis (line 30) | class Axis(Enum): # noqa: PR01
class JoinType (line 45) | class JoinType(Enum): # noqa: PR01
function join_columns (line 59) | def join_columns(
function is_trivial_index (line 158) | def is_trivial_index(index: pandas.Index) -> bool:
FILE: modin/core/dataframe/base/interchange/dataframe_protocol/dataframe.py
class ColumnBuffers (line 26) | class ColumnBuffers(TypedDict): # noqa: GL08
class CategoricalDescription (line 43) | class CategoricalDescription(TypedDict): # noqa: GL08
class ProtocolBuffer (line 52) | class ProtocolBuffer(ABC):
method bufsize (line 69) | def bufsize(self) -> int:
method ptr (line 81) | def ptr(self) -> int:
method __dlpack__ (line 92) | def __dlpack__(self) -> Any:
method __dlpack_device__ (line 111) | def __dlpack_device__(self) -> Tuple[DlpackDeviceType, Optional[int]]:
class ProtocolColumn (line 137) | class ProtocolColumn(ABC):
method size (line 178) | def size(self) -> int:
method offset (line 197) | def offset(self) -> int:
method dtype (line 214) | def dtype(self) -> Tuple[DTypeKind, int, str, str]:
method describe_categorical (line 251) | def describe_categorical(self) -> CategoricalDescription:
method describe_null (line 280) | def describe_null(self) -> Tuple[ColumnNullType, Any]:
method null_count (line 299) | def null_count(self) -> int:
method metadata (line 315) | def metadata(self) -> Dict[str, Any]:
method num_chunks (line 328) | def num_chunks(self) -> int:
method get_chunks (line 340) | def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["Prot...
method get_buffers (line 365) | def get_buffers(self) -> ColumnBuffers:
class ProtocolDataframe (line 387) | class ProtocolDataframe(ABC):
method __dataframe__ (line 406) | def __dataframe__(
method metadata (line 436) | def metadata(self) -> Dict[str, Any]:
method num_columns (line 454) | def num_columns(self) -> int:
method num_rows (line 466) | def num_rows(self) -> Optional[int]:
method num_chunks (line 478) | def num_chunks(self) -> int:
method column_names (line 490) | def column_names(self) -> Iterable[str]:
method get_column (line 502) | def get_column(self, i: int) -> ProtocolColumn:
method get_column_by_name (line 519) | def get_column_by_name(self, name: str) -> ProtocolColumn:
method get_columns (line 536) | def get_columns(self) -> Iterable[ProtocolColumn]:
method select_columns (line 548) | def select_columns(self, indices: Sequence[int]) -> "ProtocolDataframe":
method select_columns_by_name (line 565) | def select_columns_by_name(self, names: Sequence[str]) -> "ProtocolDat...
method get_chunks (line 582) | def get_chunks(
FILE: modin/core/dataframe/base/interchange/dataframe_protocol/utils.py
class DTypeKind (line 29) | class DTypeKind(enum.IntEnum): # noqa PR01
class ColumnNullType (line 59) | class ColumnNullType(enum.IntEnum): # noqa PR01
class DlpackDeviceType (line 83) | class DlpackDeviceType(enum.IntEnum): # noqa PR01
class ArrowCTypes (line 96) | class ArrowCTypes:
class Endianness (line 129) | class Endianness:
function pandas_dtype_to_arrow_c (line 138) | def pandas_dtype_to_arrow_c(dtype: Union[np.dtype, pandas.CategoricalDty...
function raise_copy_alert (line 172) | def raise_copy_alert(copy_reason: Optional[str] = None) -> None:
FILE: modin/core/dataframe/base/partitioning/axis_partition.py
class BaseDataframeAxisPartition (line 23) | class BaseDataframeAxisPartition(
method list_of_blocks (line 39) | def list_of_blocks(self) -> list:
method apply (line 43) | def apply(
method _wrap_partitions (line 101) | def _wrap_partitions(
method force_materialization (line 149) | def force_materialization(
method unwrap (line 170) | def unwrap(
FILE: modin/core/dataframe/pandas/dataframe/dataframe.py
class PandasDataframe (line 82) | class PandasDataframe(
method storage_format (line 126) | def storage_format(self) -> str:
method engine (line 139) | def engine(self) -> str:
method __constructor__ (line 151) | def __constructor__(self) -> type[PandasDataframe]:
method __init__ (line 161) | def __init__(
method _validate_axes_lengths (line 191) | def _validate_axes_lengths(self):
method num_parts (line 231) | def num_parts(self) -> int:
method row_lengths (line 242) | def row_lengths(self):
method _get_lengths (line 260) | def _get_lengths(cls, parts, axis):
method __len__ (line 280) | def __len__(self) -> int:
method column_widths (line 295) | def column_widths(self):
method _set_axis_lengths_cache (line 312) | def _set_axis_lengths_cache(self, value, axis=0):
method _get_axis_lengths_cache (line 327) | def _get_axis_lengths_cache(self, axis=0):
method _get_axis_lengths (line 343) | def _get_axis_lengths(self, axis: int = 0) -> List[int]:
method has_dtypes_cache (line 358) | def has_dtypes_cache(self) -> bool:
method has_materialized_dtypes (line 369) | def has_materialized_dtypes(self) -> bool:
method copy_dtypes_cache (line 379) | def copy_dtypes_cache(self):
method _maybe_update_proxies (line 393) | def _maybe_update_proxies(self, dtypes, new_parent=None):
method set_dtypes_cache (line 415) | def set_dtypes_cache(self, dtypes):
method dtypes (line 441) | def dtypes(self):
method get_dtypes_set (line 460) | def get_dtypes_set(self):
method _compute_dtypes (line 472) | def _compute_dtypes(self, columns=None) -> pandas.Series:
method set_index_cache (line 517) | def set_index_cache(self, index):
method set_columns_cache (line 533) | def set_columns_cache(self, columns):
method set_axis_cache (line 549) | def set_axis_cache(self, value, axis=0):
method has_axis_cache (line 563) | def has_axis_cache(self, axis=0) -> bool:
method has_index_cache (line 578) | def has_index_cache(self):
method copy_index_cache (line 588) | def copy_index_cache(self, copy_lengths=False):
method _get_axis_cache (line 608) | def _get_axis_cache(self, axis=0) -> ModinIndex:
method has_columns_cache (line 623) | def has_columns_cache(self):
method copy_columns_cache (line 633) | def copy_columns_cache(self, copy_lengths=False):
method copy_axis_cache (line 653) | def copy_axis_cache(self, axis=0, copy_lengths=False):
method has_materialized_index (line 675) | def has_materialized_index(self):
method has_materialized_columns (line 686) | def has_materialized_columns(self):
method _validate_set_axis (line 696) | def _validate_set_axis(self, new_labels, old_labels):
method _get_index (line 726) | def _get_index(self):
method _get_columns (line 744) | def _get_columns(self):
method _set_index (line 762) | def _set_index(self, new_index):
method _set_columns (line 776) | def _set_columns(self, new_columns):
method axes (line 816) | def axes(self):
method get_axis (line 827) | def get_axis(self, axis: int = 0) -> pandas.Index:
method _compute_axis_labels_and_lengths (line 841) | def _compute_axis_labels_and_lengths(self, axis: int, partitions=None):
method _filter_empties (line 865) | def _filter_empties(self, compute_metadata=True):
method synchronize_labels (line 916) | def synchronize_labels(self, axis=None):
method _propagate_index_objs (line 934) | def _propagate_index_objs(self, axis=None) -> None:
method take_2d_labels_or_positional (line 1041) | def take_2d_labels_or_positional(
method _get_sorted_positions (line 1113) | def _get_sorted_positions(self, positions):
method _get_new_lengths (line 1132) | def _get_new_lengths(self, partitions_dict, *, axis: int) -> List[int]:
method _get_new_index_obj (line 1164) | def _get_new_index_obj(
method _take_2d_positional (line 1199) | def _take_2d_positional(
method _maybe_reorder_labels (line 1382) | def _maybe_reorder_labels(
method from_labels (line 1451) | def from_labels(self) -> PandasDataframe:
method to_labels (line 1556) | def to_labels(self, column_list: List[Hashable]) -> PandasDataframe:
method _reorder_labels (line 1589) | def _reorder_labels(self, row_positions=None, col_positions=None):
method copy (line 1687) | def copy(self):
method astype (line 1707) | def astype(self, col_dtypes, errors: str = "raise"):
method numeric_columns (line 1808) | def numeric_columns(self, include_bool=True):
method _get_dict_of_block_index (line 1830) | def _get_dict_of_block_index(self, axis, indices, are_indices_sorted=F...
method _join_index_objects (line 1989) | def _join_index_objects(axis, indexes, how, sort, fill_value=None):
method _build_treereduce_func (line 2081) | def _build_treereduce_func(self, axis, func):
method _compute_tree_reduce_metadata (line 2125) | def _compute_tree_reduce_metadata(self, axis, new_parts, dtypes=None):
method reduce (line 2171) | def reduce(
method tree_reduce (line 2208) | def tree_reduce(
method map (line 2253) | def map(
method window (line 2321) | def window(
method fold (line 2357) | def fold(self, axis, func, new_index=None, new_columns=None, shape_pre...
method infer_objects (line 2402) | def infer_objects(self) -> PandasDataframe:
method infer_types (line 2420) | def infer_types(self, col_labels: List[str]) -> PandasDataframe:
method join (line 2449) | def join(
method rename (line 2487) | def rename(
method combine_and_apply (line 2518) | def combine_and_apply(
method _apply_func_to_range_partitioning (line 2565) | def _apply_func_to_range_partitioning(
method sort_by (line 2742) | def sort_by(
method filter (line 2816) | def filter(self, axis: Union[Axis, int], condition: Callable) -> Panda...
method filter_by_types (line 2859) | def filter_by_types(self, types: List[Hashable]) -> PandasDataframe:
method explode (line 2878) | def explode(self, axis: Union[int, Axis], func: Callable) -> PandasDat...
method combine (line 2918) | def combine(self) -> PandasDataframe:
method apply_full_axis (line 2956) | def apply_full_axis(
method apply_full_axis_select_indices (line 3035) | def apply_full_axis_select_indices(
method apply_select_indices (line 3110) | def apply_select_indices(
method broadcast_apply (line 3233) | def broadcast_apply(
method _prepare_frame_to_broadcast (line 3337) | def _prepare_frame_to_broadcast(self, axis, indices, broadcast_all):
method _extract_partitions (line 3375) | def _extract_partitions(self):
method broadcast_apply_select_indices (line 3398) | def broadcast_apply_select_indices(
method broadcast_apply_full_axis (line 3483) | def broadcast_apply_full_axis(
method _check_if_axes_identical (line 3678) | def _check_if_axes_identical(self, other: PandasDataframe, axis: int =...
method _copartition (line 3709) | def _copartition(
method n_ary_op (line 3851) | def n_ary_op(
method concat (line 3953) | def concat(
method _apply_func_to_range_partitioning_broadcast (line 4087) | def _apply_func_to_range_partitioning_broadcast(
method groupby (line 4163) | def groupby(
method groupby_reduce (line 4530) | def groupby_reduce(
method from_pandas (line 4592) | def from_pandas(cls, df):
method from_arrow (line 4623) | def from_arrow(cls, at):
method _arrow_type_to_dtype (line 4657) | def _arrow_type_to_dtype(cls, arrow_type):
method to_pandas (line 4692) | def to_pandas(self):
method to_numpy (line 4724) | def to_numpy(self, **kwargs):
method transpose (line 4747) | def transpose(self):
method finalize (line 4780) | def finalize(self):
method wait_computations (line 4789) | def wait_computations(self):
method support_materialization_in_worker_process (line 4793) | def support_materialization_in_worker_process(self) -> bool:
method __dataframe__ (line 4803) | def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = ...
method from_interchange_dataframe (line 4837) | def from_interchange_dataframe(cls, df: ProtocolDataframe) -> PandasDa...
method case_when (line 4869) | def case_when(self, caselist):
FILE: modin/core/dataframe/pandas/dataframe/utils.py
class ShuffleFunctions (line 35) | class ShuffleFunctions:
method __init__ (line 53) | def __init__(
method sample_fn (line 59) | def sample_fn(self, partition: pandas.DataFrame) -> pandas.DataFrame:
method pivot_fn (line 75) | def pivot_fn(self, samples: "list[pandas.DataFrame]") -> int:
method split_fn (line 91) | def split_fn(self, partition: pandas.DataFrame) -> "tuple[pandas.DataF...
class ShuffleSortFunctions (line 111) | class ShuffleSortFunctions(ShuffleFunctions):
method __init__ (line 135) | def __init__(
method sample_fn (line 154) | def sample_fn(self, partition: pandas.DataFrame) -> pandas.DataFrame:
method pivot_fn (line 163) | def pivot_fn(self, samples: "list[pandas.DataFrame]") -> int:
method split_fn (line 194) | def split_fn(
method _find_quantiles (line 212) | def _find_quantiles(
method pick_samples_for_quantiles (line 253) | def pick_samples_for_quantiles(
method pick_pivots_from_samples_for_sort (line 288) | def pick_pivots_from_samples_for_sort(
method split_partitions_using_pivots_for_sort (line 335) | def split_partitions_using_pivots_for_sort(
method _index_to_df_zero_copy (line 478) | def _index_to_df_zero_copy(
class ShuffleResample (line 509) | class ShuffleResample(ShuffleSortFunctions):
method __init__ (line 510) | def __init__(
method pick_samples_for_quantiles (line 543) | def pick_samples_for_quantiles(
method pick_pivots_from_samples_for_sort (line 552) | def pick_pivots_from_samples_for_sort(
method _adjust_bin_edges (line 598) | def _adjust_bin_edges(
method split_partitions_using_pivots_for_sort (line 652) | def split_partitions_using_pivots_for_sort(
function lazy_metadata_decorator (line 680) | def lazy_metadata_decorator(apply_axis=None, axis_arg=-1, transpose=False):
function add_missing_categories_to_groupby (line 774) | def add_missing_categories_to_groupby(
FILE: modin/core/dataframe/pandas/interchange/dataframe_protocol/buffer.py
class PandasProtocolBuffer (line 40) | class PandasProtocolBuffer(ProtocolBuffer):
method __init__ (line 66) | def __init__(self, x: np.ndarray, allow_copy: bool = True) -> None:
method bufsize (line 83) | def bufsize(self) -> int:
method ptr (line 87) | def ptr(self) -> int:
method __dlpack__ (line 90) | def __dlpack__(self):
method __dlpack_device__ (line 93) | def __dlpack_device__(self) -> Tuple[enum.IntEnum, int]:
method __repr__ (line 99) | def __repr__(self) -> str:
FILE: modin/core/dataframe/pandas/interchange/dataframe_protocol/column.py
class PandasProtocolColumn (line 59) | class PandasProtocolColumn(ProtocolColumn):
method __init__ (line 110) | def __init__(self, column: PandasDataframe, allow_copy: bool = True) -...
method size (line 117) | def size(self) -> int:
method offset (line 121) | def offset(self) -> int:
method dtype (line 125) | def dtype(self) -> Tuple[DTypeKind, int, str, str]:
method _dtype_from_primitive_pandas_dtype (line 150) | def _dtype_from_primitive_pandas_dtype(
method describe_categorical (line 187) | def describe_categorical(self) -> CategoricalDescription:
method describe_null (line 205) | def describe_null(self) -> Tuple[int, Any]:
method null_count (line 228) | def null_count(self) -> int:
method metadata (line 248) | def metadata(self) -> Dict[str, Any]:
method num_chunks (line 251) | def num_chunks(self) -> int:
method get_chunks (line 254) | def get_chunks(
method get_buffers (line 309) | def get_buffers(self) -> Dict[str, Any]:
method _get_data_buffer (line 326) | def _get_data_buffer(
method _get_validity_buffer (line 384) | def _get_validity_buffer(self) -> Tuple[PandasProtocolBuffer, Any]:
method _get_offsets_buffer (line 435) | def _get_offsets_buffer(self) -> Tuple[PandasProtocolBuffer, Any]:
FILE: modin/core/dataframe/pandas/interchange/dataframe_protocol/dataframe.py
class PandasProtocolDataframe (line 43) | class PandasProtocolDataframe(ProtocolDataframe):
method __init__ (line 74) | def __init__(
method __dataframe__ (line 84) | def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = ...
method metadata (line 90) | def metadata(self) -> Dict[str, Any]:
method num_columns (line 93) | def num_columns(self) -> int:
method num_rows (line 96) | def num_rows(self) -> int:
method num_chunks (line 99) | def num_chunks(self) -> int:
method column_names (line 102) | def column_names(self) -> Iterable[str]:
method get_column (line 106) | def get_column(self, i: int) -> PandasProtocolColumn:
method get_column_by_name (line 114) | def get_column_by_name(self, name: str) -> PandasProtocolColumn:
method get_columns (line 122) | def get_columns(self) -> Iterable[PandasProtocolColumn]:
method select_columns (line 131) | def select_columns(self, indices: Sequence[int]) -> "PandasProtocolDat...
method select_columns_by_name (line 142) | def select_columns_by_name(self, names: Sequence[str]) -> "PandasProto...
method get_chunks (line 151) | def get_chunks(
FILE: modin/core/dataframe/pandas/interchange/dataframe_protocol/exception.py
class NoValidityBuffer (line 17) | class NoValidityBuffer(Exception):
class NoOffsetsBuffer (line 23) | class NoOffsetsBuffer(Exception):
FILE: modin/core/dataframe/pandas/interchange/dataframe_protocol/from_dataframe.py
function from_dataframe_to_pandas (line 44) | def from_dataframe_to_pandas(df: ProtocolDataframe, n_chunks: Optional[i...
function protocol_df_chunk_to_pandas (line 80) | def protocol_df_chunk_to_pandas(df):
function unpack_protocol_column (line 110) | def unpack_protocol_column(
function primitive_column_to_ndarray (line 145) | def primitive_column_to_ndarray(col: ProtocolColumn) -> Tuple[np.ndarray...
function categorical_column_to_series (line 167) | def categorical_column_to_series(col: ProtocolColumn) -> Tuple[pandas.Se...
function _inverse_null_buf (line 206) | def _inverse_null_buf(buf: np.ndarray, null_kind: ColumnNullType) -> np....
function string_column_to_ndarray (line 232) | def string_column_to_ndarray(col: ProtocolColumn) -> Tuple[np.ndarray, A...
function datetime_column_to_ndarray (line 314) | def datetime_column_to_ndarray(col: ProtocolColumn) -> Tuple[np.ndarray,...
function buffer_to_ndarray (line 380) | def buffer_to_ndarray(
function bitmask_to_bool_ndarray (line 434) | def bitmask_to_bool_ndarray(
function set_nulls (line 488) | def set_nulls(
FILE: modin/core/dataframe/pandas/metadata/dtypes.py
class DtypesDescriptor (line 31) | class DtypesDescriptor:
method __init__ (line 62) | def __init__(
method update_parent (line 136) | def update_parent(self, new_parent: PandasDataframe):
method columns_order (line 150) | def columns_order(self) -> Optional[dict[int, IndexLabel]]:
method __repr__ (line 190) | def __repr__(self): # noqa: GL08
method __str__ (line 201) | def __str__(self): # noqa: GL08
method lazy_get (line 204) | def lazy_get(
method copy (line 259) | def copy(self) -> DtypesDescriptor:
method set_index (line 281) | def set_index(self, new_index: Union[pandas.Index, ModinIndex]) -> Dty...
method equals (line 326) | def equals(self, other: DtypesDescriptor) -> bool:
method is_materialized (line 349) | def is_materialized(self) -> bool:
method _materialize_all_names (line 359) | def _materialize_all_names(self):
method _materialize_cols_with_unknown_dtypes (line 375) | def _materialize_cols_with_unknown_dtypes(self):
method materialize (line 396) | def materialize(self):
method to_series (line 432) | def to_series(self) -> pandas.Series:
method get_dtypes_set (line 443) | def get_dtypes_set(self) -> set[DtypeObj]:
method _merge_dtypes (line 459) | def _merge_dtypes(
method concat (line 549) | def concat(
method _normalize_levels (line 635) | def _normalize_levels(columns, reference=None):
method _normalize_self_levels (line 715) | def _normalize_self_levels(self, reference=None):
class ModinDtypes (line 734) | class ModinDtypes:
method __init__ (line 743) | def __init__(
method __repr__ (line 758) | def __repr__(self): # noqa: GL08
method __str__ (line 761) | def __str__(self): # noqa: GL08
method is_materialized (line 765) | def is_materialized(self) -> bool:
method get_dtypes_set (line 775) | def get_dtypes_set(self) -> set[DtypeObj]:
method maybe_specify_new_frame_ref (line 789) | def maybe_specify_new_frame_ref(self, new_parent: PandasDataframe) -> ...
method lazy_get (line 811) | def lazy_get(self, ids: list, numeric_index: bool = False) -> ModinDty...
method concat (line 843) | def concat(cls, values: list, axis: int = 0) -> ModinDtypes:
method set_index (line 887) | def set_index(self, new_index: Union[pandas.Index, ModinIndex]) -> Mod...
method get (line 914) | def get(self) -> pandas.Series:
method __len__ (line 933) | def __len__(self):
method __reduce__ (line 949) | def __reduce__(self):
method __getattr__ (line 965) | def __getattr__(self, name):
method copy (line 991) | def copy(self) -> ModinDtypes:
method __getitem__ (line 1004) | def __getitem__(self, key): # noqa: GL08
method __setitem__ (line 1009) | def __setitem__(self, key, item): # noqa: GL08
method __iter__ (line 1014) | def __iter__(self): # noqa: GL08
method __contains__ (line 1019) | def __contains__(self, key): # noqa: GL08
class LazyProxyCategoricalDtype (line 1025) | class LazyProxyCategoricalDtype(pandas.CategoricalDtype):
method __init__ (line 1041) | def __init__(self, categories=None, ordered=False):
method update_dtypes (line 1052) | def update_dtypes(dtypes, new_parent):
method _update_proxy (line 1067) | def _update_proxy(self, parent, column_name):
method _build_proxy (line 1091) | def _build_proxy(cls, parent, column_name, materializer, dtype=None):
method _get_dtype (line 1117) | def _get_dtype(self):
method __reduce__ (line 1129) | def __reduce__(self):
method _categories (line 1145) | def _categories(self):
method _categories (line 1158) | def _categories(self, categories):
method _is_materialized (line 1172) | def _is_materialized(self) -> bool:
method _materialize_categories (line 1182) | def _materialize_categories(self):
function get_categories_dtype (line 1193) | def get_categories_dtype(
function extract_dtype (line 1214) | def extract_dtype(value) -> DtypeObj | pandas.Series:
FILE: modin/core/dataframe/pandas/metadata/index.py
class ModinIndex (line 24) | class ModinIndex:
method __init__ (line 47) | def __init__(self, value=None, axis=None, dtypes: Optional[pandas.Seri...
method maybe_get_dtypes (line 72) | def maybe_get_dtypes(self) -> Optional[pandas.Series]:
method _get_default_callable (line 92) | def _get_default_callable(dataframe_obj, axis):
method maybe_specify_new_frame_ref (line 108) | def maybe_specify_new_frame_ref(self, value, axis) -> "ModinIndex":
method is_materialized (line 146) | def is_materialized(self) -> bool:
method is_materialized_index (line 157) | def is_materialized_index(cls, index) -> bool:
method get (line 177) | def get(self, return_lengths=False) -> pandas.Index:
method equals (line 207) | def equals(self, other: "ModinIndex") -> bool:
method compare_partition_lengths_if_possible (line 231) | def compare_partition_lengths_if_possible(self, other: "ModinIndex"):
method __len__ (line 264) | def __len__(self):
method __reduce__ (line 280) | def __reduce__(self):
method __getitem__ (line 305) | def __getitem__(self, key):
method __getattr__ (line 321) | def __getattr__(self, name):
method copy (line 347) | def copy(self, copy_lengths=False) -> "ModinIndex":
FILE: modin/core/dataframe/pandas/partitioning/axis_partition.py
class PandasDataframeAxisPartition (line 33) | class PandasDataframeAxisPartition(BaseDataframeAxisPartition):
method __init__ (line 57) | def __init__(
method list_of_blocks (line 91) | def list_of_blocks(self):
method list_of_block_partitions (line 107) | def list_of_block_partitions(self) -> list:
method _get_drain_func (line 149) | def _get_drain_func(cls): # noqa: GL08
method drain_call_queue (line 152) | def drain_call_queue(self, num_splits=None):
method force_materialization (line 181) | def force_materialization(self, get_ip=False):
method apply (line 199) | def apply(
method split (line 311) | def split(
method deploy_splitting_func (line 354) | def deploy_splitting_func(
method deploy_axis_func (line 396) | def deploy_axis_func(
method deploy_func_between_two_axis_partitions (line 502) | def deploy_func_between_two_axis_partitions(
method drain (line 596) | def drain(cls, df: pandas.DataFrame, call_queue: list):
method mask (line 614) | def mask(self, row_indices, col_indices):
method to_pandas (line 636) | def to_pandas(self):
method to_numpy (line 646) | def to_numpy(self):
method length (line 658) | def length(self, materialize=True):
method width (line 687) | def width(self, materialize=True):
method wait (line 712) | def wait(self):
method add_to_apply_calls (line 716) | def add_to_apply_calls(self, func, *args, length=None, width=None, **k...
FILE: modin/core/dataframe/pandas/partitioning/partition.py
class PandasDataframePartition (line 33) | class PandasDataframePartition(
method __init__ (line 52) | def __init__(self):
method __constructor__ (line 66) | def __constructor__(self) -> type[PandasDataframePartition]:
method get (line 77) | def get(self):
method list_of_blocks (line 100) | def list_of_blocks(self):
method apply (line 114) | def apply(self, func, *args, **kwargs):
method add_to_apply_calls (line 140) | def add_to_apply_calls(self, func, *args, length=None, width=None, **k...
method drain_call_queue (line 174) | def drain_call_queue(self):
method wait (line 178) | def wait(self):
method to_pandas (line 182) | def to_pandas(self):
method to_numpy (line 199) | def to_numpy(self, **kwargs):
method _iloc (line 220) | def _iloc(df, row_labels, col_labels): # noqa: RT01, PR01
method mask (line 224) | def mask(self, row_labels, col_labels):
method put (line 277) | def put(cls, obj):
method preprocess_func (line 294) | def preprocess_func(cls, func):
method _length_extraction_fn (line 318) | def _length_extraction_fn(cls):
method _width_extraction_fn (line 330) | def _width_extraction_fn(cls):
method length (line 341) | def length(self, materialize=True):
method width (line 361) | def width(self, materialize=True):
method _identity (line 382) | def _identity(self):
method split (line 394) | def split(self, split_func, num_splits, *args):
method empty (line 424) | def empty(cls):
method _is_debug (line 435) | def _is_debug(self, logger=None):
FILE: modin/core/dataframe/pandas/partitioning/partition_manager.py
function wait_computations_if_benchmark_mode (line 52) | def wait_computations_if_benchmark_mode(func):
class PandasDataframePartitionManager (line 95) | class PandasDataframePartitionManager(
method materialize_futures (line 113) | def materialize_futures(cls, input_list):
method preprocess_func (line 143) | def preprocess_func(cls, map_func):
method create_partition_from_metadata (line 190) | def create_partition_from_metadata(
method column_partitions (line 216) | def column_partitions(cls, partitions, full_axis=True):
method row_partitions (line 246) | def row_partitions(cls, partitions):
method axis_partition (line 270) | def axis_partition(cls, partitions, axis, full_axis: bool = True):
method groupby_reduce (line 303) | def groupby_reduce(
method broadcast_apply_select_indices (line 361) | def broadcast_apply_select_indices(
method base_broadcast_apply (line 443) | def base_broadcast_apply(cls, axis, apply_func, left, right):
method broadcast_axis_partitions (line 498) | def broadcast_axis_partitions(
method base_map_partitions (line 615) | def base_map_partitions(
method broadcast_apply (line 658) | def broadcast_apply(
method map_partitions (line 708) | def map_partitions(
method lazy_map_partitions (line 773) | def lazy_map_partitions(
method map_axis_partitions (line 818) | def map_axis_partitions(
method map_partitions_joined_by_column (line 882) | def map_partitions_joined_by_column(
method concat (line 943) | def concat(cls, axis, left_parts, right_parts):
method to_pandas (line 989) | def to_pandas(cls, partitions):
method to_numpy (line 1008) | def to_numpy(cls, partitions, **kwargs):
method split_pandas_df_into_partitions (line 1029) | def split_pandas_df_into_partitions(
method from_pandas (line 1070) | def from_pandas(cls, df, return_dims=False):
method from_arrow (line 1152) | def from_arrow(cls, at, return_dims=False):
method get_objects_from_partitions (line 1172) | def get_objects_from_partitions(cls, partitions):
method wait_partitions (line 1200) | def wait_partitions(cls, partitions):
method get_indices (line 1220) | def get_indices(cls, axis, partitions, index_func=None):
method _apply_func_to_list_of_partitions_broadcast (line 1270) | def _apply_func_to_list_of_partitions_broadcast(
method _apply_func_to_list_of_partitions (line 1302) | def _apply_func_to_list_of_partitions(cls, func, partitions, **kwargs):
method combine (line 1328) | def combine(cls, partitions, new_index=None, new_columns=None):
method apply_func_to_select_indices (line 1377) | def apply_func_to_select_indices(
method apply_func_to_select_indices_along_full_axis (line 1499) | def apply_func_to_select_indices_along_full_axis(
method apply_func_to_indices_both_axis (line 1618) | def apply_func_to_indices_both_axis(
method n_ary_operation (line 1725) | def n_ary_operation(cls, left, func, right: list):
method finalize (line 1791) | def finalize(cls, partitions):
method rebalance_partitions (line 1803) | def rebalance_partitions(cls, partitions):
method shuffle_partitions (line 1937) | def shuffle_partitions(
FILE: modin/core/dataframe/pandas/utils.py
function concatenate (line 23) | def concatenate(dfs, copy=True):
function create_pandas_df_from_partitions (line 74) | def create_pandas_df_from_partitions(
FILE: modin/core/execution/dask/common/engine_wrapper.py
function get_dask_client (line 25) | def get_dask_client():
function _deploy_dask_func (line 43) | def _deploy_dask_func(func, *args, return_pandas_df=None, **kwargs): # ...
class DaskWrapper (line 69) | class DaskWrapper:
method deploy (line 73) | def deploy(
method is_future (line 128) | def is_future(cls, item):
method materialize (line 145) | def materialize(cls, future):
method put (line 163) | def put(cls, data, **kwargs):
method wait (line 190) | def wait(cls, obj_ids, num_returns=None):
FILE: modin/core/execution/dask/common/utils.py
function initialize_dask (line 30) | def initialize_dask():
FILE: modin/core/execution/dask/implementations/pandas_on_dask/dataframe/dataframe.py
class PandasOnDaskDataframe (line 22) | class PandasOnDaskDataframe(PandasDataframe):
method reconnect (line 49) | def reconnect(cls, address, attributes): # noqa: GL08
method __reduce__ (line 65) | def __reduce__(self): # noqa: GL08
method engine (line 73) | def engine(self) -> str:
FILE: modin/core/execution/dask/implementations/pandas_on_dask/io/io.py
class PandasOnDaskIO (line 68) | class PandasOnDaskIO(BaseIO):
method __make_read (line 81) | def __make_read(*classes, build_args=build_args):
method __make_write (line 85) | def __make_write(*classes, build_args=build_args):
method from_dask (line 142) | def from_dask(cls, dask_obj):
method to_dask (line 162) | def to_dask(cls, modin_obj):
method from_map (line 195) | def from_map(cls, func, iterable, *args, **kwargs):
FILE: modin/core/execution/dask/implementations/pandas_on_dask/partitioning/partition.py
class PandasOnDaskDataframePartition (line 26) | class PandasOnDaskDataframePartition(PandasDataframePartition):
method __init__ (line 46) | def __init__(self, data, length=None, width=None, ip=None, call_queue=...
method apply (line 67) | def apply(self, func, *args, **kwargs):
method drain_call_queue (line 117) | def drain_call_queue(self):
method wait (line 155) | def wait(self):
method mask (line 160) | def mask(self, row_labels, col_labels):
method __copy__ (line 198) | def __copy__(self):
method put (line 216) | def put(cls, obj):
method preprocess_func (line 237) | def preprocess_func(cls, func):
method length (line 253) | def length(self, materialize=True):
method width (line 275) | def width(self, materialize=True):
method ip (line 297) | def ip(self, materialize=True):
function apply_func (line 320) | def apply_func(partition, func, *args, **kwargs):
function apply_list_of_funcs (line 351) | def apply_list_of_funcs(call_queue, partition):
FILE: modin/core/execution/dask/implementations/pandas_on_dask/partitioning/partition_manager.py
class PandasOnDaskDataframePartitionManager (line 28) | class PandasOnDaskDataframePartitionManager(PandasDataframePartitionMana...
method wait_partitions (line 38) | def wait_partitions(cls, partitions):
FILE: modin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.py
class PandasOnDaskDataframeVirtualPartition (line 28) | class PandasOnDaskDataframeVirtualPartition(PandasDataframeAxisPartition):
method list_of_ips (line 55) | def list_of_ips(self):
method deploy_splitting_func (line 73) | def deploy_splitting_func(
method deploy_axis_func (line 104) | def deploy_axis_func(
method deploy_func_between_two_axis_partitions (line 172) | def deploy_func_between_two_axis_partitions(
method wait (line 234) | def wait(self):
class PandasOnDaskDataframeColumnPartition (line 241) | class PandasOnDaskDataframeColumnPartition(PandasOnDaskDataframeVirtualP...
class PandasOnDaskDataframeRowPartition (line 246) | class PandasOnDaskDataframeRowPartition(PandasOnDaskDataframeVirtualPart...
function _deploy_dask_func (line 250) | def _deploy_dask_func(
FILE: modin/core/execution/dispatching/factories/dispatcher.py
class FactoryNotFoundError (line 30) | class FactoryNotFoundError(AttributeError):
class StubIoEngine (line 40) | class StubIoEngine(object):
method __init__ (line 54) | def __init__(self, factory_name=""):
method __getattr__ (line 57) | def __getattr__(self, name):
class StubFactory (line 79) | class StubFactory(factories.BaseFactory):
method set_failing_name (line 91) | def set_failing_name(cls, factory_name):
class FactoryDispatcher (line 104) | class FactoryDispatcher(object):
method get_factory (line 115) | def get_factory(cls) -> factories.BaseFactory:
method _get_prepared_factory_for_backend (line 129) | def _get_prepared_factory_for_backend(cls, backend) -> factories.BaseF...
method _update_factory (line 175) | def _update_factory(cls, *args):
method from_pandas (line 188) | def from_pandas(
method from_arrow (line 215) | def from_arrow(cls, at):
method from_non_pandas (line 220) | def from_non_pandas(cls, *args, **kwargs):
method from_interchange_dataframe (line 225) | def from_interchange_dataframe(cls, *args, **kwargs):
method from_ray (line 230) | def from_ray(cls, ray_obj):
method from_dask (line 235) | def from_dask(cls, dask_obj):
method from_map (line 240) | def from_map(cls, func, iterable, *args, **kwargs):
method read_parquet (line 245) | def read_parquet(cls, **kwargs):
method read_csv (line 250) | def read_csv(cls, **kwargs):
method read_csv_glob (line 255) | def read_csv_glob(cls, **kwargs):
method read_pickle_glob (line 260) | def read_pickle_glob(cls, **kwargs):
method read_json (line 265) | def read_json(cls, **kwargs):
method read_gbq (line 270) | def read_gbq(cls, **kwargs):
method read_html (line 275) | def read_html(cls, **kwargs):
method read_clipboard (line 280) | def read_clipboard(cls, **kwargs):
method read_excel (line 285) | def read_excel(cls, **kwargs):
method read_hdf (line 290) | def read_hdf(cls, **kwargs):
method read_feather (line 295) | def read_feather(cls, **kwargs):
method read_stata (line 300) | def read_stata(cls, **kwargs):
method read_sas (line 305) | def read_sas(cls, **kwargs): # pragma: no cover
method read_pickle (line 310) | def read_pickle(cls, **kwargs):
method read_sql (line 315) | def read_sql(cls, **kwargs):
method read_sql_distributed (line 320) | def read_sql_distributed(cls, **kwargs):
method read_fwf (line 325) | def read_fwf(cls, **kwargs):
method read_sql_table (line 330) | def read_sql_table(cls, **kwargs):
method read_sql_query (line 335) | def read_sql_query(cls, **kwargs):
method read_spss (line 340) | def read_spss(cls, **kwargs):
method to_sql (line 345) | def to_sql(cls, *args, **kwargs):
method to_pickle (line 350) | def to_pickle(cls, *args, **kwargs):
method to_pickle_glob (line 355) | def to_pickle_glob(cls, *args, **kwargs):
method read_parquet_glob (line 360) | def read_parquet_glob(cls, *args, **kwargs):
method to_parquet_glob (line 365) | def to_parquet_glob(cls, *args, **kwargs):
method read_json_glob (line 370) | def read_json_glob(cls, *args, **kwargs):
method to_json_glob (line 375) | def to_json_glob(cls, *args, **kwargs):
method read_xml_glob (line 380) | def read_xml_glob(cls, *args, **kwargs):
method to_xml_glob (line 385) | def to_xml_glob(cls, *args, **kwargs):
method read_custom_text (line 390) | def read_custom_text(cls, **kwargs):
method to_csv (line 395) | def to_csv(cls, *args, **kwargs):
method to_json (line 400) | def to_json(cls, *args, **kwargs):
method to_json_series (line 405) | def to_json_series(cls, *args, **kwargs):
method to_xml (line 410) | def to_xml(cls, *args, **kwargs):
method to_parquet (line 415) | def to_parquet(cls, *args, **kwargs):
method to_ray (line 420) | def to_ray(cls, modin_obj):
method to_dask (line 425) | def to_dask(cls, modin_obj):
FILE: modin/core/execution/dispatching/factories/factories.py
class FactoryInfo (line 103) | class FactoryInfo(typing.NamedTuple):
class NotRealFactory (line 122) | class NotRealFactory(Exception):
class BaseFactory (line 133) | class BaseFactory(object):
method get_info (line 137) | def get_info(cls) -> FactoryInfo:
method prepare (line 161) | def prepare(cls):
method _from_pandas (line 171) | def _from_pandas(cls, df):
method _from_arrow (line 181) | def _from_arrow(cls, at):
method _from_non_pandas (line 191) | def _from_non_pandas(cls, *args, **kwargs):
method _from_interchange_dataframe (line 201) | def _from_interchange_dataframe(cls, *args, **kwargs):
method _from_ray (line 211) | def _from_ray(cls, ray_obj):
method _from_dask (line 221) | def _from_dask(cls, dask_obj):
method _from_map (line 225) | def _from_map(cls, func, iterable, *args, **kwargs):
method _read_parquet (line 257) | def _read_parquet(cls, **kwargs):
method _read_csv (line 267) | def _read_csv(cls, **kwargs):
method _read_json (line 277) | def _read_json(cls, **kwargs):
method _read_gbq (line 287) | def _read_gbq(cls, **kwargs):
method _read_html (line 297) | def _read_html(cls, **kwargs):
method _read_clipboard (line 307) | def _read_clipboard(cls, **kwargs): # pragma: no cover
method _read_excel (line 317) | def _read_excel(cls, **kwargs):
method _read_hdf (line 327) | def _read_hdf(cls, **kwargs):
method _read_feather (line 337) | def _read_feather(cls, **kwargs):
method _read_stata (line 347) | def _read_stata(cls, **kwargs):
method _read_sas (line 357) | def _read_sas(cls, **kwargs): # pragma: no cover
method _read_pickle (line 367) | def _read_pickle(cls, **kwargs):
method _read_sql (line 377) | def _read_sql(cls, **kwargs):
method _read_fwf (line 387) | def _read_fwf(cls, **kwargs):
method _read_sql_table (line 397) | def _read_sql_table(cls, **kwargs):
method _read_sql_query (line 407) | def _read_sql_query(cls, **kwargs):
method _read_spss (line 417) | def _read_spss(cls, **kwargs):
method _to_sql (line 421) | def _to_sql(cls, *args, **kwargs):
method _to_pickle (line 435) | def _to_pickle(cls, *args, **kwargs):
method _to_csv (line 449) | def _to_csv(cls, *args, **kwargs):
method _to_json (line 463) | def _to_json(cls, *args, **kwargs):
method _to_json_series (line 477) | def _to_json_series(cls, *args, **kwargs):
method _to_xml (line 491) | def _to_xml(cls, *args, **kwargs):
method _to_parquet (line 505) | def _to_parquet(cls, *args, **kwargs):
method _to_ray (line 519) | def _to_ray(cls, modin_obj):
method _to_dask (line 540) | def _to_dask(cls, modin_obj):
method _read_csv_glob (line 567) | def _read_csv_glob(cls, **kwargs):
method _read_pickle_glob (line 581) | def _read_pickle_glob(cls, **kwargs):
method _read_sql_distributed (line 595) | def _read_sql_distributed(cls, **kwargs):
method _read_custom_text (line 622) | def _read_custom_text(cls, **kwargs):
method _to_pickle_glob (line 631) | def _to_pickle_glob(cls, *args, **kwargs):
method _read_parquet_glob (line 657) | def _read_parquet_glob(cls, **kwargs):
method _to_parquet_glob (line 666) | def _to_parquet_glob(cls, *args, **kwargs):
method _read_json_glob (line 690) | def _read_json_glob(cls, **kwargs):
method _to_json_glob (line 699) | def _to_json_glob(cls, *args, **kwargs):
method _read_xml_glob (line 723) | def _read_xml_glob(cls, **kwargs):
method _to_xml_glob (line 732) | def _to_xml_glob(cls, *args, **kwargs):
class PandasOnRayFactory (line 752) | class PandasOnRayFactory(BaseFactory):
method prepare (line 755) | def prepare(cls):
class PandasOnPythonFactory (line 764) | class PandasOnPythonFactory(BaseFactory):
method prepare (line 767) | def prepare(cls):
class PandasOnDaskFactory (line 776) | class PandasOnDaskFactory(BaseFactory):
method prepare (line 779) | def prepare(cls):
class PandasOnUnidistFactory (line 788) | class PandasOnUnidistFactory(BaseFactory):
method prepare (line 791) | def prepare(cls):
class NativeIO (line 799) | class NativeIO(BaseIO):
class NativeOnNativeFactory (line 812) | class NativeOnNativeFactory(BaseFactory):
method prepare (line 816) | def prepare(cls):
FILE: modin/core/execution/modin_aqp.py
function call_progress_bar (line 32) | def call_progress_bar(result_parts, line_no):
function display_time_updates (line 119) | def display_time_updates(bar):
function _show_time_updates (line 131) | def _show_time_updates(p_bar):
function progress_bar_wrapper (line 146) | def progress_bar_wrapper(f):
FILE: modin/core/execution/python/common/engine_wrapper.py
class PythonWrapper (line 17) | class PythonWrapper:
method deploy (line 21) | def deploy(cls, func, f_args=None, f_kwargs=None, num_returns=1):
method is_future (line 45) | def is_future(cls, item):
method materialize (line 61) | def materialize(cls, obj_id):
method put (line 80) | def put(cls, data, **kwargs):
FILE: modin/core/execution/python/implementations/pandas_on_python/dataframe/dataframe.py
class PandasOnPythonDataframe (line 26) | class PandasOnPythonDataframe(PandasDataframe):
method engine (line 57) | def engine(self) -> str:
FILE: modin/core/execution/python/implementations/pandas_on_python/io/io.py
class PandasOnPythonIO (line 23) | class PandasOnPythonIO(BaseIO):
FILE: modin/core/execution/python/implementations/pandas_on_python/partitioning/partition.py
class PandasOnPythonDataframePartition (line 22) | class PandasOnPythonDataframePartition(PandasDataframePartition):
method __init__ (line 49) | def __init__(self, data, length=None, width=None, call_queue=None):
method get (line 60) | def get(self):
method apply (line 76) | def apply(self, func, *args, **kwargs):
method drain_call_queue (line 125) | def drain_call_queue(self):
method wait (line 131) | def wait(self):
method put (line 140) | def put(cls, obj):
method preprocess_func (line 157) | def preprocess_func(cls, func):
FILE: modin/core/execution/python/implementations/pandas_on_python/partitioning/partition_manager.py
class PandasOnPythonDataframePartitionManager (line 28) | class PandasOnPythonDataframePartitionManager(PandasDataframePartitionMa...
FILE: modin/core/execution/python/implementations/pandas_on_python/partitioning/virtual_partition.py
class PandasOnPythonDataframeAxisPartition (line 24) | class PandasOnPythonDataframeAxisPartition(PandasDataframeAxisPartition):
class PandasOnPythonDataframeColumnPartition (line 52) | class PandasOnPythonDataframeColumnPartition(PandasOnPythonDataframeAxis...
class PandasOnPythonDataframeRowPartition (line 57) | class PandasOnPythonDataframeRowPartition(PandasOnPythonDataframeAxisPar...
FILE: modin/core/execution/ray/common/deferred_execution.py
class DeferredExecution (line 43) | class DeferredExecution:
method __init__ (line 89) | def __init__(
method _flat_args (line 113) | def _flat_args(cls, args: Iterable):
method exec (line 135) | def exec(
method has_result (line 187) | def has_result(self):
method subscribe (line 197) | def subscribe(self):
method unsubscribe (line 209) | def unsubscribe(self):
method _deconstruct (line 214) | def _deconstruct(self) -> Tuple[List["DeferredExecution"], List[Any]]:
method _deconstruct_chain (line 276) | def _deconstruct_chain(
method _deconstruct_list (line 367) | def _deconstruct_list(
method _remote_exec_chain (line 419) | def _remote_exec_chain(num_returns: int, *args: Tuple) -> List[Any]:
method _set_result (line 446) | def _set_result(
method __reduce__ (line 466) | def __reduce__(self):
class MetaList (line 471) | class MetaList:
method __init__ (line 480) | def __init__(self, obj: Union[ray.ObjectID, List]):
method __getitem__ (line 483) | def __getitem__(self, index):
method __setitem__ (line 498) | def __setitem__(self, index, value):
class MetaListHook (line 513) | class MetaListHook(MaterializationHook):
method __init__ (line 525) | def __init__(self, meta: MetaList, idx: int):
method pre_materialize (line 529) | def pre_materialize(self):
method post_materialize (line 540) | def post_materialize(self, materialized):
class _Tag (line 556) | class _Tag(Enum): # noqa: PR01
class _RemoteExecutor (line 573) | class _RemoteExecutor:
method exec_func (line 577) | def exec_func(fn: Callable, obj: Any, args: Tuple, kwargs: Dict) -> Any:
method construct (line 610) | def construct(cls, num_returns: int, args: Tuple): # pragma: no cover
method construct_chain (line 650) | def construct_chain(
method construct_list (line 728) | def construct_list(
method __reduce__ (line 769) | def __reduce__(self):
function remote_exec_func (line 785) | def remote_exec_func(
function _remote_exec_single_chain (line 822) | def _remote_exec_single_chain(
function _remote_exec_multi_chain (line 843) | def _remote_exec_multi_chain(
FILE: modin/core/execution/ray/common/engine_wrapper.py
function _deploy_ray_func (line 33) | def _deploy_ray_func(func, *args, return_pandas_df=None, **kwargs): # p...
class RayWrapper (line 59) | class RayWrapper:
method deploy (line 65) | def deploy(
method is_future (line 96) | def is_future(cls, item):
method materialize (line 113) | def materialize(cls, obj_id):
method put (line 182) | def put(cls, data, **kwargs):
method wait (line 214) | def wait(cls, obj_ids, num_returns=None):
class SignalActor (line 241) | class SignalActor: # pragma: no cover
method __init__ (line 253) | def __init__(self, event_count: int):
method send (line 256) | def send(self, event_idx: int):
method wait (line 266) | async def wait(self, event_idx: int):
method is_set (line 276) | def is_set(self, event_idx: int) -> bool:
class MaterializationHook (line 291) | class MaterializationHook:
method pre_materialize (line 294) | def pre_materialize(self):
method post_materialize (line 304) | def post_materialize(self, materialized):
method __reduce__ (line 320) | def __reduce__(self):
FILE: modin/core/execution/ray/common/utils.py
function initialize_ray (line 55) | def initialize_ray(
function _get_object_store_memory (line 165) | def _get_object_store_memory() -> Optional[int]:
function deserialize (line 225) | def deserialize(obj): # pragma: no cover
FILE: modin/core/execution/ray/generic/io/io.py
class RayIO (line 19) | class RayIO(BaseIO):
method from_ray (line 23) | def from_ray(cls, ray_obj):
method to_ray (line 47) | def to_ray(cls, modin_obj):
FILE: modin/core/execution/ray/generic/partitioning/partition_manager.py
class GenericRayDataframePartitionManager (line 24) | class GenericRayDataframePartitionManager(PandasDataframePartitionManager):
method to_numpy (line 28) | def to_numpy(cls, partitions, **kwargs):
FILE: modin/core/execution/ray/implementations/pandas_on_ray/dataframe/dataframe.py
class PandasOnRayDataframe (line 23) | class PandasOnRayDataframe(PandasDataframe):
method _get_lengths (line 49) | def _get_lengths(self, parts, axis):
method engine (line 73) | def engine(self) -> str:
FILE: modin/core/execution/ray/implementations/pandas_on_ray/io/io.py
class PandasOnRayIO (line 68) | class PandasOnRayIO(RayIO):
method __make_read (line 81) | def __make_read(*classes, build_args=build_args):
method __make_write (line 85) | def __make_write(*classes, build_args=build_args):
method _to_csv_check_support (line 142) | def _to_csv_check_support(kwargs):
method to_csv (line 179) | def to_csv(cls, qc, **kwargs):
method from_ray (line 273) | def from_ray(cls, ray_obj):
method to_ray (line 291) | def to_ray(cls, modin_obj):
method from_map (line 309) | def from_map(cls, func, iterable, *args, **kwargs):
FILE: modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition.py
class PandasOnRayDataframePartition (line 35) | class PandasOnRayDataframePartition(PandasDataframePartition):
method __init__ (line 58) | def __init__(
method __del__ (line 94) | def __del__(self):
method apply (line 99) | def apply(self, func: Union[Callable, ray.ObjectRef], *args, **kwargs):
method add_to_apply_calls (line 130) | def add_to_apply_calls(
method drain_call_queue (line 145) | def drain_call_queue(self):
method wait (line 160) | def wait(self):
method __copy__ (line 164) | def __copy__(self):
method mask (line 179) | def mask(self, row_labels, col_labels):
method put (line 218) | def put(cls, obj: pandas.DataFrame):
method preprocess_func (line 235) | def preprocess_func(cls, func):
method length (line 251) | def length(self, materialize=True):
method width (line 278) | def width(self, materialize=True):
method ip (line 305) | def ip(self, materialize=True):
method _data (line 328) | def _data(self) -> ray.ObjectRef: # noqa: GL08
method _length_cache (line 333) | def _length_cache(self): # noqa: GL08
method _length_cache (line 337) | def _length_cache(self, value): # noqa: GL08
method _width_cache (line 341) | def _width_cache(self): # noqa: GL08
method _width_cache (line 345) | def _width_cache(self, value): # noqa: GL08
method _ip_cache (line 349) | def _ip_cache(self): # noqa: GL08
method _ip_cache (line 353) | def _ip_cache(self, value): # noqa: GL08
function _get_index_and_columns (line 358) | def _get_index_and_columns(df): # pragma: no cover
function _configure_lazy_exec (line 383) | def _configure_lazy_exec(cls: LazyExecution):
class SlicerHook (line 419) | class SlicerHook(MaterializationHook):
method __init__ (line 431) | def __init__(self, ref: ObjectIDType, slc: slice):
method pre_materialize (line 435) | def pre_materialize(self):
method post_materialize (line 452) | def post_materialize(self, materialized):
FILE: modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition_manager.py
class PandasOnRayDataframePartitionManager (line 35) | class PandasOnRayDataframePartitionManager(GenericRayDataframePartitionM...
method wait_partitions (line 46) | def wait_partitions(cls, partitions):
method split_pandas_df_into_partitions (line 65) | def split_pandas_df_into_partitions(
function _make_wrapped_method (line 123) | def _make_wrapped_method(name: str):
FILE: modin/core/execution/ray/implementations/pandas_on_ray/partitioning/virtual_partition.py
class PandasOnRayDataframeVirtualPartition (line 30) | class PandasOnRayDataframeVirtualPartition(PandasDataframeAxisPartition):
method _get_deploy_axis_func (line 62) | def _get_deploy_axis_func(cls): # noqa: GL08
method _get_deploy_split_func (line 70) | def _get_deploy_split_func(cls): # noqa: GL08
method _get_drain_func (line 78) | def _get_drain_func(cls): # noqa: GL08
method list_of_ips (line 84) | def list_of_ips(self):
method deploy_splitting_func (line 102) | def deploy_splitting_func(
method deploy_axis_func (line 132) | def deploy_axis_func(
method deploy_func_between_two_axis_partitions (line 202) | def deploy_func_between_two_axis_partitions(
method wait (line 262) | def wait(self):
class PandasOnRayDataframeColumnPartition (line 270) | class PandasOnRayDataframeColumnPartition(PandasOnRayDataframeVirtualPar...
class PandasOnRayDataframeRowPartition (line 275) | class PandasOnRayDataframeRowPartition(PandasOnRayDataframeVirtualPartit...
function _deploy_ray_func (line 280) | def _deploy_ray_func(
FILE: modin/core/execution/unidist/common/engine_wrapper.py
function _deploy_unidist_func (line 27) | def _deploy_unidist_func(
class UnidistWrapper (line 55) | class UnidistWrapper:
method deploy (line 59) | def deploy(
method is_future (line 90) | def is_future(cls, item):
method materialize (line 107) | def materialize(cls, obj_id):
method put (line 124) | def put(cls, data, **kwargs):
method wait (line 143) | def wait(cls, obj_ids, num_returns=None):
class SignalActor (line 165) | class SignalActor: # pragma: no cover
method __init__ (line 179) | def __init__(self, event_count: int):
method send (line 182) | def send(self, event_idx: int):
method wait (line 192) | async def wait(self, event_idx: int):
FILE: modin/core/execution/unidist/common/utils.py
function initialize_unidist (line 24) | def initialize_unidist():
function deserialize (line 48) | def deserialize(obj): # pragma: no cover
FILE: modin/core/execution/unidist/generic/io/io.py
class UnidistIO (line 19) | class UnidistIO(BaseIO):
FILE: modin/core/execution/unidist/generic/partitioning/partition_manager.py
class GenericUnidistDataframePartitionManager (line 24) | class GenericUnidistDataframePartitionManager(PandasDataframePartitionMa...
method to_numpy (line 28) | def to_numpy(cls, partitions, **kwargs):
FILE: modin/core/execution/unidist/implementations/pandas_on_unidist/dataframe/dataframe.py
class PandasOnUnidistDataframe (line 22) | class PandasOnUnidistDataframe(PandasDataframe):
method support_materialization_in_worker_process (line 48) | def support_materialization_in_worker_process(self) -> bool:
method engine (line 54) | def engine(self) -> str:
FILE: modin/core/execution/unidist/implementations/pandas_on_unidist/io/io.py
class PandasOnUnidistIO (line 62) | class PandasOnUnidistIO(UnidistIO):
method __make_read (line 75) | def __make_read(*classes, build_args=build_args):
method __make_write (line 79) | def __make_write(*classes, build_args=build_args):
method _to_csv_check_support (line 136) | def _to_csv_check_support(kwargs):
method to_csv (line 173) | def to_csv(cls, qc, **kwargs):
method from_map (line 265) | def from_map(cls, func, iterable, *args, **kwargs):
FILE: modin/core/execution/unidist/implementations/pandas_on_unidist/partitioning/partition.py
class PandasOnUnidistDataframePartition (line 30) | class PandasOnUnidistDataframePartition(PandasDataframePartition):
method __init__ (line 50) | def __init__(self, data, length=None, width=None, ip=None, call_queue=...
method apply (line 69) | def apply(self, func, *args, **kwargs):
method drain_call_queue (line 109) | def drain_call_queue(self):
method wait (line 152) | def wait(self):
method mask (line 157) | def mask(self, row_labels, col_labels):
method put (line 196) | def put(cls, obj):
method preprocess_func (line 213) | def preprocess_func(cls, func):
method length (line 229) | def length(self, materialize=True):
method width (line 257) | def width(self, materialize=True):
method ip (line 285) | def ip(self, materialize=True):
function _get_index_and_columns_size (line 312) | def _get_index_and_columns_size(df): # pragma: no cover
function _apply_func (line 332) | def _apply_func(partition, func, *args, **kwargs): # pragma: no cover
function _apply_list_of_funcs (line 383) | def _apply_list_of_funcs(call_queue, partition): # pragma: no cover
FILE: modin/core/execution/unidist/implementations/pandas_on_unidist/partitioning/partition_manager.py
class PandasOnUnidistDataframePartitionManager (line 29) | class PandasOnUnidistDataframePartitionManager(GenericUnidistDataframePa...
method wait_partitions (line 39) | def wait_partitions(cls, partitions):
function _make_wrapped_method (line 55) | def _make_wrapped_method(name: str):
FILE: modin/core/execution/unidist/implementations/pandas_on_unidist/partitioning/virtual_partition.py
class PandasOnUnidistDataframeVirtualPartition (line 31) | class PandasOnUnidistDataframeVirtualPartition(PandasDataframeAxisPartit...
method _get_deploy_axis_func (line 63) | def _get_deploy_axis_func(cls): # noqa: GL08
method _get_deploy_split_func (line 71) | def _get_deploy_split_func(cls): # noqa: GL08
method _get_drain_func (line 79) | def _get_drain_func(cls): # noqa: GL08
method list_of_ips (line 85) | def list_of_ips(self):
method deploy_splitting_func (line 103) | def deploy_splitting_func(
method deploy_axis_func (line 131) | def deploy_axis_func(
method deploy_func_between_two_axis_partitions (line 198) | def deploy_func_between_two_axis_partitions(
method wait (line 255) | def wait(self):
class PandasOnUnidistDataframeColumnPartition (line 263) | class PandasOnUnidistDataframeColumnPartition(PandasOnUnidistDataframeVi...
class PandasOnUnidistDataframeRowPartition (line 268) | class PandasOnUnidistDataframeRowPartition(PandasOnUnidistDataframeVirtu...
function _deploy_unidist_func (line 273) | def _deploy_unidist_func(
FILE: modin/core/execution/utils.py
function set_env (line 23) | def set_env(**environ):
function remote_function (line 39) | def remote_function(func, ignore_defaults=False):
function remote_function (line 62) | def remote_function(func, ignore_defaults=False):
function remote_function (line 67) | def remote_function(func, ignore_defaults=False):
function remote_function (line 73) | def remote_function(func, ignore_defaults=False): # noqa: F811
FILE: modin/core/io/column_stores/column_store_dispatcher.py
class ColumnStoreDispatcher (line 30) | class ColumnStoreDispatcher(FileDispatcher):
method call_deploy (line 38) | def call_deploy(cls, fname, col_partitions, **kwargs):
method build_partition (line 74) | def build_partition(cls, partition_ids, row_lengths, column_widths):
method build_index (line 108) | def build_index(cls, partition_ids):
method build_columns (line 149) | def build_columns(cls, columns, num_row_parts=None):
method build_dtypes (line 193) | def build_dtypes(cls, partition_ids, columns):
method build_query_compiler (line 214) | def build_query_compiler(cls, path, columns, **kwargs):
FILE: modin/core/io/column_stores/feather_dispatcher.py
class FeatherDispatcher (line 23) | class FeatherDispatcher(ColumnStoreDispatcher):
method _read (line 27) | def _read(cls, path, columns=None, **kwargs):
FILE: modin/core/io/column_stores/hdf_dispatcher.py
class HDFDispatcher (line 21) | class HDFDispatcher(ColumnStoreDispatcher): # pragma: no cover
method _validate_hdf_format (line 30) | def _validate_hdf_format(cls, path_or_buf):
method _read (line 54) | def _read(cls, path_or_buf, **kwargs):
FILE: modin/core/io/column_stores/parquet_dispatcher.py
class ColumnStoreDataset (line 42) | class ColumnStoreDataset:
method __init__ (line 66) | def __init__(self, path, storage_options): # noqa : PR01
method pandas_metadata (line 74) | def pandas_metadata(self):
method columns (line 79) | def columns(self):
method engine (line 84) | def engine(self):
method files (line 89) | def files(self):
method row_groups_per_file (line 94) | def row_groups_per_file(self):
method fs (line 99) | def fs(self):
method fs_path (line 116) | def fs_path(self):
method to_pandas_dataframe (line 132) | def to_pandas_dataframe(self, columns):
method _get_files (line 143) | def _get_files(self, files):
class PyArrowDataset (line 180) | class PyArrowDataset(ColumnStoreDataset):
method _init_dataset (line 181) | def _init_dataset(self): # noqa: GL08
method pandas_metadata (line 187) | def pandas_metadata(self):
method columns (line 191) | def columns(self):
method engine (line 195) | def engine(self):
method row_groups_per_file (line 199) | def row_groups_per_file(self):
method files (line 212) | def files(self):
method to_pandas_dataframe (line 216) | def to_pandas_dataframe(
class FastParquetDataset (line 228) | class FastParquetDataset(ColumnStoreDataset):
method _init_dataset (line 229) | def _init_dataset(self): # noqa: GL08
method pandas_metadata (line 235) | def pandas_metadata(self):
method columns (line 241) | def columns(self):
method engine (line 245) | def engine(self):
method row_groups_per_file (line 249) | def row_groups_per_file(self):
method files (line 262) | def files(self):
method to_pandas_dataframe (line 265) | def to_pandas_dataframe(self, columns):
method _get_fastparquet_files (line 274) | def _get_fastparquet_files(self): # noqa: GL08
class ParquetDispatcher (line 298) | class ParquetDispatcher(ColumnStoreDispatcher):
method get_dataset (line 304) | def get_dataset(cls, path, engine, storage_options):
method _determine_partitioning (line 350) | def _determine_partitioning(
method call_deploy (line 424) | def call_deploy(
method build_partition (line 480) | def build_partition(cls, partition_ids, column_widths):
method build_index (line 518) | def build_index(cls, dataset, partition_ids, index_columns, filters):
method _normalize_partitioning (line 630) | def _normalize_partitioning(cls, remote_parts, row_lengths, column_wid...
method build_query_compiler (line 739) | def build_query_compiler(cls, dataset, columns, index_columns, **kwargs):
method _read (line 804) | def _read(cls, path, engine, columns, use_nullable_dtypes, dtype_backe...
method write (line 912) | def write(cls, qc, **kwargs):
FILE: modin/core/io/file_dispatcher.py
class OpenFile (line 35) | class OpenFile:
method __init__ (line 69) | def __init__(self, file_path, mode="rb", compression="infer", **kwargs):
method __enter__ (line 75) | def __enter__(self):
method __exit__ (line 104) | def __exit__(self, *args):
class FileDispatcher (line 116) | class FileDispatcher(ClassLogger, modin_layer="CORE-IO", log_level=LogLe...
method read (line 136) | def read(cls, *args, **kwargs):
method _read (line 177) | def _read(cls, *args, **kwargs):
method get_path (line 193) | def get_path(cls, file_path):
method file_size (line 219) | def file_size(cls, f):
method file_exists (line 240) | def file_exists(cls, file_path, storage_options=None):
method deploy (line 293) | def deploy(cls, func, *args, num_returns=1, **kwargs): # noqa: PR01
method parse (line 301) | def parse(self, func, args, num_returns): # noqa: PR01
method materialize (line 310) | def materialize(cls, obj_id): # noqa: PR01
method build_partition (line 319) | def build_partition(cls, partition_ids, row_lengths, column_widths):
method _file_not_found_msg (line 353) | def _file_not_found_msg(cls, filename: str): # noqa: GL08
FILE: modin/core/io/io.py
class BaseIO (line 48) | class BaseIO:
method _maybe_warn_on_default (line 56) | def _maybe_warn_on_default(cls, *, message: str = "", reason: str = ""...
method from_non_pandas (line 71) | def from_non_pandas(cls, *args, **kwargs):
method from_pandas (line 85) | def from_pandas(cls, df) -> BaseQueryCompiler:
method from_arrow (line 102) | def from_arrow(cls, at):
method from_interchange_dataframe (line 119) | def from_interchange_dataframe(cls, df):
method from_ray (line 136) | def from_ray(cls, ray_obj):
method from_dask (line 160) | def from_dask(cls, dask_obj):
method from_map (line 184) | def from_map(cls, func, iterable, *args, **kwargs):
method read_parquet (line 218) | def read_parquet(cls, **kwargs): # noqa: PR01
method read_csv (line 229) | def read_csv(
method read_json (line 252) | def read_json(
method read_gbq (line 266) | def read_gbq(
method read_html (line 310) | def read_html(
method read_clipboard (line 358) | def read_clipboard(cls, sep=r"\s+", **kwargs): # pragma: no cover # n...
method read_excel (line 370) | def read_excel(cls, **kwargs): # noqa: PR01
method read_hdf (line 394) | def read_hdf(
method read_feather (line 439) | def read_feather(
method read_stata (line 459) | def read_stata(
method read_sas (line 474) | def read_sas(
method read_pickle (line 505) | def read_pickle(
method read_sql (line 525) | def read_sql(
method read_fwf (line 565) | def read_fwf(
method read_sql_table (line 606) | def read_sql_table(
method read_sql_query (line 640) | def read_sql_query(
method read_spss (line 662) | def read_spss(
method to_sql (line 677) | def to_sql(
method to_pickle (line 713) | def to_pickle(
method to_csv (line 734) | def to_csv(cls, obj, **kwargs): # noqa: PR01
method to_json (line 748) | def to_json(cls, obj, path, **kwargs): # noqa: PR01
method to_json_series (line 762) | def to_json_series(cls, obj, path, **kwargs): # noqa: PR01
method to_xml (line 776) | def to_xml(cls, obj, path_or_buffer, **kwargs): # noqa: PR01
method to_parquet (line 792) | def to_parquet(cls, obj, path, **kwargs): # noqa: PR01
method to_ray (line 805) | def to_ray(cls, modin_obj):
method to_dask (line 829) | def to_dask(cls, modin_obj):
FILE: modin/core/io/sql/sql_dispatcher.py
class SQLDispatcher (line 32) | class SQLDispatcher(FileDispatcher):
method _is_supported_sqlalchemy_object (line 36) | def _is_supported_sqlalchemy_object(cls, obj): # noqa: GL08
method _read (line 47) | def _read(cls, sql, con, index_col=None, **kwargs):
method write (line 132) | def write(cls, qc, **kwargs):
FILE: modin/core/io/text/csv_dispatcher.py
class CSVDispatcher (line 19) | class CSVDispatcher(TextFileDispatcher):
FILE: modin/core/io/text/excel_dispatcher.py
class ExcelDispatcher (line 31) | class ExcelDispatcher(TextFileDispatcher):
method _read (line 35) | def _read(cls, io, **kwargs):
FILE: modin/core/io/text/fwf_dispatcher.py
class FWFDispatcher (line 21) | class FWFDispatcher(TextFileDispatcher):
method check_parameters_support (line 25) | def check_parameters_support(
FILE: modin/core/io/text/json_dispatcher.py
class JSONDispatcher (line 27) | class JSONDispatcher(TextFileDispatcher):
method _read (line 31) | def _read(cls, path_or_buf, **kwargs):
FILE: modin/core/io/text/text_file_dispatcher.py
class TextFileDispatcher (line 43) | class TextFileDispatcher(FileDispatcher):
method get_path_or_buffer (line 47) | def get_path_or_buffer(cls, filepath_or_buffer):
method build_partition (line 84) | def build_partition(cls, partition_ids, row_lengths, column_widths):
method pathlib_or_pypath (line 118) | def pathlib_or_pypath(cls, filepath_or_buffer):
method offset (line 150) | def offset(
method partitioned_file (line 207) | def partitioned_file(
method _read_rows (line 351) | def _read_rows(
method compute_newline (line 422) | def compute_newline(cls, file_like, encoding, quotechar):
method rows_skipper_builder (line 477) | def rows_skipper_builder(
method _define_header_size (line 518) | def _define_header_size(
method _define_metadata (line 549) | def _define_metadata(
method preprocess_func (line 603) | def preprocess_func(cls): # noqa: RT01
method _launch_tasks (line 610) | def _launch_tasks(
method check_parameters_support (line 651) | def check_parameters_support(
method _validate_usecols_arg (line 730) | def _validate_usecols_arg(cls, usecols):
method _manage_skiprows_parameter (line 753) | def _manage_skiprows_parameter(
method _define_index (line 856) | def _define_index(
method _get_new_qc (line 891) | def _get_new_qc(
method _read (line 988) | def _read(cls, filepath_or_buffer, **kwargs):
method _get_skip_mask (line 1171) | def _get_skip_mask(cls, rows_index: pandas.Index, skiprows: Callable):
method _uses_inferred_column_names (line 1201) | def _uses_inferred_column_names(names, skiprows, skipfooter, usecols):
FILE: modin/core/io/text/utils.py
class CustomNewlineIterator (line 19) | class CustomNewlineIterator:
method __init__ (line 31) | def __init__(self, _file, newline):
method __iter__ (line 36) | def __iter__(self):
method seek (line 61) | def seek(self):
FILE: modin/core/storage_formats/base/doc_utils.py
function add_deprecation_warning (line 40) | def add_deprecation_warning(replacement_method):
function add_refer_to (line 60) | def add_refer_to(method):
function doc_qc_method (line 78) | def doc_qc_method(
function doc_binary_method (line 138) | def doc_binary_method(operation, sign, self_on_right=False, op_type="ari...
function doc_reduce_agg (line 218) | def doc_reduce_agg(method, refer_to, params=None, extra_params=None):
function doc_resample_reduce (line 334) | def doc_resample_reduce(result, refer_to, params=None, compatibility_par...
function doc_resample_agg (line 391) | def doc_resample_agg(action, output, refer_to, params=None):
function doc_resample_fillna (line 442) | def doc_resample_fillna(method, refer_to, params=None, overwrite_templat...
function doc_window_method (line 543) | def doc_window_method(
function doc_groupby_method (line 635) | def doc_groupby_method(result, refer_to, action=None):
FILE: modin/core/storage_formats/base/query_compiler.py
function _get_axis (line 72) | def _get_axis(axis):
function _set_axis (line 93) | def _set_axis(axis):
class QCCoercionCost (line 116) | class QCCoercionCost(IntEnum): # noqa: PR01
method validate_coersion_cost (line 143) | def validate_coersion_cost(cls, cost: QCCoercionCost):
class BaseQueryCompiler (line 162) | class BaseQueryCompiler(
method _maybe_warn_on_default (line 210) | def _maybe_warn_on_default(cls, *, message: str = "", reason: str = ""...
method get_backend (line 225) | def get_backend(self) -> str:
method storage_format (line 243) | def storage_format(self) -> str:
method engine (line 256) | def engine(self) -> str:
method __wrap_in_qc (line 267) | def __wrap_in_qc(self, obj):
method default_to_pandas (line 290) | def default_to_pandas(self, pandas_op, *args, **kwargs) -> Self:
method move_to_cost (line 324) | def move_to_cost(
method _stay_cost_rows (line 376) | def _stay_cost_rows(
method stay_cost (line 410) | def stay_cost(
method move_to_me_cost (line 459) | def move_to_me_cost(
method _engine_max_size (line 509) | def _engine_max_size(cls) -> int:
method _transfer_threshold (line 514) | def _transfer_threshold(cls) -> int:
method max_cost (line 520) | def max_cost(cls) -> int:
method _max_shape (line 542) | def _max_shape(self) -> tuple[int, int]:
method lazy_shape (line 558) | def lazy_shape(self):
method add_prefix (line 573) | def add_prefix(self, prefix, axis=1):
method add_suffix (line 593) | def add_suffix(self, suffix, axis=1):
method copy (line 617) | def copy(self):
method concat (line 638) | def concat(self, axis, other, **kwargs): # noqa: PR02
method free (line 700) | def free(self):
method finalize (line 705) | def finalize(self):
method execute (line 710) | def execute(self):
method support_materialization_in_worker_process (line 714) | def support_materialization_in_worker_process(self) -> bool:
method move_to (line 727) | def move_to(self, target_backend: str) -> Union[BaseQueryCompiler, Any]:
method move_from (line 745) | def move_from(cls, source_qc: BaseQueryCompiler) -> Union[BaseQueryCom...
method to_pandas (line 766) | def to_pandas(self):
method from_pandas (line 779) | def from_pandas(cls, df, data_cls):
method from_arrow (line 803) | def from_arrow(cls, at, data_cls):
method to_numpy (line 826) | def to_numpy(self, **kwargs): # noqa: PR02
method do_array_ufunc_implementation (line 850) | def do_array_ufunc_implementation(
method do_array_function_implementation (line 922) | def do_array_function_implementation(
method to_interchange_dataframe (line 974) | def to_interchange_dataframe(
method from_interchange_dataframe (line 1005) | def from_interchange_dataframe(cls, df: ProtocolDataframe, data_cls):
method to_list (line 1026) | def to_list(self):
method dataframe_to_dict (line 1039) | def dataframe_to_dict(self, orient="dict", into=dict, index=True): # ...
method series_to_dict (line 1050) | def series_to_dict(self, into=dict): # noqa: PR01
method align (line 1067) | def align(self, other, **kwargs):
method add (line 1091) | def add(self, other, **kwargs): # noqa: PR02
method combine (line 1095) | def combine(self, other, **kwargs): # noqa: PR02
method combine_first (line 1126) | def combine_first(self, other, **kwargs): # noqa: PR02
method eq (line 1148) | def eq(self, other, **kwargs): # noqa: PR02
method series_eq (line 1154) | def series_eq(self, other, **kwargs): # noqa: PR02
method equals (line 1164) | def equals(self, other): # noqa: PR01, RT01
method floordiv (line 1168) | def floordiv(self, other, **kwargs): # noqa: PR02
method divmod (line 1174) | def divmod(self, other, **kwargs):
method ge (line 1198) | def ge(self, other, **kwargs): # noqa: PR02
method series_ge (line 1206) | def series_ge(self, other, **kwargs): # noqa: PR02
method gt (line 1218) | def gt(self, other, **kwargs): # noqa: PR02
method series_gt (line 1224) | def series_gt(self, other, **kwargs): # noqa: PR02
method le (line 1236) | def le(self, other, **kwargs): # noqa: PR02
method series_le (line 1244) | def series_le(self, other, **kwargs): # noqa: PR02
method lt (line 1256) | def lt(self, other, **kwargs): # noqa: PR02
method series_lt (line 1262) | def series_lt(self, other, **kwargs): # noqa: PR02
method mod (line 1272) | def mod(self, other, **kwargs): # noqa: PR02
method mul (line 1276) | def mul(self, other, **kwargs): # noqa: PR02
method rmul (line 1282) | def rmul(self, other, **kwargs): # noqa: PR02
method corr (line 1288) | def corr(self, **kwargs): # noqa: PR02
method series_corr (line 1311) | def series_corr(self, **kwargs): # noqa: PR01
method corrwith (line 1326) | def corrwith(self, **kwargs): # noqa: PR01
method cov (line 1337) | def cov(self, **kwargs): # noqa: PR02
method dot (line 1354) | def dot(self, other, **kwargs): # noqa: PR02
method ne (line 1383) | def ne(self, other, **kwargs): # noqa: PR02
method series_ne (line 1389) | def series_ne(self, other, **kwargs): # noqa: PR02
method pow (line 1399) | def pow(self, other, **kwargs): # noqa: PR02
method radd (line 1403) | def radd(self, other, **kwargs): # noqa: PR02
method rdivmod (line 1409) | def rdivmod(self, other, **kwargs):
method rfloordiv (line 1435) | def rfloordiv(self, other, **kwargs): # noqa: PR02
method rmod (line 1441) | def rmod(self, other, **kwargs): # noqa: PR02
method rpow (line 1449) | def rpow(self, other, **kwargs): # noqa: PR02
method rsub (line 1455) | def rsub(self, other, **kwargs): # noqa: PR02
method rtruediv (line 1461) | def rtruediv(self, other, **kwargs): # noqa: PR02
method sub (line 1467) | def sub(self, other, **kwargs): # noqa: PR02
method truediv (line 1471) | def truediv(self, other, **kwargs): # noqa: PR02
method __and__ (line 1477) | def __and__(self, other, **kwargs): # noqa: PR02
method __or__ (line 1483) | def __or__(self, other, **kwargs): # noqa: PR02
method __rand__ (line 1491) | def __rand__(self, other, **kwargs): # noqa: PR02
method __ror__ (line 1499) | def __ror__(self, other, **kwargs): # noqa: PR02
method __rxor__ (line 1507) | def __rxor__(self, other, **kwargs): # noqa: PR02
method __xor__ (line 1513) | def __xor__(self, other, **kwargs): # noqa: PR02
method df_update (line 1521) | def df_update(self, other, **kwargs): # noqa: PR02
method series_update (line 1555) | def series_update(self, other, **kwargs): # noqa: PR02
method asfreq (line 1580) | def asfreq(self, **kwargs): # noqa: PR01
method clip (line 1594) | def clip(self, lower, upper, **kwargs): # noqa: PR02
method where (line 1620) | def where(self, cond, other, **kwargs): # noqa: PR02
method merge (line 1650) | def merge(self, right, **kwargs): # noqa: PR02
method merge_ordered (line 1682) | def merge_ordered(self, right, **kwargs): # noqa: PR01
method _get_column_as_pandas_series (line 1692) | def _get_column_as_pandas_series(self, key):
method merge_asof (line 1713) | def merge_asof(
method join (line 1831) | def join(self, right, **kwargs): # noqa: PR02
method transpose (line 1857) | def transpose(self, *args, **kwargs): # noqa: PR02
method columnarize (line 1879) | def columnarize(self):
method is_series_like (line 1902) | def is_series_like(self):
method reindex (line 1917) | def reindex(self, axis, labels, **kwargs): # noqa: PR02
method reset_index (line 1946) | def reset_index(self, **kwargs): # noqa: PR02
method set_index_from_columns (line 1972) | def set_index_from_columns(
method is_monotonic_increasing (line 2006) | def is_monotonic_increasing(self):
method is_monotonic_decreasing (line 2016) | def is_monotonic_decreasing(self):
method count (line 2029) | def count(self, **kwargs): # noqa: PR02
method max (line 2035) | def max(self, **kwargs): # noqa: PR02
method mean (line 2041) | def mean(self, **kwargs): # noqa: PR02
method min (line 2047) | def min(self, **kwargs): # noqa: PR02
method prod (line 2056) | def prod(self, **kwargs): # noqa: PR02
method sum (line 2065) | def sum(self, **kwargs): # noqa: PR02
method mask (line 2069) | def mask(self, cond, other, **kwargs): # noqa: PR01
method pct_change (line 2083) | def pct_change(self, **kwargs): # noqa: PR01
method to_datetime (line 2094) | def to_datetime(self, *args, **kwargs):
method abs (line 2114) | def abs(self):
method map (line 2125) | def map(self, func, *args, **kwargs):
method conj (line 2148) | def conj(self, **kwargs):
method interpolate (line 2172) | def interpolate(self, **kwargs): # noqa: PR01
method isin (line 2188) | def isin(self, values, ignore_indices=False, **kwargs): # noqa: PR02
method isna (line 2217) | def isna(self):
method negative (line 2230) | def negative(self, **kwargs):
method notna (line 2249) | def notna(self):
method round (line 2262) | def round(self, **kwargs): # noqa: PR02
method replace (line 2284) | def replace(self, **kwargs): # noqa: PR02
method argsort (line 2308) | def argsort(self, **kwargs): # noqa: PR02
method series_view (line 2340) | def series_view(self, **kwargs): # noqa: PR02
method to_numeric (line 2367) | def to_numeric(self, *args, **kwargs): # noqa: PR02
method to_timedelta (line 2389) | def to_timedelta(self, unit="ns", errors="raise"): # noqa: PR02
method unique (line 2410) | def unique(self, keep="first", ignore_index=True, subset=None):
method searchsorted (line 2439) | def searchsorted(self, **kwargs): # noqa: PR02
method stack (line 2461) | def stack(self, level, dropna, sort):
method astype (line 2483) | def astype(self, col_dtypes, errors: str = "raise"): # noqa: PR02
method infer_objects (line 2505) | def infer_objects(self):
method convert_dtypes (line 2520) | def convert_dtypes(
method dtypes (line 2568) | def dtypes(self):
method all (line 2591) | def all(self, **kwargs): # noqa: PR02
method any (line 2615) | def any(self, **kwargs): # noqa: PR02
method first_valid_index (line 2638) | def first_valid_index(self):
method idxmax (line 2653) | def idxmax(self, **kwargs): # noqa: PR02
method idxmin (line 2674) | def idxmin(self, **kwargs): # noqa: PR02
method last_valid_index (line 2694) | def last_valid_index(self):
method median (line 2711) | def median(self, **kwargs): # noqa: PR02
method memory_usage (line 2715) | def memory_usage(self, **kwargs): # noqa: PR02
method sizeof (line 2735) | def sizeof(self):
method nunique (line 2754) | def nunique(self, **kwargs): # noqa: PR02
method quantile_for_single_value (line 2767) | def quantile_for_single_value(self, **kwargs): # noqa: PR02
method skew (line 2773) | def skew(self, **kwargs): # noqa: PR02
method sem (line 2781) | def sem(self, **kwargs): # noqa: PR02
method std (line 2789) | def std(self, **kwargs): # noqa: PR02
method var (line 2795) | def var(self, **kwargs): # noqa: PR02
method describe (line 2801) | def describe(self, percentiles: np.ndarray):
method cumsum (line 2827) | def cumsum(self, fold_axis, **kwargs): # noqa: PR02
method cummax (line 2831) | def cummax(self, fold_axis, **kwargs): # noqa: PR02
method cummin (line 2835) | def cummin(self, fold_axis, **kwargs): # noqa: PR02
method cumprod (line 2839) | def cumprod(self, fold_axis, **kwargs): # noqa: PR02
method diff (line 2843) | def diff(self, **kwargs): # noqa: PR02
method dropna (line 2862) | def dropna(self, **kwargs): # noqa: PR02
method duplicated (line 2883) | def duplicated(self, **kwargs):
method nlargest (line 2900) | def nlargest(self, n=5, columns=None, keep="first"):
method nsmallest (line 2925) | def nsmallest(self, n=5, columns=None, keep="first"):
method rowwise_query (line 2950) | def rowwise_query(self, expr, **kwargs):
method eval (line 2969) | def eval(self, expr, **kwargs):
method mode (line 2988) | def mode(self, **kwargs): # noqa: PR02
method fillna (line 3008) | def fillna(self, **kwargs): # noqa: PR02
method rank (line 3042) | def rank(self, **kwargs): # noqa: PR02
method sort_index (line 3069) | def sort_index(self, **kwargs): # noqa: PR02
method melt (line 3095) | def melt(self, *args, **kwargs): # noqa: PR02
method sort_columns_by_row_values (line 3120) | def sort_columns_by_row_values(self, rows, ascending=True, **kwargs): ...
method sort_rows_by_column_values (line 3147) | def sort_rows_by_column_values(
method quantile_for_list_of_values (line 3203) | def quantile_for_list_of_values(self, **kwargs): # noqa: PR02
method getitem_array (line 3209) | def getitem_array(self, key):
method getitem_column_array (line 3232) | def getitem_column_array(self, key, numeric=False, ignore_order=False):
method getitem_row_array (line 3260) | def getitem_row_array(self, key):
method lookup (line 3280) | def lookup(self, row_labels, col_labels): # noqa: PR01, RT01, D200
method insert (line 3293) | def insert(self, loc, column, value):
method setitem_bool (line 3325) | def setitem_bool(self, row_loc, col_loc, item):
method drop (line 3359) | def drop(self, index=None, columns=None, errors: str = "raise"):
method apply (line 3389) | def apply(self, func, axis, raw=False, result_type=None, *args, **kwar...
method apply_on_series (line 3440) | def apply_on_series(self, func, *args, **kwargs):
method explode (line 3466) | def explode(self, column):
method groupby_count (line 3500) | def groupby_count(
method groupby_any (line 3524) | def groupby_any(
method groupby_idxmin (line 3548) | def groupby_idxmin(
method groupby_idxmax (line 3566) | def groupby_idxmax(
method groupby_min (line 3582) | def groupby_min(
method groupby_prod (line 3602) | def groupby_prod(
method groupby_max (line 3624) | def groupby_max(
method groupby_all (line 3648) | def groupby_all(
method groupby_sum (line 3668) | def groupby_sum(
method groupby_size (line 3692) | def groupby_size(
method groupby_rolling (line 3717) | def groupby_rolling(
method groupby_agg (line 3779) | def groupby_agg(
method groupby_mean (line 3847) | def groupby_mean(
method groupby_skew (line 3869) | def groupby_skew(
method groupby_cumcount (line 3906) | def groupby_cumcount(
method groupby_cumsum (line 3930) | def groupby_cumsum(
method groupby_cummax (line 3954) | def groupby_cummax(
method groupby_cummin (line 3978) | def groupby_cummin(
method groupby_cumprod (line 4002) | def groupby_cumprod(
method groupby_std (line 4024) | def groupby_std(
method groupby_sem (line 4046) | def groupby_sem(
method groupby_rank (line 4068) | def groupby_rank(
method groupby_var (line 4090) | def groupby_var(
method groupby_corr (line 4112) | def groupby_corr(
method groupby_cov (line 4134) | def groupby_cov(
method groupby_nunique (line 4158) | def groupby_nunique(
method groupby_median (line 4180) | def groupby_median(
method groupby_quantile (line 4204) | def groupby_quantile(
method groupby_fillna (line 4228) | def groupby_fillna(
method groupby_diff (line 4247) | def groupby_diff(
method groupby_pct_change (line 4260) | def groupby_pct_change(
method groupby_dtypes (line 4276) | def groupby_dtypes(
method groupby_get_group (line 4300) | def groupby_get_group(
method groupby_shift (line 4324) | def groupby_shift(
method groupby_first (line 4348) | def groupby_first(
method groupby_last (line 4372) | def groupby_last(
method groupby_head (line 4396) | def groupby_head(
method groupby_tail (line 4420) | def groupby_tail(
method groupby_nth (line 4444) | def groupby_nth(
method groupby_ngroup (line 4468) | def groupby_ngroup(
method groupby_nlargest (line 4492) | def groupby_nlargest(
method groupby_nsmallest (line 4517) | def groupby_nsmallest(
method groupby_unique (line 4542) | def groupby_unique(
method groupby_ohlc (line 4562) | def groupby_ohlc(
method unstack (line 4594) | def unstack(self, level, fill_value):
method wide_to_long (line 4612) | def wide_to_long(self, **kwargs): # noqa: PR01
method pivot (line 4623) | def pivot(self, index, columns, values):
method pivot_table (line 4643) | def pivot_table(
method get_dummies (line 4691) | def get_dummies(self, columns, **kwargs): # noqa: PR02
method repeat (line 4720) | def repeat(self, repeats):
method cut (line 4739) | def cut(
method get_axis (line 4775) | def get_axis(self, axis):
method get_axis_len (line 4791) | def get_axis_len(self, axis: Literal[0, 1]) -> int:
method take_2d_labels (line 4809) | def take_2d_labels(
method get_positions_from_labels (line 4844) | def get_positions_from_labels(self, row_loc, col_loc):
method take_2d_positional (line 4958) | def take_2d_positional(self, index=None, columns=None):
method insert_item (line 4982) | def insert_item(self, axis, loc, value, how="inner", replace=False):
method setitem (line 5026) | def setitem(self, axis, key, value):
method write_items (line 5056) | def write_items(
method __constructor__ (line 5115) | def __constructor__(self) -> type[Self]:
method delitem (line 5129) | def delitem(self, key):
method has_multiindex (line 5147) | def has_multiindex(self, axis=0):
method frame_has_materialized_dtypes (line 5167) | def frame_has_materialized_dtypes(self) -> bool:
method frame_has_materialized_columns (line 5178) | def frame_has_materialized_columns(self) -> bool:
method frame_has_materialized_index (line 5189) | def frame_has_materialized_index(self) -> bool:
method set_frame_dtypes_cache (line 5199) | def set_frame_dtypes_cache(self, dtypes):
method set_frame_index_cache (line 5209) | def set_frame_index_cache(self, index):
method set_frame_columns_cache (line 5219) | def set_frame_columns_cache(self, index):
method frame_has_index_cache (line 5230) | def frame_has_index_cache(self):
method frame_has_columns_cache (line 5241) | def frame_has_columns_cache(self):
method frame_has_dtypes_cache (line 5252) | def frame_has_dtypes_cache(self) -> bool:
method get_index_name (line 5262) | def get_index_name(self, axis=0):
method set_index_name (line 5278) | def set_index_name(self, name, axis=0):
method get_index_names (line 5291) | def get_index_names(self, axis=0):
method set_index_names (line 5307) | def set_index_names(self, names, axis=0):
method get_dtypes_set (line 5320) | def get_dtypes_set(self):
method between_time (line 5331) | def between_time(self, **kwargs): # noqa: PR01
method shift (line 5343) | def shift(
method tz_convert (line 5354) | def tz_convert(
method tz_localize (line 5396) | def tz_localize(
method dt_ceil (line 5443) | def dt_ceil(self, freq, ambiguous="raise", nonexistent="raise"):
method dt_components (line 5450) | def dt_components(self):
method dt_date (line 5463) | def dt_date(self):
method dt_day (line 5467) | def dt_day(self):
method dt_day_name (line 5473) | def dt_day_name(self, locale=None):
method dt_dayofweek (line 5479) | def dt_dayofweek(self):
method dt_dayofyear (line 5483) | def dt_dayofyear(self):
method dt_days (line 5487) | def dt_days(self):
method dt_days_in_month (line 5495) | def dt_days_in_month(self):
method dt_daysinmonth (line 5499) | def dt_daysinmonth(self):
method dt_end_time (line 5503) | def dt_end_time(self):
method dt_floor (line 5507) | def dt_floor(self, freq, ambiguous="raise", nonexistent="raise"):
method dt_freq (line 5514) | def dt_freq(self):
method dt_unit (line 5526) | def dt_unit(self): # noqa: RT01
method dt_as_unit (line 5530) | def dt_as_unit(self, *args, **kwargs): # noqa: PR01, RT01
method dt_isocalendar (line 5537) | def dt_isocalendar(self):
method dt_hour (line 5541) | def dt_hour(self):
method dt_is_leap_year (line 5548) | def dt_is_leap_year(self):
method dt_is_month_end (line 5555) | def dt_is_month_end(self):
method dt_is_month_start (line 5562) | def dt_is_month_start(self):
method dt_is_quarter_end (line 5569) | def dt_is_quarter_end(self):
method dt_is_quarter_start (line 5576) | def dt_is_quarter_start(self):
method dt_is_year_end (line 5583) | def dt_is_year_end(self):
method dt_is_year_start (line 5590) | def dt_is_year_start(self):
method dt_microsecond (line 5594) | def dt_microsecond(self):
method dt_microseconds (line 5598) | def dt_microseconds(self):
method dt_minute (line 5602) | def dt_minute(self):
method dt_month (line 5606) | def dt_month(self):
method dt_month_name (line 5612) | def dt_month_name(self, locale=None):
method dt_nanosecond (line 5616) | def dt_nanosecond(self):
method dt_nanoseconds (line 5620) | def dt_nanoseconds(self):
method dt_normalize (line 5625) | def dt_normalize(self):
method dt_quarter (line 5637) | def dt_quarter(self):
method dt_qyear (line 5641) | def dt_qyear(self):
method dt_round (line 5645) | def dt_round(self, freq, ambiguous="raise", nonexistent="raise"):
method dt_second (line 5651) | def dt_second(self):
method dt_seconds (line 5655) | def dt_seconds(self):
method dt_start_time (line 5659) | def dt_start_time(self):
method dt_strftime (line 5663) | def dt_strftime(self, date_format):
method dt_time (line 5679) | def dt_time(self):
method dt_timetz (line 5685) | def dt_timetz(self):
method dt_asfreq (line 5689) | def dt_asfreq(self, freq=None, how: str = "E"):
method dt_to_period (line 5714) | def dt_to_period(self, freq=None):
method dt_to_pydatetime (line 5731) | def dt_to_pydatetime(self):
method dt_to_pytimedelta (line 5746) | def dt_to_pytimedelta(self):
method dt_to_timestamp (line 5760) | def dt_to_timestamp(self):
method dt_total_seconds (line 5764) | def dt_total_seconds(self):
method dt_tz (line 5769) | def dt_tz(self):
method dt_tz_convert (line 5782) | def dt_tz_convert(self, tz):
method dt_tz_localize (line 5799) | def dt_tz_localize(self, tz, ambiguous="raise", nonexistent="raise"):
method dt_weekday (line 5819) | def dt_weekday(self):
method dt_year (line 5823) | def dt_year(self):
method first (line 5828) | def first(self, offset: pandas.DateOffset):
method last (line 5847) | def last(self, offset: pandas.DateOffset):
method resample_agg_df (line 5879) | def resample_agg_df(self, resample_kwargs, func, *args, **kwargs):
method resample_agg_ser (line 5891) | def resample_agg_ser(self, resample_kwargs, func, *args, **kwargs):
method resample_app_df (line 5903) | def resample_app_df(self, resample_kwargs, func, *args, **kwargs):
method resample_app_ser (line 5915) | def resample_app_ser(self, resample_kwargs, func, *args, **kwargs):
method resample_asfreq (line 5920) | def resample_asfreq(self, resample_kwargs, fill_value):
method resample_bfill (line 5943) | def resample_bfill(self, resample_kwargs, limit):
method resample_count (line 5951) | def resample_count(self, resample_kwargs):
method resample_ffill (line 5957) | def resample_ffill(self, resample_kwargs, limit):
method resample_fillna (line 5967) | def resample_fillna(self, resample_kwargs, method, limit):
method resample_first (line 5973) | def resample_first(self, resample_kwargs, *args, **kwargs):
method resample_get_group (line 5981) | def resample_get_group(self, resample_kwargs, name, obj):
method resample_interpolate (line 6020) | def resample_interpolate(
method resample_last (line 6046) | def resample_last(self, resample_kwargs, *args, **kwargs):
method resample_max (line 6052) | def resample_max(self, resample_kwargs, *args, **kwargs):
method resample_mean (line 6058) | def resample_mean(self, resample_kwargs, *args, **kwargs):
method resample_median (line 6064) | def resample_median(self, resample_kwargs, *args, **kwargs):
method resample_min (line 6070) | def resample_min(self, resample_kwargs, *args, **kwargs):
method resample_nearest (line 6076) | def resample_nearest(self, resample_kwargs, limit):
method resample_nunique (line 6082) | def resample_nunique(self, resample_kwargs, *args, **kwargs):
method resample_ohlc_df (line 6094) | def resample_ohlc_df(self, resample_kwargs, *args, **kwargs):
method resample_ohlc_ser (line 6104) | def resample_ohlc_ser(self, resample_kwargs, *args, **kwargs):
method resample_pipe (line 6113) | def resample_pipe(self, resample_kwargs, func, *args, **kwargs):
method resample_prod (line 6145) | def resample_prod(self, resample_kwargs, min_count, *args, **kwargs):
method resample_quantile (line 6153) | def resample_quantile(self, resample_kwargs, q, *args, **kwargs):
method resample_sem (line 6162) | def resample_sem(self, resample_kwargs, *args, **kwargs):
method resample_size (line 6170) | def resample_size(self, resample_kwargs, *args, **kwargs):
method resample_std (line 6178) | def resample_std(self, resample_kwargs, ddof, *args, **kwargs):
method resample_sum (line 6188) | def resample_sum(self, resample_kwargs, min_count, *args, **kwargs):
method resample_transform (line 6193) | def resample_transform(self, resample_kwargs, arg, *args, **kwargs):
method resample_var (line 6224) | def resample_var(self, resample_kwargs, ddof, *args, **kwargs):
method str_capitalize (line 6234) | def str_capitalize(self):
method str_center (line 6243) | def str_center(self, width, fillchar=" "):
method str_contains (line 6255) | def str_contains(self, pat, case=True, flags=0, na=None, regex=True):
method str_count (line 6266) | def str_count(self, pat, flags=0):
method str_endswith (line 6275) | def str_endswith(self, pat, na=None):
method str_find (line 6285) | def str_find(self, sub, start=0, end=None):
method str_findall (line 6294) | def str_findall(self, pat, flags=0):
method str_fullmatch (line 6305) | def str_fullmatch(self, pat, case=True, flags=0, na=None):
method str_get (line 6311) | def str_get(self, i):
method str_get_dummies (line 6315) | def str_get_dummies(self, sep):
method str_index (line 6325) | def str_index(self, sub, start=0, end=None):
method str_isalnum (line 6329) | def str_isalnum(self):
method str_isalpha (line 6333) | def str_isalpha(self):
method str_isdecimal (line 6337) | def str_isdecimal(self):
method str_isdigit (line 6341) | def str_isdigit(self):
method str_islower (line 6345) | def str_islower(self):
method str_isnumeric (line 6349) | def str_isnumeric(self):
method str_isspace (line 6353) | def str_isspace(self):
method str_istitle (line 6357) | def str_istitle(self):
method str_isupper (line 6361) | def str_isupper(self):
method str_join (line 6365) | def str_join(self, sep):
method str_len (line 6369) | def str_len(self):
method str_ljust (line 6378) | def str_ljust(self, width, fillchar=" "):
method str_lower (line 6382) | def str_lower(self):
method str_lstrip (line 6386) | def str_lstrip(self, to_strip=None):
method str_match (line 6397) | def str_match(self, pat, case=True, flags=0, na=None):
method str_extract (line 6407) | def str_extract(self, pat, flags=0, expand=True):
method str_extractall (line 6416) | def str_extractall(self, pat, flags=0):
method str_normalize (line 6422) | def str_normalize(self, form):
method str_pad (line 6432) | def str_pad(self, width, side="left", fillchar=" "):
method str_partition (line 6441) | def str_partition(self, sep=" ", expand=True):
method str_removeprefix (line 6445) | def str_removeprefix(self, prefix):
method str_removesuffix (line 6449) | def str_removesuffix(self, suffix):
method str_repeat (line 6453) | def str_repeat(self, repeats):
method str_replace (line 6466) | def str_replace(self, pat, repl, n=-1, case=None, flags=0, regex=None):
method str_rfind (line 6478) | def str_rfind(self, sub, start=0, end=None):
method str_rindex (line 6488) | def str_rindex(self, sub, start=0, end=None):
method str_rjust (line 6497) | def str_rjust(self, width, fillchar=" "):
method str_rpartition (line 6506) | def str_rpartition(self, sep=" ", expand=True):
method str_rsplit (line 6516) | def str_rsplit(self, pat=None, *, n=-1, expand=False):
method str_rstrip (line 6522) | def str_rstrip(self, to_strip=None):
method str_slice (line 6532) | def str_slice(self, start=None, stop=None, step=None):
method str_slice_replace (line 6542) | def str_slice_replace(self, start=None, stop=None, repl=None):
method str_split (line 6555) | def str_split(self, pat=None, *, n=-1, expand=False, regex=None):
method str_startswith (line 6566) | def str_startswith(self, pat, na=None):
method str_strip (line 6570) | def str_strip(self, to_strip=None):
method str_swapcase (line 6574) | def str_swapcase(self):
method str_title (line 6578) | def str_title(self):
method str_translate (line 6582) | def str_translate(self, table):
method str_upper (line 6586) | def str_upper(self):
method str_wrap (line 6595) | def str_wrap(self, width, **kwargs):
method str_zfill (line 6599) | def str_zfill(self, width):
method str___getitem__ (line 6603) | def str___getitem__(self, key):
method str_encode (line 6612) | def str_encode(self, encoding, errors):
method str_decode (line 6622) | def str_decode(self, encoding, errors, dtype):
method str_cat (line 6635) | def str_cat(self, others, sep=None, na_rep=None, join="left"):
method str_casefold (line 6644) | def str_casefold(self):
method rolling_aggregate (line 6666) | def rolling_aggregate(self, fold_axis, rolling_kwargs, func, *args, **...
method rolling_apply (line 6690) | def rolling_apply(
method rolling_corr (line 6715) | def rolling_corr(
method rolling_count (line 6725) | def rolling_count(self, fold_axis, rolling_kwargs):
method rolling_cov (line 6740) | def rolling_cov(
method rolling_kurt (line 6753) | def rolling_kurt(self, fold_axis, rolling_kwargs, **kwargs):
method rolling_max (line 6766) | def rolling_max(self, fold_axis, rolling_kwargs, *args, **kwargs):
method rolling_mean (line 6779) | def rolling_mean(self, fold_axis, rolling_kwargs, *args, **kwargs):
method rolling_median (line 6790) | def rolling_median(self, fold_axis, rolling_kwargs, **kwargs):
method rolling_min (line 6803) | def rolling_min(self, fold_axis, rolling_kwargs, *args, **kwargs):
method rolling_quantile (line 6817) | def rolling_quantile(
method rolling_skew (line 6830) | def rolling_skew(self, fold_axis, rolling_kwargs, **kwargs):
method rolling_std (line 6844) | def rolling_std(self, fold_axis, rolling_kwargs, ddof=1, *args, **kwar...
method rolling_sum (line 6857) | def rolling_sum(self, fold_axis, rolling_kwargs, *args, **kwargs):
method rolling_sem (line 6870) | def rolling_sem(self, fold_axis, rolling_kwargs, *args, **kwargs):
method rolling_var (line 6884) | def rolling_var(self, fold_axis, rolling_kwargs, ddof=1, *args, **kwar...
method rolling_rank (line 6901) | def rolling_rank(
method expanding_aggregate (line 6939) | def expanding_aggregate(self, fold_axis, expanding_args, func, *args, ...
method expanding_sum (line 6953) | def expanding_sum(self, fold_axis, expanding_args, *args, **kwargs):
method expanding_min (line 6967) | def expanding_min(self, fold_axis, expanding_args, *args, **kwargs):
method expanding_max (line 6981) | def expanding_max(self, fold_axis, expanding_args, *args, **kwargs):
method expanding_mean (line 6995) | def expanding_mean(self, fold_axis, expanding_args, *args, **kwargs):
method expanding_median (line 7011) | def expanding_median(
method expanding_var (line 7039) | def expanding_var(self, fold_axis, expanding_args, ddof=1, *args, **kw...
method expanding_std (line 7054) | def expanding_std(self, fold_axis, expanding_args, ddof=1, *args, **kw...
method expanding_corr (line 7073) | def expanding_corr(
method expanding_cov (line 7121) | def expanding_cov(
method expanding_count (line 7165) | def expanding_count(self, fold_axis, expanding_args, ddof=1, *args, **...
method expanding_quantile (line 7180) | def expanding_quantile(
method expanding_sem (line 7198) | def expanding_sem(
method expanding_skew (line 7214) | def expanding_skew(self, fold_axis, expanding_args, numeric_only=False...
method expanding_kurt (line 7228) | def expanding_kurt(self, fold_axis, expanding_args, numeric_only=False...
method expanding_rank (line 7246) | def expanding_rank(
method window_mean (line 7281) | def window_mean(self, fold_axis, window_kwargs, *args, **kwargs):
method window_std (line 7296) | def window_std(self, fold_axis, window_kwargs, ddof=1, *args, **kwargs):
method window_sum (line 7310) | def window_sum(self, fold_axis, window_kwargs, *args, **kwargs):
method window_var (line 7325) | def window_var(self, fold_axis, window_kwargs, ddof=1, *args, **kwargs):
method cat_codes (line 7336) | def cat_codes(self):
method list_flatten (line 7354) | def list_flatten(self):
method list_len (line 7366) | def list_len(self):
method list__getitem__ (line 7378) | def list__getitem__(self, key): # noqa: PR01
method struct_dtypes (line 7394) | def struct_dtypes(self):
method struct_field (line 7406) | def struct_field(self, name_or_index): # noqa: PR01
method struct_explode (line 7420) | def struct_explode(self):
method invert (line 7434) | def invert(self):
method kurt (line 7448) | def kurt(self, axis, numeric_only=False, skipna=True, **kwargs):
method compare (line 7457) | def compare(self, other, align_axis, keep_shape, keep_equal, result_na...
method case_when (line 7487) | def case_when(self, caselist): # noqa: PR01, RT01, D200
method get_pandas_backend (line 7502) | def get_pandas_backend(self) -> Optional[str]:
method repartition (line 7513) | def repartition(self, axis=None):
FILE: modin/core/storage_formats/base/query_compiler_calculator.py
function all_switchable_backends (line 35) | def all_switchable_backends() -> list[str]:
class AggregatedBackendData (line 57) | class AggregatedBackendData:
method __init__ (line 69) | def __init__(self, backend: str, qc_cls: type[BaseQueryCompiler]):
class BackendCostCalculator (line 76) | class BackendCostCalculator:
method __init__ (line 97) | def __init__(
method calculate (line 141) | def calculate(self) -> str:
method _add_cost_data (line 290) | def _add_cost_data(self, backend, cost):
method _calc_result_log (line 306) | def _calc_result_log(self, selected_backend: str) -> str:
FILE: modin/core/storage_formats/pandas/aggregations.py
class CorrCovBuilder (line 31) | class CorrCovBuilder:
class Method (line 34) | class Method(Enum):
method build_corr_method (line 41) | def build_corr_method(
method build_cov_method (line 105) | def build_cov_method(
method _build_map_reduce_methods (line 119) | def _build_map_reduce_methods(
class _CorrCovKernels (line 151) | class _CorrCovKernels:
method map (line 155) | def map(cls, df: pandas.DataFrame, numeric_only: bool) -> pandas.DataF...
method _compute_non_nan_aggs (line 227) | def _compute_non_nan_aggs(
method _compute_nan_aggs (line 252) | def _compute_nan_aggs(
method reduce (line 323) | def reduce(
method _maybe_combine_nan_and_non_nan_aggs (line 401) | def _maybe_combine_nan_and_non_nan_aggs(
method _build_corr_table_nan (line 513) | def _build_corr_table_nan(
method _build_corr_table_non_nan (line 558) | def _build_corr_table_non_nan(
FILE: modin/core/storage_formats/pandas/groupby.py
class GroupbyReduceImpl (line 26) | class GroupbyReduceImpl:
method get_impl (line 30) | def get_impl(cls, agg_name):
method has_impl_for (line 48) | def has_impl_for(cls, agg_func):
method build_qc_method (line 75) | def build_qc_method(cls, agg_name, finalizer_fn=None):
method _build_skew_impl (line 116) | def _build_skew_impl():
method _build_mean_impl (line 185) | def _build_mean_impl():
class PivotTableImpl (line 251) | class PivotTableImpl:
method map_reduce_impl (line 255) | def map_reduce_impl(
method full_axis_impl (line 313) | def full_axis_impl(
method range_partition_impl (line 382) | def range_partition_impl(
method _pivot_table_from_groupby (line 444) | def _pivot_table_from_groupby(
method _separate_data_from_grouper (line 483) | def _separate_data_from_grouper(qc, values, unique_keys):
FILE: modin/core/storage_formats/pandas/merge.py
class MergeImpl (line 35) | class MergeImpl:
method range_partitioning_merge (line 39) | def range_partitioning_merge(cls, left, right, kwargs):
method row_axis_merge (line 104) | def row_axis_merge(
method _compute_result_metadata (line 255) | def _compute_result_metadata(
FILE: modin/core/storage_formats/pandas/native_query_compiler.py
function _get_axis (line 47) | def _get_axis(axis):
function _set_axis (line 66) | def _set_axis(axis):
class NativeQueryCompiler (line 93) | class NativeQueryCompiler(BaseQueryCompiler):
method __init__ (line 109) | def __init__(self, pandas_frame):
method execute (line 130) | def execute(self):
method frame_has_materialized_dtypes (line 134) | def frame_has_materialized_dtypes(self) -> bool:
method set_frame_dtypes_cache (line 144) | def set_frame_dtypes_cache(self, dtypes):
method set_frame_index_cache (line 159) | def set_frame_index_cache(self, index):
method frame_has_index_cache (line 175) | def frame_has_index_cache(self):
method frame_has_dtypes_cache (line 186) | def frame_has_dtypes_cache(self) -> bool:
method copy (line 196) | def copy(self):
method to_pandas (line 205) | def to_pandas(self):
method from_pandas (line 213) | def from_pandas(cls, df, data_cls):
method from_arrow (line 217) | def from_arrow(cls, at, data_cls):
method free (line 220) | def free(self):
method finalize (line 223) | def finalize(self):
method move_to (line 226) | def move_to(self, target_backend: str) -> Union[BaseQueryCompiler, Any]:
method move_from (line 230) | def move_from(cls, source_qc: BaseQueryCompiler) -> Union[BaseQueryCom...
method _engine_max_size (line 234) | def _engine_max_size(cls):
method _transfer_threshold (line 241) | def _transfer_threshold(cls):
method do_array_ufunc_implementation (line 247) | def do_array_ufunc_implementation(
method to_interchange_dataframe (line 282) | def to_interchange_dataframe(
method from_interchange_dataframe (line 290) | def from_interchange_dataframe(cls, df: ProtocolDataframe, data_cls):
method support_materialization_in_worker_process (line 295) | def support_materialization_in_worker_process(self) -> bool:
method get_pandas_backend (line 305) | def get_pandas_backend(self) -> Optional[str]:
method repartition (line 324) | def repartition(self, axis=None):
FILE: modin/core/storage_formats/pandas/parsers.py
function _split_result_for_readers (line 100) | def _split_result_for_readers(axis, num_splits, df): # pragma: no cover
function find_common_type_cat (line 131) | def find_common_type_cat(types):
class PandasParser (line 162) | class PandasParser(ClassLogger, modin_layer="PARSER", log_level=LogLevel...
method generic_parse (line 167) | def generic_parse(fname, **kwargs):
method get_dtypes (line 234) | def get_dtypes(cls, dtypes_ids, columns):
method single_worker_read (line 289) | def single_worker_read(cls, fname, *args, reason: str, **kwargs):
method get_types_mapper (line 332) | def get_types_mapper(dtype_backend):
class PandasCSVParser (line 358) | class PandasCSVParser(PandasParser):
method parse (line 361) | def parse(fname, common_read_kwargs, **kwargs):
method read_callback (line 370) | def read_callback(*args, **kwargs):
class PandasFWFParser (line 390) | class PandasFWFParser(PandasParser):
method parse (line 393) | def parse(fname, common_read_kwargs, **kwargs):
method read_callback (line 402) | def read_callback(*args, **kwargs):
class PandasExcelParser (line 422) | class PandasExcelParser(PandasParser):
method get_sheet_data (line 424) | def get_sheet_data(cls, sheet, convert_float):
method _convert_cell (line 446) | def _convert_cell(cls, cell, convert_float):
method need_rich_text_param (line 481) | def need_rich_text_param():
method parse (line 496) | def parse(fname, **kwargs):
class PandasJSONParser (line 648) | class PandasJSONParser(PandasParser):
method parse (line 651) | def parse(fname, **kwargs):
class ParquetFileToRead (line 680) | class ParquetFileToRead(NamedTuple):
class PandasParquetParser (line 700) | class PandasParquetParser(PandasParser):
method _read_row_group_chunk (line 702) | def _read_row_group_chunk(
method parse (line 794) | def parse(files_for_parser, engine, **kwargs):
class PandasHDFParser (line 826) | class PandasHDFParser(PandasParser): # pragma: no cover
method parse (line 833) | def parse(fname, **kwargs):
class PandasFeatherParser (line 844) | class PandasFeatherParser(PandasParser):
method parse (line 851) | def parse(fname, **kwargs):
class PandasSQLParser (line 878) | class PandasSQLParser(PandasParser):
method parse (line 891) | def parse(sql, con, index_col, read_sql_engine, **kwargs):
FILE: modin/core/storage_formats/pandas/query_compiler.py
function _get_axis (line 91) | def _get_axis(axis):
function _set_axis (line 110) | def _set_axis(axis):
function _str_map (line 136) | def _str_map(func_name):
function _dt_prop_map (line 161) | def _dt_prop_map(property_name):
function _dt_func_map (line 197) | def _dt_func_map(func_name):
function copy_df_for_func (line 227) | def copy_df_for_func(func, display_name: str = None):
function _series_logical_binop (line 258) | def _series_logical_binop(func):
class PandasQueryCompiler (line 279) | class PandasQueryCompiler(BaseQueryCompiler):
method __init__ (line 297) | def __init__(self, modin_frame: PandasDataframe, shape_hint: Optional[...
method lazy_row_labels (line 305) | def lazy_row_labels(self):
method lazy_row_count (line 318) | def lazy_row_count(self):
method lazy_column_types (line 331) | def lazy_column_types(self):
method lazy_column_labels (line 344) | def lazy_column_labels(self):
method lazy_column_count (line 357) | def lazy_column_count(self):
method stay_cost (line 372) | def stay_cost(self, api_cls_name, operation, arguments):
method finalize (line 375) | def finalize(self):
method execute (line 378) | def execute(self):
method to_pandas (line 382) | def to_pandas(self):
method from_pandas (line 386) | def from_pandas(cls, df, data_cls):
method from_arrow (line 390) | def from_arrow(cls, at, data_cls):
method to_interchange_dataframe (line 395) | def to_interchange_dataframe(
method from_interchange_dataframe (line 403) | def from_interchange_dataframe(cls, df: ProtocolDataframe, data_cls):
method get_axis_len (line 411) | def get_axis_len(self, axis: Literal[0, 1]) -> int:
method dtypes (line 430) | def dtypes(self) -> pandas.Series:
method get_dtypes_set (line 433) | def get_dtypes_set(self):
method add_prefix (line 439) | def add_prefix(self, prefix, axis=1):
method add_suffix (line 449) | def add_suffix(self, suffix, axis=1):
method copy (line 465) | def copy(self):
method concat (line 482) | def concat(self, axis, other, **kwargs):
method free (line 507) | def free(self):
method move_to (line 514) | def move_to(self, target_backend: str) -> Union[BaseQueryCompiler, Any]:
method move_from (line 518) | def move_from(cls, source_qc: BaseQueryCompiler) -> Union[BaseQueryCom...
method to_numpy (line 524) | def to_numpy(self, **kwargs):
method where (line 626) | def where(self, cond, other, **kwargs):
method merge (line 657) | def merge(self, right, **kwargs):
method join (line 669) | def join(self, right: PandasQueryCompiler, **kwargs) -> PandasQueryCom...
method reindex (line 713) | def reindex(self, axis, labels, **kwargs):
method reset_index (line 741) | def reset_index(self, **kwargs) -> PandasQueryCompiler:
method set_index_from_columns (line 921) | def set_index_from_columns(
method transpose (line 966) | def transpose(self, *args, **kwargs) -> PandasQueryCompiler:
method is_series_like (line 970) | def is_series_like(self):
method _dtypes_sum (line 978) | def _dtypes_sum(dtypes: pandas.Series, *func_args, **func_kwargs): # ...
method memory_usage (line 996) | def memory_usage(self, **kwargs):
method max (line 1013) | def max(self, axis, **kwargs):
method min (line 1025) | def min(self, axis, **kwargs):
method mean (line 1037) | def mean(self, axis, **kwargs):
method median (line 1104) | def median(self, axis, **kwargs):
method nunique (line 1109) | def nunique(self, axis=0, dropna=True):
method skew (line 1144) | def skew(self, axis, **kwargs):
method kurt (line 1149) | def kurt(self, axis, **kwargs):
method to_datetime (line 1161) | def to_datetime(self, *args, **kwargs):
method _resample_func (line 1177) | def _resample_func(
method resample_get_group (line 1289) | def resample_get_group(self, resample_kwargs, name, obj):
method resample_app_ser (line 1294) | def resample_app_ser(self, resample_kwargs, func, *args, **kwargs):
method resample_app_df (line 1304) | def resample_app_df(self, resample_kwargs, func, *args, **kwargs):
method resample_agg_ser (line 1307) | def resample_agg_ser(self, resample_kwargs, func, *args, **kwargs):
method resample_agg_df (line 1317) | def resample_agg_df(self, resample_kwargs, func, *args, **kwargs):
method resample_transform (line 1322) | def resample_transform(self, resample_kwargs, arg, *args, **kwargs):
method resample_pipe (line 1332) | def resample_pipe(self, resample_kwargs, func, *args, **kwargs):
method resample_ffill (line 1335) | def resample_ffill(self, resample_kwargs, limit):
method resample_bfill (line 1340) | def resample_bfill(self, resample_kwargs, limit):
method resample_nearest (line 1345) | def resample_nearest(self, resample_kwargs, limit):
method resample_fillna (line 1350) | def resample_fillna(self, resample_kwargs, method, limit):
method resample_asfreq (line 1359) | def resample_asfreq(self, resample_kwargs, fill_value):
method resample_interpolate (line 1362) | def resample_interpolate(
method resample_count (line 1387) | def resample_count(self, resample_kwargs):
method resample_nunique (line 1390) | def resample_nunique(self, resample_kwargs, *args, **kwargs):
method resample_first (line 1393) | def resample_first(self, resample_kwargs, *args, **kwargs):
method resample_last (line 1398) | def resample_last(self, resample_kwargs, *args, **kwargs):
method resample_max (line 1403) | def resample_max(self, resample_kwargs, *args, **kwargs):
method resample_mean (line 1406) | def resample_mean(self, resample_kwargs, *args, **kwargs):
method resample_median (line 1409) | def resample_median(self, resample_kwargs, *args, **kwargs):
method resample_min (line 1412) | def resample_min(self, resample_kwargs, *args, **kwargs):
method resample_ohlc_ser (line 1415) | def resample_ohlc_ser(self, resample_kwargs, *args, **kwargs):
method resample_ohlc_df (line 1424) | def resample_ohlc_df(self, resample_kwargs, *args, **kwargs):
method resample_prod (line 1427) | def resample_prod(self, resample_kwargs, min_count, *args, **kwargs):
method resample_size (line 1436) | def resample_size(self, resample_kwargs):
method resample_sem (line 1444) | def resample_sem(self, resample_kwargs, *args, **kwargs):
method resample_std (line 1447) | def resample_std(self, resample_kwargs, ddof, *args, **kwargs):
method resample_sum (line 1450) | def resample_sum(self, resample_kwargs, min_count, *args, **kwargs):
method resample_var (line 1459) | def resample_var(self, resample_kwargs, ddof, *args, **kwargs):
method resample_quantile (line 1462) | def resample_quantile(self, resample_kwargs, q, **kwargs):
method expanding_aggregate (line 1465) | def expanding_aggregate(self, axis, expanding_args, func, *args, **kwa...
method expanding_cov (line 1531) | def expanding_cov(
method expanding_corr (line 1582) | def expanding_corr(
method rolling_corr (line 1792) | def rolling_corr(self, axis, rolling_kwargs, other, pairwise, *args, *...
method rolling_cov (line 1809) | def rolling_cov(self, axis, rolling_kwargs, other, pairwise, ddof, **k...
method rolling_aggregate (line 1826) | def rolling_aggregate(self, axis, rolling_kwargs, func, *args, **kwargs):
method unstack (line 1836) | def unstack(self, level, fill_value):
method stack (line 1988) | def stack(self, level, dropna, sort):
method isin (line 2009) | def isin(self, values, ignore_indices=False):
method convert_dtypes (line 2040) | def convert_dtypes(
method str_partition (line 2157) | def str_partition(self, sep=" ", expand=True):
method str_extract (line 2166) | def str_extract(self, pat, flags, expand):
method str_rpartition (line 2184) | def str_rpartition(self, sep=" ", expand=True):
method str_rsplit (line 2192) | def str_rsplit(self, pat=None, n=-1, expand=False):
method str_split (line 2205) | def str_split(self, pat=None, n=-1, expand=False, regex=None):
method unique (line 2231) | def unique(self, keep="first", ignore_index=True, subset=None):
method searchsorted (line 2272) | def searchsorted(self, **kwargs):
method astype (line 2335) | def astype(self, col_dtypes, errors: str = "raise"):
method infer_objects (line 2345) | def infer_objects(self):
method first_valid_index (line 2350) | def first_valid_index(self):
method last_valid_index (line 2368) | def last_valid_index(self):
method describe (line 2388) | def describe(self, percentiles: np.ndarray):
method diff (line 2435) | def diff(self, axis, periods):
method clip (line 2438) | def clip(self, lower, upper, **kwargs):
method cov (line 2456) | def cov(self, min_periods=None, ddof=1):
method _nancorr (line 2462) | def _nancorr(self, min_periods=1, cov=False, ddof=1):
method dot (line 2546) | def dot(self, other, squeeze_self=None, squeeze_other=None):
method _nsort (line 2605) | def _nsort(self, n, columns=None, keep="first", sort_type="nsmallest"):
method nsmallest (line 2653) | def nsmallest(self, *args, **kwargs):
method nlargest (line 2656) | def nlargest(self, *args, **kwargs):
method eval (line 2659) | def eval(self, expr, **kwargs):
method mode (line 2683) | def mode(self, **kwargs):
method fillna (line 2710) | def fillna(self, **kwargs):
method quantile_for_list_of_values (line 2815) | def quantile_for_list_of_values(self, **kwargs):
method rank (line 2862) | def rank(self, **kwargs):
method sort_index (line 2879) | def sort_index(self, **kwargs):
method melt (line 2918) | def melt(
method setitem_bool (line 3028) | def setitem_bool(self, row_loc: PandasQueryCompiler, col_loc, item):
method __validate_bool_indexer (line 3060) | def __validate_bool_indexer(self, indexer):
method getitem_array (line 3072) | def getitem_array(self, key):
method getitem_column_array (line 3105) | def getitem_column_array(
method getitem_row_array (line 3124) | def getitem_row_array(self, key):
method setitem (line 3129) | def setitem(self, axis, key, value):
method _setitem (line 3143) | def _setitem(self, axis, key, value, how="inner"):
method dropna (line 3249) | def dropna(self, **kwargs):
method drop (line 3326) | def drop(
method duplicated (line 3346) | def duplicated(self, **kwargs):
method insert (line 3394) | def insert(self, loc, column, value):
method _wrap_column_data (line 3440) | def _wrap_column_data(self, data):
method explode (line 3461) | def explode(self, column):
method apply (line 3469) | def apply(self, func, axis, *args, **kwargs):
method apply_on_series (line 3482) | def apply_on_series(self, func, *args, **kwargs):
method _dict_func (line 3499) | def _dict_func(self, func, axis, *args, **kwargs):
method _list_like_func (line 3544) | def _list_like_func(self, func, axis, *args, **kwargs):
method rowwise_query (line 3585) | def rowwise_query(self, expr, **kwargs):
method _callable_func (line 3644) | def _callable_func(self, func, axis, *args, **kwargs):
method _groupby_separate_by (line 3681) | def _groupby_separate_by(self, by, drop):
method groupby_nth (line 3750) | def groupby_nth(
method groupby_mean (line 3767) | def groupby_mean(self, by, axis, groupby_kwargs, agg_args, agg_kwargs,...
method groupby_size (line 3830) | def groupby_size(
method _groupby_dict_reduce (line 3876) | def _groupby_dict_reduce(
method groupby_dtypes (line 3972) | def groupby_dtypes(
method _groupby_shuffle (line 3993) | def _groupby_shuffle(
method groupby_corr (line 4136) | def groupby_corr(
method groupby_cov (line 4160) | def groupby_cov(
method groupby_rolling (line 4184) | def groupby_rolling(
method groupby_agg (line 4236) | def groupby_agg(
method pivot (line 4509) | def pivot(self, index, columns, values):
method pivot_table (line 4559) | def pivot_table(
method get_dummies (line 4643) | def get_dummies(self, columns, **kwargs):
method take_2d_positional (line 4684) | def take_2d_positional(self, index=None, columns=None):
method write_items (line 4691) | def write_items(
method sort_rows_by_column_values (line 4773) | def sort_rows_by_column_values(self, columns, ascending=True, **kwargs):
method sort_columns_by_row_values (line 4779) | def sort_columns_by_row_values(self, rows, ascending=True, **kwargs):
method cat_codes (line 4797) | def cat_codes(self):
method compare (line 4807) | def compare(self, other, **kwargs):
method case_when (line 4818) | def case_when(self, caselist):
FILE: modin/core/storage_formats/pandas/query_compiler_caster.py
function _normalize_class_name (line 56) | def _normalize_class_name(class_of_wrapped_fn: Optional[str]) -> str:
function _get_empty_qc_for_default_backend (line 142) | def _get_empty_qc_for_default_backend() -> BaseQueryCompiler:
class QueryCompilerCaster (line 161) | class QueryCompilerCaster(ABC):
method __init_subclass__ (line 165) | def __init_subclass__(
method _get_query_compiler (line 185) | def _get_query_compiler(self) -> Optional[BaseQueryCompiler]:
method is_backend_pinned (line 198) | def is_backend_pinned(self) -> bool:
method _set_backend_pinned (line 210) | def _set_backend_pinned(self, pinned: bool, inplace: bool) -> Optional...
method pin_backend (line 229) | def pin_backend(self, inplace: bool = False) -> Optional[Self]:
method unpin_backend (line 245) | def unpin_backend(self, inplace: bool = False) -> Optional[Self]:
method get_backend (line 262) | def get_backend(self) -> str:
method set_backend (line 274) | def set_backend(
method move_to (line 304) | def move_to(
method _copy_into (line 316) | def _copy_into(self, other: Self) -> None:
method _get_extension (line 328) | def _get_extension(self, name: str, extensions: EXTENSION_DICT_TYPE) -...
method _getattribute__from_extension_impl (line 353) | def _getattribute__from_extension_impl(
method _getattr__from_extension_impl (line 394) | def _getattr__from_extension_impl(
function visit_nested_args (line 441) | def visit_nested_args(arguments, fn: callable):
function _assert_casting_functions_wrap_same_implementation (line 491) | def _assert_casting_functions_wrap_same_implementation(
function apply_argument_cast_to_class (line 527) | def apply_argument_cast_to_class(klass: type) -> type:
function _maybe_switch_backend_pre_op (line 598) | def _maybe_switch_backend_pre_op(
function _maybe_switch_backend_post_op (line 660) | def _maybe_switch_backend_post_op(
function _get_backend_for_auto_switch (line 735) | def _get_backend_for_auto_switch(
function _get_extension_for_method (line 870) | def _get_extension_for_method(
function wrap_function_in_argument_caster (line 925) | def wrap_function_in_argument_caster(
function wrap_free_function_in_argument_caster (line 1192) | def wrap_free_function_in_argument_caster(name: str) -> callable:
function register_function_for_post_op_switch (line 1222) | def register_function_for_post_op_switch(
function register_function_for_pre_op_switch (line 1243) | def register_function_for_pre_op_switch(
FILE: modin/core/storage_formats/pandas/utils.py
function compute_chunksize (line 28) | def compute_chunksize(axis_len: int, num_splits: int, min_block_size: in...
function split_result_of_axis_func_pandas (line 61) | def split_result_of_axis_func_pandas(
function generate_result_of_axis_func_pandas (line 98) | def generate_result_of_axis_func_pandas(
function get_length_list (line 156) | def get_length_list(axis_len: int, num_splits: int, min_block_size: int)...
function length_fn_pandas (line 185) | def length_fn_pandas(df):
function width_fn_pandas (line 201) | def width_fn_pandas(df):
function get_group_names (line 217) | def get_group_names(regex: "re.Pattern") -> "List[Hashable]":
function merge_partitioning (line 235) | def merge_partitioning(left, right, axis=1):
FILE: modin/db_conn.py
class UnsupportedDatabaseException (line 32) | class UnsupportedDatabaseException(Exception):
class ModinDatabaseConnection (line 38) | class ModinDatabaseConnection:
method __init__ (line 57) | def __init__(self, lib: str, *args: Any, **kwargs: Any) -> None:
method _dialect_is_microsoft_sql (line 66) | def _dialect_is_microsoft_sql(self) -> bool:
method get_connection (line 89) | def get_connection(self) -> Any:
method get_string (line 114) | def get_string(self) -> str:
method column_names_query (line 124) | def column_names_query(self, query: str) -> str:
method row_count_query (line 141) | def row_count_query(self, query: str) -> str:
method partition_query (line 156) | def partition_query(self, query: str, limit: int, offset: int) -> str:
FILE: modin/distributed/dataframe/pandas/partitions.py
function unwrap_partitions (line 58) | def unwrap_partitions(
function from_partitions (line 154) | def from_partitions(
FILE: modin/error_message.py
class ErrorMessage (line 21) | class ErrorMessage(object):
method not_implemented (line 27) | def not_implemented(cls, message: str = "") -> NoReturn:
method single_warning (line 39) | def single_warning(
method default_to_pandas (line 57) | def default_to_pandas(cls, message: str = "", reason: str = "") -> None:
method catch_bugs_and_request_email (line 83) | def catch_bugs_and_request_email(
method non_verified_udf (line 97) | def non_verified_udf(cls) -> None:
method bad_type_for_numpy_op (line 105) | def bad_type_for_numpy_op(cls, function_name: str, operand_type: type)...
method mismatch_with_pandas (line 111) | def mismatch_with_pandas(cls, operation: str, message: str) -> None:
method warn (line 120) | def warn(cls, message: str) -> None:
method not_initialized (line 124) | def not_initialized(cls, engine: str, code: str) -> None:
FILE: modin/experimental/batch/pipeline.py
class PandasQuery (line 30) | class PandasQuery(object):
method __init__ (line 67) | def __init__(
class PandasQueryPipeline (line 88) | class PandasQueryPipeline(object):
method __init__ (line 107) | def __init__(self, df, num_partitions: Optional[int] = None):
method update_df (line 125) | def update_df(self, df):
method add_query (line 142) | def add_query(
method _complete_nodes (line 209) | def _complete_nodes(self, list_of_nodes, partitions):
method compute_batch (line 288) | def compute_batch(
FILE: modin/experimental/core/io/glob/glob_dispatcher.py
class ExperimentalGlobDispatcher (line 27) | class ExperimentalGlobDispatcher(FileDispatcher):
method _read (line 31) | def _read(cls, **kwargs):
method write (line 106) | def write(cls, qc, **kwargs):
FILE: modin/experimental/core/io/sql/sql_dispatcher.py
class ExperimentalSQLDispatcher (line 25) | class ExperimentalSQLDispatcher(SQLDispatcher):
method preprocess_func (line 31) | def preprocess_func(cls): # noqa: RT01
method _read (line 41) | def _read(
FILE: modin/experimental/core/io/sql/utils.py
function is_distributed (line 23) | def is_distributed(partition_column, lower_bound, upper_bound):
function is_table (line 58) | def is_table(engine, sql):
function get_table_metadata (line 77) | def get_table_metadata(engine, table):
function get_table_columns (line 99) | def get_table_columns(metadata):
function build_query_from_table (line 120) | def build_query_from_table(name):
function check_query (line 137) | def check_query(query):
function get_query_columns (line 153) | def get_query_columns(engine, query):
function check_partition_column (line 179) | def check_partition_column(partition_column, cols):
function get_query_info (line 200) | def get_query_info(sql, con, partition_column):
function query_put_bounders (line 235) | def query_put_bounders(query, partition_column, start, end): # pragma: ...
class InvalidArguments (line 262) | class InvalidArguments(Exception):
class InvalidQuery (line 266) | class InvalidQuery(Exception):
class InvalidPartitionColumn (line 270) | class InvalidPartitionColumn(Exception):
function read_sql_with_offset (line 274) | def read_sql_with_offset(
FILE: modin/experimental/core/io/text/csv_glob_dispatcher.py
class ExperimentalCSVGlobDispatcher (line 33) | class ExperimentalCSVGlobDispatcher(CSVDispatcher):
method _read (line 37) | def _read(cls, filepath_or_buffer, **kwargs):
method file_exists (line 265) | def file_exists(cls, file_path: str, storage_options=None) -> bool:
method get_path (line 319) | def get_path(cls, file_path: str, storage_options=None) -> list:
method partitioned_file (line 384) | def partitioned_file(
FILE: modin/experimental/core/io/text/custom_text_dispatcher.py
class ExperimentalCustomTextDispatcher (line 24) | class ExperimentalCustomTextDispatcher(TextFileDispatcher):
method _read (line 28) | def _read(cls, filepath_or_buffer, columns, custom_parser, **kwargs):
FILE: modin/experimental/core/storage_formats/pandas/parsers.py
class ExperimentalPandasCSVGlobParser (line 35) | class ExperimentalPandasCSVGlobParser(PandasCSVParser):
method parse (line 44) | def parse(chunks, **kwargs):
class ExperimentalPandasPickleParser (line 97) | class ExperimentalPandasPickleParser(PandasParser):
method parse (line 100) | def parse(fname, **kwargs):
class ExperimentalPandasParquetParser (line 118) | class ExperimentalPandasParquetParser(PandasParser):
method parse (line 121) | def parse(fname, **kwargs):
class ExperimentalPandasJsonParser (line 136) | class ExperimentalPandasJsonParser(PandasParser):
method parse (line 139) | def parse(fname, **kwargs):
class ExperimentalPandasXmlParser (line 154) | class ExperimentalPandasXmlParser(PandasParser):
method parse (line 157) | def parse(fname, **kwargs):
class ExperimentalCustomTextParser (line 172) | class ExperimentalCustomTextParser(PandasParser):
method parse (line 175) | def parse(fname, **kwargs):
FILE: modin/experimental/pandas/io.py
function read_sql (line 33) | def read_sql(
function read_custom_text (line 124) | def read_custom_text(
function _make_parser_func (line 167) | def _make_parser_func(sep: str, funcname: str) -> Callable:
function _read (line 251) | def _read(**kwargs) -> DataFrame:
function read_pickle_glob (line 306) | def read_pickle_glob(
function to_pickle_glob (line 351) | def to_pickle_glob(
function read_parquet_glob (line 405) | def read_parquet_glob(
function to_parquet_glob (line 450) | def to_parquet_glob(
function read_json_glob (line 489) | def read_json_glob(
function to_json_glob (line 558) | def to_json_glob(
function read_xml_glob (line 606) | def read_xml_glob(
function to_xml_glob (line 666) | def to_xml_glob(
FILE: modin/experimental/sklearn/model_selection/train_test_split.py
function train_test_split (line 18) | def train_test_split(df, **options):
FILE: modin/experimental/spreadsheet/general.py
function from_dataframe (line 19) | def from_dataframe(
function to_dataframe (line 188) | def to_dataframe(spreadsheet):
FILE: modin/experimental/torch/datasets.py
class ModinDataLoader (line 24) | class ModinDataLoader:
method __init__ (line 27) | def __init__(
method __len__ (line 67) | def __len__(self):
method __iter__ (line 71) | def __iter__(self):
method _end_of_batch (line 81) | def _end_of_batch(self, counter: int):
FILE: modin/experimental/xgboost/utils.py
class RabitContextManager (line 23) | class RabitContextManager:
method __init__ (line 39) | def __init__(self, num_workers: int, host_ip):
method __enter__ (line 46) | def __enter__(self):
method __exit__ (line 62) | def __exit__(self, type, value, traceback):
class RabitContext (line 80) | class RabitContext:
method __init__ (line 92) | def __init__(self, actor_rank, args):
method __enter__ (line 96) | def __enter__(self):
method __exit__ (line 105) | def __exit__(self, *args):
FILE: modin/experimental/xgboost/xgboost.py
class DMatrix (line 28) | class DMatrix:
method __init__ (line 60) | def __init__(
method __iter__ (line 106) | def __iter__(self):
method get_dmatrix_params (line 122) | def get_dmatrix_params(self):
method feature_names (line 141) | def feature_names(self):
method feature_names (line 152) | def feature_names(self, feature_names):
method feature_types (line 186) | def feature_types(self):
method feature_types (line 197) | def feature_types(self, feature_types):
method num_row (line 220) | def num_row(self):
method num_col (line 230) | def num_col(self):
method get_float_info (line 240) | def get_float_info(self, name):
method set_info (line 255) | def set_info(
class Booster (line 287) | class Booster(xgb.Booster):
method __init__ (line 304) | def __init__(self, params=None, cache=(), model_file=None): # noqa: MD01
method predict (line 307) | def predict(
function train (line 372) | def train(
FILE: modin/experimental/xgboost/xgboost_ray.py
class ModinXGBoostActor (line 43) | class ModinXGBoostActor:
method __init__ (line 55) | def __init__(self, rank, nthread):
method _get_dmatrix (line 64) | def _get_dmatrix(self, X_y, **dmatrix_kwargs):
method set_train_data (line 96) | def set_train_data(self, *X_y, add_as_eval_method=None, **dmatrix_kwar...
method add_eval_data (line 116) | def add_eval_data(self, *X_y, eval_method, **dmatrix_kwargs):
method train (line 133) | def train(self, rabit_args, params, *args, **kwargs):
function _get_cluster_cpus (line 180) | def _get_cluster_cpus():
function _get_min_cpus_per_node (line 192) | def _get_min_cpus_per_node():
function _get_cpus_per_actor (line 208) | def _get_cpus_per_actor(num_actors):
function _get_num_actors (line 229) | def _get_num_actors(num_actors=None):
function create_actors (line 257) | def create_actors(num_actors):
function _split_data_across_actors (line 297) | def _split_data_across_actors(
function _assign_row_partitions_to_actors (line 332) | def _assign_row_partitions_to_actors(
function _train (line 465) | def _train(
function _map_predict (line 579) | def _map_predict(booster, part, columns, dmatrix_kwargs={}, **kwargs):
function _predict (line 610) | def _predict(
FILE: modin/logging/class_logger.py
class ClassLogger (line 26) | class ClassLogger:
method __init_subclass__ (line 39) | def __init_subclass__(
FILE: modin/logging/config.py
class LogLevel (line 42) | class LogLevel(IntEnum): # noqa: PR01
class ModinFormatter (line 52) | class ModinFormatter(logging.Formatter): # noqa: PR01
method formatTime (line 55) | def formatTime(
function bytes_int_to_str (line 86) | def bytes_int_to_str(num_bytes: int, suffix: str = "B") -> str:
function _create_logger (line 112) | def _create_logger(
function configure_logging (line 164) | def configure_logging() -> None:
function memory_thread (line 203) | def memory_thread(logger: logging.Logger, sleep_time: int) -> None:
function get_logger (line 222) | def get_logger(namespace: str = "modin.logger.default") -> logging.Logger:
FILE: modin/logging/logger_decorator.py
function disable_logging (line 36) | def disable_logging(func: Callable) -> Any:
function enable_logging (line 55) | def enable_logging(modin_layer: Fn) -> Fn:
function enable_logging (line 61) | def enable_logging(
function enable_logging (line 69) | def enable_logging(
FILE: modin/logging/metrics.py
function emit_metric (line 33) | def emit_metric(name: str, value: Union[int, float]) -> None:
function add_metric_handler (line 59) | def add_metric_handler(handler: Callable[[str, Union[int, float]], None]...
function clear_metric_handler (line 70) | def clear_metric_handler(handler: Callable[[str, Union[int, float]], Non...
FILE: modin/numpy/__init__.py
function where (line 91) | def where(condition, x=None, y=None):
FILE: modin/numpy/arr.py
function check_kwargs (line 31) | def check_kwargs(order="C", subok=True, keepdims=None, casting="same_kin...
function check_can_broadcast_to_output (line 60) | def check_can_broadcast_to_output(arr_in: "array", arr_out: "array"):
function fix_dtypes_and_determine_return (line 109) | def fix_dtypes_and_determine_return(
class array (line 141) | class array(object):
method __init__ (line 154) | def __init__(
method __getitem__ (line 239) | def __getitem__(self, key):
method __setitem__ (line 250) | def __setitem__(s
Condensed preview — 681 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (7,455K chars).
[
{
"path": ".gitattributes",
"chars": 43,
"preview": "* text=auto\nmodin/_version.py export-subst\n"
},
{
"path": ".github/ISSUE_TEMPLATE/bug-report.yaml",
"chars": 2272,
"preview": "name: Bug report\ndescription: Report incorrect behavior in the Modin library\ntitle: 'BUG: '\nlabels: ['bug 🦗', 'Triage 🩹'"
},
{
"path": ".github/ISSUE_TEMPLATE/feature_request.md",
"chars": 351,
"preview": "---\nname: Feature request\nabout: Request a new API or feature implementation\ntitle: ''\nlabels: 'new feature/request 💬, T"
},
{
"path": ".github/ISSUE_TEMPLATE/question.md",
"chars": 115,
"preview": "---\nname: Question\nabout: You want to ask a question\ntitle: ''\nlabels: 'question ❓, Triage 🩹'\nassignees: ''\n\n---\n\n\n"
},
{
"path": ".github/PULL_REQUEST_TEMPLATE.md",
"chars": 1217,
"preview": "<!--\nThank you for your contribution!\nPlease review the contributing docs: https://modin.readthedocs.io/en/latest/develo"
},
{
"path": ".github/actions/mamba-env/action.yml",
"chars": 1754,
"preview": "name: \"Install environment using Mamba\"\ndescription: \"Prepare the environment to run Modin\"\ninputs:\n python-version:\n "
},
{
"path": ".github/actions/python-only/action.yml",
"chars": 364,
"preview": "name: \"Install Python only\"\ndescription: \"Prepare the environment to run simple tasks\"\ninputs:\n python-version:\n des"
},
{
"path": ".github/actions/run-core-tests/action.yml",
"chars": 866,
"preview": "name: \"Run core Modin tests\"\ndescription: \"Run core Modin tests like dataframe or groupby\"\ninputs:\n runner:\n descrip"
},
{
"path": ".github/actions/run-core-tests/group_1/action.yml",
"chars": 838,
"preview": "name: \"Run core Modin tests - group 1\"\ndescription: \"Run core Modin tests like dataframe or groupby\"\ninputs:\n runner:\n "
},
{
"path": ".github/actions/run-core-tests/group_2/action.yml",
"chars": 1123,
"preview": "name: \"Run core Modin tests - group 2\"\ndescription: \"Run core Modin tests like dataframe or groupby\"\ninputs:\n runner:\n "
},
{
"path": ".github/actions/run-core-tests/group_3/action.yml",
"chars": 1631,
"preview": "name: \"Run core Modin tests - group 3\"\ndescription: \"Run core Modin tests like dataframe or groupby\"\ninputs:\n runner:\n "
},
{
"path": ".github/actions/run-core-tests/group_4/action.yml",
"chars": 1121,
"preview": "name: \"Run core Modin tests - group 4\"\ndescription: \"Run core Modin tests like dataframe or groupby\"\ninputs:\n runner:\n "
},
{
"path": ".github/actions/upload-coverage/action.yml",
"chars": 494,
"preview": "name: Upload Coverage\ndescription: Upload coverage files\n\nruns:\n using: \"composite\"\n\n steps:\n - run: |\n COVE"
},
{
"path": ".github/dependabot.yaml",
"chars": 187,
"preview": "version: 2\nupdates:\n - package-ecosystem: \"github-actions\"\n directory: \"/\"\n schedule:\n interval: \"monthly\"\n "
},
{
"path": ".github/stale.yml",
"chars": 1366,
"preview": "# Number of days of inactivity before an Issue or Pull Request becomes stale\ndaysUntilStale: 365\n\n# Number of days of in"
},
{
"path": ".github/workflows/ci-notebooks.yml",
"chars": 3625,
"preview": "name: ci-notebooks\non:\n pull_request:\n paths:\n - modin/**\n - examples/tutorial/**\n - .github/workflow"
},
{
"path": ".github/workflows/ci-required.yml",
"chars": 4528,
"preview": "name: ci-required\non: pull_request\nconcurrency:\n # Cancel other jobs in the same branch. We don't care whether CI passe"
},
{
"path": ".github/workflows/ci.yml",
"chars": 36794,
"preview": "name: ci\non:\n pull_request:\n paths:\n # NOTE: keep these paths in sync with the paths that trigger the\n # f"
},
{
"path": ".github/workflows/codeql/codeql-config.yml",
"chars": 50,
"preview": "name: \"Modin CodeQL config\"\n\npaths:\n - modin/** \n"
},
{
"path": ".github/workflows/codeql.yml",
"chars": 1116,
"preview": "name: \"CodeQL\"\n\non:\n push:\n branches: [ \"main\" ]\n pull_request:\n branches: [ \"main\" ]\n\nconcurrency:\n # Cancel o"
},
{
"path": ".github/workflows/fuzzydata-test.yml",
"chars": 1743,
"preview": "name: fuzzy\non:\n pull_request:\n paths:\n # NOTE: keep these paths in sync with the paths that trigger the CI Git"
},
{
"path": ".github/workflows/publish-to-pypi.yml",
"chars": 1197,
"preview": "name: Publish Modin wheel to PyPI\n\non:\n schedule:\n - cron: \"42 0 * * WED\"\n push:\n tags: \n - '*'\n wo"
},
{
"path": ".github/workflows/push-to-main.yml",
"chars": 3306,
"preview": "name: push-to-main\non:\n push:\n branches:\n - main\nconcurrency:\n # Cancel other jobs in the same branch. We don'"
},
{
"path": ".github/workflows/sql_server/set_up_sql_server.sh",
"chars": 703,
"preview": "# This script sets up a SQL server listening at 0.0.0.0:1234.\n\n# If any step fails, we can't set up a valid SQL server f"
},
{
"path": ".gitignore",
"chars": 2772,
"preview": "# Byte-compiled / optimized / DLL files\r\n__pycache__/\r\n*.py[cod]\r\n*$py.class\r\n\r\n# C extensions\r\n*.so\r\n\r\n# Distribution /"
},
{
"path": ".readthedocs.yaml",
"chars": 444,
"preview": "# .readthedocs.yaml\n# Read the Docs configuration file\n# See https://docs.readthedocs.io/en/stable/config-file/v2.html f"
},
{
"path": "CODEOWNERS",
"chars": 223,
"preview": "# These owners will be the default owners for everything in\n# the repo unless a later match takes precedence,\n* @modi"
},
{
"path": "CODE_OF_CONDUCT.md",
"chars": 3349,
"preview": "# Contributor Covenant Code of Conduct\n\n## Our Pledge\n\nIn the interest of fostering an open and welcoming environment, w"
},
{
"path": "LICENSE",
"chars": 13196,
"preview": " Apache License\n Version 2.0, January 2004\n "
},
{
"path": "LICENSE_HEADER",
"chars": 782,
"preview": "# Licensed to Modin Development Team under one or more contributor license agreements.\n# See the NOTICE file distributed"
},
{
"path": "MANIFEST.in",
"chars": 86,
"preview": "include versioneer.py\ninclude modin/_version.py\ninclude modin/tests/pandas/data/*.csv\n"
},
{
"path": "NOTICE",
"chars": 49,
"preview": "Modin\n\nCopyright (c) 2018-2024 Modin Developers.\n"
},
{
"path": "README.md",
"chars": 15988,
"preview": "<p align=\"center\"><a href=\"https://modin.readthedocs.io\"><img width=77% alt=\"\" src=\"https://github.com/modin-project/mod"
},
{
"path": "asv_bench/README.md",
"chars": 4846,
"preview": "# Modin ASV benchmarks\n\n## Here are some scenarios in which [ASV](https://asv.readthedocs.io/en/stable/index.html) can b"
},
{
"path": "asv_bench/asv.conf.dask.json",
"chars": 2186,
"preview": "{\n // The version of the config file format. Do not change, unless\n // you know what you are doing.\n \"version\""
},
{
"path": "asv_bench/asv.conf.json",
"chars": 2185,
"preview": "{\n // The version of the config file format. Do not change, unless\n // you know what you are doing.\n \"version\""
},
{
"path": "asv_bench/asv.conf.unidist.json",
"chars": 2189,
"preview": "{\n // The version of the config file format. Do not change, unless\n // you know what you are doing.\n \"version\""
},
{
"path": "asv_bench/benchmarks/__init__.py",
"chars": 807,
"preview": "# Licensed to Modin Development Team under one or more contributor license agreements.\n# See the NOTICE file distributed"
},
{
"path": "asv_bench/benchmarks/benchmarks.py",
"chars": 40055,
"preview": "# Licensed to Modin Development Team under one or more contributor license agreements.\n# See the NOTICE file distributed"
},
{
"path": "asv_bench/benchmarks/io/__init__.py",
"chars": 810,
"preview": "# Licensed to Modin Development Team under one or more contributor license agreements.\n# See the NOTICE file distributed"
},
{
"path": "asv_bench/benchmarks/io/csv.py",
"chars": 5065,
"preview": "# Licensed to Modin Development Team under one or more contributor license agreements.\n# See the NOTICE file distributed"
},
{
"path": "asv_bench/benchmarks/io/parquet.py",
"chars": 1776,
"preview": "# Licensed to Modin Development Team under one or more contributor license agreements.\n# See the NOTICE file distributed"
},
{
"path": "asv_bench/benchmarks/scalability/__init__.py",
"chars": 868,
"preview": "# Licensed to Modin Development Team under one or more contributor license agreements.\n# See the NOTICE file distributed"
},
{
"path": "asv_bench/benchmarks/scalability/scalability_benchmarks.py",
"chars": 2969,
"preview": "# Licensed to Modin Development Team under one or more contributor license agreements.\n# See the NOTICE file distributed"
},
{
"path": "asv_bench/benchmarks/utils/__init__.py",
"chars": 1661,
"preview": "# Licensed to Modin Development Team under one or more contributor license agreements.\n# See the NOTICE file distributed"
},
{
"path": "asv_bench/benchmarks/utils/common.py",
"chars": 15214,
"preview": "# Licensed to Modin Development Team under one or more contributor license agreements.\n# See the NOTICE file distributed"
},
{
"path": "asv_bench/benchmarks/utils/compatibility.py",
"chars": 2133,
"preview": "# Licensed to Modin Development Team under one or more contributor license agreements.\n# See the NOTICE file distributed"
},
{
"path": "asv_bench/benchmarks/utils/data_shapes.py",
"chars": 6090,
"preview": "# Licensed to Modin Development Team under one or more contributor license agreements.\n# See the NOTICE file distributed"
},
{
"path": "asv_bench/test/__init__.py",
"chars": 782,
"preview": "# Licensed to Modin Development Team under one or more contributor license agreements.\n# See the NOTICE file distributed"
},
{
"path": "asv_bench/test/test_utils.py",
"chars": 2795,
"preview": "# Licensed to Modin Development Team under one or more contributor license agreements.\n# See the NOTICE file distributed"
},
{
"path": "ci/teamcity/Dockerfile.teamcity-ci",
"chars": 1340,
"preview": "# Create images from this container like this (in modin repo root):\n#\n# git rev-parse HEAD > ci/teamcity/git-rev\n#\n# tar"
},
{
"path": "ci/teamcity/build-docker.py",
"chars": 1007,
"preview": "import os\nimport sys\n\n\ndef execute_command(cmd):\n status = os.system(cmd)\n ec = os.WEXITSTATUS(status)\n if ec !"
},
{
"path": "ci/teamcity/comment_on_pr.py",
"chars": 2699,
"preview": "\"\"\"\nPost the comment like the following to the PR:\n```\n:robot: TeamCity test results bot :robot:\n\n<Logs from pytest>\n```"
},
{
"path": "codecov.yml",
"chars": 164,
"preview": "comment: false\ncoverage:\n status:\n project:\n default:\n branches:\n - main\n target: 85%\n "
},
{
"path": "contributing/contributing.md",
"chars": 3659,
"preview": "# Modin dev onboarding\n\n1. [Set up git](https://docs.github.com/en/get-started/quickstart/set-up-git)\n1. [install anacon"
},
{
"path": "contributing/pre-commit",
"chars": 814,
"preview": "#!/bin/sh\n#\n# Called by \"git commit\" with no arguments. The hook should\n# exit with non-zero status after issuing an ap"
},
{
"path": "docker/Dockerfile",
"chars": 103,
"preview": "FROM continuumio/miniconda3\n\nRUN conda install -c conda-forge psutil setproctitle\nRUN pip install modin"
},
{
"path": "docs/_static/custom.js",
"chars": 540,
"preview": "document.addEventListener(\"DOMContentLoaded\", function () {\n var script = document.createElement(\"script\");\n script.ty"
},
{
"path": "docs/_templates/layout.html",
"chars": 147,
"preview": "{% extends \"!layout.html\" %}\n {% block footer %} {{ super() }}\n\n <style>\n .wy-nav-content { max-width: 65em; }"
},
{
"path": "docs/conf.py",
"chars": 5979,
"preview": "# -*- coding: utf-8 -*-\n#\n# Configuration file for the Sphinx documentation builder.\n#\n# This file does only contain a s"
},
{
"path": "docs/contact.rst",
"chars": 771,
"preview": "Contact\r\n=======\r\n\r\nSlack\r\n-----\r\n\r\nJoin our `Slack`_ community to connect with Modin users and contributors,\r\ndiscuss, "
},
{
"path": "docs/development/architecture.rst",
"chars": 19776,
"preview": "System Architecture\n===================\n\nIn this section, we will lay out the overall system architecture for\nModin, as "
},
{
"path": "docs/development/contributing.rst",
"chars": 9001,
"preview": "Contributing\n============\n\nGetting Started\n---------------\n\nIf you're interested in getting involved in the development "
},
{
"path": "docs/development/index.rst",
"chars": 289,
"preview": "Development\n===========\n\n.. toctree::\n :maxdepth: 4\n\n contributing\n architecture\n partition_api\n using_pa"
},
{
"path": "docs/development/partition_api.rst",
"chars": 3276,
"preview": "Partition API in Modin\n======================\n\nWhen you are working with a :py:class:`~modin.pandas.dataframe.DataFrame`"
},
{
"path": "docs/development/using_pandas_on_dask.rst",
"chars": 2624,
"preview": "pandas on Dask\n==============\n\nThis section describes usage related documents for the pandas on Dask component of Modin."
},
{
"path": "docs/development/using_pandas_on_mpi.rst",
"chars": 1864,
"preview": "pandas on MPI through unidist\n=============================\n\nThis section describes usage related documents for the pand"
},
{
"path": "docs/development/using_pandas_on_python.rst",
"chars": 1068,
"preview": "pandas on Python\n================\n\nThis section describes usage related documents for the pandas on Python component of "
},
{
"path": "docs/development/using_pandas_on_ray.rst",
"chars": 848,
"preview": "pandas on Ray\n=============\n\nThis section describes usage related documents for the pandas on Ray component of Modin.\n\nM"
},
{
"path": "docs/ecosystem.rst",
"chars": 2389,
"preview": "Ecosystem\n=========\n\nThere is a constantly growing number of users and packages using pandas\nto address their specific n"
},
{
"path": "docs/flow/modin/config.rst",
"chars": 2657,
"preview": ":orphan:\n\nModin Configuration Settings\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nTo adjust Modin's default behavior, you can set the"
},
{
"path": "docs/flow/modin/core/dataframe/algebra.rst",
"chars": 7457,
"preview": ":orphan:\n\nOperators Module Description\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nBrief description\n'''''''''''''''''\nMost of the fun"
},
{
"path": "docs/flow/modin/core/dataframe/base/dataframe.rst",
"chars": 1419,
"preview": "ModinDataframe\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nThe :py:class:`~modin.core.dataframe.base.dataframe.dataframe.ModinDataframe` is the pare"
},
{
"path": "docs/flow/modin/core/dataframe/base/index.rst",
"chars": 1604,
"preview": "Purpose\n=======\n\nThe :py:class:`~modin.core.dataframe.base.dataframe.dataframe.ModinDataframe` serves the purpose of des"
},
{
"path": "docs/flow/modin/core/dataframe/base/partitioning/axis_partition.rst",
"chars": 1038,
"preview": "BaseDataframeAxisPartition\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nThe class is base for any axis partition class and serves as the "
},
{
"path": "docs/flow/modin/core/dataframe/index.rst",
"chars": 2671,
"preview": ":orphan:\n\nCore Modin Dataframe Objects\n============================\n\nModin partitions data to scale efficiently.\nTo keep"
},
{
"path": "docs/flow/modin/core/dataframe/pandas/dataframe.rst",
"chars": 2615,
"preview": "PandasDataframe\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\n:py:class:`~modin.core.dataframe.pandas.dataframe.dataframe.PandasDataframe` is a direc"
},
{
"path": "docs/flow/modin/core/dataframe/pandas/index.rst",
"chars": 1375,
"preview": "Modin PandasDataframe Objects\n=============================\n\n``modin.core.dataframe.pandas`` is the package which houses"
},
{
"path": "docs/flow/modin/core/dataframe/pandas/metadata/dtypes.rst",
"chars": 130,
"preview": "ModinDtypes\n\"\"\"\"\"\"\"\"\"\"\"\n\nPublic API\n----------\n\n.. autoclass:: modin.core.dataframe.pandas.metadata.dtypes.ModinDtypes\n "
},
{
"path": "docs/flow/modin/core/dataframe/pandas/metadata/index.rst",
"chars": 126,
"preview": "ModinIndex\n\"\"\"\"\"\"\"\"\"\"\n\nPublic API\n----------\n\n.. autoclass:: modin.core.dataframe.pandas.metadata.index.ModinIndex\n :me"
},
{
"path": "docs/flow/modin/core/dataframe/pandas/partitioning/axis_partition.rst",
"chars": 1329,
"preview": "PandasDataframeAxisPartition\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nThe class implements abstract interface methods from :py:clas"
},
{
"path": "docs/flow/modin/core/dataframe/pandas/partitioning/partition.rst",
"chars": 846,
"preview": "PandasDataframePartition\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nThe class is base for any partition class of ``pandas`` storage forma"
},
{
"path": "docs/flow/modin/core/dataframe/pandas/partitioning/partition_manager.rst",
"chars": 2406,
"preview": "PandasDataframePartitionManager\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nThe class is base for any partition manager class of ``"
},
{
"path": "docs/flow/modin/core/execution/dask/implementations/pandas_on_dask/dataframe.rst",
"chars": 479,
"preview": "PandasOnDaskDataframe\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nThe class is the specific implementation of the dataframe algebra for the `"
},
{
"path": "docs/flow/modin/core/execution/dask/implementations/pandas_on_dask/index.rst",
"chars": 4799,
"preview": ":orphan:\n\nPandasOnDask Execution\n======================\n\nQueries that perform data transformation, data ingress or data "
},
{
"path": "docs/flow/modin/core/execution/dask/implementations/pandas_on_dask/partitioning/partition.rst",
"chars": 1244,
"preview": "PandasOnDaskDataframePartition\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nThe class is the specific implementation of :py:class:`~m"
},
{
"path": "docs/flow/modin/core/execution/dask/implementations/pandas_on_dask/partitioning/partition_manager.rst",
"chars": 530,
"preview": "PandasOnDaskDataframePartitionManager\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nThis class is the specific implementation o"
},
{
"path": "docs/flow/modin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.rst",
"chars": 1056,
"preview": "PandasOnDaskDataframeVirtualPartition\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nThe class is the specific implementation of"
},
{
"path": "docs/flow/modin/core/execution/dispatching.rst",
"chars": 3669,
"preview": ":orphan:\n\n..\n TODO: add links to documentation for mentioned modules.\n\nFactories Module Description\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\""
},
{
"path": "docs/flow/modin/core/execution/python/implementations/pandas_on_python/dataframe.rst",
"chars": 627,
"preview": "PandasOnPythonDataframe\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nThe class is specific implementation of :py:class:`~modin.core.datafram"
},
{
"path": "docs/flow/modin/core/execution/python/implementations/pandas_on_python/index.rst",
"chars": 4398,
"preview": ":orphan:\n\nPandasOnPython Execution\n========================\n\nQueries that perform data transformation, data ingress or d"
},
{
"path": "docs/flow/modin/core/execution/python/implementations/pandas_on_python/partitioning/axis_partition.rst",
"chars": 1067,
"preview": "PandasOnPythonDataframeAxisPartition\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nThe class is specific implementation of :py:c"
},
{
"path": "docs/flow/modin/core/execution/python/implementations/pandas_on_python/partitioning/partition.rst",
"chars": 1299,
"preview": "PandasOnPythonDataframePartition\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nThe class is specific implementation of :py:class:`~m"
},
{
"path": "docs/flow/modin/core/execution/python/implementations/pandas_on_python/partitioning/partition_manager.rst",
"chars": 541,
"preview": "PandasOnPythonDataframePartition\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nThe class is specific implementation of :py:class:`~m"
},
{
"path": "docs/flow/modin/core/execution/ray/generic.rst",
"chars": 515,
"preview": ":orphan:\n\nGeneric Ray-based members\n=========================\n\nObjects which are storage format agnostic but require spe"
},
{
"path": "docs/flow/modin/core/execution/ray/implementations/pandas_on_ray/dataframe.rst",
"chars": 580,
"preview": "PandasOnRayDataframe\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nThe class is specific implementation of :py:class:`~modin.core.dataframe.pand"
},
{
"path": "docs/flow/modin/core/execution/ray/implementations/pandas_on_ray/index.rst",
"chars": 4865,
"preview": ":orphan:\n\nPandasOnRay Execution\n=====================\n\nQueries that perform data transformation, data ingress or data eg"
},
{
"path": "docs/flow/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/axis_partition.rst",
"chars": 1096,
"preview": "PandasOnRayDataframeVirtualPartition\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nThis class is the specific implementation of "
},
{
"path": "docs/flow/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition.rst",
"chars": 1227,
"preview": "PandasOnRayDataframePartition\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nThe class is the specific implementation of :py:class:`~mod"
},
{
"path": "docs/flow/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition_manager.rst",
"chars": 510,
"preview": "PandasOnRayDataframePartitionManager\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nThis class is the specific implementation of "
},
{
"path": "docs/flow/modin/core/execution/unidist/generic.rst",
"chars": 547,
"preview": ":orphan:\n\nGeneric Unidist-based members\n=============================\n\nObjects which are storage format agnostic but req"
},
{
"path": "docs/flow/modin/core/execution/unidist/implementations/pandas_on_unidist/dataframe.rst",
"chars": 616,
"preview": "PandasOnUnidistDataframe\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nThe class is specific implementation of :py:class:`~modin.core.datafr"
},
{
"path": "docs/flow/modin/core/execution/unidist/implementations/pandas_on_unidist/index.rst",
"chars": 5065,
"preview": ":orphan:\n\nPandasOnUnidist Execution\n=========================\n\nQueries that perform data transformation, data ingress or"
},
{
"path": "docs/flow/modin/core/execution/unidist/implementations/pandas_on_unidist/partitioning/axis_partition.rst",
"chars": 1160,
"preview": "PandasOnUnidistDataframeVirtualPartition\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nThis class is the specific implementa"
},
{
"path": "docs/flow/modin/core/execution/unidist/implementations/pandas_on_unidist/partitioning/partition.rst",
"chars": 1275,
"preview": "PandasOnUnidistDataframePartition\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nThe class is the specific implementation of :py:cla"
},
{
"path": "docs/flow/modin/core/execution/unidist/implementations/pandas_on_unidist/partitioning/partition_manager.rst",
"chars": 542,
"preview": "PandasOnUnidistDataframePartitionManager\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nThis class is the specific implementa"
},
{
"path": "docs/flow/modin/core/io/index.rst",
"chars": 8618,
"preview": ":orphan:\n\nIO Module Description\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nDispatcher Classes Workflow Overview\n''''''''''''''''''''''''''''"
},
{
"path": "docs/flow/modin/core/storage_formats/base/query_compiler.rst",
"chars": 4620,
"preview": "BaseQueryCompiler\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nBrief description\n'''''''''''''''''\n:py:class:`~modin.core.storage_formats.base.que"
},
{
"path": "docs/flow/modin/core/storage_formats/index.rst",
"chars": 3517,
"preview": ":orphan:\n\nStorage Formats\n===============\nStorage format is one of the components that form Modin's execution, it descri"
},
{
"path": "docs/flow/modin/core/storage_formats/pandas/index.rst",
"chars": 589,
"preview": ":orphan:\n\nPandas storage format\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\n.. toctree::\n :hidden:\n\n query_compiler\n parsers\n\nHigh-L"
},
{
"path": "docs/flow/modin/core/storage_formats/pandas/parsers.rst",
"chars": 1072,
"preview": "Pandas Parsers Module Description\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\nHigh-Level Module Overview\n'''''''''''''''''''''''''"
},
{
"path": "docs/flow/modin/core/storage_formats/pandas/query_compiler.rst",
"chars": 1459,
"preview": "PandasQueryCompiler\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n:py:class:`~modin.core.storage_formats.pandas.query_compiler.PandasQueryCompiler"
},
{
"path": "docs/flow/modin/distributed/dataframe/pandas.rst",
"chars": 760,
"preview": "Pandas partitioning API\n=======================\n\nThis page contains a description of the API to extract partitions from "
},
{
"path": "docs/flow/modin/experimental/batch.rst",
"chars": 282,
"preview": "Batch Pipeline API \n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nThis API exposes the ability to pipeline row-parallel batch queries on a Modin D"
},
{
"path": "docs/flow/modin/experimental/core/io/index.rst",
"chars": 755,
"preview": ":orphan:\n\nExperimental IO Module Description\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nThe module is used mostly for storing e"
},
{
"path": "docs/flow/modin/experimental/index.rst",
"chars": 763,
"preview": ":orphan:\n\nExperimental Modules Overview\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nIn some cases Modin can give the user the opportu"
},
{
"path": "docs/flow/modin/experimental/pandas.rst",
"chars": 748,
"preview": ":orphan:\n\nExperimental Pandas API\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\n.. automodule:: modin.experimental.pandas\n :noindex:\n\nExperi"
},
{
"path": "docs/flow/modin/experimental/range_partitioning_groupby.rst",
"chars": 215,
"preview": ":orphan:\n\n.. redirect to the new page\n.. raw:: html\n\n <script type=\"text/javascript\">\n window.location.href = "
},
{
"path": "docs/flow/modin/experimental/reshuffling_groupby.rst",
"chars": 216,
"preview": ":orphan:\n\n.. redirect to the new page\n.. raw:: html\n\n <script type=\"text/javascript\">\n window.location.href = "
},
{
"path": "docs/flow/modin/experimental/sklearn.rst",
"chars": 226,
"preview": "Scikit-learn module description\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nThis module holds experimental scikit-learn-specific fu"
},
{
"path": "docs/flow/modin/experimental/xgboost.rst",
"chars": 6380,
"preview": "Modin XGBoost module description\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\nHigh-level Module Overview\n''''''''''''''''''''''''''\n"
},
{
"path": "docs/flow/modin/pandas/base.rst",
"chars": 267,
"preview": "Base pandas Dataset API\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nThe class implements functionality that is common to Modin's pandas API"
},
{
"path": "docs/flow/modin/pandas/dataframe.rst",
"chars": 4518,
"preview": ":orphan:\n\nDataFrame Module Overview\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nModin's ``pandas.DataFrame`` API\n''''''''''''''''''''''''"
},
{
"path": "docs/flow/modin/pandas/series.rst",
"chars": 3583,
"preview": ":orphan:\n\nSeries Module Overview\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nModin's ``pandas.Series`` API\n'''''''''''''''''''''''''''''\n\nMo"
},
{
"path": "docs/flow/modin/utils.rst",
"chars": 209,
"preview": ":orphan:\n\nModin Utils\n\"\"\"\"\"\"\"\"\"\"\"\n\nHere are utilities that can be useful when working with Modin.\n\nPublic API\n''''''''''"
},
{
"path": "docs/getting_started/examples.rst",
"chars": 6519,
"preview": "Examples and Resources\n======================\n\nHere you can find additional resources to learn about Modin. To learn mor"
},
{
"path": "docs/getting_started/faq.rst",
"chars": 9817,
"preview": "Frequently Asked Questions (FAQs)\n=================================\n\nBelow, you will find answers to the most commonly a"
},
{
"path": "docs/getting_started/installation.rst",
"chars": 10144,
"preview": "=============\nInstallation\n=============\n\n.. note:: \n | *Estimated Reading Time: 15 minutes*\n | If you already install"
},
{
"path": "docs/getting_started/quickstart.rst",
"chars": 6345,
"preview": "Getting Started\n===============\n\n.. note:: \n | *Estimated Reading Time: 10 minutes*\n | You can follow along this tutor"
},
{
"path": "docs/getting_started/troubleshooting.rst",
"chars": 14969,
"preview": "Troubleshooting\n===============\n\nWe hope your experience with Modin is bug-free, but there are some quirks about Modin\nt"
},
{
"path": "docs/getting_started/using_modin/using_modin.rst",
"chars": 263,
"preview": "Using Modin\n===========\n\nIn this section, we show how Modin can be used to accelerate your pandas workflows on a \nsingle"
},
{
"path": "docs/getting_started/using_modin/using_modin_cluster.rst",
"chars": 5570,
"preview": "Using Modin in a Cluster\n========================\n\n.. note::\n | *Estimated Reading Time: 15 minutes*\n\nOften in practice"
},
{
"path": "docs/getting_started/using_modin/using_modin_locally.rst",
"chars": 2893,
"preview": "===================\nUsing Modin Locally\n===================\n\n.. note::\n | *Estimated Reading Time: 5 minutes*\n | You c"
},
{
"path": "docs/getting_started/why_modin/modin_vs_dask_vs_koalas.rst",
"chars": 12274,
"preview": "Modin vs. Dask DataFrame vs. Koalas\n===================================\n\nLibraries such as `Dask DataFrame <https://docs"
},
{
"path": "docs/getting_started/why_modin/out_of_core.rst",
"chars": 3139,
"preview": "Out-of-memory data with Modin\n=============================\n\n.. note::\n | *Estimated Reading Time: 10 minutes*\n \nWhen "
},
{
"path": "docs/getting_started/why_modin/pandas.rst",
"chars": 3321,
"preview": "How does Modin differ from pandas?\n==================================\n\n.. note:: \n | *Estimated Reading Time: 10 minute"
},
{
"path": "docs/getting_started/why_modin/why_modin.rst",
"chars": 1349,
"preview": "Why Modin?\n==========\n\nIn this section, we explain the design and motivation behind Modin and why you should use Modin t"
},
{
"path": "docs/index.rst",
"chars": 5532,
"preview": ".. image:: img/MODIN_ver2_hrz.png\n :width: 400px\n :alt: modin logo\n :align: center\n\n====\n\n.. toctree::\n :hidden:"
},
{
"path": "docs/release-procedure.md",
"chars": 6868,
"preview": "## Versioning\n\n### Patch release\n\nModin uses semantic versioning. So when doing a patch release, please make a separate "
},
{
"path": "docs/release_notes/release_notes-0.14.0.rst",
"chars": 5750,
"preview": ":orphan:\n\nModin 0.14.0\n\nKey Features and Updates\n------------------------\n\n* Stability and Bugfixes\n * FIX-#4058: Allow"
},
{
"path": "docs/release_notes/release_notes-0.15.0.rst",
"chars": 4425,
"preview": ":orphan:\n\nModin 0.15.0\n\nKey Features and Updates\n------------------------\n\n* Stability and Bugfixes\n * FIX-#4376: Upgra"
},
{
"path": "docs/release_notes/release_notes-0.16.0.rst",
"chars": 13042,
"preview": ":orphan:\n\nModin 0.16.0\n\nKey Features and Updates\n------------------------\n\n* Stability and Bugfixes\n * FIX-#4570: Repla"
},
{
"path": "docs/release_notes/release_notes-template.rst",
"chars": 445,
"preview": ":orphan:\n\nModin X.X.X\n\nKey Features and Updates\n------------------------\n\n* Stability and Bugfixes\n *\n* Performance enh"
},
{
"path": "docs/requirements-doc.txt",
"chars": 444,
"preview": "# install current modin checkout to bring all required dependencies\n.[all]\n# now install some more optional dependencies"
},
{
"path": "docs/supported_apis/dataframe_supported.rst",
"chars": 86111,
"preview": "``pd.DataFrame`` supported APIs\n===================================\n\nThe following table lists both implemented and not "
},
{
"path": "docs/supported_apis/defaulting_to_pandas.rst",
"chars": 1691,
"preview": "Defaulting to pandas\n====================\n\nCurrently Modin does not support distributed execution for all methods from p"
},
{
"path": "docs/supported_apis/index.rst",
"chars": 1214,
"preview": "Supported APIs\n==============\n\nFor your convenience, we have compiled a list of currently implemented APIs and methods\na"
},
{
"path": "docs/supported_apis/io_supported.rst",
"chars": 7342,
"preview": "``pd.read_<file>`` and I/O APIs\r\n=================================\r\n\r\nA number of IO methods default to pandas. We have "
},
{
"path": "docs/supported_apis/older_pandas_compat.rst",
"chars": 1914,
"preview": "===================================\nPandas backwards compatibility mode\n===================================\n\nModin verio"
},
{
"path": "docs/supported_apis/series_supported.rst",
"chars": 54936,
"preview": "``pd.Series`` supported APIs\r\n============================\r\n\r\nThe following table lists both implemented and not impleme"
},
{
"path": "docs/supported_apis/utilities_supported.rst",
"chars": 7464,
"preview": "pandas Utilities Supported\r\n==========================\r\n\r\nIf you run ``import modin.pandas as pd``, the following operat"
},
{
"path": "docs/usage_guide/advanced_usage/batch.rst",
"chars": 14984,
"preview": "Batch Pipline API Usage Guide\n=============================\n\nModin provides an experimental batching feature that pipeli"
},
{
"path": "docs/usage_guide/advanced_usage/index.rst",
"chars": 7737,
"preview": "Advanced Usage\n==============\n\n.. toctree::\n :titlesonly:\n :hidden:\n\n /flow/modin/distributed/dataframe/pandas\n "
},
{
"path": "docs/usage_guide/advanced_usage/modin_engines.rst",
"chars": 2250,
"preview": "Modin engines\n=============\n\nAs a rule, you don't have to worry about initialization of an execution engine as\nModin its"
},
{
"path": "docs/usage_guide/advanced_usage/modin_logging.rst",
"chars": 4324,
"preview": "Modin Logging\n=============\n\nModin logging offers users greater insight into their queries by logging internal Modin API"
},
{
"path": "docs/usage_guide/advanced_usage/modin_metrics.rst",
"chars": 1990,
"preview": "Modin Metrics\n=============\n\nModin allows for third-party systems to register a metrics handler to collect specific API "
},
{
"path": "docs/usage_guide/advanced_usage/modin_xgboost.rst",
"chars": 3695,
"preview": "Distributed XGBoost on Modin\n============================\n\nModin provides an implementation of `distributed XGBoost`_ ma"
},
{
"path": "docs/usage_guide/advanced_usage/progress_bar.rst",
"chars": 531,
"preview": "Progress Bar\n============\n\nThe progress bar allows users to see the estimated progress and completion time of each line "
},
{
"path": "docs/usage_guide/advanced_usage/spreadsheets_api.rst",
"chars": 4110,
"preview": "Modin Spreadsheets API\n======================\n\nGetting started\n---------------\nInstall Modin-spreadsheet using pip:\n\n.. "
},
{
"path": "docs/usage_guide/benchmarking.rst",
"chars": 7192,
"preview": "Benchmarking Modin\n==================\n\nSummary\n-------\nTo benchmark a single Modin function, often turning on the\n:doc:`"
},
{
"path": "docs/usage_guide/examples/index.rst",
"chars": 2948,
"preview": "Modin Usage Examples\n====================\n\nThis section shows Modin usage examples in different scenarios like Modin on "
},
{
"path": "docs/usage_guide/index.rst",
"chars": 499,
"preview": "Usage Guide\n===========\n\nThis guide describes both basic and advanced Modin usage, including usage examples, \ndetails re"
},
{
"path": "docs/usage_guide/integrations.rst",
"chars": 6836,
"preview": "Third Party Library Integrations\n================================\n\nModin is a drop-in replacement for Pandas, so we want"
},
{
"path": "docs/usage_guide/optimization_notes/index.rst",
"chars": 20121,
"preview": "Optimization Notes\n==================\n\nModin has chosen default values for a lot of the configurations here that provide"
},
{
"path": "docs/usage_guide/optimization_notes/range_partitioning_ops.rst",
"chars": 15503,
"preview": ":orphan:\n\nOperations that support range-partitioning in Modin\n###################################################\n\nThe f"
},
{
"path": "environment-dev.yml",
"chars": 1844,
"preview": "name: modin\nchannels:\n - conda-forge\ndependencies:\n - pip\n\n # required dependencies\n - pandas>=2.2,<2.4\n - numpy>=1"
},
{
"path": "examples/data/boston_housing.csv",
"chars": 41084,
"preview": ",CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE\n0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0"
},
{
"path": "examples/data/census_1k.csv",
"chars": 98409,
"preview": "\"YEAR\",\"DATANUM\",\"SERIAL\",\"CBSERIAL\",\"HHWT\",\"CPI99\",\"GQ\",\"QGQ\",\"PERNUM\",\"PERWT\",\"SEX\",\"AGE\",\"EDUC\",\"EDUCD\",\"INCTOT\",\"SEX"
},
{
"path": "examples/data/nyc-taxi_1k.csv",
"chars": 149567,
"preview": "1460000001,2,2017-12-15 00:00:28,2017-12-15 00:15:43,N,1,,,,,2,1.50,11,0.5,0.5,1.25,0,,0.3,13.55,1,,,,yellow,0.11,1,1.2,"
},
{
"path": "examples/data/plasticc_test_set_1k.csv",
"chars": 36558,
"preview": "object_id,mjd,passband,flux,flux_err,detected\n13,59798.3205,2,-1.299735,1.357315,0\n13,59798.3281,1,-2.095392,1.148654,0\n"
},
{
"path": "examples/data/plasticc_test_set_metadata_1k.csv",
"chars": 82582,
"preview": "object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv\n13,34.453125,-5.229529,1"
},
{
"path": "examples/data/plasticc_training_set_1k.csv",
"chars": 38276,
"preview": "object_id,mjd,passband,flux,flux_err,detected\n615,59750.4229,2,-544.810303,3.622952,1\n615,59750.4306,1,-816.434326,5.553"
},
{
"path": "examples/data/plasticc_training_set_metadata_1k.csv",
"chars": 88029,
"preview": "object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target\n615,349.046051,-6"
},
{
"path": "examples/docker/modin-ray/Dockerfile",
"chars": 2620,
"preview": "# Licensed to Modin Development Team under one or more contributor license agreements.\n# See the NOTICE file distributed"
},
{
"path": "examples/docker/modin-ray/build-docker-image.sh",
"chars": 2033,
"preview": "#!/bin/bash -e\n\n# Licensed to Modin Development Team under one or more contributor license agreements.\n# See the NOTICE "
},
{
"path": "examples/docker/modin-ray/census.py",
"chars": 5978,
"preview": "# Licensed to Modin Development Team under one or more contributor license agreements.\n# See the NOTICE file distributed"
},
{
"path": "examples/docker/modin-ray/nyc-taxi.py",
"chars": 3886,
"preview": "# Licensed to Modin Development Team under one or more contributor license agreements.\n# See the NOTICE file distributed"
},
{
"path": "examples/docker/modin-ray/plasticc.py",
"chars": 7933,
"preview": "# Licensed to Modin Development Team under one or more contributor license agreements.\n# See the NOTICE file distributed"
},
{
"path": "examples/jupyter/Modin_Taxi.ipynb",
"chars": 2918,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"code\",\n \"execution_count\": 1,\n \"id\": \"cc4bd9e9\",\n \"metadata\": {\n \"slideshow\""
},
{
"path": "examples/jupyter/Pandas_Taxi.ipynb",
"chars": 2453,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"code\",\n \"execution_count\": 1,\n \"id\": \"5d674ce8\",\n \"metadata\": {},\n \"outputs\":"
},
{
"path": "examples/jupyter/integrations/NLTK.ipynb",
"chars": 9740,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Demonstrating NLTK Modin Interope"
},
{
"path": "examples/jupyter/integrations/altair.ipynb",
"chars": 1941,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Demonstrating Altair Modin Intero"
},
{
"path": "examples/jupyter/integrations/bokeh.ipynb",
"chars": 5847,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Demonstrating Bokeh Modin Interop"
},
{
"path": "examples/jupyter/integrations/huggingface.ipynb",
"chars": 9587,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Demonstrating Hugging Face Modin "
},
{
"path": "examples/jupyter/integrations/matplotlib.ipynb",
"chars": 8190,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Demonstrating Matplotlib Modin In"
},
{
"path": "examples/jupyter/integrations/plotly.ipynb",
"chars": 7117,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Demonstrating Plotly Modin Intero"
},
{
"path": "examples/jupyter/integrations/seaborn.ipynb",
"chars": 404920,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Demonstrating Seaborn Modin Inter"
},
{
"path": "examples/jupyter/integrations/sklearn.ipynb",
"chars": 10021,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Demonstrating sklearn Modin Inter"
},
{
"path": "examples/jupyter/integrations/statsmodels.ipynb",
"chars": 6065,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Demonstrating statsmodels Modin I"
},
{
"path": "examples/jupyter/integrations/tensorflow.ipynb",
"chars": 3621,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Demonstrating Tensorflow Modin In"
},
{
"path": "examples/jupyter/integrations/xgboost.ipynb",
"chars": 2455,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Demonstrating XGBoost Modin Inter"
},
{
"path": "examples/modin-scikit-learn-example.ipynb",
"chars": 48285,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"code\",\n \"execution_count\": 1,\n \"metadata\": {},\n \"outputs\": [\n {\n \"name\":"
},
{
"path": "examples/quickstart.ipynb",
"chars": 40912,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"
About this extraction
This page contains the full source code of the modin-project/modin GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 681 files (6.8 MB), approximately 1.8M tokens, and a symbol index with 6064 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.